koichi12 commited on Feb 12, 2025

Commit

c060ea1

verified ·

1 Parent(s): 920167e

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/const_vs_enum.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/contains.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/issue232.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/json_schema_test_suite.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/nested_schemas.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/subcomponents.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/unused_registry.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_applicator_schemas.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_keywords.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/validator_creation.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/const_vs_enum.py +30 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/contains.py +28 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/json_schema_test_suite.py +12 -0
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/validator_creation.py +14 -0
.venv/lib/python3.11/site-packages/torchaudio/__init__.py +53 -0
.venv/lib/python3.11/site-packages/torchaudio/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/__pycache__/kaldi_io.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/__pycache__/version.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/__init__.py +61 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/backend.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/common.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/ffmpeg.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile_backend.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/sox.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/backend.py +53 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/common.py +52 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/ffmpeg.py +334 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile.py +54 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile_backend.py +457 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/sox.py +91 -0
.venv/lib/python3.11/site-packages/torchaudio/_backend/utils.py +317 -0
.venv/lib/python3.11/site-packages/torchaudio/backend/__init__.py +8 -0
.venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/_sox_io_backend.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/soundfile_backend.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/backend/_no_backend.py +25 -0
.venv/lib/python3.11/site-packages/torchaudio/backend/common.py +13 -0
.venv/lib/python3.11/site-packages/torchaudio/backend/soundfile_backend.py +14 -0
.venv/lib/python3.11/site-packages/torchaudio/backend/sox_io_backend.py +14 -0
.venv/lib/python3.11/site-packages/torchaudio/functional/__init__.py +127 -0
.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/_alignment.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/filtering.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/functional.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/torchaudio/functional/_alignment.py +128 -0
.venv/lib/python3.11/site-packages/torchaudio/functional/filtering.py +1669 -0

.gitattributes CHANGED Viewed

@@ -295,3 +295,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/
 .venv/bin/py-spy filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/_cffi_backend.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/jsonschema/tests/__pycache__/test_validators.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

 .venv/bin/py-spy filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/_cffi_backend.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/jsonschema/tests/__pycache__/test_validators.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/functional.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (279 Bytes). View file

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/const_vs_enum.cpython-311.pyc ADDED Viewed

Binary file (2.18 kB). View file

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/contains.cpython-311.pyc ADDED Viewed

Binary file (2.15 kB). View file

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/issue232.cpython-311.pyc ADDED Viewed

Binary file (1.02 kB). View file

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/json_schema_test_suite.cpython-311.pyc ADDED Viewed

Binary file (719 Bytes). View file

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/nested_schemas.cpython-311.pyc ADDED Viewed

Binary file (2.74 kB). View file

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/subcomponents.cpython-311.pyc ADDED Viewed

Binary file (2.6 kB). View file

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/unused_registry.cpython-311.pyc ADDED Viewed

Binary file (1.79 kB). View file

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_applicator_schemas.cpython-311.pyc ADDED Viewed

Binary file (4.04 kB). View file

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_keywords.cpython-311.pyc ADDED Viewed

Binary file (2.36 kB). View file

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/validator_creation.cpython-311.pyc ADDED Viewed

Binary file (629 Bytes). View file

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/const_vs_enum.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+A benchmark for comparing equivalent validation of `const` and `enum`.
+"""
+from pyperf import Runner
+from jsonschema import Draft202012Validator
+value = [37] * 100
+const_schema = {"const": list(value)}
+enum_schema = {"enum": [list(value)]}
+valid = list(value)
+invalid = [*valid, 73]
+const = Draft202012Validator(const_schema)
+enum = Draft202012Validator(enum_schema)
+assert const.is_valid(valid)
+assert enum.is_valid(valid)
+assert not const.is_valid(invalid)
+assert not enum.is_valid(invalid)
+if __name__ == "__main__":
+    runner = Runner()
+    runner.bench_func("const valid", lambda: const.is_valid(valid))
+    runner.bench_func("const invalid", lambda: const.is_valid(invalid))
+    runner.bench_func("enum valid", lambda: enum.is_valid(valid))
+    runner.bench_func("enum invalid", lambda: enum.is_valid(invalid))

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/contains.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+A benchmark for validation of the `contains` keyword.
+"""
+from pyperf import Runner
+from jsonschema import Draft202012Validator
+schema = {
+    "type": "array",
+    "contains": {"const": 37},
+}
+validator = Draft202012Validator(schema)
+size = 1000
+beginning = [37] + [0] * (size - 1)
+middle = [0] * (size // 2) + [37] + [0] * (size // 2)
+end = [0] * (size - 1) + [37]
+invalid = [0] * size
+if __name__ == "__main__":
+    runner = Runner()
+    runner.bench_func("baseline", lambda: validator.is_valid([]))
+    runner.bench_func("beginning", lambda: validator.is_valid(beginning))
+    runner.bench_func("middle", lambda: validator.is_valid(middle))
+    runner.bench_func("end", lambda: validator.is_valid(end))
+    runner.bench_func("invalid", lambda: validator.is_valid(invalid))

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/json_schema_test_suite.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""
+A performance benchmark using the official test suite.
+This benchmarks jsonschema using every valid example in the
+JSON-Schema-Test-Suite. It will take some time to complete.
+"""
+from pyperf import Runner
+from jsonschema.tests._suite import Suite
+if __name__ == "__main__":
+    Suite().benchmark(runner=Runner())

.venv/lib/python3.11/site-packages/jsonschema/benchmarks/validator_creation.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from pyperf import Runner
+from jsonschema import Draft202012Validator
+schema = {
+    "type": "array",
+    "minLength": 1,
+    "maxLength": 1,
+    "items": {"type": "integer"},
+}
+if __name__ == "__main__":
+    Runner().bench_func("validator creation", Draft202012Validator, schema)

.venv/lib/python3.11/site-packages/torchaudio/__init__.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Initialize extension and backend first
+from . import _extension  # noqa  # usort: skip
+from ._backend import (  # noqa  # usort: skip
+    AudioMetaData,
+    get_audio_backend,
+    info,
+    list_audio_backends,
+    load,
+    save,
+    set_audio_backend,
+)
+from . import (  # noqa: F401
+    compliance,
+    datasets,
+    functional,
+    io,
+    kaldi_io,
+    models,
+    pipelines,
+    sox_effects,
+    transforms,
+    utils,
+)
+# For BC
+from . import backend  # noqa # usort: skip
+try:
+    from .version import __version__, git_version  # noqa: F401
+except ImportError:
+    pass
+__all__ = [
+    "AudioMetaData",
+    "load",
+    "info",
+    "save",
+    "io",
+    "compliance",
+    "datasets",
+    "functional",
+    "models",
+    "pipelines",
+    "kaldi_io",
+    "utils",
+    "sox_effects",
+    "transforms",
+    "list_audio_backends",
+    "get_audio_backend",
+    "set_audio_backend",
+]

.venv/lib/python3.11/site-packages/torchaudio/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.17 kB). View file

.venv/lib/python3.11/site-packages/torchaudio/__pycache__/kaldi_io.cpython-311.pyc ADDED Viewed

Binary file (5.84 kB). View file

.venv/lib/python3.11/site-packages/torchaudio/__pycache__/version.cpython-311.pyc ADDED Viewed

Binary file (272 Bytes). View file

.venv/lib/python3.11/site-packages/torchaudio/_backend/__init__.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from typing import List, Optional
+from torchaudio._internal.module_utils import deprecated
+from . import utils
+from .common import AudioMetaData
+__all__ = [
+    "AudioMetaData",
+    "load",
+    "info",
+    "save",
+    "list_audio_backends",
+    "get_audio_backend",
+    "set_audio_backend",
+]
+info = utils.get_info_func()
+load = utils.get_load_func()
+save = utils.get_save_func()
+def list_audio_backends() -> List[str]:
+    """List available backends
+    Returns:
+        list of str: The list of available backends.
+        The possible values are; ``"ffmpeg"``, ``"sox"`` and ``"soundfile"``.
+    """
+    return list(utils.get_available_backends().keys())
+# Temporary until global backend is removed
+@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
+def get_audio_backend() -> Optional[str]:
+    """Get the name of the current global backend
+    Returns:
+        str or None:
+            If dispatcher mode is enabled, returns ``None`` otherwise,
+            the name of current backend or ``None`` (no backend is set).
+    """
+    return None
+# Temporary until global backend is removed
+@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
+def set_audio_backend(backend: Optional[str]):  # noqa
+    """Set the global backend.
+    This is a no-op when dispatcher mode is enabled.
+    Args:
+        backend (str or None): Name of the backend.
+            One of ``"sox_io"`` or ``"soundfile"`` based on availability
+            of the system. If ``None`` is provided the  current backend is unassigned.
+    """
+    pass

.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.33 kB). View file

.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/backend.cpython-311.pyc ADDED Viewed

Binary file (3.08 kB). View file

.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/common.cpython-311.pyc ADDED Viewed

Binary file (2.35 kB). View file

.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/ffmpeg.cpython-311.pyc ADDED Viewed

Binary file (14 kB). View file

.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile.cpython-311.pyc ADDED Viewed

Binary file (3.21 kB). View file

.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile_backend.cpython-311.pyc ADDED Viewed

Binary file (17.6 kB). View file

.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/sox.cpython-311.pyc ADDED Viewed

Binary file (4.97 kB). View file

.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (16.5 kB). View file

.venv/lib/python3.11/site-packages/torchaudio/_backend/backend.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+from abc import ABC, abstractmethod
+from typing import BinaryIO, Optional, Tuple, Union
+from torch import Tensor
+from torchaudio.io import CodecConfig
+from .common import AudioMetaData
+class Backend(ABC):
+    @staticmethod
+    @abstractmethod
+    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        raise NotImplementedError
+    @staticmethod
+    @abstractmethod
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[Tensor, int]:
+        raise NotImplementedError
+    @staticmethod
+    @abstractmethod
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[CodecConfig, float, int]] = None,
+    ) -> None:
+        raise NotImplementedError
+    @staticmethod
+    @abstractmethod
+    def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        raise NotImplementedError
+    @staticmethod
+    @abstractmethod
+    def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        raise NotImplementedError

.venv/lib/python3.11/site-packages/torchaudio/_backend/common.py ADDED Viewed

	@@ -0,0 +1,52 @@

+class AudioMetaData:
+    """AudioMetaData()
+    Return type of ``torchaudio.info`` function.
+    :ivar int sample_rate: Sample rate
+    :ivar int num_frames: The number of frames
+    :ivar int num_channels: The number of channels
+    :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
+        or when it cannot be accurately inferred.
+    :ivar str encoding: Audio encoding
+        The values encoding can take are one of the following:
+            * ``PCM_S``: Signed integer linear PCM
+            * ``PCM_U``: Unsigned integer linear PCM
+            * ``PCM_F``: Floating point linear PCM
+            * ``FLAC``: Flac, Free Lossless Audio Codec
+            * ``ULAW``: Mu-law
+            * ``ALAW``: A-law
+            * ``MP3`` : MP3, MPEG-1 Audio Layer III
+            * ``VORBIS``: OGG Vorbis
+            * ``AMR_WB``: Adaptive Multi-Rate Wideband
+            * ``AMR_NB``: Adaptive Multi-Rate Narrowband
+            * ``OPUS``: Opus
+            * ``HTK``: Single channel 16-bit PCM
+            * ``UNKNOWN`` : None of above
+    """
+    def __init__(
+        self,
+        sample_rate: int,
+        num_frames: int,
+        num_channels: int,
+        bits_per_sample: int,
+        encoding: str,
+    ):
+        self.sample_rate = sample_rate
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.bits_per_sample = bits_per_sample
+        self.encoding = encoding
+    def __str__(self):
+        return (
+            f"AudioMetaData("
+            f"sample_rate={self.sample_rate}, "
+            f"num_frames={self.num_frames}, "
+            f"num_channels={self.num_channels}, "
+            f"bits_per_sample={self.bits_per_sample}, "
+            f"encoding={self.encoding}"
+            f")"
+        )

.venv/lib/python3.11/site-packages/torchaudio/_backend/ffmpeg.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import os
+import re
+import sys
+from typing import BinaryIO, Optional, Tuple, Union
+import torch
+import torchaudio
+from .backend import Backend
+from .common import AudioMetaData
+InputType = Union[BinaryIO, str, os.PathLike]
+def info_audio(
+    src: InputType,
+    format: Optional[str],
+    buffer_size: int = 4096,
+) -> AudioMetaData:
+    s = torchaudio.io.StreamReader(src, format, None, buffer_size)
+    sinfo = s.get_src_stream_info(s.default_audio_stream)
+    if sinfo.num_frames == 0:
+        waveform = _load_audio(s)
+        num_frames = waveform.size(1)
+    else:
+        num_frames = sinfo.num_frames
+    return AudioMetaData(
+        int(sinfo.sample_rate),
+        num_frames,
+        sinfo.num_channels,
+        sinfo.bits_per_sample,
+        sinfo.codec.upper(),
+    )
+def _get_load_filter(
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    convert: bool = True,
+) -> Optional[str]:
+    if frame_offset < 0:
+        raise RuntimeError("Invalid argument: frame_offset must be non-negative. Found: {}".format(frame_offset))
+    if num_frames == 0 or num_frames < -1:
+        raise RuntimeError("Invalid argument: num_frames must be -1 or greater than 0. Found: {}".format(num_frames))
+    # All default values -> no filter
+    if frame_offset == 0 and num_frames == -1 and not convert:
+        return None
+    # Only convert
+    aformat = "aformat=sample_fmts=fltp"
+    if frame_offset == 0 and num_frames == -1 and convert:
+        return aformat
+    # At least one of frame_offset or num_frames has non-default value
+    if num_frames > 0:
+        atrim = "atrim=start_sample={}:end_sample={}".format(frame_offset, frame_offset + num_frames)
+    else:
+        atrim = "atrim=start_sample={}".format(frame_offset)
+    if not convert:
+        return atrim
+    return "{},{}".format(atrim, aformat)
+def _load_audio(
+    s: "torchaudio.io.StreamReader",
+    filter: Optional[str] = None,
+    channels_first: bool = True,
+) -> torch.Tensor:
+    s.add_audio_stream(-1, -1, filter_desc=filter)
+    s.process_all_packets()
+    chunk = s.pop_chunks()[0]
+    if chunk is None:
+        raise RuntimeError("Failed to decode audio.")
+    waveform = chunk._elem
+    return waveform.T if channels_first else waveform
+def load_audio(
+    src: InputType,
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    convert: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+    buffer_size: int = 4096,
+) -> Tuple[torch.Tensor, int]:
+    if hasattr(src, "read") and format == "vorbis":
+        format = "ogg"
+    s = torchaudio.io.StreamReader(src, format, None, buffer_size)
+    sample_rate = int(s.get_src_stream_info(s.default_audio_stream).sample_rate)
+    filter = _get_load_filter(frame_offset, num_frames, convert)
+    waveform = _load_audio(s, filter, channels_first)
+    return waveform, sample_rate
+def _get_sample_format(dtype: torch.dtype) -> str:
+    dtype_to_format = {
+        torch.uint8: "u8",
+        torch.int16: "s16",
+        torch.int32: "s32",
+        torch.int64: "s64",
+        torch.float32: "flt",
+        torch.float64: "dbl",
+    }
+    format = dtype_to_format.get(dtype)
+    if format is None:
+        raise ValueError(f"No format found for dtype {dtype}; dtype must be one of {list(dtype_to_format.keys())}.")
+    return format
+def _native_endianness() -> str:
+    if sys.byteorder == "little":
+        return "le"
+    else:
+        return "be"
+def _get_encoder_for_wav(encoding: str, bits_per_sample: int) -> str:
+    if bits_per_sample not in {None, 8, 16, 24, 32, 64}:
+        raise ValueError(f"Invalid bits_per_sample {bits_per_sample} for WAV encoding.")
+    endianness = _native_endianness()
+    if not encoding:
+        if not bits_per_sample:
+            # default to PCM S16
+            return f"pcm_s16{endianness}"
+        if bits_per_sample == 8:
+            return "pcm_u8"
+        return f"pcm_s{bits_per_sample}{endianness}"
+    if encoding == "PCM_S":
+        if not bits_per_sample:
+            bits_per_sample = 16
+        if bits_per_sample == 8:
+            raise ValueError("For WAV signed PCM, 8-bit encoding is not supported.")
+        return f"pcm_s{bits_per_sample}{endianness}"
+    if encoding == "PCM_U":
+        if bits_per_sample in (None, 8):
+            return "pcm_u8"
+        raise ValueError("For WAV unsigned PCM, only 8-bit encoding is supported.")
+    if encoding == "PCM_F":
+        if not bits_per_sample:
+            bits_per_sample = 32
+        if bits_per_sample in (32, 64):
+            return f"pcm_f{bits_per_sample}{endianness}"
+        raise ValueError("For WAV float PCM, only 32- and 64-bit encodings are supported.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "pcm_mulaw"
+        raise ValueError("For WAV PCM mu-law, only 8-bit encoding is supported.")
+    if encoding == "ALAW":
+        if bits_per_sample in (None, 8):
+            return "pcm_alaw"
+        raise ValueError("For WAV PCM A-law, only 8-bit encoding is supported.")
+    raise ValueError(f"WAV encoding {encoding} is not supported.")
+def _get_flac_sample_fmt(bps):
+    if bps is None or bps == 16:
+        return "s16"
+    if bps == 24:
+        return "s32"
+    raise ValueError(f"FLAC only supports bits_per_sample values of 16 and 24 ({bps} specified).")
+def _parse_save_args(
+    ext: Optional[str],
+    format: Optional[str],
+    encoding: Optional[str],
+    bps: Optional[int],
+):
+    # torchaudio's save function accepts the followings, which do not 1to1 map
+    # to FFmpeg.
+    #
+    # - format: audio format
+    # - bits_per_sample: encoder sample format
+    # - encoding: such as PCM_U8.
+    #
+    # In FFmpeg, format is specified with the following three (and more)
+    #
+    # - muxer: could be audio format or container format.
+    # the one we passed to the constructor of StreamWriter
+    # - encoder: the audio encoder used to encode audio
+    # - encoder sample format: the format used by encoder to encode audio.
+    #
+    # If encoder sample format is different from source sample format, StreamWriter
+    # will insert a filter automatically.
+    #
+    def _type(spec):
+        # either format is exactly the specified one
+        # or extension matches to the spec AND there is no format override.
+        return format == spec or (format is None and ext == spec)
+    if _type("wav") or _type("amb"):
+        # wav is special because it supports different encoding through encoders
+        # each encoder only supports one encoder format
+        #
+        # amb format is a special case originated from libsox.
+        # It is basically a WAV format, with slight modification.
+        # https://github.com/chirlu/sox/commit/4a4ea33edbca5972a1ed8933cc3512c7302fa67a#diff-39171191a858add9df87f5f210a34a776ac2c026842ae6db6ce97f5e68836795
+        # It is a format so that decoders will recognize it as ambisonic.
+        # https://www.ambisonia.com/Members/mleese/file-format-for-b-format/
+        # FFmpeg does not recognize amb because it is basically a WAV format.
+        muxer = "wav"
+        encoder = _get_encoder_for_wav(encoding, bps)
+        sample_fmt = None
+    elif _type("vorbis"):
+        # FFpmeg does not recognize vorbis extension, while libsox used to do.
+        # For the sake of bakward compatibility, (and the simplicity),
+        # we support the case where users want to do save("foo.vorbis")
+        muxer = "ogg"
+        encoder = "vorbis"
+        sample_fmt = None
+    else:
+        muxer = format
+        encoder = None
+        sample_fmt = None
+        if _type("flac"):
+            sample_fmt = _get_flac_sample_fmt(bps)
+        if _type("ogg"):
+            sample_fmt = _get_flac_sample_fmt(bps)
+    return muxer, encoder, sample_fmt
+def save_audio(
+    uri: InputType,
+    src: torch.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+    buffer_size: int = 4096,
+    compression: Optional[torchaudio.io.CodecConfig] = None,
+) -> None:
+    ext = None
+    if hasattr(uri, "write"):
+        if format is None:
+            raise RuntimeError("'format' is required when saving to file object.")
+    else:
+        uri = os.path.normpath(uri)
+        if tokens := str(uri).split(".")[1:]:
+            ext = tokens[-1].lower()
+    muxer, encoder, enc_fmt = _parse_save_args(ext, format, encoding, bits_per_sample)
+    if channels_first:
+        src = src.T
+    s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)
+    s.add_audio_stream(
+        sample_rate,
+        num_channels=src.size(-1),
+        format=_get_sample_format(src.dtype),
+        encoder=encoder,
+        encoder_format=enc_fmt,
+        codec_config=compression,
+    )
+    with s.open():
+        s.write_audio_chunk(0, src)
+def _map_encoding(encoding: str) -> str:
+    for dst in ["PCM_S", "PCM_U", "PCM_F"]:
+        if dst in encoding:
+            return dst
+    if encoding == "PCM_MULAW":
+        return "ULAW"
+    elif encoding == "PCM_ALAW":
+        return "ALAW"
+    return encoding
+def _get_bits_per_sample(encoding: str, bits_per_sample: int) -> str:
+    if m := re.search(r"PCM_\w(\d+)\w*", encoding):
+        return int(m.group(1))
+    elif encoding in ["PCM_ALAW", "PCM_MULAW"]:
+        return 8
+    return bits_per_sample
+class FFmpegBackend(Backend):
+    @staticmethod
+    def info(uri: InputType, format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        metadata = info_audio(uri, format, buffer_size)
+        metadata.bits_per_sample = _get_bits_per_sample(metadata.encoding, metadata.bits_per_sample)
+        metadata.encoding = _map_encoding(metadata.encoding)
+        return metadata
+    @staticmethod
+    def load(
+        uri: InputType,
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[torch.Tensor, int]:
+        return load_audio(uri, frame_offset, num_frames, normalize, channels_first, format)
+    @staticmethod
+    def save(
+        uri: InputType,
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
+    ) -> None:
+        if not isinstance(compression, (torchaudio.io.CodecConfig, type(None))):
+            raise ValueError(
+                "FFmpeg backend expects non-`None` value for argument `compression` to be of ",
+                f"type `torchaudio.io.CodecConfig`, but received value of type {type(compression)}",
+            )
+        save_audio(
+            uri,
+            src,
+            sample_rate,
+            channels_first,
+            format,
+            encoding,
+            bits_per_sample,
+            buffer_size,
+            compression,
+        )
+    @staticmethod
+    def can_decode(uri: InputType, format: Optional[str]) -> bool:
+        return True
+    @staticmethod
+    def can_encode(uri: InputType, format: Optional[str]) -> bool:
+        return True

.venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+from typing import BinaryIO, Optional, Tuple, Union
+import torch
+from torchaudio.io import CodecConfig
+from . import soundfile_backend
+from .backend import Backend
+from .common import AudioMetaData
+class SoundfileBackend(Backend):
+    @staticmethod
+    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        return soundfile_backend.info(uri, format)
+    @staticmethod
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[torch.Tensor, int]:
+        return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format)
+    @staticmethod
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[CodecConfig, float, int]] = None,
+    ) -> None:
+        if compression:
+            raise ValueError("soundfile backend does not support argument `compression`.")
+        soundfile_backend.save(
+            uri, src, sample_rate, channels_first, format=format, encoding=encoding, bits_per_sample=bits_per_sample
+        )
+    @staticmethod
+    def can_decode(uri, format) -> bool:
+        return True
+    @staticmethod
+    def can_encode(uri, format) -> bool:
+        return True

.venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile_backend.py ADDED Viewed

	@@ -0,0 +1,457 @@

+"""The new soundfile backend which will become default in 0.8.0 onward"""
+import warnings
+from typing import Optional, Tuple
+import torch
+from torchaudio._internal import module_utils as _mod_utils
+from .common import AudioMetaData
+_IS_SOUNDFILE_AVAILABLE = False
+# TODO: import soundfile only when it is used.
+if _mod_utils.is_module_available("soundfile"):
+    try:
+        import soundfile
+        _requires_soundfile = _mod_utils.no_op
+        _IS_SOUNDFILE_AVAILABLE = True
+    except Exception:
+        _requires_soundfile = _mod_utils.fail_with_message(
+            "requires soundfile, but we failed to import it. Please check the installation of soundfile."
+        )
+else:
+    _requires_soundfile = _mod_utils.fail_with_message(
+        "requires soundfile, but it is not installed. Please install soundfile."
+    )
+# Mapping from soundfile subtype to number of bits per sample.
+# This is mostly heuristical and the value is set to 0 when it is irrelevant
+# (lossy formats) or when it can't be inferred.
+# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
+# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
+# the default seems to be 8 bits but it can be compressed further to 4 bits.
+# The dict is inspired from
+# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
+_SUBTYPE_TO_BITS_PER_SAMPLE = {
+    "PCM_S8": 8,  # Signed 8 bit data
+    "PCM_16": 16,  # Signed 16 bit data
+    "PCM_24": 24,  # Signed 24 bit data
+    "PCM_32": 32,  # Signed 32 bit data
+    "PCM_U8": 8,  # Unsigned 8 bit data (WAV and RAW only)
+    "FLOAT": 32,  # 32 bit float data
+    "DOUBLE": 64,  # 64 bit float data
+    "ULAW": 8,  # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
+    "ALAW": 8,  # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
+    "IMA_ADPCM": 0,  # IMA ADPCM.
+    "MS_ADPCM": 0,  # Microsoft ADPCM.
+    "GSM610": 0,  # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
+    "VOX_ADPCM": 0,  # OKI / Dialogix ADPCM
+    "G721_32": 0,  # 32kbs G721 ADPCM encoding.
+    "G723_24": 0,  # 24kbs G723 ADPCM encoding.
+    "G723_40": 0,  # 40kbs G723 ADPCM encoding.
+    "DWVW_12": 12,  # 12 bit Delta Width Variable Word encoding.
+    "DWVW_16": 16,  # 16 bit Delta Width Variable Word encoding.
+    "DWVW_24": 24,  # 24 bit Delta Width Variable Word encoding.
+    "DWVW_N": 0,  # N bit Delta Width Variable Word encoding.
+    "DPCM_8": 8,  # 8 bit differential PCM (XI only)
+    "DPCM_16": 16,  # 16 bit differential PCM (XI only)
+    "VORBIS": 0,  # Xiph Vorbis encoding. (lossy)
+    "ALAC_16": 16,  # Apple Lossless Audio Codec (16 bit).
+    "ALAC_20": 20,  # Apple Lossless Audio Codec (20 bit).
+    "ALAC_24": 24,  # Apple Lossless Audio Codec (24 bit).
+    "ALAC_32": 32,  # Apple Lossless Audio Codec (32 bit).
+}
+def _get_bit_depth(subtype):
+    if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
+        warnings.warn(
+            f"The {subtype} subtype is unknown to TorchAudio. As a result, the bits_per_sample "
+            "attribute will be set to 0. If you are seeing this warning, please "
+            "report by opening an issue on github (after checking for existing/closed ones). "
+            "You may otherwise ignore this warning."
+        )
+    return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
+_SUBTYPE_TO_ENCODING = {
+    "PCM_S8": "PCM_S",
+    "PCM_16": "PCM_S",
+    "PCM_24": "PCM_S",
+    "PCM_32": "PCM_S",
+    "PCM_U8": "PCM_U",
+    "FLOAT": "PCM_F",
+    "DOUBLE": "PCM_F",
+    "ULAW": "ULAW",
+    "ALAW": "ALAW",
+    "VORBIS": "VORBIS",
+}
+def _get_encoding(format: str, subtype: str):
+    if format == "FLAC":
+        return "FLAC"
+    return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
+@_requires_soundfile
+def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
+    """Get signal information of an audio file.
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+    Args:
+        filepath (path-like object or file-like object):
+            Source of audio data.
+        format (str or None, optional):
+            Not used. PySoundFile does not accept format hint.
+    Returns:
+        AudioMetaData: meta data of the given audio.
+    """
+    sinfo = soundfile.info(filepath)
+    return AudioMetaData(
+        sinfo.samplerate,
+        sinfo.frames,
+        sinfo.channels,
+        bits_per_sample=_get_bit_depth(sinfo.subtype),
+        encoding=_get_encoding(sinfo.format, sinfo.subtype),
+    )
+_SUBTYPE2DTYPE = {
+    "PCM_S8": "int8",
+    "PCM_U8": "uint8",
+    "PCM_16": "int16",
+    "PCM_32": "int32",
+    "FLOAT": "float32",
+    "DOUBLE": "float64",
+}
+@_requires_soundfile
+def load(
+    filepath: str,
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    normalize: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+) -> Tuple[torch.Tensor, int]:
+    """Load audio data from file.
+    Note:
+        The formats this function can handle depend on the soundfile installation.
+        This function is tested on the following formats;
+        * WAV
+            * 32-bit floating-point
+            * 32-bit signed integer
+            * 16-bit signed integer
+            * 8-bit unsigned integer
+        * FLAC
+        * OGG/VORBIS
+        * SPHERE
+    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
+    ``float32`` dtype, and the shape of `[channel, time]`.
+    .. warning::
+       ``normalize`` argument does not perform volume normalization.
+       It only converts the sample type to `torch.float32` from the native sample
+       type.
+       When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
+       signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
+       this function can return integer Tensor, where the samples are expressed within the whole range
+       of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
+       ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
+       support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
+       ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
+       ``flac`` and ``mp3``.
+       For these formats, this function always returns ``float32`` Tensor with values.
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+    Args:
+        filepath (path-like object or file-like object):
+            Source of audio data.
+        frame_offset (int, optional):
+            Number of frames to skip before start reading data.
+        num_frames (int, optional):
+            Maximum number of frames to read. ``-1`` reads all the remaining samples,
+            starting from ``frame_offset``.
+            This function may return the less number of frames if there is not enough
+            frames in the given file.
+        normalize (bool, optional):
+            When ``True``, this function converts the native sample type to ``float32``.
+            Default: ``True``.
+            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+            integer type.
+            This argument has no effect for formats other than integer WAV type.
+        channels_first (bool, optional):
+            When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
+        format (str or None, optional):
+            Not used. PySoundFile does not accept format hint.
+    Returns:
+        (torch.Tensor, int): Resulting Tensor and sample rate.
+            If the input file has integer wav format and normalization is off, then it has
+            integer type, else ``float32`` type. If ``channels_first=True``, it has
+            `[channel, time]` else `[time, channel]`.
+    """
+    with soundfile.SoundFile(filepath, "r") as file_:
+        if file_.format != "WAV" or normalize:
+            dtype = "float32"
+        elif file_.subtype not in _SUBTYPE2DTYPE:
+            raise ValueError(f"Unsupported subtype: {file_.subtype}")
+        else:
+            dtype = _SUBTYPE2DTYPE[file_.subtype]
+        frames = file_._prepare_read(frame_offset, None, num_frames)
+        waveform = file_.read(frames, dtype, always_2d=True)
+        sample_rate = file_.samplerate
+    waveform = torch.from_numpy(waveform)
+    if channels_first:
+        waveform = waveform.t()
+    return waveform, sample_rate
+def _get_subtype_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int):
+    if not encoding:
+        if not bits_per_sample:
+            subtype = {
+                torch.uint8: "PCM_U8",
+                torch.int16: "PCM_16",
+                torch.int32: "PCM_32",
+                torch.float32: "FLOAT",
+                torch.float64: "DOUBLE",
+            }.get(dtype)
+            if not subtype:
+                raise ValueError(f"Unsupported dtype for wav: {dtype}")
+            return subtype
+        if bits_per_sample == 8:
+            return "PCM_U8"
+        return f"PCM_{bits_per_sample}"
+    if encoding == "PCM_S":
+        if not bits_per_sample:
+            return "PCM_32"
+        if bits_per_sample == 8:
+            raise ValueError("wav does not support 8-bit signed PCM encoding.")
+        return f"PCM_{bits_per_sample}"
+    if encoding == "PCM_U":
+        if bits_per_sample in (None, 8):
+            return "PCM_U8"
+        raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
+    if encoding == "PCM_F":
+        if bits_per_sample in (None, 32):
+            return "FLOAT"
+        if bits_per_sample == 64:
+            return "DOUBLE"
+        raise ValueError("wav only supports 32/64-bit float PCM encoding.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "ULAW"
+        raise ValueError("wav only supports 8-bit mu-law encoding.")
+    if encoding == "ALAW":
+        if bits_per_sample in (None, 8):
+            return "ALAW"
+        raise ValueError("wav only supports 8-bit a-law encoding.")
+    raise ValueError(f"wav does not support {encoding}.")
+def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
+    if encoding in (None, "PCM_S"):
+        return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
+    if encoding in ("PCM_U", "PCM_F"):
+        raise ValueError(f"sph does not support {encoding} encoding.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "ULAW"
+        raise ValueError("sph only supports 8-bit for mu-law encoding.")
+    if encoding == "ALAW":
+        return "ALAW"
+    raise ValueError(f"sph does not support {encoding}.")
+def _get_subtype(dtype: torch.dtype, format: str, encoding: str, bits_per_sample: int):
+    if format == "wav":
+        return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
+    if format == "flac":
+        if encoding:
+            raise ValueError("flac does not support encoding.")
+        if not bits_per_sample:
+            return "PCM_16"
+        if bits_per_sample > 24:
+            raise ValueError("flac does not support bits_per_sample > 24.")
+        return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
+    if format in ("ogg", "vorbis"):
+        if bits_per_sample:
+            raise ValueError("ogg/vorbis does not support bits_per_sample.")
+        if encoding is None or encoding == "vorbis":
+            return "VORBIS"
+        if encoding == "opus":
+            return "OPUS"
+        raise ValueError(f"Unexpected encoding: {encoding}")
+    if format == "mp3":
+        return "MPEG_LAYER_III"
+    if format == "sph":
+        return _get_subtype_for_sphere(encoding, bits_per_sample)
+    if format in ("nis", "nist"):
+        return "PCM_16"
+    raise ValueError(f"Unsupported format: {format}")
+@_requires_soundfile
+def save(
+    filepath: str,
+    src: torch.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    compression: Optional[float] = None,
+    format: Optional[str] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+):
+    """Save audio data to file.
+    Note:
+        The formats this function can handle depend on the soundfile installation.
+        This function is tested on the following formats;
+        * WAV
+            * 32-bit floating-point
+            * 32-bit signed integer
+            * 16-bit signed integer
+            * 8-bit unsigned integer
+        * FLAC
+        * OGG/VORBIS
+        * SPHERE
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+    Args:
+        filepath (str or pathlib.Path): Path to audio file.
+        src (torch.Tensor): Audio data to save. must be 2D tensor.
+        sample_rate (int): sampling rate
+        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
+            otherwise `[time, channel]`.
+        compression (float of None, optional): Not used.
+            It is here only for interface compatibility reson with "sox_io" backend.
+        format (str or None, optional): Override the audio format.
+            When ``filepath`` argument is path-like object, audio format is
+            inferred from file extension. If the file extension is missing or
+            different, you can specify the correct format with this argument.
+            When ``filepath`` argument is file-like object,
+            this argument is required.
+            Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
+            ``"flac"`` and ``"sph"``.
+        encoding (str or None, optional): Changes the encoding for supported formats.
+            This argument is effective only for supported formats, sush as
+            ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
+                - ``"PCM_S"`` (signed integer Linear PCM)
+                - ``"PCM_U"`` (unsigned integer Linear PCM)
+                - ``"PCM_F"`` (floating point PCM)
+                - ``"ULAW"`` (mu-law)
+                - ``"ALAW"`` (a-law)
+        bits_per_sample (int or None, optional): Changes the bit depth for the
+            supported formats.
+            When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
+            you can change the bit depth.
+            Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
+    Supported formats/encodings/bit depth/compression are:
+    ``"wav"``
+        - 32-bit floating-point PCM
+        - 32-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 8-bit unsigned integer PCM
+        - 8-bit mu-law
+        - 8-bit a-law
+        Note:
+            Default encoding/bit depth is determined by the dtype of
+            the input Tensor.
+    ``"flac"``
+        - 8-bit
+        - 16-bit (default)
+        - 24-bit
+    ``"ogg"``, ``"vorbis"``
+        - Doesn't accept changing configuration.
+    ``"sph"``
+        - 8-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 32-bit signed integer PCM (default)
+        - 8-bit mu-law
+        - 8-bit a-law
+        - 16-bit a-law
+        - 24-bit a-law
+        - 32-bit a-law
+    """
+    if src.ndim != 2:
+        raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
+    if compression is not None:
+        warnings.warn(
+            '`save` function of "soundfile" backend does not support "compression" parameter. '
+            "The argument is silently ignored."
+        )
+    if hasattr(filepath, "write"):
+        if format is None:
+            raise RuntimeError("`format` is required when saving to file object.")
+        ext = format.lower()
+    else:
+        ext = str(filepath).split(".")[-1].lower()
+    if bits_per_sample not in (None, 8, 16, 24, 32, 64):
+        raise ValueError("Invalid bits_per_sample.")
+    if bits_per_sample == 24:
+        warnings.warn(
+            "Saving audio with 24 bits per sample might warp samples near -1. "
+            "Using 16 bits per sample might be able to avoid this."
+        )
+    subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
+    # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
+    # so we extend the extensions manually here
+    if ext in ["nis", "nist", "sph"] and format is None:
+        format = "NIST"
+    if channels_first:
+        src = src.t()
+    soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)

.venv/lib/python3.11/site-packages/torchaudio/_backend/sox.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+from typing import BinaryIO, Optional, Tuple, Union
+import torch
+import torchaudio
+from .backend import Backend
+from .common import AudioMetaData
+sox_ext = torchaudio._extension.lazy_import_sox_ext()
+class SoXBackend(Backend):
+    @staticmethod
+    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        if hasattr(uri, "read"):
+            raise ValueError(
+                "SoX backend does not support reading from file-like objects. ",
+                "Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.",
+            )
+        else:
+            sinfo = sox_ext.get_info(uri, format)
+            if sinfo:
+                return AudioMetaData(*sinfo)
+            else:
+                raise RuntimeError(f"Failed to fetch metadata for {uri}.")
+    @staticmethod
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[torch.Tensor, int]:
+        if hasattr(uri, "read"):
+            raise ValueError(
+                "SoX backend does not support loading from file-like objects. ",
+                "Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.",
+            )
+        else:
+            ret = sox_ext.load_audio_file(uri, frame_offset, num_frames, normalize, channels_first, format)
+            if not ret:
+                raise RuntimeError(f"Failed to load audio from {uri}.")
+            return ret
+    @staticmethod
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
+    ) -> None:
+        if not isinstance(compression, (float, int, type(None))):
+            raise ValueError(
+                "SoX backend expects non-`None` value for argument `compression` to be of ",
+                f"type `float` or `int`, but received value of type {type(compression)}",
+            )
+        if hasattr(uri, "write"):
+            raise ValueError(
+                "SoX backend does not support writing to file-like objects. ",
+                "Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.",
+            )
+        else:
+            sox_ext.save_audio_file(
+                uri,
+                src,
+                sample_rate,
+                channels_first,
+                compression,
+                format,
+                encoding,
+                bits_per_sample,
+            )
+    @staticmethod
+    def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        # i.e. not a file-like object.
+        return not hasattr(uri, "read")
+    @staticmethod
+    def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        # i.e. not a file-like object.
+        return not hasattr(uri, "write")

.venv/lib/python3.11/site-packages/torchaudio/_backend/utils.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import os
+from functools import lru_cache
+from typing import BinaryIO, Dict, Optional, Tuple, Type, Union
+import torch
+from torchaudio._extension import lazy_import_sox_ext
+from torchaudio.io import CodecConfig
+from torio._extension import lazy_import_ffmpeg_ext
+from . import soundfile_backend
+from .backend import Backend
+from .common import AudioMetaData
+from .ffmpeg import FFmpegBackend
+from .soundfile import SoundfileBackend
+from .sox import SoXBackend
+@lru_cache(None)
+def get_available_backends() -> Dict[str, Type[Backend]]:
+    backend_specs: Dict[str, Type[Backend]] = {}
+    if lazy_import_ffmpeg_ext().is_available():
+        backend_specs["ffmpeg"] = FFmpegBackend
+    if lazy_import_sox_ext().is_available():
+        backend_specs["sox"] = SoXBackend
+    if soundfile_backend._IS_SOUNDFILE_AVAILABLE:
+        backend_specs["soundfile"] = SoundfileBackend
+    return backend_specs
+def get_backend(backend_name, backends) -> Backend:
+    if backend := backends.get(backend_name):
+        return backend
+    else:
+        raise ValueError(
+            f"Unsupported backend '{backend_name}' specified; ",
+            f"please select one of {list(backends.keys())} instead.",
+        )
+def get_info_func():
+    backends = get_available_backends()
+    def dispatcher(
+        uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
+    ) -> Backend:
+        if backend_name is not None:
+            return get_backend(backend_name, backends)
+        for backend in backends.values():
+            if backend.can_decode(uri, format):
+                return backend
+        raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
+    def info(
+        uri: Union[BinaryIO, str, os.PathLike],
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+        backend: Optional[str] = None,
+    ) -> AudioMetaData:
+        """Get signal information of an audio file.
+        Note:
+            When the input type is file-like object, this function cannot
+            get the correct length (``num_samples``) for certain formats,
+            such as ``vorbis``.
+            In this case, the value of ``num_samples`` is ``0``.
+        Args:
+            uri (path-like object or file-like object):
+                Source of audio data. The following types are accepted:
+                * ``path-like``: File path or URL.
+                * ``file-like``: Object with ``read(size: int) -> bytes`` method,
+                  which returns byte string of at most ``size`` length.
+            format (str or None, optional):
+                If not ``None``, interpreted as hint that may allow backend to override the detected format.
+                (Default: ``None``)
+            buffer_size (int, optional):
+                Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
+            backend (str or None, optional):
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend available.
+                (Default: ``None``)
+                .. seealso::
+                   :ref:`backend`
+        Returns:
+            AudioMetaData
+        """
+        backend = dispatcher(uri, format, backend)
+        return backend.info(uri, format, buffer_size)
+    return info
+def get_load_func():
+    backends = get_available_backends()
+    def dispatcher(
+        uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
+    ) -> Backend:
+        if backend_name is not None:
+            return get_backend(backend_name, backends)
+        for backend in backends.values():
+            if backend.can_decode(uri, format):
+                return backend
+        raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+        backend: Optional[str] = None,
+    ) -> Tuple[torch.Tensor, int]:
+        """Load audio data from source.
+        By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
+        ``float32`` dtype, and the shape of `[channel, time]`.
+        Note:
+            The formats this function can handle depend on the availability of backends.
+            Please use the following functions to fetch the supported formats.
+            - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_decoders`
+            - Sox: :py:func:`torchaudio.utils.sox_utils.list_read_formats`
+            - SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
+        .. warning::
+            ``normalize`` argument does not perform volume normalization.
+            It only converts the sample type to `torch.float32` from the native sample
+            type.
+            When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
+            signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
+            this function can return integer Tensor, where the samples are expressed within the whole range
+            of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
+            ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
+            support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
+            ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
+            ``flac`` and ``mp3``.
+            For these formats, this function always returns ``float32`` Tensor with values.
+        Args:
+            uri (path-like object or file-like object):
+                Source of audio data.
+            frame_offset (int, optional):
+                Number of frames to skip before start reading data.
+            num_frames (int, optional):
+                Maximum number of frames to read. ``-1`` reads all the remaining samples,
+                starting from ``frame_offset``.
+                This function may return the less number of frames if there is not enough
+                frames in the given file.
+            normalize (bool, optional):
+                When ``True``, this function converts the native sample type to ``float32``.
+                Default: ``True``.
+                If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+                integer type.
+                This argument has no effect for formats other than integer WAV type.
+            channels_first (bool, optional):
+                When True, the returned Tensor has dimension `[channel, time]`.
+                Otherwise, the returned Tensor's dimension is `[time, channel]`.
+            format (str or None, optional):
+                If not ``None``, interpreted as hint that may allow backend to override the detected format.
+                (Default: ``None``)
+            buffer_size (int, optional):
+                Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
+            backend (str or None, optional):
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend being available. (Default: ``None``)
+                .. seealso::
+                   :ref:`backend`
+        Returns:
+            (torch.Tensor, int): Resulting Tensor and sample rate.
+                If the input file has integer wav format and normalization is off, then it has
+                integer type, else ``float32`` type. If ``channels_first=True``, it has
+                `[channel, time]` else `[time, channel]`.
+        """
+        backend = dispatcher(uri, format, backend)
+        return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size)
+    return load
+def get_save_func():
+    backends = get_available_backends()
+    def dispatcher(
+        uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
+    ) -> Backend:
+        if backend_name is not None:
+            return get_backend(backend_name, backends)
+        for backend in backends.values():
+            if backend.can_encode(uri, format):
+                return backend
+        raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        backend: Optional[str] = None,
+        compression: Optional[Union[CodecConfig, float, int]] = None,
+    ):
+        """Save audio data to file.
+        Note:
+            The formats this function can handle depend on the availability of backends.
+            Please use the following functions to fetch the supported formats.
+            - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_encoders`
+            - Sox: :py:func:`torchaudio.utils.sox_utils.list_write_formats`
+            - SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
+        Args:
+            uri (str or pathlib.Path): Path to audio file.
+            src (torch.Tensor): Audio data to save. must be 2D tensor.
+            sample_rate (int): sampling rate
+            channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
+                otherwise `[time, channel]`.
+            format (str or None, optional): Override the audio format.
+                When ``uri`` argument is path-like object, audio format is
+                inferred from file extension. If the file extension is missing or
+                different, you can specify the correct format with this argument.
+                When ``uri`` argument is file-like object,
+                this argument is required.
+                Valid values are ``"wav"``, ``"ogg"``, and ``"flac"``.
+            encoding (str or None, optional): Changes the encoding for supported formats.
+                This argument is effective only for supported formats, i.e.
+                ``"wav"`` and ``""flac"```. Valid values are
+                - ``"PCM_S"`` (signed integer Linear PCM)
+                - ``"PCM_U"`` (unsigned integer Linear PCM)
+                - ``"PCM_F"`` (floating point PCM)
+                - ``"ULAW"`` (mu-law)
+                - ``"ALAW"`` (a-law)
+            bits_per_sample (int or None, optional): Changes the bit depth for the
+                supported formats.
+                When ``format`` is one of ``"wav"`` and ``"flac"``,
+                you can change the bit depth.
+                Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
+            buffer_size (int, optional):
+                Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
+            backend (str or None, optional):
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend being available.
+                (Default: ``None``)
+                .. seealso::
+                   :ref:`backend`
+            compression (CodecConfig, float, int, or None, optional):
+                Compression configuration to apply.
+                If the selected backend is FFmpeg, an instance of :py:class:`CodecConfig` must be provided.
+                Otherwise, if the selected backend is SoX, a float or int value corresponding to option ``-C`` of the
+                ``sox`` command line interface must be provided. For instance:
+                ``"mp3"``
+                    Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
+                    VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
+                ``"flac"``
+                    Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
+                ``"ogg"``, ``"vorbis"``
+                    Number from ``-1`` to ``10``; ``-1`` is the highest compression
+                    and lowest quality. Default: ``3``.
+                Refer to http://sox.sourceforge.net/soxformat.html for more details.
+        """
+        backend = dispatcher(uri, format, backend)
+        return backend.save(
+            uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size, compression
+        )
+    return save

.venv/lib/python3.11/site-packages/torchaudio/backend/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# NOTE:
+# The entire `torchaudio.backend` module is deprecated.
+# New things should be added to `torchaudio._backend`.
+# Only things related to backward compatibility should be placed here.
+from . import common, no_backend, soundfile_backend, sox_io_backend  # noqa
+__all__ = []

.venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/_sox_io_backend.cpython-311.pyc ADDED Viewed

Binary file (12.7 kB). View file

.venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/soundfile_backend.cpython-311.pyc ADDED Viewed

Binary file (889 Bytes). View file

.venv/lib/python3.11/site-packages/torchaudio/backend/_no_backend.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from pathlib import Path
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+from torchaudio import AudioMetaData
+def load(
+    filepath: Union[str, Path],
+    out: Optional[Tensor] = None,
+    normalization: Union[bool, float, Callable] = True,
+    channels_first: bool = True,
+    num_frames: int = 0,
+    offset: int = 0,
+    filetype: Optional[str] = None,
+) -> Tuple[Tensor, int]:
+    raise RuntimeError("No audio I/O backend is available.")
+def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
+    raise RuntimeError("No audio I/O backend is available.")
+def info(filepath: str) -> AudioMetaData:
+    raise RuntimeError("No audio I/O backend is available.")

.venv/lib/python3.11/site-packages/torchaudio/backend/common.py ADDED Viewed

	@@ -0,0 +1,13 @@

+def __getattr__(name: str):
+    if name == "AudioMetaData":
+        import warnings
+        warnings.warn(
+            "`torchaudio.backend.common.AudioMetaData` has been moved to "
+            "`torchaudio.AudioMetaData`. Please update the import path.",
+            stacklevel=2,
+        )
+        from torchaudio import AudioMetaData
+        return AudioMetaData
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

.venv/lib/python3.11/site-packages/torchaudio/backend/soundfile_backend.py ADDED Viewed

	@@ -0,0 +1,14 @@

+def __getattr__(name: str):
+    import warnings
+    warnings.warn(
+        "Torchaudio's I/O functions now support par-call bakcend dispatch. "
+        "Importing backend implementation directly is no longer guaranteed to work. "
+        "Please use `backend` keyword with load/save/info function, instead of "
+        "calling the udnerlying implementation directly.",
+        stacklevel=2,
+    )
+    from torchaudio._backend import soundfile_backend
+    return getattr(soundfile_backend, name)

.venv/lib/python3.11/site-packages/torchaudio/backend/sox_io_backend.py ADDED Viewed

	@@ -0,0 +1,14 @@

+def __getattr__(name: str):
+    import warnings
+    warnings.warn(
+        "Torchaudio's I/O functions now support par-call bakcend dispatch. "
+        "Importing backend implementation directly is no longer guaranteed to work. "
+        "Please use `backend` keyword with load/save/info function, instead of "
+        "calling the udnerlying implementation directly.",
+        stacklevel=2,
+    )
+    from . import _sox_io_backend
+    return getattr(_sox_io_backend, name)

.venv/lib/python3.11/site-packages/torchaudio/functional/__init__.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from ._alignment import forced_align, merge_tokens, TokenSpan
+from .filtering import (
+    allpass_biquad,
+    band_biquad,
+    bandpass_biquad,
+    bandreject_biquad,
+    bass_biquad,
+    biquad,
+    contrast,
+    dcshift,
+    deemph_biquad,
+    dither,
+    equalizer_biquad,
+    filtfilt,
+    flanger,
+    gain,
+    highpass_biquad,
+    lfilter,
+    lowpass_biquad,
+    overdrive,
+    phaser,
+    riaa_biquad,
+    treble_biquad,
+    vad,
+)
+from .functional import (
+    add_noise,
+    amplitude_to_DB,
+    apply_beamforming,
+    apply_codec,
+    compute_deltas,
+    convolve,
+    create_dct,
+    DB_to_amplitude,
+    deemphasis,
+    detect_pitch_frequency,
+    edit_distance,
+    fftconvolve,
+    frechet_distance,
+    griffinlim,
+    inverse_spectrogram,
+    linear_fbanks,
+    loudness,
+    mask_along_axis,
+    mask_along_axis_iid,
+    melscale_fbanks,
+    mu_law_decoding,
+    mu_law_encoding,
+    mvdr_weights_rtf,
+    mvdr_weights_souden,
+    phase_vocoder,
+    pitch_shift,
+    preemphasis,
+    psd,
+    resample,
+    rnnt_loss,
+    rtf_evd,
+    rtf_power,
+    sliding_window_cmn,
+    spectral_centroid,
+    spectrogram,
+    speed,
+)
+__all__ = [
+    "amplitude_to_DB",
+    "compute_deltas",
+    "create_dct",
+    "melscale_fbanks",
+    "linear_fbanks",
+    "DB_to_amplitude",
+    "loudness",
+    "detect_pitch_frequency",
+    "griffinlim",
+    "mask_along_axis",
+    "mask_along_axis_iid",
+    "mu_law_encoding",
+    "mu_law_decoding",
+    "phase_vocoder",
+    "sliding_window_cmn",
+    "spectrogram",
+    "inverse_spectrogram",
+    "spectral_centroid",
+    "allpass_biquad",
+    "band_biquad",
+    "bandpass_biquad",
+    "bandreject_biquad",
+    "bass_biquad",
+    "biquad",
+    "contrast",
+    "dither",
+    "dcshift",
+    "deemph_biquad",
+    "equalizer_biquad",
+    "filtfilt",
+    "flanger",
+    "forced_align",
+    "merge_tokens",
+    "TokenSpan",
+    "gain",
+    "highpass_biquad",
+    "lfilter",
+    "lowpass_biquad",
+    "overdrive",
+    "phaser",
+    "riaa_biquad",
+    "treble_biquad",
+    "vad",
+    "apply_codec",
+    "resample",
+    "edit_distance",
+    "pitch_shift",
+    "rnnt_loss",
+    "psd",
+    "mvdr_weights_souden",
+    "mvdr_weights_rtf",
+    "rtf_evd",
+    "rtf_power",
+    "apply_beamforming",
+    "fftconvolve",
+    "convolve",
+    "add_noise",
+    "speed",
+    "preemphasis",
+    "deemphasis",
+    "frechet_distance",
+]

.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.63 kB). View file

.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/_alignment.cpython-311.pyc ADDED Viewed

Binary file (6.77 kB). View file

.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/filtering.cpython-311.pyc ADDED Viewed

Binary file (74 kB). View file

.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/functional.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71e5719c3daaa09433b5ece2431df353ef399f7678bc6bee1f1ebff9b16f9c13
+size 115834

.venv/lib/python3.11/site-packages/torchaudio/functional/_alignment.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+import torch
+from torch import Tensor
+from torchaudio._extension import fail_if_no_align
+__all__ = []
+@fail_if_no_align
+def forced_align(
+    log_probs: Tensor,
+    targets: Tensor,
+    input_lengths: Optional[Tensor] = None,
+    target_lengths: Optional[Tensor] = None,
+    blank: int = 0,
+) -> Tuple[Tensor, Tensor]:
+    r"""Align a CTC label sequence to an emission.
+    .. devices:: CPU CUDA
+    .. properties:: TorchScript
+    Args:
+        log_probs (Tensor): log probability of CTC emission output.
+            Tensor of shape `(B, T, C)`. where `B` is the batch size, `T` is the input length,
+            `C` is the number of characters in alphabet including blank.
+        targets (Tensor): Target sequence. Tensor of shape `(B, L)`,
+            where `L` is the target length.
+        input_lengths (Tensor or None, optional):
+            Lengths of the inputs (max value must each be <= `T`). 1-D Tensor of shape `(B,)`.
+        target_lengths (Tensor or None, optional):
+            Lengths of the targets. 1-D Tensor of shape `(B,)`.
+        blank_id (int, optional): The index of blank symbol in CTC emission. (Default: 0)
+    Returns:
+        Tuple(Tensor, Tensor):
+            Tensor: Label for each time step in the alignment path computed using forced alignment.
+            Tensor: Log probability scores of the labels for each time step.
+    Note:
+        The sequence length of `log_probs` must satisfy:
+        .. math::
+            L_{\text{log\_probs}} \ge L_{\text{label}} + N_{\text{repeat}}
+        where :math:`N_{\text{repeat}}` is the number of consecutively repeated tokens.
+        For example, in str `"aabbc"`, the number of repeats are `2`.
+    Note:
+        The current version only supports ``batch_size==1``.
+    """
+    if blank in targets:
+        raise ValueError(f"targets Tensor shouldn't contain blank index. Found {targets}.")
+    if torch.max(targets) >= log_probs.shape[-1]:
+        raise ValueError("targets values must be less than the CTC dimension")
+    if input_lengths is None:
+        batch_size, length = log_probs.size(0), log_probs.size(1)
+        input_lengths = torch.full((batch_size,), length, dtype=torch.int64, device=log_probs.device)
+    if target_lengths is None:
+        batch_size, length = targets.size(0), targets.size(1)
+        target_lengths = torch.full((batch_size,), length, dtype=torch.int64, device=targets.device)
+    # For TorchScript compatibility
+    assert input_lengths is not None
+    assert target_lengths is not None
+    paths, scores = torch.ops.torchaudio.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
+    return paths, scores
+@dataclass
+class TokenSpan:
+    """TokenSpan()
+    Token with time stamps and score. Returned by :py:func:`merge_tokens`.
+    """
+    token: int
+    """The token"""
+    start: int
+    """The start time (inclusive) in emission time axis."""
+    end: int
+    """The end time (exclusive) in emission time axis."""
+    score: float
+    """The score of the this token."""
+    def __len__(self) -> int:
+        """Returns the time span"""
+        return self.end - self.start
+def merge_tokens(tokens: Tensor, scores: Tensor, blank: int = 0) -> List[TokenSpan]:
+    """Removes repeated tokens and blank tokens from the given CTC token sequence.
+    Args:
+        tokens (Tensor): Alignment tokens (unbatched) returned from :py:func:`forced_align`.
+            Shape: `(time, )`.
+        scores (Tensor): Alignment scores (unbatched) returned from :py:func:`forced_align`.
+            Shape: `(time, )`. When computing the token-size score, the given score is averaged
+            across the corresponding time span.
+    Returns:
+        list of TokenSpan
+    Example:
+        >>> aligned_tokens, scores = forced_align(emission, targets, input_lengths, target_lengths)
+        >>> token_spans = merge_tokens(aligned_tokens[0], scores[0])
+    """
+    if tokens.ndim != 1 or scores.ndim != 1:
+        raise ValueError("`tokens` and `scores` must be 1D Tensor.")
+    if len(tokens) != len(scores):
+        raise ValueError("`tokens` and `scores` must be the same length.")
+    diff = torch.diff(
+        tokens, prepend=torch.tensor([-1], device=tokens.device), append=torch.tensor([-1], device=tokens.device)
+    )
+    changes_wo_blank = torch.nonzero((diff != 0)).squeeze().tolist()
+    tokens = tokens.tolist()
+    spans = [
+        TokenSpan(token=token, start=start, end=end, score=scores[start:end].mean().item())
+        for start, end in zip(changes_wo_blank[:-1], changes_wo_blank[1:])
+        if (token := tokens[start]) != blank
+    ]
+    return spans

.venv/lib/python3.11/site-packages/torchaudio/functional/filtering.py ADDED Viewed

	@@ -0,0 +1,1669 @@

+import math
+import warnings
+from typing import Optional
+import torch
+from torch import Tensor
+from torchaudio._extension import _IS_TORCHAUDIO_EXT_AVAILABLE
+def _dB2Linear(x: float) -> float:
+    return math.exp(x * math.log(10) / 20.0)
+def _generate_wave_table(
+    wave_type: str,
+    data_type: str,
+    table_size: int,
+    min: float,
+    max: float,
+    phase: float,
+    device: torch.device,
+) -> Tensor:
+    r"""A helper function for phaser. Generates a table with given parameters.
+    Args:
+        wave_type (str): SINE or TRIANGULAR
+        data_type (str): desired data_type ( `INT` or `FLOAT` )
+        table_size (int): desired table size
+        min (float): desired min value
+        max (float): desired max value
+        phase (float): desired phase
+        device (torch.device): Torch device on which table must be generated
+    Returns:
+        Tensor: A 1D tensor with wave table values
+    """
+    phase_offset = int(phase / math.pi / 2 * table_size + 0.5)
+    t = torch.arange(table_size, device=device, dtype=torch.int32)
+    point = (t + phase_offset) % table_size
+    d = torch.zeros_like(point, device=device, dtype=torch.float64)
+    if wave_type == "SINE":
+        d = (torch.sin(point.to(torch.float64) / table_size * 2 * math.pi) + 1) / 2
+    elif wave_type == "TRIANGLE":
+        d = point.to(torch.float64) * 2 / table_size
+        value = torch.div(4 * point, table_size, rounding_mode="floor")
+        d[value == 0] = d[value == 0] + 0.5
+        d[value == 1] = 1.5 - d[value == 1]
+        d[value == 2] = 1.5 - d[value == 2]
+        d[value == 3] = d[value == 3] - 1.5
+    d = d * (max - min) + min
+    if data_type == "INT":
+        mask = d < 0
+        d[mask] = d[mask] - 0.5
+        d[~mask] = d[~mask] + 0.5
+        d = d.to(torch.int32)
+    elif data_type == "FLOAT":
+        d = d.to(torch.float32)
+    return d
+def allpass_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design two-pole all-pass filter.  Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+        waveform(torch.Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+    b0 = 1 - alpha
+    b1 = -2 * torch.cos(w0)
+    b2 = 1 + alpha
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+def band_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    central_freq: float,
+    Q: float = 0.707,
+    noise: bool = False,
+) -> Tensor:
+    r"""Design two-pole band filter.  Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
+        noise (bool, optional) : If ``True``, uses the alternate mode for un-pitched audio (e.g. percussion).
+            If ``False``, uses mode oriented to pitched audio, i.e. voice, singing,
+            or instrumental music (Default: ``False``).
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    w0 = 2 * math.pi * central_freq / sample_rate
+    bw_Hz = central_freq / Q
+    a0 = 1.0
+    a2 = torch.exp(-2 * math.pi * bw_Hz / sample_rate)
+    a1 = -4 * a2 / (1 + a2) * torch.cos(w0)
+    b0 = torch.sqrt(1 - a1 * a1 / (4 * a2)) * (1 - a2)
+    if noise:
+        mult = torch.sqrt(((1 + a2) * (1 + a2) - a1 * a1) * (1 - a2) / (1 + a2)) / b0
+        b0 = mult * b0
+    b1 = 0.0
+    b2 = 0.0
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+def bandpass_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    central_freq: float,
+    Q: float = 0.707,
+    const_skirt_gain: bool = False,
+) -> Tensor:
+    r"""Design two-pole band-pass filter.  Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+        const_skirt_gain (bool, optional) : If ``True``, uses a constant skirt gain (peak gain = Q).
+            If ``False``, uses a constant 0dB peak gain. (Default: ``False``)
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+    temp = torch.sin(w0) / 2 if const_skirt_gain else alpha
+    b0 = temp
+    b1 = 0.0
+    b2 = -temp
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+def bandreject_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design two-pole band-reject filter.  Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+    b0 = 1.0
+    b1 = -2 * torch.cos(w0)
+    b2 = 1.0
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+def bass_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    gain: float,
+    central_freq: float = 100,
+    Q: float = 0.707,
+) -> Tensor:
+    r"""Design a bass tone-control effect.  Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB.
+        central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``100``)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    gain = torch.as_tensor(gain, dtype=dtype, device=device)
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+    A = torch.exp(gain / 40 * math.log(10))
+    temp1 = 2 * torch.sqrt(A) * alpha
+    temp2 = (A - 1) * torch.cos(w0)
+    temp3 = (A + 1) * torch.cos(w0)
+    b0 = A * ((A + 1) - temp2 + temp1)
+    b1 = 2 * A * ((A - 1) - temp3)
+    b2 = A * ((A + 1) - temp2 - temp1)
+    a0 = (A + 1) + temp2 + temp1
+    a1 = -2 * ((A - 1) + temp3)
+    a2 = (A + 1) + temp2 - temp1
+    return biquad(waveform, b0 / a0, b1 / a0, b2 / a0, a0 / a0, a1 / a0, a2 / a0)
+def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: float, a2: float) -> Tensor:
+    r"""Perform a biquad filter of input tensor.  Initial conditions set to 0.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        b0 (float or torch.Tensor): numerator coefficient of current input, x[n]
+        b1 (float or torch.Tensor): numerator coefficient of input one time step ago x[n-1]
+        b2 (float or torch.Tensor): numerator coefficient of input two time steps ago x[n-2]
+        a0 (float or torch.Tensor): denominator coefficient of current output y[n], typically 1
+        a1 (float or torch.Tensor): denominator coefficient of current output y[n-1]
+        a2 (float or torch.Tensor): denominator coefficient of current output y[n-2]
+    Returns:
+        Tensor: Waveform with dimension of `(..., time)`
+    Reference:
+       - https://en.wikipedia.org/wiki/Digital_biquad_filter
+    """
+    device = waveform.device
+    dtype = waveform.dtype
+    b0 = torch.as_tensor(b0, dtype=dtype, device=device).view(1)
+    b1 = torch.as_tensor(b1, dtype=dtype, device=device).view(1)
+    b2 = torch.as_tensor(b2, dtype=dtype, device=device).view(1)
+    a0 = torch.as_tensor(a0, dtype=dtype, device=device).view(1)
+    a1 = torch.as_tensor(a1, dtype=dtype, device=device).view(1)
+    a2 = torch.as_tensor(a2, dtype=dtype, device=device).view(1)
+    output_waveform = lfilter(
+        waveform,
+        torch.cat([a0, a1, a2]),
+        torch.cat([b0, b1, b2]),
+    )
+    return output_waveform
+def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor:
+    r"""Apply contrast effect.  Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Comparable with compression, this effect modifies an audio signal to make it sound louder
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        enhancement_amount (float, optional): controls the amount of the enhancement
+            Allowed range of values for enhancement_amount : 0-100
+            Note that enhancement_amount = 0 still gives a significant contrast enhancement
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+    if not 0 <= enhancement_amount <= 100:
+        raise ValueError("Allowed range of values for enhancement_amount : 0-100")
+    contrast = enhancement_amount / 750.0
+    temp1 = waveform * (math.pi / 2)
+    temp2 = contrast * torch.sin(temp1 * 4)
+    output_waveform = torch.sin(temp1 + temp2)
+    return output_waveform
+def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None) -> Tensor:
+    r"""Apply a DC shift to the audio. Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: TorchScript
+    This can be useful to remove a DC offset
+    (caused perhaps by a hardware problem in the recording chain) from the audio
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        shift (float): indicates the amount to shift the audio
+            Allowed range of values for shift : -2.0 to +2.0
+        limiter_gain (float of None, optional): It is used only on peaks to prevent clipping
+            It should have a value much less than 1 (e.g. 0.05 or 0.02)
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+    output_waveform = waveform
+    limiter_threshold = 0.0
+    if limiter_gain is not None:
+        limiter_threshold = 1.0 - (abs(shift) - limiter_gain)
+    # Note:
+    # the following index-based update breaks auto-grad support
+    if limiter_gain is not None and shift > 0:
+        mask = waveform > limiter_threshold
+        temp = (waveform[mask] - limiter_threshold) * limiter_gain / (1 - limiter_threshold)
+        output_waveform[mask] = (temp + limiter_threshold + shift).clamp(max=limiter_threshold)
+        output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1)
+    elif limiter_gain is not None and shift < 0:
+        mask = waveform < -limiter_threshold
+        temp = (waveform[mask] + limiter_threshold) * limiter_gain / (1 - limiter_threshold)
+        output_waveform[mask] = (temp - limiter_threshold + shift).clamp(min=-limiter_threshold)
+        output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1)
+    else:
+        output_waveform = (waveform + shift).clamp(min=-1, max=1)
+    return output_waveform
+def deemph_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
+    r"""Apply ISO 908 CD de-emphasis (shelving) IIR filter.  Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, Allowed sample rate ``44100`` or ``48000``
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    if sample_rate == 44100:
+        central_freq = 5283
+        width_slope = 0.4845
+        gain = -9.477
+    elif sample_rate == 48000:
+        central_freq = 5356
+        width_slope = 0.479
+        gain = -9.62
+    else:
+        raise ValueError("Sample rate must be 44100 (audio-CD) or 48000 (DAT)")
+    w0 = 2 * math.pi * central_freq / sample_rate
+    A = math.exp(gain / 40.0 * math.log(10))
+    alpha = math.sin(w0) / 2 * math.sqrt((A + 1 / A) * (1 / width_slope - 1) + 2)
+    temp1 = 2 * math.sqrt(A) * alpha
+    temp2 = (A - 1) * math.cos(w0)
+    temp3 = (A + 1) * math.cos(w0)
+    b0 = A * ((A + 1) + temp2 + temp1)
+    b1 = -2 * A * ((A - 1) + temp3)
+    b2 = A * ((A + 1) + temp2 - temp1)
+    a0 = (A + 1) - temp2 + temp1
+    a1 = 2 * ((A - 1) - temp3)
+    a2 = (A + 1) - temp2 - temp1
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+def _add_noise_shaping(dithered_waveform: Tensor, waveform: Tensor) -> Tensor:
+    r"""Noise shaping is calculated by error:
+    error[n] = dithered[n] - original[n]
+    noise_shaped_waveform[n] = dithered[n] + error[n-1]
+    """
+    wf_shape = waveform.size()
+    waveform = waveform.reshape(-1, wf_shape[-1])
+    dithered_shape = dithered_waveform.size()
+    dithered_waveform = dithered_waveform.reshape(-1, dithered_shape[-1])
+    error = dithered_waveform - waveform
+    # add error[n-1] to dithered_waveform[n], so offset the error by 1 index
+    zeros = torch.zeros(1, dtype=error.dtype, device=error.device)
+    for index in range(error.size()[0]):
+        err = error[index]
+        error_offset = torch.cat((zeros, err))
+        error[index] = error_offset[: waveform.size()[1]]
+    noise_shaped = dithered_waveform + error
+    return noise_shaped.reshape(dithered_shape[:-1] + noise_shaped.shape[-1:])
+def _apply_probability_distribution(waveform: Tensor, density_function: str = "TPDF") -> Tensor:
+    r"""Apply a probability distribution function on a waveform.
+    Triangular probability density function (TPDF) dither noise has a
+    triangular distribution; values in the center of the range have a higher
+    probability of occurring.
+    Rectangular probability density function (RPDF) dither noise has a
+    uniform distribution; any value in the specified range has the same
+    probability of occurring.
+    Gaussian probability density function (GPDF) has a normal distribution.
+    The relationship of probabilities of results follows a bell-shaped,
+    or Gaussian curve, typical of dither generated by analog sources.
+    Args:
+        waveform (Tensor): Tensor of audio of dimension (..., time)
+        density_function (str, optional): The density function of a
+           continuous random variable (Default: ``"TPDF"``)
+           Options: Triangular Probability Density Function - `TPDF`
+                    Rectangular Probability Density Function - `RPDF`
+                    Gaussian Probability Density Function - `GPDF`
+    Returns:
+        Tensor: waveform dithered with TPDF
+    """
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, shape[-1])
+    channel_size = waveform.size()[0] - 1
+    time_size = waveform.size()[-1] - 1
+    random_channel = (
+        int(
+            torch.randint(
+                channel_size,
+                [
+                    1,
+                ],
+            ).item()
+        )
+        if channel_size > 0
+        else 0
+    )
+    random_time = (
+        int(
+            torch.randint(
+                time_size,
+                [
+                    1,
+                ],
+            ).item()
+        )
+        if time_size > 0
+        else 0
+    )
+    number_of_bits = 16
+    up_scaling = 2 ** (number_of_bits - 1) - 2
+    signal_scaled = waveform * up_scaling
+    down_scaling = 2 ** (number_of_bits - 1)
+    signal_scaled_dis = waveform
+    if density_function == "RPDF":
+        RPDF = waveform[random_channel][random_time] - 0.5
+        signal_scaled_dis = signal_scaled + RPDF
+    elif density_function == "GPDF":
+        # TODO Replace by distribution code once
+        # https://github.com/pytorch/pytorch/issues/29843 is resolved
+        # gaussian = torch.distributions.normal.Normal(torch.mean(waveform, -1), 1).sample()
+        num_rand_variables = 6
+        gaussian = waveform[random_channel][random_time]
+        for ws in num_rand_variables * [time_size]:
+            rand_chan = int(
+                torch.randint(
+                    channel_size,
+                    [
+                        1,
+                    ],
+                ).item()
+            )
+            gaussian += waveform[rand_chan][
+                int(
+                    torch.randint(
+                        ws,
+                        [
+                            1,
+                        ],
+                    ).item()
+                )
+            ]
+        signal_scaled_dis = signal_scaled + gaussian
+    else:
+        # dtype needed for https://github.com/pytorch/pytorch/issues/32358
+        TPDF = torch.bartlett_window(time_size + 1, dtype=signal_scaled.dtype, device=signal_scaled.device)
+        TPDF = TPDF.repeat((channel_size + 1), 1)
+        signal_scaled_dis = signal_scaled + TPDF
+    quantised_signal_scaled = torch.round(signal_scaled_dis)
+    quantised_signal = quantised_signal_scaled / down_scaling
+    # unpack batch
+    return quantised_signal.reshape(shape[:-1] + quantised_signal.shape[-1:])
+def dither(waveform: Tensor, density_function: str = "TPDF", noise_shaping: bool = False) -> Tensor:
+    r"""Apply dither
+    .. devices:: CPU CUDA
+    .. properties:: TorchScript
+    Dither increases the perceived dynamic range of audio stored at a
+    particular bit-depth by eliminating nonlinear truncation distortion
+    (i.e. adding minimally perceived noise to mask distortion caused by quantization).
+    Args:
+        waveform (Tensor): Tensor of audio of dimension (..., time)
+        density_function (str, optional):
+            The density function of a continuous random variable. One of
+            ``"TPDF"`` (Triangular Probability Density Function),
+            ``"RPDF"`` (Rectangular Probability Density Function) or
+            ``"GPDF"`` (Gaussian Probability Density Function) (Default: ``"TPDF"``).
+        noise_shaping (bool, optional): a filtering process that shapes the spectral
+            energy of quantisation error (Default: ``False``)
+    Returns:
+       Tensor: waveform dithered
+    """
+    dithered = _apply_probability_distribution(waveform, density_function=density_function)
+    if noise_shaping:
+        return _add_noise_shaping(dithered, waveform)
+    else:
+        return dithered
+def equalizer_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    center_freq: float,
+    gain: float,
+    Q: float = 0.707,
+) -> Tensor:
+    r"""Design biquad peaking equalizer filter and perform filtering.  Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        center_freq (float): filter's central frequency
+        gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    center_freq = torch.as_tensor(center_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    gain = torch.as_tensor(gain, dtype=dtype, device=device)
+    w0 = 2 * math.pi * center_freq / sample_rate
+    A = torch.exp(gain / 40.0 * math.log(10))
+    alpha = torch.sin(w0) / 2 / Q
+    b0 = 1 + alpha * A
+    b1 = -2 * torch.cos(w0)
+    b2 = 1 - alpha * A
+    a0 = 1 + alpha / A
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha / A
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+def filtfilt(
+    waveform: Tensor,
+    a_coeffs: Tensor,
+    b_coeffs: Tensor,
+    clamp: bool = True,
+) -> Tensor:
+    r"""Apply an IIR filter forward and backward to a waveform.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Inspired by https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.filtfilt.html
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`.  Must be normalized to -1 to 1.
+        a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delay coefficients are first, e.g. ``[a0, a1, a2, ...]``.
+                                Must be same size as b_coeffs (pad with 0's as necessary).
+        b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delay coefficients are first, e.g. ``[b0, b1, b2, ...]``.
+                                Must be same size as a_coeffs (pad with 0's as necessary).
+        clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
+    Returns:
+        Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
+        are 2D Tensors, or `(..., time)` otherwise.
+    """
+    forward_filtered = lfilter(waveform, a_coeffs, b_coeffs, clamp=False, batching=True)
+    backward_filtered = lfilter(
+        forward_filtered.flip(-1),
+        a_coeffs,
+        b_coeffs,
+        clamp=clamp,
+        batching=True,
+    ).flip(-1)
+    return backward_filtered
+def flanger(
+    waveform: Tensor,
+    sample_rate: int,
+    delay: float = 0.0,
+    depth: float = 2.0,
+    regen: float = 0.0,
+    width: float = 71.0,
+    speed: float = 0.5,
+    phase: float = 25.0,
+    modulation: str = "sinusoidal",
+    interpolation: str = "linear",
+) -> Tensor:
+    r"""Apply a flanger effect to the audio. Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., channel, time)` .
+            Max 4 channels allowed
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        delay (float, optional): desired delay in milliseconds(ms)
+            Allowed range of values are 0 to 30
+        depth (float, optional): desired delay depth in milliseconds(ms)
+            Allowed range of values are 0 to 10
+        regen (float, optional): desired regen(feedback gain) in dB
+            Allowed range of values are -95 to 95
+        width (float, optional):  desired width(delay gain) in dB
+            Allowed range of values are 0 to 100
+        speed (float, optional):  modulation speed in Hz
+            Allowed range of values are 0.1 to 10
+        phase (float, optional):  percentage phase-shift for multi-channel
+            Allowed range of values are 0 to 100
+        modulation (str, optional):  Use either "sinusoidal" or "triangular" modulation. (Default: ``sinusoidal``)
+        interpolation (str, optional): Use either "linear" or "quadratic" for delay-line interpolation.
+            (Default: ``linear``)
+    Returns:
+        Tensor: Waveform of dimension of `(..., channel, time)`
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - Scott Lehman, `Effects Explained`_,
+    .. _Effects Explained:
+        https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
+    """
+    if modulation not in ("sinusoidal", "triangular"):
+        raise ValueError('Only "sinusoidal" or "triangular" modulation allowed')
+    if interpolation not in ("linear", "quadratic"):
+        raise ValueError('Only "linear" or "quadratic" interpolation allowed')
+    actual_shape = waveform.shape
+    device, dtype = waveform.device, waveform.dtype
+    if actual_shape[-2] > 4:
+        raise ValueError("Max 4 channels allowed")
+    # convert to 3D (batch, channels, time)
+    waveform = waveform.view(-1, actual_shape[-2], actual_shape[-1])
+    # Scaling
+    feedback_gain = regen / 100
+    delay_gain = width / 100
+    channel_phase = phase / 100
+    delay_min = delay / 1000
+    delay_depth = depth / 1000
+    n_channels = waveform.shape[-2]
+    if modulation == "sinusoidal":
+        wave_type = "SINE"
+    else:
+        wave_type = "TRIANGLE"
+    # Balance output:
+    in_gain = 1.0 / (1 + delay_gain)
+    delay_gain = delay_gain / (1 + delay_gain)
+    # Balance feedback loop:
+    delay_gain = delay_gain * (1 - abs(feedback_gain))
+    delay_buf_length = int((delay_min + delay_depth) * sample_rate + 0.5)
+    delay_buf_length = delay_buf_length + 2
+    delay_bufs = torch.zeros(waveform.shape[0], n_channels, delay_buf_length, dtype=dtype, device=device)
+    delay_last = torch.zeros(waveform.shape[0], n_channels, dtype=dtype, device=device)
+    lfo_length = int(sample_rate / speed)
+    table_min = math.floor(delay_min * sample_rate + 0.5)
+    table_max = delay_buf_length - 2.0
+    lfo = _generate_wave_table(
+        wave_type=wave_type,
+        data_type="FLOAT",
+        table_size=lfo_length,
+        min=float(table_min),
+        max=float(table_max),
+        phase=3 * math.pi / 2,
+        device=device,
+    )
+    output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
+    delay_buf_pos = 0
+    lfo_pos = 0
+    channel_idxs = torch.arange(0, n_channels, device=device)
+    for i in range(waveform.shape[-1]):
+        delay_buf_pos = (delay_buf_pos + delay_buf_length - 1) % delay_buf_length
+        cur_channel_phase = (channel_idxs * lfo_length * channel_phase + 0.5).to(torch.int64)
+        delay_tensor = lfo[(lfo_pos + cur_channel_phase) % lfo_length]
+        frac_delay = torch.frac(delay_tensor)
+        delay_tensor = torch.floor(delay_tensor)
+        int_delay = delay_tensor.to(torch.int64)
+        temp = waveform[:, :, i]
+        delay_bufs[:, :, delay_buf_pos] = temp + delay_last * feedback_gain
+        delayed_0 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
+        int_delay = int_delay + 1
+        delayed_1 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
+        int_delay = int_delay + 1
+        if interpolation == "linear":
+            delayed = delayed_0 + (delayed_1 - delayed_0) * frac_delay
+        else:
+            delayed_2 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
+            int_delay = int_delay + 1
+            delayed_2 = delayed_2 - delayed_0
+            delayed_1 = delayed_1 - delayed_0
+            a = delayed_2 * 0.5 - delayed_1
+            b = delayed_1 * 2 - delayed_2 * 0.5
+            delayed = delayed_0 + (a * frac_delay + b) * frac_delay
+        delay_last = delayed
+        output_waveform[:, :, i] = waveform[:, :, i] * in_gain + delayed * delay_gain
+        lfo_pos = (lfo_pos + 1) % lfo_length
+    return output_waveform.clamp(min=-1, max=1).view(actual_shape)
+def gain(waveform: Tensor, gain_db: float = 1.0) -> Tensor:
+    r"""Apply amplification or attenuation to the whole waveform.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+       waveform (Tensor): Tensor of audio of dimension (..., time).
+       gain_db (float, optional) Gain adjustment in decibels (dB) (Default: ``1.0``).
+    Returns:
+       Tensor: the whole waveform amplified by gain_db.
+    """
+    if gain_db == 0:
+        return waveform
+    ratio = 10 ** (gain_db / 20)
+    return waveform * ratio
+def highpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design biquad highpass filter and perform filtering.  Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        cutoff_freq (float or torch.Tensor): filter cutoff frequency
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+    Returns:
+        Tensor: Waveform dimension of `(..., time)`
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    w0 = 2 * math.pi * cutoff_freq / sample_rate
+    alpha = torch.sin(w0) / 2.0 / Q
+    b0 = (1 + torch.cos(w0)) / 2
+    b1 = -1 - torch.cos(w0)
+    b2 = b0
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+def _lfilter_core_generic_loop(input_signal_windows: Tensor, a_coeffs_flipped: Tensor, padded_output_waveform: Tensor):
+    n_order = a_coeffs_flipped.size(1)
+    a_coeffs_flipped = a_coeffs_flipped.unsqueeze(2)
+    for i_sample, o0 in enumerate(input_signal_windows.permute(2, 0, 1)):
+        windowed_output_signal = padded_output_waveform[:, :, i_sample : i_sample + n_order]
+        o0 -= (windowed_output_signal.transpose(0, 1) @ a_coeffs_flipped)[..., 0].t()
+        padded_output_waveform[:, :, i_sample + n_order - 1] = o0
+if _IS_TORCHAUDIO_EXT_AVAILABLE:
+    _lfilter_core_cpu_loop = torch.ops.torchaudio._lfilter_core_loop
+else:
+    _lfilter_core_cpu_loop = _lfilter_core_generic_loop
+def _lfilter_core(
+    waveform: Tensor,
+    a_coeffs: Tensor,
+    b_coeffs: Tensor,
+) -> Tensor:
+    if a_coeffs.size() != b_coeffs.size():
+        raise ValueError(
+            "Expected coeffs to be the same size."
+            f"Found a_coeffs size: {a_coeffs.size()}, b_coeffs size: {b_coeffs.size()}"
+        )
+    if waveform.ndim != 3:
+        raise ValueError(f"Expected waveform to be 3 dimensional. Found: {waveform.ndim}")
+    if not (waveform.device == a_coeffs.device == b_coeffs.device):
+        raise ValueError(
+            "Expected waveform and coeffs to be on the same device."
+            f"Found: waveform device:{waveform.device}, a_coeffs device: {a_coeffs.device}, "
+            f"b_coeffs device: {b_coeffs.device}"
+        )
+    n_batch, n_channel, n_sample = waveform.size()
+    n_order = a_coeffs.size(1)
+    if n_order <= 0:
+        raise ValueError(f"Expected n_order to be positive. Found: {n_order}")
+    # Pad the input and create output
+    padded_waveform = torch.nn.functional.pad(waveform, [n_order - 1, 0])
+    padded_output_waveform = torch.zeros_like(padded_waveform)
+    # Set up the coefficients matrix
+    # Flip coefficients' order
+    a_coeffs_flipped = a_coeffs.flip(1)
+    b_coeffs_flipped = b_coeffs.flip(1)
+    # calculate windowed_input_signal in parallel using convolution
+    input_signal_windows = torch.nn.functional.conv1d(padded_waveform, b_coeffs_flipped.unsqueeze(1), groups=n_channel)
+    input_signal_windows.div_(a_coeffs[:, :1])
+    a_coeffs_flipped.div_(a_coeffs[:, :1])
+    if (
+        input_signal_windows.device == torch.device("cpu")
+        and a_coeffs_flipped.device == torch.device("cpu")
+        and padded_output_waveform.device == torch.device("cpu")
+    ):
+        _lfilter_core_cpu_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform)
+    else:
+        _lfilter_core_generic_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform)
+    output = padded_output_waveform[:, :, n_order - 1 :]
+    return output
+if _IS_TORCHAUDIO_EXT_AVAILABLE:
+    _lfilter = torch.ops.torchaudio._lfilter
+else:
+    _lfilter = _lfilter_core
+def lfilter(waveform: Tensor, a_coeffs: Tensor, b_coeffs: Tensor, clamp: bool = True, batching: bool = True) -> Tensor:
+    r"""Perform an IIR filter by evaluating difference equation, using differentiable implementation
+    developed independently by *Yu et al.* :cite:`ismir_YuF23` and *Forgione et al.* :cite:`forgione2021dynonet`.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Note:
+        To avoid numerical problems, small filter order is preferred.
+        Using double precision could also minimize numerical precision errors.
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`.  Must be normalized to -1 to 1.
+        a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delays coefficients are first, e.g. ``[a0, a1, a2, ...]``.
+                                Must be same size as b_coeffs (pad with 0's as necessary).
+        b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delays coefficients are first, e.g. ``[b0, b1, b2, ...]``.
+                                Must be same size as a_coeffs (pad with 0's as necessary).
+        clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
+        batching (bool, optional): Effective only when coefficients are 2D. If ``True``, then waveform should be at
+                                    least 2D, and the size of second axis from last should equals to ``num_filters``.
+                                    The output can be expressed as ``output[..., i, :] = lfilter(waveform[..., i, :],
+                                    a_coeffs[i], b_coeffs[i], clamp=clamp, batching=False)``. (Default: ``True``)
+    Returns:
+        Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
+        are 2D Tensors, or `(..., time)` otherwise.
+    """
+    if a_coeffs.size() != b_coeffs.size():
+        raise ValueError(
+            "Expected coeffs to be the same size."
+            f"Found: a_coeffs size: {a_coeffs.size()}, b_coeffs size: {b_coeffs.size()}"
+        )
+    if a_coeffs.ndim > 2:
+        raise ValueError(f"Expected coeffs to have greater than 1 dimension. Found: {a_coeffs.ndim}")
+    if a_coeffs.ndim > 1:
+        if batching:
+            if waveform.ndim <= 0:
+                raise ValueError("Expected waveform to have a positive number of dimensions." f"Found: {waveform.ndim}")
+            if waveform.shape[-2] != a_coeffs.shape[0]:
+                raise ValueError(
+                    "Expected number of batches in waveform and coeffs to be the same."
+                    f"Found: coeffs batches: {a_coeffs.shape[0]}, waveform batches: {waveform.shape[-2]}"
+                )
+        else:
+            waveform = torch.stack([waveform] * a_coeffs.shape[0], -2)
+    else:
+        a_coeffs = a_coeffs.unsqueeze(0)
+        b_coeffs = b_coeffs.unsqueeze(0)
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, a_coeffs.shape[0], shape[-1])
+    output = _lfilter(waveform, a_coeffs, b_coeffs)
+    if clamp:
+        output = torch.clamp(output, min=-1.0, max=1.0)
+    # unpack batch
+    output = output.reshape(shape[:-1] + output.shape[-1:])
+    return output
+def lowpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design biquad lowpass filter and perform filtering.  Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+        waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        cutoff_freq (float or torch.Tensor): filter cutoff frequency
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    w0 = 2 * math.pi * cutoff_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+    b0 = (1 - torch.cos(w0)) / 2
+    b1 = 1 - torch.cos(w0)
+    b2 = b0
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+def _overdrive_core_loop_generic(
+    waveform: Tensor, temp: Tensor, last_in: Tensor, last_out: Tensor, output_waveform: Tensor
+):
+    for i in range(waveform.shape[-1]):
+        last_out = temp[:, i] - last_in + 0.995 * last_out
+        last_in = temp[:, i]
+        output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
+if _IS_TORCHAUDIO_EXT_AVAILABLE:
+    _overdrive_core_loop_cpu = torch.ops.torchaudio._overdrive_core_loop
+else:
+    _overdrive_core_loop_cpu = _overdrive_core_loop_generic
+def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
+    r"""Apply a overdrive effect to the audio. Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    This effect applies a non linear distortion to the audio signal.
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        gain (float, optional): desired gain at the boost (or attenuation) in dB
+            Allowed range of values are 0 to 100
+        colour (float, optional):  controls the amount of even harmonic content in the over-driven output
+            Allowed range of values are 0 to 100
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+    actual_shape = waveform.shape
+    device, dtype = waveform.device, waveform.dtype
+    # convert to 2D (..,time)
+    waveform = waveform.view(-1, actual_shape[-1])
+    gain = _dB2Linear(gain)
+    colour = colour / 200
+    last_in = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device)
+    last_out = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device)
+    temp = waveform * gain + colour
+    mask1 = temp < -1
+    temp[mask1] = torch.tensor(-2.0 / 3.0, dtype=dtype, device=device)
+    # Wrapping the constant with Tensor is required for Torchscript
+    mask2 = temp > 1
+    temp[mask2] = torch.tensor(2.0 / 3.0, dtype=dtype, device=device)
+    mask3 = ~mask1 & ~mask2
+    temp[mask3] = temp[mask3] - (temp[mask3] ** 3) * (1.0 / 3)
+    output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
+    # Uses CPU optimized loop function if available for CPU device
+    if device == torch.device("cpu"):
+        _overdrive_core_loop_cpu(waveform, temp, last_in, last_out, output_waveform)
+    else:
+        _overdrive_core_loop_generic(waveform, temp, last_in, last_out, output_waveform)
+    return output_waveform.clamp(min=-1, max=1).view(actual_shape)
+def phaser(
+    waveform: Tensor,
+    sample_rate: int,
+    gain_in: float = 0.4,
+    gain_out: float = 0.74,
+    delay_ms: float = 3.0,
+    decay: float = 0.4,
+    mod_speed: float = 0.5,
+    sinusoidal: bool = True,
+) -> Tensor:
+    r"""Apply a phasing effect to the audio. Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        gain_in (float, optional): desired input gain at the boost (or attenuation) in dB
+            Allowed range of values are 0 to 1
+        gain_out (float, optional): desired output gain at the boost (or attenuation) in dB
+            Allowed range of values are 0 to 1e9
+        delay_ms (float, optional): desired delay in milliseconds
+            Allowed range of values are 0 to 5.0
+        decay (float, optional):  desired decay relative to gain-in
+            Allowed range of values are 0 to 0.99
+        mod_speed (float, optional):  modulation speed in Hz
+            Allowed range of values are 0.1 to 2
+        sinusoidal (bool, optional):  If ``True``, uses sinusoidal modulation (preferable for multiple instruments)
+            If ``False``, uses triangular modulation (gives single instruments a sharper phasing effect)
+            (Default: ``True``)
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - Scott Lehman, `Effects Explained`_.
+    .. _Effects Explained:
+        https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
+    """
+    actual_shape = waveform.shape
+    device, dtype = waveform.device, waveform.dtype
+    # convert to 2D (channels,time)
+    waveform = waveform.view(-1, actual_shape[-1])
+    delay_buf_len = int((delay_ms * 0.001 * sample_rate) + 0.5)
+    delay_buf = torch.zeros(waveform.shape[0], delay_buf_len, dtype=dtype, device=device)
+    mod_buf_len = int(sample_rate / mod_speed + 0.5)
+    if sinusoidal:
+        wave_type = "SINE"
+    else:
+        wave_type = "TRIANGLE"
+    mod_buf = _generate_wave_table(
+        wave_type=wave_type,
+        data_type="INT",
+        table_size=mod_buf_len,
+        min=1.0,
+        max=float(delay_buf_len),
+        phase=math.pi / 2,
+        device=device,
+    )
+    delay_pos = 0
+    mod_pos = 0
+    output_waveform_pre_gain_list = []
+    waveform = waveform * gain_in
+    delay_buf = delay_buf * decay
+    waveform_list = [waveform[:, i] for i in range(waveform.size(1))]
+    delay_buf_list = [delay_buf[:, i] for i in range(delay_buf.size(1))]
+    mod_buf_list = [mod_buf[i] for i in range(mod_buf.size(0))]
+    for i in range(waveform.shape[-1]):
+        idx = int((delay_pos + mod_buf_list[mod_pos]) % delay_buf_len)
+        mod_pos = (mod_pos + 1) % mod_buf_len
+        delay_pos = (delay_pos + 1) % delay_buf_len
+        temp = (waveform_list[i]) + (delay_buf_list[idx])
+        delay_buf_list[delay_pos] = temp * decay
+        output_waveform_pre_gain_list.append(temp)
+    output_waveform = torch.stack(output_waveform_pre_gain_list, dim=1).to(dtype=dtype, device=device)
+    output_waveform.mul_(gain_out)
+    return output_waveform.clamp(min=-1, max=1).view(actual_shape)
+def riaa_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
+    r"""Apply RIAA vinyl playback equalization.  Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz).
+            Allowed sample rates in Hz : ``44100``,``48000``,``88200``,``96000``
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    if sample_rate == 44100:
+        zeros = [-0.2014898, 0.9233820]
+        poles = [0.7083149, 0.9924091]
+    elif sample_rate == 48000:
+        zeros = [-0.1766069, 0.9321590]
+        poles = [0.7396325, 0.9931330]
+    elif sample_rate == 88200:
+        zeros = [-0.1168735, 0.9648312]
+        poles = [0.8590646, 0.9964002]
+    elif sample_rate == 96000:
+        zeros = [-0.1141486, 0.9676817]
+        poles = [0.8699137, 0.9966946]
+    else:
+        raise ValueError("Sample rate must be 44.1k, 48k, 88.2k, or 96k")
+    # polynomial coefficients with roots zeros[0] and zeros[1]
+    b0 = 1.0
+    b1 = -(zeros[0] + zeros[1])
+    b2 = zeros[0] * zeros[1]
+    # polynomial coefficients with roots poles[0] and poles[1]
+    a0 = 1.0
+    a1 = -(poles[0] + poles[1])
+    a2 = poles[0] * poles[1]
+    # Normalize to 0dB at 1kHz
+    y = 2 * math.pi * 1000 / sample_rate
+    b_re = b0 + b1 * math.cos(-y) + b2 * math.cos(-2 * y)
+    a_re = a0 + a1 * math.cos(-y) + a2 * math.cos(-2 * y)
+    b_im = b1 * math.sin(-y) + b2 * math.sin(-2 * y)
+    a_im = a1 * math.sin(-y) + a2 * math.sin(-2 * y)
+    g = 1 / math.sqrt((b_re**2 + b_im**2) / (a_re**2 + a_im**2))
+    b0 *= g
+    b1 *= g
+    b2 *= g
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+def treble_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    gain: float,
+    central_freq: float = 3000,
+    Q: float = 0.707,
+) -> Tensor:
+    r"""Design a treble tone-control effect.  Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: Autograd TorchScript
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB.
+        central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``3000``)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    gain = torch.as_tensor(gain, dtype=dtype, device=device)
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+    A = torch.exp(gain / 40 * math.log(10))
+    temp1 = 2 * torch.sqrt(A) * alpha
+    temp2 = (A - 1) * torch.cos(w0)
+    temp3 = (A + 1) * torch.cos(w0)
+    b0 = A * ((A + 1) + temp2 + temp1)
+    b1 = -2 * A * ((A - 1) + temp3)
+    b2 = A * ((A + 1) + temp2 - temp1)
+    a0 = (A + 1) - temp2 + temp1
+    a1 = 2 * ((A - 1) - temp3)
+    a2 = (A + 1) - temp2 - temp1
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+def _measure(
+    measure_len_ws: int,
+    samples: Tensor,
+    spectrum: Tensor,
+    noise_spectrum: Tensor,
+    spectrum_window: Tensor,
+    spectrum_start: int,
+    spectrum_end: int,
+    cepstrum_window: Tensor,
+    cepstrum_start: int,
+    cepstrum_end: int,
+    noise_reduction_amount: float,
+    measure_smooth_time_mult: float,
+    noise_up_time_mult: Tensor,
+    noise_down_time_mult: Tensor,
+    boot_count: int,
+) -> float:
+    device = samples.device
+    if spectrum.size(-1) != noise_spectrum.size(-1):
+        raise ValueError(
+            "Expected spectrum size to match noise spectrum size in final dimension."
+            f"Found: spectrum size: {spectrum.size()}, noise_spectrum size: {noise_spectrum.size()}"
+        )
+    dft_len_ws = spectrum.size()[-1]
+    dftBuf = torch.zeros(dft_len_ws, device=device)
+    dftBuf[:measure_len_ws] = samples * spectrum_window[:measure_len_ws]
+    # lsx_safe_rdft((int)p->dft_len_ws, 1, c->dftBuf);
+    _dftBuf = torch.fft.rfft(dftBuf)
+    mult: float = boot_count / (1.0 + boot_count) if boot_count >= 0 else measure_smooth_time_mult
+    _d = _dftBuf[spectrum_start:spectrum_end].abs()
+    spectrum[spectrum_start:spectrum_end].mul_(mult).add_(_d * (1 - mult))
+    _d = spectrum[spectrum_start:spectrum_end] ** 2
+    _zeros = torch.zeros(spectrum_end - spectrum_start, device=device)
+    _mult = (
+        _zeros
+        if boot_count >= 0
+        else torch.where(
+            _d > noise_spectrum[spectrum_start:spectrum_end],
+            noise_up_time_mult,  # if
+            noise_down_time_mult,  # else,
+        )
+    )
+    noise_spectrum[spectrum_start:spectrum_end].mul_(_mult).add_(_d * (1 - _mult))
+    _d = torch.sqrt(
+        torch.max(
+            _zeros,
+            _d - noise_reduction_amount * noise_spectrum[spectrum_start:spectrum_end],
+        ),
+    )
+    _cepstrum_Buf: Tensor = torch.zeros(dft_len_ws >> 1, device=device)
+    _cepstrum_Buf[spectrum_start:spectrum_end] = _d * cepstrum_window
+    _cepstrum_Buf[spectrum_end : dft_len_ws >> 1].zero_()
+    # lsx_safe_rdft((int)p->dft_len_ws >> 1, 1, c->dftBuf);
+    _cepstrum_Buf = torch.fft.rfft(_cepstrum_Buf)
+    result: float = float(torch.sum(_cepstrum_Buf[cepstrum_start:cepstrum_end].abs().pow(2)))
+    result = math.log(result / (cepstrum_end - cepstrum_start)) if result > 0 else -math.inf
+    return max(0, 21 + result)
+def vad(
+    waveform: Tensor,
+    sample_rate: int,
+    trigger_level: float = 7.0,
+    trigger_time: float = 0.25,
+    search_time: float = 1.0,
+    allowed_gap: float = 0.25,
+    pre_trigger_time: float = 0.0,
+    # Fine-tuning parameters
+    boot_time: float = 0.35,
+    noise_up_time: float = 0.1,
+    noise_down_time: float = 0.01,
+    noise_reduction_amount: float = 1.35,
+    measure_freq: float = 20.0,
+    measure_duration: Optional[float] = None,
+    measure_smooth_time: float = 0.4,
+    hp_filter_freq: float = 50.0,
+    lp_filter_freq: float = 6000.0,
+    hp_lifter_freq: float = 150.0,
+    lp_lifter_freq: float = 2000.0,
+) -> Tensor:
+    r"""Voice Activity Detector. Similar to SoX implementation.
+    .. devices:: CPU CUDA
+    .. properties:: TorchScript
+    Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
+    The algorithm currently uses a simple cepstral power measurement to detect voice,
+    so may be fooled by other things, especially music.
+    The effect can trim only from the front of the audio,
+    so in order to trim from the back, the reverse effect must also be used.
+    Args:
+        waveform (Tensor): Tensor of audio of dimension `(channels, time)` or `(time)`
+            Tensor of shape `(channels, time)` is treated as a multi-channel recording
+            of the same event and the resulting output will be trimmed to the earliest
+            voice activity in any channel.
+        sample_rate (int): Sample rate of audio signal.
+        trigger_level (float, optional): The measurement level used to trigger activity detection.
+            This may need to be cahnged depending on the noise level, signal level,
+            and other characteristics of the input audio. (Default: 7.0)
+        trigger_time (float, optional): The time constant (in seconds)
+            used to help ignore short bursts of sound. (Default: 0.25)
+        search_time (float, optional): The amount of audio (in seconds)
+            to search for quieter/shorter bursts of audio to include prior
+            to the detected trigger point. (Default: 1.0)
+        allowed_gap (float, optional): The allowed gap (in seconds) between
+            quieter/shorter bursts of audio to include prior
+            to the detected trigger point. (Default: 0.25)
+        pre_trigger_time (float, optional): The amount of audio (in seconds) to preserve
+            before the trigger point and any found quieter/shorter bursts. (Default: 0.0)
+        boot_time (float, optional) The algorithm (internally) uses adaptive noise
+            estimation/reduction in order to detect the start of the wanted audio.
+            This option sets the time for the initial noise estimate. (Default: 0.35)
+        noise_up_time (float, optional) Time constant used by the adaptive noise estimator
+            for when the noise level is increasing. (Default: 0.1)
+        noise_down_time (float, optional) Time constant used by the adaptive noise estimator
+            for when the noise level is decreasing. (Default: 0.01)
+        noise_reduction_amount (float, optional) Amount of noise reduction to use in
+            the detection algorithm (e.g. 0, 0.5, ...). (Default: 1.35)
+        measure_freq (float, optional) Frequency of the algorithm's
+            processing/measurements. (Default: 20.0)
+        measure_duration: (float, optional) Measurement duration.
+            (Default: Twice the measurement period; i.e. with overlap.)
+        measure_smooth_time (float, optional) Time constant used to smooth
+            spectral measurements. (Default: 0.4)
+        hp_filter_freq (float, optional) "Brick-wall" frequency of high-pass filter applied
+            at the input to the detector algorithm. (Default: 50.0)
+        lp_filter_freq (float, optional) "Brick-wall" frequency of low-pass filter applied
+            at the input to the detector algorithm. (Default: 6000.0)
+        hp_lifter_freq (float, optional) "Brick-wall" frequency of high-pass lifter used
+            in the detector algorithm. (Default: 150.0)
+        lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used
+            in the detector algorithm. (Default: 2000.0)
+    Returns:
+        Tensor: Tensor of audio of dimension `(..., time)`.
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+    device = waveform.device
+    if waveform.ndim > 2:
+        warnings.warn(
+            "Expected input tensor dimension of 1 for single channel"
+            f" or 2 for multi-channel. Got {waveform.ndim} instead. "
+            "Batch semantics is not supported. "
+            "Please refer to https://github.com/pytorch/audio/issues/1348"
+            " and https://github.com/pytorch/audio/issues/1468."
+        )
+    measure_duration: float = 2.0 / measure_freq if measure_duration is None else measure_duration
+    measure_len_ws = int(sample_rate * measure_duration + 0.5)
+    measure_len_ns = measure_len_ws
+    # for (dft_len_ws = 16; dft_len_ws < measure_len_ws; dft_len_ws <<= 1);
+    dft_len_ws = 16
+    while dft_len_ws < measure_len_ws:
+        dft_len_ws *= 2
+    measure_period_ns = int(sample_rate / measure_freq + 0.5)
+    measures_len = math.ceil(search_time * measure_freq)
+    search_pre_trigger_len_ns = measures_len * measure_period_ns
+    gap_len = int(allowed_gap * measure_freq + 0.5)
+    fixed_pre_trigger_len_ns = int(pre_trigger_time * sample_rate + 0.5)
+    samplesLen_ns = fixed_pre_trigger_len_ns + search_pre_trigger_len_ns + measure_len_ns
+    spectrum_window = torch.zeros(measure_len_ws, device=device)
+    for i in range(measure_len_ws):
+        # sox.h:741 define SOX_SAMPLE_MIN (sox_sample_t)SOX_INT_MIN(32)
+        spectrum_window[i] = 2.0 / math.sqrt(float(measure_len_ws))
+    # lsx_apply_hann(spectrum_window, (int)measure_len_ws);
+    spectrum_window *= torch.hann_window(measure_len_ws, device=device, dtype=torch.float)
+    spectrum_start: int = int(hp_filter_freq / sample_rate * dft_len_ws + 0.5)
+    spectrum_start: int = max(spectrum_start, 1)
+    spectrum_end: int = int(lp_filter_freq / sample_rate * dft_len_ws + 0.5)
+    spectrum_end: int = min(spectrum_end, dft_len_ws // 2)
+    cepstrum_window = torch.zeros(spectrum_end - spectrum_start, device=device)
+    for i in range(spectrum_end - spectrum_start):
+        cepstrum_window[i] = 2.0 / math.sqrt(float(spectrum_end) - spectrum_start)
+    # lsx_apply_hann(cepstrum_window,(int)(spectrum_end - spectrum_start));
+    cepstrum_window *= torch.hann_window(spectrum_end - spectrum_start, device=device, dtype=torch.float)
+    cepstrum_start = math.ceil(sample_rate * 0.5 / lp_lifter_freq)
+    cepstrum_end = math.floor(sample_rate * 0.5 / hp_lifter_freq)
+    cepstrum_end = min(cepstrum_end, dft_len_ws // 4)
+    if cepstrum_end <= cepstrum_start:
+        raise ValueError(
+            "Expected cepstrum_start to be smaller than cepstrum_end."
+            f"Found: cepstrum_start: {cepstrum_start}, cepstrum_end: {cepstrum_end}."
+        )
+    noise_up_time_mult = torch.tensor(math.exp(-1.0 / (noise_up_time * measure_freq)), device=device)
+    noise_down_time_mult = torch.tensor(math.exp(-1.0 / (noise_down_time * measure_freq)), device=device)
+    measure_smooth_time_mult = math.exp(-1.0 / (measure_smooth_time * measure_freq))
+    trigger_meas_time_mult = math.exp(-1.0 / (trigger_time * measure_freq))
+    boot_count_max = int(boot_time * measure_freq - 0.5)
+    boot_count = measures_index = flushedLen_ns = 0
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.view(-1, shape[-1])
+    n_channels, ilen = waveform.size()
+    mean_meas = torch.zeros(n_channels, device=device)
+    spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
+    noise_spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
+    measures = torch.zeros(n_channels, measures_len, device=device)
+    has_triggered: bool = False
+    num_measures_to_flush: int = 0
+    pos = 0
+    for pos in range(measure_len_ns, ilen, measure_period_ns):
+        for i in range(n_channels):
+            meas: float = _measure(
+                measure_len_ws=measure_len_ws,
+                samples=waveform[i, pos - measure_len_ws : pos],
+                spectrum=spectrum[i],
+                noise_spectrum=noise_spectrum[i],
+                spectrum_window=spectrum_window,
+                spectrum_start=spectrum_start,
+                spectrum_end=spectrum_end,
+                cepstrum_window=cepstrum_window,
+                cepstrum_start=cepstrum_start,
+                cepstrum_end=cepstrum_end,
+                noise_reduction_amount=noise_reduction_amount,
+                measure_smooth_time_mult=measure_smooth_time_mult,
+                noise_up_time_mult=noise_up_time_mult,
+                noise_down_time_mult=noise_down_time_mult,
+                boot_count=boot_count,
+            )
+            measures[i, measures_index] = meas
+            mean_meas[i] = mean_meas[i] * trigger_meas_time_mult + meas * (1.0 - trigger_meas_time_mult)
+            has_triggered = has_triggered or (mean_meas[i] >= trigger_level)
+            if has_triggered:
+                n: int = measures_len
+                k: int = measures_index
+                jTrigger: int = n
+                jZero: int = n
+                j: int = 0
+                for j in range(n):
+                    if (measures[i, k] >= trigger_level) and (j <= jTrigger + gap_len):
+                        jZero = jTrigger = j
+                    elif (measures[i, k] == 0) and (jTrigger >= jZero):
+                        jZero = j
+                    k = (k + n - 1) % n
+                j = min(j, jZero)
+                # num_measures_to_flush = range_limit(j, num_measures_to_flush, n);
+                num_measures_to_flush = min(max(num_measures_to_flush, j), n)
+            # end if has_triggered
+        # end for channel
+        measures_index += 1
+        measures_index = measures_index % measures_len
+        if boot_count >= 0:
+            boot_count = -1 if boot_count == boot_count_max else boot_count + 1
+        if has_triggered:
+            flushedLen_ns = (measures_len - num_measures_to_flush) * measure_period_ns
+            break
+    # end for window
+    if not has_triggered:
+        return waveform[..., :0].view(shape[:-1] + torch.Size([0]))
+    res = waveform[:, pos - samplesLen_ns + flushedLen_ns :]
+    # unpack batch
+    return res.view(shape[:-1] + res.shape[-1:])