koichi12 commited on
Commit
c060ea1
·
verified ·
1 Parent(s): 920167e

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/__init__.cpython-311.pyc +0 -0
  3. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/const_vs_enum.cpython-311.pyc +0 -0
  4. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/contains.cpython-311.pyc +0 -0
  5. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/issue232.cpython-311.pyc +0 -0
  6. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/json_schema_test_suite.cpython-311.pyc +0 -0
  7. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/nested_schemas.cpython-311.pyc +0 -0
  8. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/subcomponents.cpython-311.pyc +0 -0
  9. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/unused_registry.cpython-311.pyc +0 -0
  10. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_applicator_schemas.cpython-311.pyc +0 -0
  11. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_keywords.cpython-311.pyc +0 -0
  12. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/validator_creation.cpython-311.pyc +0 -0
  13. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/const_vs_enum.py +30 -0
  14. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/contains.py +28 -0
  15. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/json_schema_test_suite.py +12 -0
  16. .venv/lib/python3.11/site-packages/jsonschema/benchmarks/validator_creation.py +14 -0
  17. .venv/lib/python3.11/site-packages/torchaudio/__init__.py +53 -0
  18. .venv/lib/python3.11/site-packages/torchaudio/__pycache__/__init__.cpython-311.pyc +0 -0
  19. .venv/lib/python3.11/site-packages/torchaudio/__pycache__/kaldi_io.cpython-311.pyc +0 -0
  20. .venv/lib/python3.11/site-packages/torchaudio/__pycache__/version.cpython-311.pyc +0 -0
  21. .venv/lib/python3.11/site-packages/torchaudio/_backend/__init__.py +61 -0
  22. .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/__init__.cpython-311.pyc +0 -0
  23. .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/backend.cpython-311.pyc +0 -0
  24. .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/common.cpython-311.pyc +0 -0
  25. .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/ffmpeg.cpython-311.pyc +0 -0
  26. .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile.cpython-311.pyc +0 -0
  27. .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile_backend.cpython-311.pyc +0 -0
  28. .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/sox.cpython-311.pyc +0 -0
  29. .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/utils.cpython-311.pyc +0 -0
  30. .venv/lib/python3.11/site-packages/torchaudio/_backend/backend.py +53 -0
  31. .venv/lib/python3.11/site-packages/torchaudio/_backend/common.py +52 -0
  32. .venv/lib/python3.11/site-packages/torchaudio/_backend/ffmpeg.py +334 -0
  33. .venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile.py +54 -0
  34. .venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile_backend.py +457 -0
  35. .venv/lib/python3.11/site-packages/torchaudio/_backend/sox.py +91 -0
  36. .venv/lib/python3.11/site-packages/torchaudio/_backend/utils.py +317 -0
  37. .venv/lib/python3.11/site-packages/torchaudio/backend/__init__.py +8 -0
  38. .venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/_sox_io_backend.cpython-311.pyc +0 -0
  39. .venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/soundfile_backend.cpython-311.pyc +0 -0
  40. .venv/lib/python3.11/site-packages/torchaudio/backend/_no_backend.py +25 -0
  41. .venv/lib/python3.11/site-packages/torchaudio/backend/common.py +13 -0
  42. .venv/lib/python3.11/site-packages/torchaudio/backend/soundfile_backend.py +14 -0
  43. .venv/lib/python3.11/site-packages/torchaudio/backend/sox_io_backend.py +14 -0
  44. .venv/lib/python3.11/site-packages/torchaudio/functional/__init__.py +127 -0
  45. .venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/__init__.cpython-311.pyc +0 -0
  46. .venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/_alignment.cpython-311.pyc +0 -0
  47. .venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/filtering.cpython-311.pyc +0 -0
  48. .venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/functional.cpython-311.pyc +3 -0
  49. .venv/lib/python3.11/site-packages/torchaudio/functional/_alignment.py +128 -0
  50. .venv/lib/python3.11/site-packages/torchaudio/functional/filtering.py +1669 -0
.gitattributes CHANGED
@@ -295,3 +295,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/
295
  .venv/bin/py-spy filter=lfs diff=lfs merge=lfs -text
296
  .venv/lib/python3.11/site-packages/_cffi_backend.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
297
  .venv/lib/python3.11/site-packages/jsonschema/tests/__pycache__/test_validators.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 
 
295
  .venv/bin/py-spy filter=lfs diff=lfs merge=lfs -text
296
  .venv/lib/python3.11/site-packages/_cffi_backend.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
297
  .venv/lib/python3.11/site-packages/jsonschema/tests/__pycache__/test_validators.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
298
+ .venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/functional.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (279 Bytes). View file
 
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/const_vs_enum.cpython-311.pyc ADDED
Binary file (2.18 kB). View file
 
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/contains.cpython-311.pyc ADDED
Binary file (2.15 kB). View file
 
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/issue232.cpython-311.pyc ADDED
Binary file (1.02 kB). View file
 
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/json_schema_test_suite.cpython-311.pyc ADDED
Binary file (719 Bytes). View file
 
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/nested_schemas.cpython-311.pyc ADDED
Binary file (2.74 kB). View file
 
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/subcomponents.cpython-311.pyc ADDED
Binary file (2.6 kB). View file
 
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/unused_registry.cpython-311.pyc ADDED
Binary file (1.79 kB). View file
 
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_applicator_schemas.cpython-311.pyc ADDED
Binary file (4.04 kB). View file
 
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_keywords.cpython-311.pyc ADDED
Binary file (2.36 kB). View file
 
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/validator_creation.cpython-311.pyc ADDED
Binary file (629 Bytes). View file
 
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/const_vs_enum.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A benchmark for comparing equivalent validation of `const` and `enum`.
3
+ """
4
+
5
+ from pyperf import Runner
6
+
7
+ from jsonschema import Draft202012Validator
8
+
9
+ value = [37] * 100
10
+ const_schema = {"const": list(value)}
11
+ enum_schema = {"enum": [list(value)]}
12
+
13
+ valid = list(value)
14
+ invalid = [*valid, 73]
15
+
16
+ const = Draft202012Validator(const_schema)
17
+ enum = Draft202012Validator(enum_schema)
18
+
19
+ assert const.is_valid(valid)
20
+ assert enum.is_valid(valid)
21
+ assert not const.is_valid(invalid)
22
+ assert not enum.is_valid(invalid)
23
+
24
+
25
+ if __name__ == "__main__":
26
+ runner = Runner()
27
+ runner.bench_func("const valid", lambda: const.is_valid(valid))
28
+ runner.bench_func("const invalid", lambda: const.is_valid(invalid))
29
+ runner.bench_func("enum valid", lambda: enum.is_valid(valid))
30
+ runner.bench_func("enum invalid", lambda: enum.is_valid(invalid))
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/contains.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A benchmark for validation of the `contains` keyword.
3
+ """
4
+
5
+ from pyperf import Runner
6
+
7
+ from jsonschema import Draft202012Validator
8
+
9
+ schema = {
10
+ "type": "array",
11
+ "contains": {"const": 37},
12
+ }
13
+ validator = Draft202012Validator(schema)
14
+
15
+ size = 1000
16
+ beginning = [37] + [0] * (size - 1)
17
+ middle = [0] * (size // 2) + [37] + [0] * (size // 2)
18
+ end = [0] * (size - 1) + [37]
19
+ invalid = [0] * size
20
+
21
+
22
+ if __name__ == "__main__":
23
+ runner = Runner()
24
+ runner.bench_func("baseline", lambda: validator.is_valid([]))
25
+ runner.bench_func("beginning", lambda: validator.is_valid(beginning))
26
+ runner.bench_func("middle", lambda: validator.is_valid(middle))
27
+ runner.bench_func("end", lambda: validator.is_valid(end))
28
+ runner.bench_func("invalid", lambda: validator.is_valid(invalid))
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/json_schema_test_suite.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A performance benchmark using the official test suite.
3
+
4
+ This benchmarks jsonschema using every valid example in the
5
+ JSON-Schema-Test-Suite. It will take some time to complete.
6
+ """
7
+ from pyperf import Runner
8
+
9
+ from jsonschema.tests._suite import Suite
10
+
11
+ if __name__ == "__main__":
12
+ Suite().benchmark(runner=Runner())
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/validator_creation.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pyperf import Runner
2
+
3
+ from jsonschema import Draft202012Validator
4
+
5
+ schema = {
6
+ "type": "array",
7
+ "minLength": 1,
8
+ "maxLength": 1,
9
+ "items": {"type": "integer"},
10
+ }
11
+
12
+
13
+ if __name__ == "__main__":
14
+ Runner().bench_func("validator creation", Draft202012Validator, schema)
.venv/lib/python3.11/site-packages/torchaudio/__init__.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Initialize extension and backend first
2
+ from . import _extension # noqa # usort: skip
3
+ from ._backend import ( # noqa # usort: skip
4
+ AudioMetaData,
5
+ get_audio_backend,
6
+ info,
7
+ list_audio_backends,
8
+ load,
9
+ save,
10
+ set_audio_backend,
11
+ )
12
+
13
+ from . import ( # noqa: F401
14
+ compliance,
15
+ datasets,
16
+ functional,
17
+ io,
18
+ kaldi_io,
19
+ models,
20
+ pipelines,
21
+ sox_effects,
22
+ transforms,
23
+ utils,
24
+ )
25
+
26
+ # For BC
27
+ from . import backend # noqa # usort: skip
28
+
29
+ try:
30
+ from .version import __version__, git_version # noqa: F401
31
+ except ImportError:
32
+ pass
33
+
34
+
35
+ __all__ = [
36
+ "AudioMetaData",
37
+ "load",
38
+ "info",
39
+ "save",
40
+ "io",
41
+ "compliance",
42
+ "datasets",
43
+ "functional",
44
+ "models",
45
+ "pipelines",
46
+ "kaldi_io",
47
+ "utils",
48
+ "sox_effects",
49
+ "transforms",
50
+ "list_audio_backends",
51
+ "get_audio_backend",
52
+ "set_audio_backend",
53
+ ]
.venv/lib/python3.11/site-packages/torchaudio/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (1.17 kB). View file
 
.venv/lib/python3.11/site-packages/torchaudio/__pycache__/kaldi_io.cpython-311.pyc ADDED
Binary file (5.84 kB). View file
 
.venv/lib/python3.11/site-packages/torchaudio/__pycache__/version.cpython-311.pyc ADDED
Binary file (272 Bytes). View file
 
.venv/lib/python3.11/site-packages/torchaudio/_backend/__init__.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+
3
+ from torchaudio._internal.module_utils import deprecated
4
+
5
+ from . import utils
6
+ from .common import AudioMetaData
7
+
8
+ __all__ = [
9
+ "AudioMetaData",
10
+ "load",
11
+ "info",
12
+ "save",
13
+ "list_audio_backends",
14
+ "get_audio_backend",
15
+ "set_audio_backend",
16
+ ]
17
+
18
+
19
+ info = utils.get_info_func()
20
+ load = utils.get_load_func()
21
+ save = utils.get_save_func()
22
+
23
+
24
+ def list_audio_backends() -> List[str]:
25
+ """List available backends
26
+
27
+ Returns:
28
+ list of str: The list of available backends.
29
+
30
+ The possible values are; ``"ffmpeg"``, ``"sox"`` and ``"soundfile"``.
31
+ """
32
+
33
+ return list(utils.get_available_backends().keys())
34
+
35
+
36
+ # Temporary until global backend is removed
37
+ @deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
38
+ def get_audio_backend() -> Optional[str]:
39
+ """Get the name of the current global backend
40
+
41
+ Returns:
42
+ str or None:
43
+ If dispatcher mode is enabled, returns ``None`` otherwise,
44
+ the name of current backend or ``None`` (no backend is set).
45
+ """
46
+ return None
47
+
48
+
49
+ # Temporary until global backend is removed
50
+ @deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
51
+ def set_audio_backend(backend: Optional[str]): # noqa
52
+ """Set the global backend.
53
+
54
+ This is a no-op when dispatcher mode is enabled.
55
+
56
+ Args:
57
+ backend (str or None): Name of the backend.
58
+ One of ``"sox_io"`` or ``"soundfile"`` based on availability
59
+ of the system. If ``None`` is provided the current backend is unassigned.
60
+ """
61
+ pass
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (2.33 kB). View file
 
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/backend.cpython-311.pyc ADDED
Binary file (3.08 kB). View file
 
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/common.cpython-311.pyc ADDED
Binary file (2.35 kB). View file
 
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/ffmpeg.cpython-311.pyc ADDED
Binary file (14 kB). View file
 
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile.cpython-311.pyc ADDED
Binary file (3.21 kB). View file
 
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile_backend.cpython-311.pyc ADDED
Binary file (17.6 kB). View file
 
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/sox.cpython-311.pyc ADDED
Binary file (4.97 kB). View file
 
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/utils.cpython-311.pyc ADDED
Binary file (16.5 kB). View file
 
.venv/lib/python3.11/site-packages/torchaudio/_backend/backend.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from abc import ABC, abstractmethod
3
+ from typing import BinaryIO, Optional, Tuple, Union
4
+
5
+ from torch import Tensor
6
+ from torchaudio.io import CodecConfig
7
+
8
+ from .common import AudioMetaData
9
+
10
+
11
+ class Backend(ABC):
12
+ @staticmethod
13
+ @abstractmethod
14
+ def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
15
+ raise NotImplementedError
16
+
17
+ @staticmethod
18
+ @abstractmethod
19
+ def load(
20
+ uri: Union[BinaryIO, str, os.PathLike],
21
+ frame_offset: int = 0,
22
+ num_frames: int = -1,
23
+ normalize: bool = True,
24
+ channels_first: bool = True,
25
+ format: Optional[str] = None,
26
+ buffer_size: int = 4096,
27
+ ) -> Tuple[Tensor, int]:
28
+ raise NotImplementedError
29
+
30
+ @staticmethod
31
+ @abstractmethod
32
+ def save(
33
+ uri: Union[BinaryIO, str, os.PathLike],
34
+ src: Tensor,
35
+ sample_rate: int,
36
+ channels_first: bool = True,
37
+ format: Optional[str] = None,
38
+ encoding: Optional[str] = None,
39
+ bits_per_sample: Optional[int] = None,
40
+ buffer_size: int = 4096,
41
+ compression: Optional[Union[CodecConfig, float, int]] = None,
42
+ ) -> None:
43
+ raise NotImplementedError
44
+
45
+ @staticmethod
46
+ @abstractmethod
47
+ def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
48
+ raise NotImplementedError
49
+
50
+ @staticmethod
51
+ @abstractmethod
52
+ def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
53
+ raise NotImplementedError
.venv/lib/python3.11/site-packages/torchaudio/_backend/common.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class AudioMetaData:
2
+ """AudioMetaData()
3
+
4
+ Return type of ``torchaudio.info`` function.
5
+
6
+ :ivar int sample_rate: Sample rate
7
+ :ivar int num_frames: The number of frames
8
+ :ivar int num_channels: The number of channels
9
+ :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
10
+ or when it cannot be accurately inferred.
11
+ :ivar str encoding: Audio encoding
12
+ The values encoding can take are one of the following:
13
+
14
+ * ``PCM_S``: Signed integer linear PCM
15
+ * ``PCM_U``: Unsigned integer linear PCM
16
+ * ``PCM_F``: Floating point linear PCM
17
+ * ``FLAC``: Flac, Free Lossless Audio Codec
18
+ * ``ULAW``: Mu-law
19
+ * ``ALAW``: A-law
20
+ * ``MP3`` : MP3, MPEG-1 Audio Layer III
21
+ * ``VORBIS``: OGG Vorbis
22
+ * ``AMR_WB``: Adaptive Multi-Rate Wideband
23
+ * ``AMR_NB``: Adaptive Multi-Rate Narrowband
24
+ * ``OPUS``: Opus
25
+ * ``HTK``: Single channel 16-bit PCM
26
+ * ``UNKNOWN`` : None of above
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ sample_rate: int,
32
+ num_frames: int,
33
+ num_channels: int,
34
+ bits_per_sample: int,
35
+ encoding: str,
36
+ ):
37
+ self.sample_rate = sample_rate
38
+ self.num_frames = num_frames
39
+ self.num_channels = num_channels
40
+ self.bits_per_sample = bits_per_sample
41
+ self.encoding = encoding
42
+
43
+ def __str__(self):
44
+ return (
45
+ f"AudioMetaData("
46
+ f"sample_rate={self.sample_rate}, "
47
+ f"num_frames={self.num_frames}, "
48
+ f"num_channels={self.num_channels}, "
49
+ f"bits_per_sample={self.bits_per_sample}, "
50
+ f"encoding={self.encoding}"
51
+ f")"
52
+ )
.venv/lib/python3.11/site-packages/torchaudio/_backend/ffmpeg.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import sys
4
+ from typing import BinaryIO, Optional, Tuple, Union
5
+
6
+ import torch
7
+ import torchaudio
8
+
9
+ from .backend import Backend
10
+ from .common import AudioMetaData
11
+
12
+ InputType = Union[BinaryIO, str, os.PathLike]
13
+
14
+
15
+ def info_audio(
16
+ src: InputType,
17
+ format: Optional[str],
18
+ buffer_size: int = 4096,
19
+ ) -> AudioMetaData:
20
+ s = torchaudio.io.StreamReader(src, format, None, buffer_size)
21
+ sinfo = s.get_src_stream_info(s.default_audio_stream)
22
+ if sinfo.num_frames == 0:
23
+ waveform = _load_audio(s)
24
+ num_frames = waveform.size(1)
25
+ else:
26
+ num_frames = sinfo.num_frames
27
+ return AudioMetaData(
28
+ int(sinfo.sample_rate),
29
+ num_frames,
30
+ sinfo.num_channels,
31
+ sinfo.bits_per_sample,
32
+ sinfo.codec.upper(),
33
+ )
34
+
35
+
36
+ def _get_load_filter(
37
+ frame_offset: int = 0,
38
+ num_frames: int = -1,
39
+ convert: bool = True,
40
+ ) -> Optional[str]:
41
+ if frame_offset < 0:
42
+ raise RuntimeError("Invalid argument: frame_offset must be non-negative. Found: {}".format(frame_offset))
43
+ if num_frames == 0 or num_frames < -1:
44
+ raise RuntimeError("Invalid argument: num_frames must be -1 or greater than 0. Found: {}".format(num_frames))
45
+
46
+ # All default values -> no filter
47
+ if frame_offset == 0 and num_frames == -1 and not convert:
48
+ return None
49
+ # Only convert
50
+ aformat = "aformat=sample_fmts=fltp"
51
+ if frame_offset == 0 and num_frames == -1 and convert:
52
+ return aformat
53
+ # At least one of frame_offset or num_frames has non-default value
54
+ if num_frames > 0:
55
+ atrim = "atrim=start_sample={}:end_sample={}".format(frame_offset, frame_offset + num_frames)
56
+ else:
57
+ atrim = "atrim=start_sample={}".format(frame_offset)
58
+ if not convert:
59
+ return atrim
60
+ return "{},{}".format(atrim, aformat)
61
+
62
+
63
+ def _load_audio(
64
+ s: "torchaudio.io.StreamReader",
65
+ filter: Optional[str] = None,
66
+ channels_first: bool = True,
67
+ ) -> torch.Tensor:
68
+ s.add_audio_stream(-1, -1, filter_desc=filter)
69
+ s.process_all_packets()
70
+ chunk = s.pop_chunks()[0]
71
+ if chunk is None:
72
+ raise RuntimeError("Failed to decode audio.")
73
+ waveform = chunk._elem
74
+ return waveform.T if channels_first else waveform
75
+
76
+
77
+ def load_audio(
78
+ src: InputType,
79
+ frame_offset: int = 0,
80
+ num_frames: int = -1,
81
+ convert: bool = True,
82
+ channels_first: bool = True,
83
+ format: Optional[str] = None,
84
+ buffer_size: int = 4096,
85
+ ) -> Tuple[torch.Tensor, int]:
86
+ if hasattr(src, "read") and format == "vorbis":
87
+ format = "ogg"
88
+ s = torchaudio.io.StreamReader(src, format, None, buffer_size)
89
+ sample_rate = int(s.get_src_stream_info(s.default_audio_stream).sample_rate)
90
+ filter = _get_load_filter(frame_offset, num_frames, convert)
91
+ waveform = _load_audio(s, filter, channels_first)
92
+ return waveform, sample_rate
93
+
94
+
95
+ def _get_sample_format(dtype: torch.dtype) -> str:
96
+ dtype_to_format = {
97
+ torch.uint8: "u8",
98
+ torch.int16: "s16",
99
+ torch.int32: "s32",
100
+ torch.int64: "s64",
101
+ torch.float32: "flt",
102
+ torch.float64: "dbl",
103
+ }
104
+ format = dtype_to_format.get(dtype)
105
+ if format is None:
106
+ raise ValueError(f"No format found for dtype {dtype}; dtype must be one of {list(dtype_to_format.keys())}.")
107
+ return format
108
+
109
+
110
+ def _native_endianness() -> str:
111
+ if sys.byteorder == "little":
112
+ return "le"
113
+ else:
114
+ return "be"
115
+
116
+
117
+ def _get_encoder_for_wav(encoding: str, bits_per_sample: int) -> str:
118
+ if bits_per_sample not in {None, 8, 16, 24, 32, 64}:
119
+ raise ValueError(f"Invalid bits_per_sample {bits_per_sample} for WAV encoding.")
120
+ endianness = _native_endianness()
121
+ if not encoding:
122
+ if not bits_per_sample:
123
+ # default to PCM S16
124
+ return f"pcm_s16{endianness}"
125
+ if bits_per_sample == 8:
126
+ return "pcm_u8"
127
+ return f"pcm_s{bits_per_sample}{endianness}"
128
+ if encoding == "PCM_S":
129
+ if not bits_per_sample:
130
+ bits_per_sample = 16
131
+ if bits_per_sample == 8:
132
+ raise ValueError("For WAV signed PCM, 8-bit encoding is not supported.")
133
+ return f"pcm_s{bits_per_sample}{endianness}"
134
+ if encoding == "PCM_U":
135
+ if bits_per_sample in (None, 8):
136
+ return "pcm_u8"
137
+ raise ValueError("For WAV unsigned PCM, only 8-bit encoding is supported.")
138
+ if encoding == "PCM_F":
139
+ if not bits_per_sample:
140
+ bits_per_sample = 32
141
+ if bits_per_sample in (32, 64):
142
+ return f"pcm_f{bits_per_sample}{endianness}"
143
+ raise ValueError("For WAV float PCM, only 32- and 64-bit encodings are supported.")
144
+ if encoding == "ULAW":
145
+ if bits_per_sample in (None, 8):
146
+ return "pcm_mulaw"
147
+ raise ValueError("For WAV PCM mu-law, only 8-bit encoding is supported.")
148
+ if encoding == "ALAW":
149
+ if bits_per_sample in (None, 8):
150
+ return "pcm_alaw"
151
+ raise ValueError("For WAV PCM A-law, only 8-bit encoding is supported.")
152
+ raise ValueError(f"WAV encoding {encoding} is not supported.")
153
+
154
+
155
+ def _get_flac_sample_fmt(bps):
156
+ if bps is None or bps == 16:
157
+ return "s16"
158
+ if bps == 24:
159
+ return "s32"
160
+ raise ValueError(f"FLAC only supports bits_per_sample values of 16 and 24 ({bps} specified).")
161
+
162
+
163
+ def _parse_save_args(
164
+ ext: Optional[str],
165
+ format: Optional[str],
166
+ encoding: Optional[str],
167
+ bps: Optional[int],
168
+ ):
169
+ # torchaudio's save function accepts the followings, which do not 1to1 map
170
+ # to FFmpeg.
171
+ #
172
+ # - format: audio format
173
+ # - bits_per_sample: encoder sample format
174
+ # - encoding: such as PCM_U8.
175
+ #
176
+ # In FFmpeg, format is specified with the following three (and more)
177
+ #
178
+ # - muxer: could be audio format or container format.
179
+ # the one we passed to the constructor of StreamWriter
180
+ # - encoder: the audio encoder used to encode audio
181
+ # - encoder sample format: the format used by encoder to encode audio.
182
+ #
183
+ # If encoder sample format is different from source sample format, StreamWriter
184
+ # will insert a filter automatically.
185
+ #
186
+ def _type(spec):
187
+ # either format is exactly the specified one
188
+ # or extension matches to the spec AND there is no format override.
189
+ return format == spec or (format is None and ext == spec)
190
+
191
+ if _type("wav") or _type("amb"):
192
+ # wav is special because it supports different encoding through encoders
193
+ # each encoder only supports one encoder format
194
+ #
195
+ # amb format is a special case originated from libsox.
196
+ # It is basically a WAV format, with slight modification.
197
+ # https://github.com/chirlu/sox/commit/4a4ea33edbca5972a1ed8933cc3512c7302fa67a#diff-39171191a858add9df87f5f210a34a776ac2c026842ae6db6ce97f5e68836795
198
+ # It is a format so that decoders will recognize it as ambisonic.
199
+ # https://www.ambisonia.com/Members/mleese/file-format-for-b-format/
200
+ # FFmpeg does not recognize amb because it is basically a WAV format.
201
+ muxer = "wav"
202
+ encoder = _get_encoder_for_wav(encoding, bps)
203
+ sample_fmt = None
204
+ elif _type("vorbis"):
205
+ # FFpmeg does not recognize vorbis extension, while libsox used to do.
206
+ # For the sake of bakward compatibility, (and the simplicity),
207
+ # we support the case where users want to do save("foo.vorbis")
208
+ muxer = "ogg"
209
+ encoder = "vorbis"
210
+ sample_fmt = None
211
+ else:
212
+ muxer = format
213
+ encoder = None
214
+ sample_fmt = None
215
+ if _type("flac"):
216
+ sample_fmt = _get_flac_sample_fmt(bps)
217
+ if _type("ogg"):
218
+ sample_fmt = _get_flac_sample_fmt(bps)
219
+ return muxer, encoder, sample_fmt
220
+
221
+
222
+ def save_audio(
223
+ uri: InputType,
224
+ src: torch.Tensor,
225
+ sample_rate: int,
226
+ channels_first: bool = True,
227
+ format: Optional[str] = None,
228
+ encoding: Optional[str] = None,
229
+ bits_per_sample: Optional[int] = None,
230
+ buffer_size: int = 4096,
231
+ compression: Optional[torchaudio.io.CodecConfig] = None,
232
+ ) -> None:
233
+ ext = None
234
+ if hasattr(uri, "write"):
235
+ if format is None:
236
+ raise RuntimeError("'format' is required when saving to file object.")
237
+ else:
238
+ uri = os.path.normpath(uri)
239
+ if tokens := str(uri).split(".")[1:]:
240
+ ext = tokens[-1].lower()
241
+
242
+ muxer, encoder, enc_fmt = _parse_save_args(ext, format, encoding, bits_per_sample)
243
+
244
+ if channels_first:
245
+ src = src.T
246
+
247
+ s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)
248
+ s.add_audio_stream(
249
+ sample_rate,
250
+ num_channels=src.size(-1),
251
+ format=_get_sample_format(src.dtype),
252
+ encoder=encoder,
253
+ encoder_format=enc_fmt,
254
+ codec_config=compression,
255
+ )
256
+ with s.open():
257
+ s.write_audio_chunk(0, src)
258
+
259
+
260
+ def _map_encoding(encoding: str) -> str:
261
+ for dst in ["PCM_S", "PCM_U", "PCM_F"]:
262
+ if dst in encoding:
263
+ return dst
264
+ if encoding == "PCM_MULAW":
265
+ return "ULAW"
266
+ elif encoding == "PCM_ALAW":
267
+ return "ALAW"
268
+ return encoding
269
+
270
+
271
+ def _get_bits_per_sample(encoding: str, bits_per_sample: int) -> str:
272
+ if m := re.search(r"PCM_\w(\d+)\w*", encoding):
273
+ return int(m.group(1))
274
+ elif encoding in ["PCM_ALAW", "PCM_MULAW"]:
275
+ return 8
276
+ return bits_per_sample
277
+
278
+
279
+ class FFmpegBackend(Backend):
280
+ @staticmethod
281
+ def info(uri: InputType, format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
282
+ metadata = info_audio(uri, format, buffer_size)
283
+ metadata.bits_per_sample = _get_bits_per_sample(metadata.encoding, metadata.bits_per_sample)
284
+ metadata.encoding = _map_encoding(metadata.encoding)
285
+ return metadata
286
+
287
+ @staticmethod
288
+ def load(
289
+ uri: InputType,
290
+ frame_offset: int = 0,
291
+ num_frames: int = -1,
292
+ normalize: bool = True,
293
+ channels_first: bool = True,
294
+ format: Optional[str] = None,
295
+ buffer_size: int = 4096,
296
+ ) -> Tuple[torch.Tensor, int]:
297
+ return load_audio(uri, frame_offset, num_frames, normalize, channels_first, format)
298
+
299
+ @staticmethod
300
+ def save(
301
+ uri: InputType,
302
+ src: torch.Tensor,
303
+ sample_rate: int,
304
+ channels_first: bool = True,
305
+ format: Optional[str] = None,
306
+ encoding: Optional[str] = None,
307
+ bits_per_sample: Optional[int] = None,
308
+ buffer_size: int = 4096,
309
+ compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
310
+ ) -> None:
311
+ if not isinstance(compression, (torchaudio.io.CodecConfig, type(None))):
312
+ raise ValueError(
313
+ "FFmpeg backend expects non-`None` value for argument `compression` to be of ",
314
+ f"type `torchaudio.io.CodecConfig`, but received value of type {type(compression)}",
315
+ )
316
+ save_audio(
317
+ uri,
318
+ src,
319
+ sample_rate,
320
+ channels_first,
321
+ format,
322
+ encoding,
323
+ bits_per_sample,
324
+ buffer_size,
325
+ compression,
326
+ )
327
+
328
+ @staticmethod
329
+ def can_decode(uri: InputType, format: Optional[str]) -> bool:
330
+ return True
331
+
332
+ @staticmethod
333
+ def can_encode(uri: InputType, format: Optional[str]) -> bool:
334
+ return True
.venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import BinaryIO, Optional, Tuple, Union
3
+
4
+ import torch
5
+ from torchaudio.io import CodecConfig
6
+
7
+ from . import soundfile_backend
8
+ from .backend import Backend
9
+ from .common import AudioMetaData
10
+
11
+
12
+ class SoundfileBackend(Backend):
13
+ @staticmethod
14
+ def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
15
+ return soundfile_backend.info(uri, format)
16
+
17
+ @staticmethod
18
+ def load(
19
+ uri: Union[BinaryIO, str, os.PathLike],
20
+ frame_offset: int = 0,
21
+ num_frames: int = -1,
22
+ normalize: bool = True,
23
+ channels_first: bool = True,
24
+ format: Optional[str] = None,
25
+ buffer_size: int = 4096,
26
+ ) -> Tuple[torch.Tensor, int]:
27
+ return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format)
28
+
29
+ @staticmethod
30
+ def save(
31
+ uri: Union[BinaryIO, str, os.PathLike],
32
+ src: torch.Tensor,
33
+ sample_rate: int,
34
+ channels_first: bool = True,
35
+ format: Optional[str] = None,
36
+ encoding: Optional[str] = None,
37
+ bits_per_sample: Optional[int] = None,
38
+ buffer_size: int = 4096,
39
+ compression: Optional[Union[CodecConfig, float, int]] = None,
40
+ ) -> None:
41
+ if compression:
42
+ raise ValueError("soundfile backend does not support argument `compression`.")
43
+
44
+ soundfile_backend.save(
45
+ uri, src, sample_rate, channels_first, format=format, encoding=encoding, bits_per_sample=bits_per_sample
46
+ )
47
+
48
+ @staticmethod
49
+ def can_decode(uri, format) -> bool:
50
+ return True
51
+
52
+ @staticmethod
53
+ def can_encode(uri, format) -> bool:
54
+ return True
.venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile_backend.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """The new soundfile backend which will become default in 0.8.0 onward"""
2
+ import warnings
3
+ from typing import Optional, Tuple
4
+
5
+ import torch
6
+ from torchaudio._internal import module_utils as _mod_utils
7
+
8
+ from .common import AudioMetaData
9
+
10
+
11
+ _IS_SOUNDFILE_AVAILABLE = False
12
+
13
+ # TODO: import soundfile only when it is used.
14
+ if _mod_utils.is_module_available("soundfile"):
15
+ try:
16
+ import soundfile
17
+
18
+ _requires_soundfile = _mod_utils.no_op
19
+ _IS_SOUNDFILE_AVAILABLE = True
20
+ except Exception:
21
+ _requires_soundfile = _mod_utils.fail_with_message(
22
+ "requires soundfile, but we failed to import it. Please check the installation of soundfile."
23
+ )
24
+ else:
25
+ _requires_soundfile = _mod_utils.fail_with_message(
26
+ "requires soundfile, but it is not installed. Please install soundfile."
27
+ )
28
+
29
+
30
+ # Mapping from soundfile subtype to number of bits per sample.
31
+ # This is mostly heuristical and the value is set to 0 when it is irrelevant
32
+ # (lossy formats) or when it can't be inferred.
33
+ # For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
34
+ # According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
35
+ # the default seems to be 8 bits but it can be compressed further to 4 bits.
36
+ # The dict is inspired from
37
+ # https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
38
+ _SUBTYPE_TO_BITS_PER_SAMPLE = {
39
+ "PCM_S8": 8, # Signed 8 bit data
40
+ "PCM_16": 16, # Signed 16 bit data
41
+ "PCM_24": 24, # Signed 24 bit data
42
+ "PCM_32": 32, # Signed 32 bit data
43
+ "PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only)
44
+ "FLOAT": 32, # 32 bit float data
45
+ "DOUBLE": 64, # 64 bit float data
46
+ "ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
47
+ "ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
48
+ "IMA_ADPCM": 0, # IMA ADPCM.
49
+ "MS_ADPCM": 0, # Microsoft ADPCM.
50
+ "GSM610": 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
51
+ "VOX_ADPCM": 0, # OKI / Dialogix ADPCM
52
+ "G721_32": 0, # 32kbs G721 ADPCM encoding.
53
+ "G723_24": 0, # 24kbs G723 ADPCM encoding.
54
+ "G723_40": 0, # 40kbs G723 ADPCM encoding.
55
+ "DWVW_12": 12, # 12 bit Delta Width Variable Word encoding.
56
+ "DWVW_16": 16, # 16 bit Delta Width Variable Word encoding.
57
+ "DWVW_24": 24, # 24 bit Delta Width Variable Word encoding.
58
+ "DWVW_N": 0, # N bit Delta Width Variable Word encoding.
59
+ "DPCM_8": 8, # 8 bit differential PCM (XI only)
60
+ "DPCM_16": 16, # 16 bit differential PCM (XI only)
61
+ "VORBIS": 0, # Xiph Vorbis encoding. (lossy)
62
+ "ALAC_16": 16, # Apple Lossless Audio Codec (16 bit).
63
+ "ALAC_20": 20, # Apple Lossless Audio Codec (20 bit).
64
+ "ALAC_24": 24, # Apple Lossless Audio Codec (24 bit).
65
+ "ALAC_32": 32, # Apple Lossless Audio Codec (32 bit).
66
+ }
67
+
68
+
69
+ def _get_bit_depth(subtype):
70
+ if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
71
+ warnings.warn(
72
+ f"The {subtype} subtype is unknown to TorchAudio. As a result, the bits_per_sample "
73
+ "attribute will be set to 0. If you are seeing this warning, please "
74
+ "report by opening an issue on github (after checking for existing/closed ones). "
75
+ "You may otherwise ignore this warning."
76
+ )
77
+ return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
78
+
79
+
80
+ _SUBTYPE_TO_ENCODING = {
81
+ "PCM_S8": "PCM_S",
82
+ "PCM_16": "PCM_S",
83
+ "PCM_24": "PCM_S",
84
+ "PCM_32": "PCM_S",
85
+ "PCM_U8": "PCM_U",
86
+ "FLOAT": "PCM_F",
87
+ "DOUBLE": "PCM_F",
88
+ "ULAW": "ULAW",
89
+ "ALAW": "ALAW",
90
+ "VORBIS": "VORBIS",
91
+ }
92
+
93
+
94
+ def _get_encoding(format: str, subtype: str):
95
+ if format == "FLAC":
96
+ return "FLAC"
97
+ return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
98
+
99
+
100
+ @_requires_soundfile
101
+ def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
102
+ """Get signal information of an audio file.
103
+
104
+ Note:
105
+ ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
106
+ ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
107
+ which has a restriction on type annotation due to TorchScript compiler compatiblity.
108
+
109
+ Args:
110
+ filepath (path-like object or file-like object):
111
+ Source of audio data.
112
+ format (str or None, optional):
113
+ Not used. PySoundFile does not accept format hint.
114
+
115
+ Returns:
116
+ AudioMetaData: meta data of the given audio.
117
+
118
+ """
119
+ sinfo = soundfile.info(filepath)
120
+ return AudioMetaData(
121
+ sinfo.samplerate,
122
+ sinfo.frames,
123
+ sinfo.channels,
124
+ bits_per_sample=_get_bit_depth(sinfo.subtype),
125
+ encoding=_get_encoding(sinfo.format, sinfo.subtype),
126
+ )
127
+
128
+
129
+ _SUBTYPE2DTYPE = {
130
+ "PCM_S8": "int8",
131
+ "PCM_U8": "uint8",
132
+ "PCM_16": "int16",
133
+ "PCM_32": "int32",
134
+ "FLOAT": "float32",
135
+ "DOUBLE": "float64",
136
+ }
137
+
138
+
139
+ @_requires_soundfile
140
+ def load(
141
+ filepath: str,
142
+ frame_offset: int = 0,
143
+ num_frames: int = -1,
144
+ normalize: bool = True,
145
+ channels_first: bool = True,
146
+ format: Optional[str] = None,
147
+ ) -> Tuple[torch.Tensor, int]:
148
+ """Load audio data from file.
149
+
150
+ Note:
151
+ The formats this function can handle depend on the soundfile installation.
152
+ This function is tested on the following formats;
153
+
154
+ * WAV
155
+
156
+ * 32-bit floating-point
157
+ * 32-bit signed integer
158
+ * 16-bit signed integer
159
+ * 8-bit unsigned integer
160
+
161
+ * FLAC
162
+ * OGG/VORBIS
163
+ * SPHERE
164
+
165
+ By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
166
+ ``float32`` dtype, and the shape of `[channel, time]`.
167
+
168
+ .. warning::
169
+
170
+ ``normalize`` argument does not perform volume normalization.
171
+ It only converts the sample type to `torch.float32` from the native sample
172
+ type.
173
+
174
+ When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
175
+ signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
176
+ this function can return integer Tensor, where the samples are expressed within the whole range
177
+ of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
178
+ ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
179
+ support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
180
+
181
+ ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
182
+ ``flac`` and ``mp3``.
183
+
184
+ For these formats, this function always returns ``float32`` Tensor with values.
185
+
186
+ Note:
187
+ ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
188
+ ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
189
+ which has a restriction on type annotation due to TorchScript compiler compatiblity.
190
+
191
+ Args:
192
+ filepath (path-like object or file-like object):
193
+ Source of audio data.
194
+ frame_offset (int, optional):
195
+ Number of frames to skip before start reading data.
196
+ num_frames (int, optional):
197
+ Maximum number of frames to read. ``-1`` reads all the remaining samples,
198
+ starting from ``frame_offset``.
199
+ This function may return the less number of frames if there is not enough
200
+ frames in the given file.
201
+ normalize (bool, optional):
202
+ When ``True``, this function converts the native sample type to ``float32``.
203
+ Default: ``True``.
204
+
205
+ If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
206
+ integer type.
207
+ This argument has no effect for formats other than integer WAV type.
208
+
209
+ channels_first (bool, optional):
210
+ When True, the returned Tensor has dimension `[channel, time]`.
211
+ Otherwise, the returned Tensor's dimension is `[time, channel]`.
212
+ format (str or None, optional):
213
+ Not used. PySoundFile does not accept format hint.
214
+
215
+ Returns:
216
+ (torch.Tensor, int): Resulting Tensor and sample rate.
217
+ If the input file has integer wav format and normalization is off, then it has
218
+ integer type, else ``float32`` type. If ``channels_first=True``, it has
219
+ `[channel, time]` else `[time, channel]`.
220
+ """
221
+ with soundfile.SoundFile(filepath, "r") as file_:
222
+ if file_.format != "WAV" or normalize:
223
+ dtype = "float32"
224
+ elif file_.subtype not in _SUBTYPE2DTYPE:
225
+ raise ValueError(f"Unsupported subtype: {file_.subtype}")
226
+ else:
227
+ dtype = _SUBTYPE2DTYPE[file_.subtype]
228
+
229
+ frames = file_._prepare_read(frame_offset, None, num_frames)
230
+ waveform = file_.read(frames, dtype, always_2d=True)
231
+ sample_rate = file_.samplerate
232
+
233
+ waveform = torch.from_numpy(waveform)
234
+ if channels_first:
235
+ waveform = waveform.t()
236
+ return waveform, sample_rate
237
+
238
+
239
+ def _get_subtype_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int):
240
+ if not encoding:
241
+ if not bits_per_sample:
242
+ subtype = {
243
+ torch.uint8: "PCM_U8",
244
+ torch.int16: "PCM_16",
245
+ torch.int32: "PCM_32",
246
+ torch.float32: "FLOAT",
247
+ torch.float64: "DOUBLE",
248
+ }.get(dtype)
249
+ if not subtype:
250
+ raise ValueError(f"Unsupported dtype for wav: {dtype}")
251
+ return subtype
252
+ if bits_per_sample == 8:
253
+ return "PCM_U8"
254
+ return f"PCM_{bits_per_sample}"
255
+ if encoding == "PCM_S":
256
+ if not bits_per_sample:
257
+ return "PCM_32"
258
+ if bits_per_sample == 8:
259
+ raise ValueError("wav does not support 8-bit signed PCM encoding.")
260
+ return f"PCM_{bits_per_sample}"
261
+ if encoding == "PCM_U":
262
+ if bits_per_sample in (None, 8):
263
+ return "PCM_U8"
264
+ raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
265
+ if encoding == "PCM_F":
266
+ if bits_per_sample in (None, 32):
267
+ return "FLOAT"
268
+ if bits_per_sample == 64:
269
+ return "DOUBLE"
270
+ raise ValueError("wav only supports 32/64-bit float PCM encoding.")
271
+ if encoding == "ULAW":
272
+ if bits_per_sample in (None, 8):
273
+ return "ULAW"
274
+ raise ValueError("wav only supports 8-bit mu-law encoding.")
275
+ if encoding == "ALAW":
276
+ if bits_per_sample in (None, 8):
277
+ return "ALAW"
278
+ raise ValueError("wav only supports 8-bit a-law encoding.")
279
+ raise ValueError(f"wav does not support {encoding}.")
280
+
281
+
282
+ def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
283
+ if encoding in (None, "PCM_S"):
284
+ return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
285
+ if encoding in ("PCM_U", "PCM_F"):
286
+ raise ValueError(f"sph does not support {encoding} encoding.")
287
+ if encoding == "ULAW":
288
+ if bits_per_sample in (None, 8):
289
+ return "ULAW"
290
+ raise ValueError("sph only supports 8-bit for mu-law encoding.")
291
+ if encoding == "ALAW":
292
+ return "ALAW"
293
+ raise ValueError(f"sph does not support {encoding}.")
294
+
295
+
296
+ def _get_subtype(dtype: torch.dtype, format: str, encoding: str, bits_per_sample: int):
297
+ if format == "wav":
298
+ return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
299
+ if format == "flac":
300
+ if encoding:
301
+ raise ValueError("flac does not support encoding.")
302
+ if not bits_per_sample:
303
+ return "PCM_16"
304
+ if bits_per_sample > 24:
305
+ raise ValueError("flac does not support bits_per_sample > 24.")
306
+ return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
307
+ if format in ("ogg", "vorbis"):
308
+ if bits_per_sample:
309
+ raise ValueError("ogg/vorbis does not support bits_per_sample.")
310
+ if encoding is None or encoding == "vorbis":
311
+ return "VORBIS"
312
+ if encoding == "opus":
313
+ return "OPUS"
314
+ raise ValueError(f"Unexpected encoding: {encoding}")
315
+ if format == "mp3":
316
+ return "MPEG_LAYER_III"
317
+ if format == "sph":
318
+ return _get_subtype_for_sphere(encoding, bits_per_sample)
319
+ if format in ("nis", "nist"):
320
+ return "PCM_16"
321
+ raise ValueError(f"Unsupported format: {format}")
322
+
323
+
324
+ @_requires_soundfile
325
+ def save(
326
+ filepath: str,
327
+ src: torch.Tensor,
328
+ sample_rate: int,
329
+ channels_first: bool = True,
330
+ compression: Optional[float] = None,
331
+ format: Optional[str] = None,
332
+ encoding: Optional[str] = None,
333
+ bits_per_sample: Optional[int] = None,
334
+ ):
335
+ """Save audio data to file.
336
+
337
+ Note:
338
+ The formats this function can handle depend on the soundfile installation.
339
+ This function is tested on the following formats;
340
+
341
+ * WAV
342
+
343
+ * 32-bit floating-point
344
+ * 32-bit signed integer
345
+ * 16-bit signed integer
346
+ * 8-bit unsigned integer
347
+
348
+ * FLAC
349
+ * OGG/VORBIS
350
+ * SPHERE
351
+
352
+ Note:
353
+ ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
354
+ ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
355
+ which has a restriction on type annotation due to TorchScript compiler compatiblity.
356
+
357
+ Args:
358
+ filepath (str or pathlib.Path): Path to audio file.
359
+ src (torch.Tensor): Audio data to save. must be 2D tensor.
360
+ sample_rate (int): sampling rate
361
+ channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
362
+ otherwise `[time, channel]`.
363
+ compression (float of None, optional): Not used.
364
+ It is here only for interface compatibility reson with "sox_io" backend.
365
+ format (str or None, optional): Override the audio format.
366
+ When ``filepath`` argument is path-like object, audio format is
367
+ inferred from file extension. If the file extension is missing or
368
+ different, you can specify the correct format with this argument.
369
+
370
+ When ``filepath`` argument is file-like object,
371
+ this argument is required.
372
+
373
+ Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
374
+ ``"flac"`` and ``"sph"``.
375
+ encoding (str or None, optional): Changes the encoding for supported formats.
376
+ This argument is effective only for supported formats, sush as
377
+ ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
378
+
379
+ - ``"PCM_S"`` (signed integer Linear PCM)
380
+ - ``"PCM_U"`` (unsigned integer Linear PCM)
381
+ - ``"PCM_F"`` (floating point PCM)
382
+ - ``"ULAW"`` (mu-law)
383
+ - ``"ALAW"`` (a-law)
384
+
385
+ bits_per_sample (int or None, optional): Changes the bit depth for the
386
+ supported formats.
387
+ When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
388
+ you can change the bit depth.
389
+ Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
390
+
391
+ Supported formats/encodings/bit depth/compression are:
392
+
393
+ ``"wav"``
394
+ - 32-bit floating-point PCM
395
+ - 32-bit signed integer PCM
396
+ - 24-bit signed integer PCM
397
+ - 16-bit signed integer PCM
398
+ - 8-bit unsigned integer PCM
399
+ - 8-bit mu-law
400
+ - 8-bit a-law
401
+
402
+ Note:
403
+ Default encoding/bit depth is determined by the dtype of
404
+ the input Tensor.
405
+
406
+ ``"flac"``
407
+ - 8-bit
408
+ - 16-bit (default)
409
+ - 24-bit
410
+
411
+ ``"ogg"``, ``"vorbis"``
412
+ - Doesn't accept changing configuration.
413
+
414
+ ``"sph"``
415
+ - 8-bit signed integer PCM
416
+ - 16-bit signed integer PCM
417
+ - 24-bit signed integer PCM
418
+ - 32-bit signed integer PCM (default)
419
+ - 8-bit mu-law
420
+ - 8-bit a-law
421
+ - 16-bit a-law
422
+ - 24-bit a-law
423
+ - 32-bit a-law
424
+
425
+ """
426
+ if src.ndim != 2:
427
+ raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
428
+ if compression is not None:
429
+ warnings.warn(
430
+ '`save` function of "soundfile" backend does not support "compression" parameter. '
431
+ "The argument is silently ignored."
432
+ )
433
+ if hasattr(filepath, "write"):
434
+ if format is None:
435
+ raise RuntimeError("`format` is required when saving to file object.")
436
+ ext = format.lower()
437
+ else:
438
+ ext = str(filepath).split(".")[-1].lower()
439
+
440
+ if bits_per_sample not in (None, 8, 16, 24, 32, 64):
441
+ raise ValueError("Invalid bits_per_sample.")
442
+ if bits_per_sample == 24:
443
+ warnings.warn(
444
+ "Saving audio with 24 bits per sample might warp samples near -1. "
445
+ "Using 16 bits per sample might be able to avoid this."
446
+ )
447
+ subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
448
+
449
+ # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
450
+ # so we extend the extensions manually here
451
+ if ext in ["nis", "nist", "sph"] and format is None:
452
+ format = "NIST"
453
+
454
+ if channels_first:
455
+ src = src.t()
456
+
457
+ soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
.venv/lib/python3.11/site-packages/torchaudio/_backend/sox.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import BinaryIO, Optional, Tuple, Union
3
+
4
+ import torch
5
+ import torchaudio
6
+
7
+ from .backend import Backend
8
+ from .common import AudioMetaData
9
+
10
+ sox_ext = torchaudio._extension.lazy_import_sox_ext()
11
+
12
+
13
+ class SoXBackend(Backend):
14
+ @staticmethod
15
+ def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
16
+ if hasattr(uri, "read"):
17
+ raise ValueError(
18
+ "SoX backend does not support reading from file-like objects. ",
19
+ "Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.",
20
+ )
21
+ else:
22
+ sinfo = sox_ext.get_info(uri, format)
23
+ if sinfo:
24
+ return AudioMetaData(*sinfo)
25
+ else:
26
+ raise RuntimeError(f"Failed to fetch metadata for {uri}.")
27
+
28
+ @staticmethod
29
+ def load(
30
+ uri: Union[BinaryIO, str, os.PathLike],
31
+ frame_offset: int = 0,
32
+ num_frames: int = -1,
33
+ normalize: bool = True,
34
+ channels_first: bool = True,
35
+ format: Optional[str] = None,
36
+ buffer_size: int = 4096,
37
+ ) -> Tuple[torch.Tensor, int]:
38
+ if hasattr(uri, "read"):
39
+ raise ValueError(
40
+ "SoX backend does not support loading from file-like objects. ",
41
+ "Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.",
42
+ )
43
+ else:
44
+ ret = sox_ext.load_audio_file(uri, frame_offset, num_frames, normalize, channels_first, format)
45
+ if not ret:
46
+ raise RuntimeError(f"Failed to load audio from {uri}.")
47
+ return ret
48
+
49
+ @staticmethod
50
+ def save(
51
+ uri: Union[BinaryIO, str, os.PathLike],
52
+ src: torch.Tensor,
53
+ sample_rate: int,
54
+ channels_first: bool = True,
55
+ format: Optional[str] = None,
56
+ encoding: Optional[str] = None,
57
+ bits_per_sample: Optional[int] = None,
58
+ buffer_size: int = 4096,
59
+ compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
60
+ ) -> None:
61
+ if not isinstance(compression, (float, int, type(None))):
62
+ raise ValueError(
63
+ "SoX backend expects non-`None` value for argument `compression` to be of ",
64
+ f"type `float` or `int`, but received value of type {type(compression)}",
65
+ )
66
+ if hasattr(uri, "write"):
67
+ raise ValueError(
68
+ "SoX backend does not support writing to file-like objects. ",
69
+ "Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.",
70
+ )
71
+ else:
72
+ sox_ext.save_audio_file(
73
+ uri,
74
+ src,
75
+ sample_rate,
76
+ channels_first,
77
+ compression,
78
+ format,
79
+ encoding,
80
+ bits_per_sample,
81
+ )
82
+
83
+ @staticmethod
84
+ def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
85
+ # i.e. not a file-like object.
86
+ return not hasattr(uri, "read")
87
+
88
+ @staticmethod
89
+ def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
90
+ # i.e. not a file-like object.
91
+ return not hasattr(uri, "write")
.venv/lib/python3.11/site-packages/torchaudio/_backend/utils.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from functools import lru_cache
3
+ from typing import BinaryIO, Dict, Optional, Tuple, Type, Union
4
+
5
+ import torch
6
+
7
+ from torchaudio._extension import lazy_import_sox_ext
8
+ from torchaudio.io import CodecConfig
9
+ from torio._extension import lazy_import_ffmpeg_ext
10
+
11
+ from . import soundfile_backend
12
+
13
+ from .backend import Backend
14
+ from .common import AudioMetaData
15
+ from .ffmpeg import FFmpegBackend
16
+ from .soundfile import SoundfileBackend
17
+ from .sox import SoXBackend
18
+
19
+
20
+ @lru_cache(None)
21
+ def get_available_backends() -> Dict[str, Type[Backend]]:
22
+ backend_specs: Dict[str, Type[Backend]] = {}
23
+ if lazy_import_ffmpeg_ext().is_available():
24
+ backend_specs["ffmpeg"] = FFmpegBackend
25
+ if lazy_import_sox_ext().is_available():
26
+ backend_specs["sox"] = SoXBackend
27
+ if soundfile_backend._IS_SOUNDFILE_AVAILABLE:
28
+ backend_specs["soundfile"] = SoundfileBackend
29
+ return backend_specs
30
+
31
+
32
+ def get_backend(backend_name, backends) -> Backend:
33
+ if backend := backends.get(backend_name):
34
+ return backend
35
+ else:
36
+ raise ValueError(
37
+ f"Unsupported backend '{backend_name}' specified; ",
38
+ f"please select one of {list(backends.keys())} instead.",
39
+ )
40
+
41
+
42
+ def get_info_func():
43
+ backends = get_available_backends()
44
+
45
+ def dispatcher(
46
+ uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
47
+ ) -> Backend:
48
+ if backend_name is not None:
49
+ return get_backend(backend_name, backends)
50
+
51
+ for backend in backends.values():
52
+ if backend.can_decode(uri, format):
53
+ return backend
54
+ raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
55
+
56
+ def info(
57
+ uri: Union[BinaryIO, str, os.PathLike],
58
+ format: Optional[str] = None,
59
+ buffer_size: int = 4096,
60
+ backend: Optional[str] = None,
61
+ ) -> AudioMetaData:
62
+ """Get signal information of an audio file.
63
+
64
+ Note:
65
+ When the input type is file-like object, this function cannot
66
+ get the correct length (``num_samples``) for certain formats,
67
+ such as ``vorbis``.
68
+ In this case, the value of ``num_samples`` is ``0``.
69
+
70
+ Args:
71
+ uri (path-like object or file-like object):
72
+ Source of audio data. The following types are accepted:
73
+
74
+ * ``path-like``: File path or URL.
75
+ * ``file-like``: Object with ``read(size: int) -> bytes`` method,
76
+ which returns byte string of at most ``size`` length.
77
+
78
+ format (str or None, optional):
79
+ If not ``None``, interpreted as hint that may allow backend to override the detected format.
80
+ (Default: ``None``)
81
+
82
+ buffer_size (int, optional):
83
+ Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
84
+
85
+ backend (str or None, optional):
86
+ I/O backend to use.
87
+ If ``None``, function selects backend given input and available backends.
88
+ Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
89
+ with the corresponding backend available.
90
+ (Default: ``None``)
91
+
92
+ .. seealso::
93
+ :ref:`backend`
94
+
95
+ Returns:
96
+ AudioMetaData
97
+ """
98
+ backend = dispatcher(uri, format, backend)
99
+ return backend.info(uri, format, buffer_size)
100
+
101
+ return info
102
+
103
+
104
+ def get_load_func():
105
+ backends = get_available_backends()
106
+
107
+ def dispatcher(
108
+ uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
109
+ ) -> Backend:
110
+ if backend_name is not None:
111
+ return get_backend(backend_name, backends)
112
+
113
+ for backend in backends.values():
114
+ if backend.can_decode(uri, format):
115
+ return backend
116
+ raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
117
+
118
+ def load(
119
+ uri: Union[BinaryIO, str, os.PathLike],
120
+ frame_offset: int = 0,
121
+ num_frames: int = -1,
122
+ normalize: bool = True,
123
+ channels_first: bool = True,
124
+ format: Optional[str] = None,
125
+ buffer_size: int = 4096,
126
+ backend: Optional[str] = None,
127
+ ) -> Tuple[torch.Tensor, int]:
128
+ """Load audio data from source.
129
+
130
+ By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
131
+ ``float32`` dtype, and the shape of `[channel, time]`.
132
+
133
+ Note:
134
+ The formats this function can handle depend on the availability of backends.
135
+ Please use the following functions to fetch the supported formats.
136
+
137
+ - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_decoders`
138
+ - Sox: :py:func:`torchaudio.utils.sox_utils.list_read_formats`
139
+ - SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
140
+
141
+ .. warning::
142
+
143
+ ``normalize`` argument does not perform volume normalization.
144
+ It only converts the sample type to `torch.float32` from the native sample
145
+ type.
146
+
147
+ When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
148
+ signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
149
+ this function can return integer Tensor, where the samples are expressed within the whole range
150
+ of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
151
+ ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
152
+ support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
153
+
154
+ ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
155
+ ``flac`` and ``mp3``.
156
+
157
+ For these formats, this function always returns ``float32`` Tensor with values.
158
+
159
+
160
+ Args:
161
+ uri (path-like object or file-like object):
162
+ Source of audio data.
163
+ frame_offset (int, optional):
164
+ Number of frames to skip before start reading data.
165
+ num_frames (int, optional):
166
+ Maximum number of frames to read. ``-1`` reads all the remaining samples,
167
+ starting from ``frame_offset``.
168
+ This function may return the less number of frames if there is not enough
169
+ frames in the given file.
170
+ normalize (bool, optional):
171
+ When ``True``, this function converts the native sample type to ``float32``.
172
+ Default: ``True``.
173
+
174
+ If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
175
+ integer type.
176
+ This argument has no effect for formats other than integer WAV type.
177
+
178
+ channels_first (bool, optional):
179
+ When True, the returned Tensor has dimension `[channel, time]`.
180
+ Otherwise, the returned Tensor's dimension is `[time, channel]`.
181
+
182
+ format (str or None, optional):
183
+ If not ``None``, interpreted as hint that may allow backend to override the detected format.
184
+ (Default: ``None``)
185
+
186
+ buffer_size (int, optional):
187
+ Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
188
+
189
+ backend (str or None, optional):
190
+ I/O backend to use.
191
+ If ``None``, function selects backend given input and available backends.
192
+ Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
193
+ with the corresponding backend being available. (Default: ``None``)
194
+
195
+ .. seealso::
196
+ :ref:`backend`
197
+
198
+ Returns:
199
+ (torch.Tensor, int): Resulting Tensor and sample rate.
200
+ If the input file has integer wav format and normalization is off, then it has
201
+ integer type, else ``float32`` type. If ``channels_first=True``, it has
202
+ `[channel, time]` else `[time, channel]`.
203
+ """
204
+ backend = dispatcher(uri, format, backend)
205
+ return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size)
206
+
207
+ return load
208
+
209
+
210
+ def get_save_func():
211
+ backends = get_available_backends()
212
+
213
+ def dispatcher(
214
+ uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
215
+ ) -> Backend:
216
+ if backend_name is not None:
217
+ return get_backend(backend_name, backends)
218
+
219
+ for backend in backends.values():
220
+ if backend.can_encode(uri, format):
221
+ return backend
222
+ raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
223
+
224
+ def save(
225
+ uri: Union[BinaryIO, str, os.PathLike],
226
+ src: torch.Tensor,
227
+ sample_rate: int,
228
+ channels_first: bool = True,
229
+ format: Optional[str] = None,
230
+ encoding: Optional[str] = None,
231
+ bits_per_sample: Optional[int] = None,
232
+ buffer_size: int = 4096,
233
+ backend: Optional[str] = None,
234
+ compression: Optional[Union[CodecConfig, float, int]] = None,
235
+ ):
236
+ """Save audio data to file.
237
+
238
+ Note:
239
+ The formats this function can handle depend on the availability of backends.
240
+ Please use the following functions to fetch the supported formats.
241
+
242
+ - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_encoders`
243
+ - Sox: :py:func:`torchaudio.utils.sox_utils.list_write_formats`
244
+ - SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
245
+
246
+ Args:
247
+ uri (str or pathlib.Path): Path to audio file.
248
+ src (torch.Tensor): Audio data to save. must be 2D tensor.
249
+ sample_rate (int): sampling rate
250
+ channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
251
+ otherwise `[time, channel]`.
252
+ format (str or None, optional): Override the audio format.
253
+ When ``uri`` argument is path-like object, audio format is
254
+ inferred from file extension. If the file extension is missing or
255
+ different, you can specify the correct format with this argument.
256
+
257
+ When ``uri`` argument is file-like object,
258
+ this argument is required.
259
+
260
+ Valid values are ``"wav"``, ``"ogg"``, and ``"flac"``.
261
+ encoding (str or None, optional): Changes the encoding for supported formats.
262
+ This argument is effective only for supported formats, i.e.
263
+ ``"wav"`` and ``""flac"```. Valid values are
264
+
265
+ - ``"PCM_S"`` (signed integer Linear PCM)
266
+ - ``"PCM_U"`` (unsigned integer Linear PCM)
267
+ - ``"PCM_F"`` (floating point PCM)
268
+ - ``"ULAW"`` (mu-law)
269
+ - ``"ALAW"`` (a-law)
270
+
271
+ bits_per_sample (int or None, optional): Changes the bit depth for the
272
+ supported formats.
273
+ When ``format`` is one of ``"wav"`` and ``"flac"``,
274
+ you can change the bit depth.
275
+ Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
276
+
277
+ buffer_size (int, optional):
278
+ Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
279
+
280
+ backend (str or None, optional):
281
+ I/O backend to use.
282
+ If ``None``, function selects backend given input and available backends.
283
+ Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
284
+ with the corresponding backend being available.
285
+ (Default: ``None``)
286
+
287
+ .. seealso::
288
+ :ref:`backend`
289
+
290
+ compression (CodecConfig, float, int, or None, optional):
291
+ Compression configuration to apply.
292
+
293
+ If the selected backend is FFmpeg, an instance of :py:class:`CodecConfig` must be provided.
294
+
295
+ Otherwise, if the selected backend is SoX, a float or int value corresponding to option ``-C`` of the
296
+ ``sox`` command line interface must be provided. For instance:
297
+
298
+ ``"mp3"``
299
+ Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
300
+ VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
301
+
302
+ ``"flac"``
303
+ Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
304
+
305
+ ``"ogg"``, ``"vorbis"``
306
+ Number from ``-1`` to ``10``; ``-1`` is the highest compression
307
+ and lowest quality. Default: ``3``.
308
+
309
+ Refer to http://sox.sourceforge.net/soxformat.html for more details.
310
+
311
+ """
312
+ backend = dispatcher(uri, format, backend)
313
+ return backend.save(
314
+ uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size, compression
315
+ )
316
+
317
+ return save
.venv/lib/python3.11/site-packages/torchaudio/backend/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # NOTE:
2
+ # The entire `torchaudio.backend` module is deprecated.
3
+ # New things should be added to `torchaudio._backend`.
4
+ # Only things related to backward compatibility should be placed here.
5
+
6
+ from . import common, no_backend, soundfile_backend, sox_io_backend # noqa
7
+
8
+ __all__ = []
.venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/_sox_io_backend.cpython-311.pyc ADDED
Binary file (12.7 kB). View file
 
.venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/soundfile_backend.cpython-311.pyc ADDED
Binary file (889 Bytes). View file
 
.venv/lib/python3.11/site-packages/torchaudio/backend/_no_backend.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Callable, Optional, Tuple, Union
3
+
4
+ from torch import Tensor
5
+ from torchaudio import AudioMetaData
6
+
7
+
8
+ def load(
9
+ filepath: Union[str, Path],
10
+ out: Optional[Tensor] = None,
11
+ normalization: Union[bool, float, Callable] = True,
12
+ channels_first: bool = True,
13
+ num_frames: int = 0,
14
+ offset: int = 0,
15
+ filetype: Optional[str] = None,
16
+ ) -> Tuple[Tensor, int]:
17
+ raise RuntimeError("No audio I/O backend is available.")
18
+
19
+
20
+ def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
21
+ raise RuntimeError("No audio I/O backend is available.")
22
+
23
+
24
+ def info(filepath: str) -> AudioMetaData:
25
+ raise RuntimeError("No audio I/O backend is available.")
.venv/lib/python3.11/site-packages/torchaudio/backend/common.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def __getattr__(name: str):
2
+ if name == "AudioMetaData":
3
+ import warnings
4
+
5
+ warnings.warn(
6
+ "`torchaudio.backend.common.AudioMetaData` has been moved to "
7
+ "`torchaudio.AudioMetaData`. Please update the import path.",
8
+ stacklevel=2,
9
+ )
10
+ from torchaudio import AudioMetaData
11
+
12
+ return AudioMetaData
13
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
.venv/lib/python3.11/site-packages/torchaudio/backend/soundfile_backend.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def __getattr__(name: str):
2
+ import warnings
3
+
4
+ warnings.warn(
5
+ "Torchaudio's I/O functions now support par-call bakcend dispatch. "
6
+ "Importing backend implementation directly is no longer guaranteed to work. "
7
+ "Please use `backend` keyword with load/save/info function, instead of "
8
+ "calling the udnerlying implementation directly.",
9
+ stacklevel=2,
10
+ )
11
+
12
+ from torchaudio._backend import soundfile_backend
13
+
14
+ return getattr(soundfile_backend, name)
.venv/lib/python3.11/site-packages/torchaudio/backend/sox_io_backend.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def __getattr__(name: str):
2
+ import warnings
3
+
4
+ warnings.warn(
5
+ "Torchaudio's I/O functions now support par-call bakcend dispatch. "
6
+ "Importing backend implementation directly is no longer guaranteed to work. "
7
+ "Please use `backend` keyword with load/save/info function, instead of "
8
+ "calling the udnerlying implementation directly.",
9
+ stacklevel=2,
10
+ )
11
+
12
+ from . import _sox_io_backend
13
+
14
+ return getattr(_sox_io_backend, name)
.venv/lib/python3.11/site-packages/torchaudio/functional/__init__.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._alignment import forced_align, merge_tokens, TokenSpan
2
+ from .filtering import (
3
+ allpass_biquad,
4
+ band_biquad,
5
+ bandpass_biquad,
6
+ bandreject_biquad,
7
+ bass_biquad,
8
+ biquad,
9
+ contrast,
10
+ dcshift,
11
+ deemph_biquad,
12
+ dither,
13
+ equalizer_biquad,
14
+ filtfilt,
15
+ flanger,
16
+ gain,
17
+ highpass_biquad,
18
+ lfilter,
19
+ lowpass_biquad,
20
+ overdrive,
21
+ phaser,
22
+ riaa_biquad,
23
+ treble_biquad,
24
+ vad,
25
+ )
26
+ from .functional import (
27
+ add_noise,
28
+ amplitude_to_DB,
29
+ apply_beamforming,
30
+ apply_codec,
31
+ compute_deltas,
32
+ convolve,
33
+ create_dct,
34
+ DB_to_amplitude,
35
+ deemphasis,
36
+ detect_pitch_frequency,
37
+ edit_distance,
38
+ fftconvolve,
39
+ frechet_distance,
40
+ griffinlim,
41
+ inverse_spectrogram,
42
+ linear_fbanks,
43
+ loudness,
44
+ mask_along_axis,
45
+ mask_along_axis_iid,
46
+ melscale_fbanks,
47
+ mu_law_decoding,
48
+ mu_law_encoding,
49
+ mvdr_weights_rtf,
50
+ mvdr_weights_souden,
51
+ phase_vocoder,
52
+ pitch_shift,
53
+ preemphasis,
54
+ psd,
55
+ resample,
56
+ rnnt_loss,
57
+ rtf_evd,
58
+ rtf_power,
59
+ sliding_window_cmn,
60
+ spectral_centroid,
61
+ spectrogram,
62
+ speed,
63
+ )
64
+
65
+ __all__ = [
66
+ "amplitude_to_DB",
67
+ "compute_deltas",
68
+ "create_dct",
69
+ "melscale_fbanks",
70
+ "linear_fbanks",
71
+ "DB_to_amplitude",
72
+ "loudness",
73
+ "detect_pitch_frequency",
74
+ "griffinlim",
75
+ "mask_along_axis",
76
+ "mask_along_axis_iid",
77
+ "mu_law_encoding",
78
+ "mu_law_decoding",
79
+ "phase_vocoder",
80
+ "sliding_window_cmn",
81
+ "spectrogram",
82
+ "inverse_spectrogram",
83
+ "spectral_centroid",
84
+ "allpass_biquad",
85
+ "band_biquad",
86
+ "bandpass_biquad",
87
+ "bandreject_biquad",
88
+ "bass_biquad",
89
+ "biquad",
90
+ "contrast",
91
+ "dither",
92
+ "dcshift",
93
+ "deemph_biquad",
94
+ "equalizer_biquad",
95
+ "filtfilt",
96
+ "flanger",
97
+ "forced_align",
98
+ "merge_tokens",
99
+ "TokenSpan",
100
+ "gain",
101
+ "highpass_biquad",
102
+ "lfilter",
103
+ "lowpass_biquad",
104
+ "overdrive",
105
+ "phaser",
106
+ "riaa_biquad",
107
+ "treble_biquad",
108
+ "vad",
109
+ "apply_codec",
110
+ "resample",
111
+ "edit_distance",
112
+ "pitch_shift",
113
+ "rnnt_loss",
114
+ "psd",
115
+ "mvdr_weights_souden",
116
+ "mvdr_weights_rtf",
117
+ "rtf_evd",
118
+ "rtf_power",
119
+ "apply_beamforming",
120
+ "fftconvolve",
121
+ "convolve",
122
+ "add_noise",
123
+ "speed",
124
+ "preemphasis",
125
+ "deemphasis",
126
+ "frechet_distance",
127
+ ]
.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (2.63 kB). View file
 
.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/_alignment.cpython-311.pyc ADDED
Binary file (6.77 kB). View file
 
.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/filtering.cpython-311.pyc ADDED
Binary file (74 kB). View file
 
.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/functional.cpython-311.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71e5719c3daaa09433b5ece2431df353ef399f7678bc6bee1f1ebff9b16f9c13
3
+ size 115834
.venv/lib/python3.11/site-packages/torchaudio/functional/_alignment.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional, Tuple
3
+
4
+ import torch
5
+ from torch import Tensor
6
+ from torchaudio._extension import fail_if_no_align
7
+
8
+ __all__ = []
9
+
10
+
11
+ @fail_if_no_align
12
+ def forced_align(
13
+ log_probs: Tensor,
14
+ targets: Tensor,
15
+ input_lengths: Optional[Tensor] = None,
16
+ target_lengths: Optional[Tensor] = None,
17
+ blank: int = 0,
18
+ ) -> Tuple[Tensor, Tensor]:
19
+ r"""Align a CTC label sequence to an emission.
20
+
21
+ .. devices:: CPU CUDA
22
+
23
+ .. properties:: TorchScript
24
+
25
+ Args:
26
+ log_probs (Tensor): log probability of CTC emission output.
27
+ Tensor of shape `(B, T, C)`. where `B` is the batch size, `T` is the input length,
28
+ `C` is the number of characters in alphabet including blank.
29
+ targets (Tensor): Target sequence. Tensor of shape `(B, L)`,
30
+ where `L` is the target length.
31
+ input_lengths (Tensor or None, optional):
32
+ Lengths of the inputs (max value must each be <= `T`). 1-D Tensor of shape `(B,)`.
33
+ target_lengths (Tensor or None, optional):
34
+ Lengths of the targets. 1-D Tensor of shape `(B,)`.
35
+ blank_id (int, optional): The index of blank symbol in CTC emission. (Default: 0)
36
+
37
+ Returns:
38
+ Tuple(Tensor, Tensor):
39
+ Tensor: Label for each time step in the alignment path computed using forced alignment.
40
+
41
+ Tensor: Log probability scores of the labels for each time step.
42
+
43
+ Note:
44
+ The sequence length of `log_probs` must satisfy:
45
+
46
+
47
+ .. math::
48
+ L_{\text{log\_probs}} \ge L_{\text{label}} + N_{\text{repeat}}
49
+
50
+ where :math:`N_{\text{repeat}}` is the number of consecutively repeated tokens.
51
+ For example, in str `"aabbc"`, the number of repeats are `2`.
52
+
53
+ Note:
54
+ The current version only supports ``batch_size==1``.
55
+ """
56
+ if blank in targets:
57
+ raise ValueError(f"targets Tensor shouldn't contain blank index. Found {targets}.")
58
+ if torch.max(targets) >= log_probs.shape[-1]:
59
+ raise ValueError("targets values must be less than the CTC dimension")
60
+
61
+ if input_lengths is None:
62
+ batch_size, length = log_probs.size(0), log_probs.size(1)
63
+ input_lengths = torch.full((batch_size,), length, dtype=torch.int64, device=log_probs.device)
64
+ if target_lengths is None:
65
+ batch_size, length = targets.size(0), targets.size(1)
66
+ target_lengths = torch.full((batch_size,), length, dtype=torch.int64, device=targets.device)
67
+
68
+ # For TorchScript compatibility
69
+ assert input_lengths is not None
70
+ assert target_lengths is not None
71
+
72
+ paths, scores = torch.ops.torchaudio.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
73
+ return paths, scores
74
+
75
+
76
+ @dataclass
77
+ class TokenSpan:
78
+ """TokenSpan()
79
+ Token with time stamps and score. Returned by :py:func:`merge_tokens`.
80
+ """
81
+
82
+ token: int
83
+ """The token"""
84
+ start: int
85
+ """The start time (inclusive) in emission time axis."""
86
+ end: int
87
+ """The end time (exclusive) in emission time axis."""
88
+ score: float
89
+ """The score of the this token."""
90
+
91
+ def __len__(self) -> int:
92
+ """Returns the time span"""
93
+ return self.end - self.start
94
+
95
+
96
+ def merge_tokens(tokens: Tensor, scores: Tensor, blank: int = 0) -> List[TokenSpan]:
97
+ """Removes repeated tokens and blank tokens from the given CTC token sequence.
98
+
99
+ Args:
100
+ tokens (Tensor): Alignment tokens (unbatched) returned from :py:func:`forced_align`.
101
+ Shape: `(time, )`.
102
+ scores (Tensor): Alignment scores (unbatched) returned from :py:func:`forced_align`.
103
+ Shape: `(time, )`. When computing the token-size score, the given score is averaged
104
+ across the corresponding time span.
105
+
106
+ Returns:
107
+ list of TokenSpan
108
+
109
+ Example:
110
+ >>> aligned_tokens, scores = forced_align(emission, targets, input_lengths, target_lengths)
111
+ >>> token_spans = merge_tokens(aligned_tokens[0], scores[0])
112
+ """
113
+ if tokens.ndim != 1 or scores.ndim != 1:
114
+ raise ValueError("`tokens` and `scores` must be 1D Tensor.")
115
+ if len(tokens) != len(scores):
116
+ raise ValueError("`tokens` and `scores` must be the same length.")
117
+
118
+ diff = torch.diff(
119
+ tokens, prepend=torch.tensor([-1], device=tokens.device), append=torch.tensor([-1], device=tokens.device)
120
+ )
121
+ changes_wo_blank = torch.nonzero((diff != 0)).squeeze().tolist()
122
+ tokens = tokens.tolist()
123
+ spans = [
124
+ TokenSpan(token=token, start=start, end=end, score=scores[start:end].mean().item())
125
+ for start, end in zip(changes_wo_blank[:-1], changes_wo_blank[1:])
126
+ if (token := tokens[start]) != blank
127
+ ]
128
+ return spans
.venv/lib/python3.11/site-packages/torchaudio/functional/filtering.py ADDED
@@ -0,0 +1,1669 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import warnings
3
+ from typing import Optional
4
+
5
+ import torch
6
+ from torch import Tensor
7
+
8
+ from torchaudio._extension import _IS_TORCHAUDIO_EXT_AVAILABLE
9
+
10
+
11
+ def _dB2Linear(x: float) -> float:
12
+ return math.exp(x * math.log(10) / 20.0)
13
+
14
+
15
+ def _generate_wave_table(
16
+ wave_type: str,
17
+ data_type: str,
18
+ table_size: int,
19
+ min: float,
20
+ max: float,
21
+ phase: float,
22
+ device: torch.device,
23
+ ) -> Tensor:
24
+ r"""A helper function for phaser. Generates a table with given parameters.
25
+
26
+ Args:
27
+ wave_type (str): SINE or TRIANGULAR
28
+ data_type (str): desired data_type ( `INT` or `FLOAT` )
29
+ table_size (int): desired table size
30
+ min (float): desired min value
31
+ max (float): desired max value
32
+ phase (float): desired phase
33
+ device (torch.device): Torch device on which table must be generated
34
+ Returns:
35
+ Tensor: A 1D tensor with wave table values
36
+ """
37
+
38
+ phase_offset = int(phase / math.pi / 2 * table_size + 0.5)
39
+
40
+ t = torch.arange(table_size, device=device, dtype=torch.int32)
41
+
42
+ point = (t + phase_offset) % table_size
43
+
44
+ d = torch.zeros_like(point, device=device, dtype=torch.float64)
45
+
46
+ if wave_type == "SINE":
47
+ d = (torch.sin(point.to(torch.float64) / table_size * 2 * math.pi) + 1) / 2
48
+ elif wave_type == "TRIANGLE":
49
+ d = point.to(torch.float64) * 2 / table_size
50
+ value = torch.div(4 * point, table_size, rounding_mode="floor")
51
+ d[value == 0] = d[value == 0] + 0.5
52
+ d[value == 1] = 1.5 - d[value == 1]
53
+ d[value == 2] = 1.5 - d[value == 2]
54
+ d[value == 3] = d[value == 3] - 1.5
55
+
56
+ d = d * (max - min) + min
57
+
58
+ if data_type == "INT":
59
+ mask = d < 0
60
+ d[mask] = d[mask] - 0.5
61
+ d[~mask] = d[~mask] + 0.5
62
+ d = d.to(torch.int32)
63
+ elif data_type == "FLOAT":
64
+ d = d.to(torch.float32)
65
+
66
+ return d
67
+
68
+
69
+ def allpass_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
70
+ r"""Design two-pole all-pass filter. Similar to SoX implementation.
71
+
72
+ .. devices:: CPU CUDA
73
+
74
+ .. properties:: Autograd TorchScript
75
+
76
+ Args:
77
+ waveform(torch.Tensor): audio waveform of dimension of `(..., time)`
78
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
79
+ central_freq (float or torch.Tensor): central frequency (in Hz)
80
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
81
+
82
+ Returns:
83
+ Tensor: Waveform of dimension of `(..., time)`
84
+
85
+ Reference:
86
+ - http://sox.sourceforge.net/sox.html
87
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
88
+ """
89
+ dtype = waveform.dtype
90
+ device = waveform.device
91
+ central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
92
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
93
+
94
+ w0 = 2 * math.pi * central_freq / sample_rate
95
+
96
+ alpha = torch.sin(w0) / 2 / Q
97
+
98
+ b0 = 1 - alpha
99
+ b1 = -2 * torch.cos(w0)
100
+ b2 = 1 + alpha
101
+ a0 = 1 + alpha
102
+ a1 = -2 * torch.cos(w0)
103
+ a2 = 1 - alpha
104
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
105
+
106
+
107
+ def band_biquad(
108
+ waveform: Tensor,
109
+ sample_rate: int,
110
+ central_freq: float,
111
+ Q: float = 0.707,
112
+ noise: bool = False,
113
+ ) -> Tensor:
114
+ r"""Design two-pole band filter. Similar to SoX implementation.
115
+
116
+ .. devices:: CPU CUDA
117
+
118
+ .. properties:: Autograd TorchScript
119
+
120
+ Args:
121
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
122
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
123
+ central_freq (float or torch.Tensor): central frequency (in Hz)
124
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
125
+ noise (bool, optional) : If ``True``, uses the alternate mode for un-pitched audio (e.g. percussion).
126
+ If ``False``, uses mode oriented to pitched audio, i.e. voice, singing,
127
+ or instrumental music (Default: ``False``).
128
+
129
+ Returns:
130
+ Tensor: Waveform of dimension of `(..., time)`
131
+
132
+ Reference:
133
+ - http://sox.sourceforge.net/sox.html
134
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
135
+ """
136
+ dtype = waveform.dtype
137
+ device = waveform.device
138
+ central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
139
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
140
+
141
+ w0 = 2 * math.pi * central_freq / sample_rate
142
+ bw_Hz = central_freq / Q
143
+
144
+ a0 = 1.0
145
+ a2 = torch.exp(-2 * math.pi * bw_Hz / sample_rate)
146
+ a1 = -4 * a2 / (1 + a2) * torch.cos(w0)
147
+
148
+ b0 = torch.sqrt(1 - a1 * a1 / (4 * a2)) * (1 - a2)
149
+
150
+ if noise:
151
+ mult = torch.sqrt(((1 + a2) * (1 + a2) - a1 * a1) * (1 - a2) / (1 + a2)) / b0
152
+ b0 = mult * b0
153
+
154
+ b1 = 0.0
155
+ b2 = 0.0
156
+
157
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
158
+
159
+
160
+ def bandpass_biquad(
161
+ waveform: Tensor,
162
+ sample_rate: int,
163
+ central_freq: float,
164
+ Q: float = 0.707,
165
+ const_skirt_gain: bool = False,
166
+ ) -> Tensor:
167
+ r"""Design two-pole band-pass filter. Similar to SoX implementation.
168
+
169
+ .. devices:: CPU CUDA
170
+
171
+ .. properties:: Autograd TorchScript
172
+
173
+ Args:
174
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
175
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
176
+ central_freq (float or torch.Tensor): central frequency (in Hz)
177
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
178
+ const_skirt_gain (bool, optional) : If ``True``, uses a constant skirt gain (peak gain = Q).
179
+ If ``False``, uses a constant 0dB peak gain. (Default: ``False``)
180
+
181
+ Returns:
182
+ Tensor: Waveform of dimension of `(..., time)`
183
+
184
+ Reference:
185
+ - http://sox.sourceforge.net/sox.html
186
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
187
+ """
188
+ dtype = waveform.dtype
189
+ device = waveform.device
190
+ central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
191
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
192
+
193
+ w0 = 2 * math.pi * central_freq / sample_rate
194
+ alpha = torch.sin(w0) / 2 / Q
195
+
196
+ temp = torch.sin(w0) / 2 if const_skirt_gain else alpha
197
+ b0 = temp
198
+ b1 = 0.0
199
+ b2 = -temp
200
+ a0 = 1 + alpha
201
+ a1 = -2 * torch.cos(w0)
202
+ a2 = 1 - alpha
203
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
204
+
205
+
206
+ def bandreject_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
207
+ r"""Design two-pole band-reject filter. Similar to SoX implementation.
208
+
209
+ .. devices:: CPU CUDA
210
+
211
+ .. properties:: Autograd TorchScript
212
+
213
+ Args:
214
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
215
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
216
+ central_freq (float or torch.Tensor): central frequency (in Hz)
217
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
218
+
219
+ Returns:
220
+ Tensor: Waveform of dimension of `(..., time)`
221
+
222
+ Reference:
223
+ - http://sox.sourceforge.net/sox.html
224
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
225
+ """
226
+ dtype = waveform.dtype
227
+ device = waveform.device
228
+ central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
229
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
230
+
231
+ w0 = 2 * math.pi * central_freq / sample_rate
232
+ alpha = torch.sin(w0) / 2 / Q
233
+
234
+ b0 = 1.0
235
+ b1 = -2 * torch.cos(w0)
236
+ b2 = 1.0
237
+ a0 = 1 + alpha
238
+ a1 = -2 * torch.cos(w0)
239
+ a2 = 1 - alpha
240
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
241
+
242
+
243
+ def bass_biquad(
244
+ waveform: Tensor,
245
+ sample_rate: int,
246
+ gain: float,
247
+ central_freq: float = 100,
248
+ Q: float = 0.707,
249
+ ) -> Tensor:
250
+ r"""Design a bass tone-control effect. Similar to SoX implementation.
251
+
252
+ .. devices:: CPU CUDA
253
+
254
+ .. properties:: Autograd TorchScript
255
+
256
+ Args:
257
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
258
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
259
+ gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB.
260
+ central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``100``)
261
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
262
+
263
+ Returns:
264
+ Tensor: Waveform of dimension of `(..., time)`
265
+
266
+ Reference:
267
+ - http://sox.sourceforge.net/sox.html
268
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
269
+ """
270
+ dtype = waveform.dtype
271
+ device = waveform.device
272
+ central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
273
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
274
+ gain = torch.as_tensor(gain, dtype=dtype, device=device)
275
+
276
+ w0 = 2 * math.pi * central_freq / sample_rate
277
+ alpha = torch.sin(w0) / 2 / Q
278
+ A = torch.exp(gain / 40 * math.log(10))
279
+
280
+ temp1 = 2 * torch.sqrt(A) * alpha
281
+ temp2 = (A - 1) * torch.cos(w0)
282
+ temp3 = (A + 1) * torch.cos(w0)
283
+
284
+ b0 = A * ((A + 1) - temp2 + temp1)
285
+ b1 = 2 * A * ((A - 1) - temp3)
286
+ b2 = A * ((A + 1) - temp2 - temp1)
287
+ a0 = (A + 1) + temp2 + temp1
288
+ a1 = -2 * ((A - 1) + temp3)
289
+ a2 = (A + 1) + temp2 - temp1
290
+
291
+ return biquad(waveform, b0 / a0, b1 / a0, b2 / a0, a0 / a0, a1 / a0, a2 / a0)
292
+
293
+
294
+ def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: float, a2: float) -> Tensor:
295
+ r"""Perform a biquad filter of input tensor. Initial conditions set to 0.
296
+
297
+ .. devices:: CPU CUDA
298
+
299
+ .. properties:: Autograd TorchScript
300
+
301
+ Args:
302
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
303
+ b0 (float or torch.Tensor): numerator coefficient of current input, x[n]
304
+ b1 (float or torch.Tensor): numerator coefficient of input one time step ago x[n-1]
305
+ b2 (float or torch.Tensor): numerator coefficient of input two time steps ago x[n-2]
306
+ a0 (float or torch.Tensor): denominator coefficient of current output y[n], typically 1
307
+ a1 (float or torch.Tensor): denominator coefficient of current output y[n-1]
308
+ a2 (float or torch.Tensor): denominator coefficient of current output y[n-2]
309
+
310
+ Returns:
311
+ Tensor: Waveform with dimension of `(..., time)`
312
+
313
+ Reference:
314
+ - https://en.wikipedia.org/wiki/Digital_biquad_filter
315
+ """
316
+
317
+ device = waveform.device
318
+ dtype = waveform.dtype
319
+
320
+ b0 = torch.as_tensor(b0, dtype=dtype, device=device).view(1)
321
+ b1 = torch.as_tensor(b1, dtype=dtype, device=device).view(1)
322
+ b2 = torch.as_tensor(b2, dtype=dtype, device=device).view(1)
323
+ a0 = torch.as_tensor(a0, dtype=dtype, device=device).view(1)
324
+ a1 = torch.as_tensor(a1, dtype=dtype, device=device).view(1)
325
+ a2 = torch.as_tensor(a2, dtype=dtype, device=device).view(1)
326
+
327
+ output_waveform = lfilter(
328
+ waveform,
329
+ torch.cat([a0, a1, a2]),
330
+ torch.cat([b0, b1, b2]),
331
+ )
332
+ return output_waveform
333
+
334
+
335
+ def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor:
336
+ r"""Apply contrast effect. Similar to SoX implementation.
337
+
338
+ .. devices:: CPU CUDA
339
+
340
+ .. properties:: Autograd TorchScript
341
+
342
+ Comparable with compression, this effect modifies an audio signal to make it sound louder
343
+
344
+ Args:
345
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
346
+ enhancement_amount (float, optional): controls the amount of the enhancement
347
+ Allowed range of values for enhancement_amount : 0-100
348
+ Note that enhancement_amount = 0 still gives a significant contrast enhancement
349
+
350
+ Returns:
351
+ Tensor: Waveform of dimension of `(..., time)`
352
+
353
+ Reference:
354
+ - http://sox.sourceforge.net/sox.html
355
+ """
356
+
357
+ if not 0 <= enhancement_amount <= 100:
358
+ raise ValueError("Allowed range of values for enhancement_amount : 0-100")
359
+
360
+ contrast = enhancement_amount / 750.0
361
+
362
+ temp1 = waveform * (math.pi / 2)
363
+ temp2 = contrast * torch.sin(temp1 * 4)
364
+ output_waveform = torch.sin(temp1 + temp2)
365
+
366
+ return output_waveform
367
+
368
+
369
+ def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None) -> Tensor:
370
+ r"""Apply a DC shift to the audio. Similar to SoX implementation.
371
+
372
+ .. devices:: CPU CUDA
373
+
374
+ .. properties:: TorchScript
375
+
376
+ This can be useful to remove a DC offset
377
+ (caused perhaps by a hardware problem in the recording chain) from the audio
378
+
379
+ Args:
380
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
381
+ shift (float): indicates the amount to shift the audio
382
+ Allowed range of values for shift : -2.0 to +2.0
383
+ limiter_gain (float of None, optional): It is used only on peaks to prevent clipping
384
+ It should have a value much less than 1 (e.g. 0.05 or 0.02)
385
+
386
+ Returns:
387
+ Tensor: Waveform of dimension of `(..., time)`
388
+
389
+ Reference:
390
+ - http://sox.sourceforge.net/sox.html
391
+ """
392
+ output_waveform = waveform
393
+ limiter_threshold = 0.0
394
+
395
+ if limiter_gain is not None:
396
+ limiter_threshold = 1.0 - (abs(shift) - limiter_gain)
397
+
398
+ # Note:
399
+ # the following index-based update breaks auto-grad support
400
+ if limiter_gain is not None and shift > 0:
401
+ mask = waveform > limiter_threshold
402
+ temp = (waveform[mask] - limiter_threshold) * limiter_gain / (1 - limiter_threshold)
403
+ output_waveform[mask] = (temp + limiter_threshold + shift).clamp(max=limiter_threshold)
404
+ output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1)
405
+ elif limiter_gain is not None and shift < 0:
406
+ mask = waveform < -limiter_threshold
407
+ temp = (waveform[mask] + limiter_threshold) * limiter_gain / (1 - limiter_threshold)
408
+ output_waveform[mask] = (temp - limiter_threshold + shift).clamp(min=-limiter_threshold)
409
+ output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1)
410
+ else:
411
+ output_waveform = (waveform + shift).clamp(min=-1, max=1)
412
+
413
+ return output_waveform
414
+
415
+
416
+ def deemph_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
417
+ r"""Apply ISO 908 CD de-emphasis (shelving) IIR filter. Similar to SoX implementation.
418
+
419
+ .. devices:: CPU CUDA
420
+
421
+ .. properties:: Autograd TorchScript
422
+
423
+ Args:
424
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
425
+ sample_rate (int): sampling rate of the waveform, Allowed sample rate ``44100`` or ``48000``
426
+
427
+ Returns:
428
+ Tensor: Waveform of dimension of `(..., time)`
429
+
430
+ Reference:
431
+ - http://sox.sourceforge.net/sox.html
432
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
433
+ """
434
+
435
+ if sample_rate == 44100:
436
+ central_freq = 5283
437
+ width_slope = 0.4845
438
+ gain = -9.477
439
+ elif sample_rate == 48000:
440
+ central_freq = 5356
441
+ width_slope = 0.479
442
+ gain = -9.62
443
+ else:
444
+ raise ValueError("Sample rate must be 44100 (audio-CD) or 48000 (DAT)")
445
+
446
+ w0 = 2 * math.pi * central_freq / sample_rate
447
+ A = math.exp(gain / 40.0 * math.log(10))
448
+ alpha = math.sin(w0) / 2 * math.sqrt((A + 1 / A) * (1 / width_slope - 1) + 2)
449
+
450
+ temp1 = 2 * math.sqrt(A) * alpha
451
+ temp2 = (A - 1) * math.cos(w0)
452
+ temp3 = (A + 1) * math.cos(w0)
453
+
454
+ b0 = A * ((A + 1) + temp2 + temp1)
455
+ b1 = -2 * A * ((A - 1) + temp3)
456
+ b2 = A * ((A + 1) + temp2 - temp1)
457
+ a0 = (A + 1) - temp2 + temp1
458
+ a1 = 2 * ((A - 1) - temp3)
459
+ a2 = (A + 1) - temp2 - temp1
460
+
461
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
462
+
463
+
464
+ def _add_noise_shaping(dithered_waveform: Tensor, waveform: Tensor) -> Tensor:
465
+ r"""Noise shaping is calculated by error:
466
+ error[n] = dithered[n] - original[n]
467
+ noise_shaped_waveform[n] = dithered[n] + error[n-1]
468
+ """
469
+ wf_shape = waveform.size()
470
+ waveform = waveform.reshape(-1, wf_shape[-1])
471
+
472
+ dithered_shape = dithered_waveform.size()
473
+ dithered_waveform = dithered_waveform.reshape(-1, dithered_shape[-1])
474
+
475
+ error = dithered_waveform - waveform
476
+
477
+ # add error[n-1] to dithered_waveform[n], so offset the error by 1 index
478
+ zeros = torch.zeros(1, dtype=error.dtype, device=error.device)
479
+ for index in range(error.size()[0]):
480
+ err = error[index]
481
+ error_offset = torch.cat((zeros, err))
482
+ error[index] = error_offset[: waveform.size()[1]]
483
+
484
+ noise_shaped = dithered_waveform + error
485
+ return noise_shaped.reshape(dithered_shape[:-1] + noise_shaped.shape[-1:])
486
+
487
+
488
+ def _apply_probability_distribution(waveform: Tensor, density_function: str = "TPDF") -> Tensor:
489
+ r"""Apply a probability distribution function on a waveform.
490
+
491
+ Triangular probability density function (TPDF) dither noise has a
492
+ triangular distribution; values in the center of the range have a higher
493
+ probability of occurring.
494
+
495
+ Rectangular probability density function (RPDF) dither noise has a
496
+ uniform distribution; any value in the specified range has the same
497
+ probability of occurring.
498
+
499
+ Gaussian probability density function (GPDF) has a normal distribution.
500
+ The relationship of probabilities of results follows a bell-shaped,
501
+ or Gaussian curve, typical of dither generated by analog sources.
502
+ Args:
503
+ waveform (Tensor): Tensor of audio of dimension (..., time)
504
+ density_function (str, optional): The density function of a
505
+ continuous random variable (Default: ``"TPDF"``)
506
+ Options: Triangular Probability Density Function - `TPDF`
507
+ Rectangular Probability Density Function - `RPDF`
508
+ Gaussian Probability Density Function - `GPDF`
509
+ Returns:
510
+ Tensor: waveform dithered with TPDF
511
+ """
512
+
513
+ # pack batch
514
+ shape = waveform.size()
515
+ waveform = waveform.reshape(-1, shape[-1])
516
+
517
+ channel_size = waveform.size()[0] - 1
518
+ time_size = waveform.size()[-1] - 1
519
+
520
+ random_channel = (
521
+ int(
522
+ torch.randint(
523
+ channel_size,
524
+ [
525
+ 1,
526
+ ],
527
+ ).item()
528
+ )
529
+ if channel_size > 0
530
+ else 0
531
+ )
532
+ random_time = (
533
+ int(
534
+ torch.randint(
535
+ time_size,
536
+ [
537
+ 1,
538
+ ],
539
+ ).item()
540
+ )
541
+ if time_size > 0
542
+ else 0
543
+ )
544
+
545
+ number_of_bits = 16
546
+ up_scaling = 2 ** (number_of_bits - 1) - 2
547
+ signal_scaled = waveform * up_scaling
548
+ down_scaling = 2 ** (number_of_bits - 1)
549
+
550
+ signal_scaled_dis = waveform
551
+ if density_function == "RPDF":
552
+ RPDF = waveform[random_channel][random_time] - 0.5
553
+
554
+ signal_scaled_dis = signal_scaled + RPDF
555
+ elif density_function == "GPDF":
556
+ # TODO Replace by distribution code once
557
+ # https://github.com/pytorch/pytorch/issues/29843 is resolved
558
+ # gaussian = torch.distributions.normal.Normal(torch.mean(waveform, -1), 1).sample()
559
+
560
+ num_rand_variables = 6
561
+
562
+ gaussian = waveform[random_channel][random_time]
563
+ for ws in num_rand_variables * [time_size]:
564
+ rand_chan = int(
565
+ torch.randint(
566
+ channel_size,
567
+ [
568
+ 1,
569
+ ],
570
+ ).item()
571
+ )
572
+ gaussian += waveform[rand_chan][
573
+ int(
574
+ torch.randint(
575
+ ws,
576
+ [
577
+ 1,
578
+ ],
579
+ ).item()
580
+ )
581
+ ]
582
+
583
+ signal_scaled_dis = signal_scaled + gaussian
584
+ else:
585
+ # dtype needed for https://github.com/pytorch/pytorch/issues/32358
586
+ TPDF = torch.bartlett_window(time_size + 1, dtype=signal_scaled.dtype, device=signal_scaled.device)
587
+ TPDF = TPDF.repeat((channel_size + 1), 1)
588
+ signal_scaled_dis = signal_scaled + TPDF
589
+
590
+ quantised_signal_scaled = torch.round(signal_scaled_dis)
591
+ quantised_signal = quantised_signal_scaled / down_scaling
592
+
593
+ # unpack batch
594
+ return quantised_signal.reshape(shape[:-1] + quantised_signal.shape[-1:])
595
+
596
+
597
+ def dither(waveform: Tensor, density_function: str = "TPDF", noise_shaping: bool = False) -> Tensor:
598
+ r"""Apply dither
599
+
600
+ .. devices:: CPU CUDA
601
+
602
+ .. properties:: TorchScript
603
+
604
+ Dither increases the perceived dynamic range of audio stored at a
605
+ particular bit-depth by eliminating nonlinear truncation distortion
606
+ (i.e. adding minimally perceived noise to mask distortion caused by quantization).
607
+
608
+ Args:
609
+ waveform (Tensor): Tensor of audio of dimension (..., time)
610
+ density_function (str, optional):
611
+ The density function of a continuous random variable. One of
612
+ ``"TPDF"`` (Triangular Probability Density Function),
613
+ ``"RPDF"`` (Rectangular Probability Density Function) or
614
+ ``"GPDF"`` (Gaussian Probability Density Function) (Default: ``"TPDF"``).
615
+ noise_shaping (bool, optional): a filtering process that shapes the spectral
616
+ energy of quantisation error (Default: ``False``)
617
+
618
+ Returns:
619
+ Tensor: waveform dithered
620
+ """
621
+ dithered = _apply_probability_distribution(waveform, density_function=density_function)
622
+
623
+ if noise_shaping:
624
+ return _add_noise_shaping(dithered, waveform)
625
+ else:
626
+ return dithered
627
+
628
+
629
+ def equalizer_biquad(
630
+ waveform: Tensor,
631
+ sample_rate: int,
632
+ center_freq: float,
633
+ gain: float,
634
+ Q: float = 0.707,
635
+ ) -> Tensor:
636
+ r"""Design biquad peaking equalizer filter and perform filtering. Similar to SoX implementation.
637
+
638
+ .. devices:: CPU CUDA
639
+
640
+ .. properties:: Autograd TorchScript
641
+
642
+ Args:
643
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
644
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
645
+ center_freq (float): filter's central frequency
646
+ gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB
647
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
648
+
649
+ Returns:
650
+ Tensor: Waveform of dimension of `(..., time)`
651
+ """
652
+ dtype = waveform.dtype
653
+ device = waveform.device
654
+ center_freq = torch.as_tensor(center_freq, dtype=dtype, device=device)
655
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
656
+ gain = torch.as_tensor(gain, dtype=dtype, device=device)
657
+
658
+ w0 = 2 * math.pi * center_freq / sample_rate
659
+ A = torch.exp(gain / 40.0 * math.log(10))
660
+ alpha = torch.sin(w0) / 2 / Q
661
+
662
+ b0 = 1 + alpha * A
663
+ b1 = -2 * torch.cos(w0)
664
+ b2 = 1 - alpha * A
665
+ a0 = 1 + alpha / A
666
+ a1 = -2 * torch.cos(w0)
667
+ a2 = 1 - alpha / A
668
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
669
+
670
+
671
+ def filtfilt(
672
+ waveform: Tensor,
673
+ a_coeffs: Tensor,
674
+ b_coeffs: Tensor,
675
+ clamp: bool = True,
676
+ ) -> Tensor:
677
+ r"""Apply an IIR filter forward and backward to a waveform.
678
+
679
+ .. devices:: CPU CUDA
680
+
681
+ .. properties:: Autograd TorchScript
682
+
683
+ Inspired by https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.filtfilt.html
684
+
685
+ Args:
686
+ waveform (Tensor): audio waveform of dimension of `(..., time)`. Must be normalized to -1 to 1.
687
+ a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either
688
+ 1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
689
+ Lower delay coefficients are first, e.g. ``[a0, a1, a2, ...]``.
690
+ Must be same size as b_coeffs (pad with 0's as necessary).
691
+ b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either
692
+ 1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
693
+ Lower delay coefficients are first, e.g. ``[b0, b1, b2, ...]``.
694
+ Must be same size as a_coeffs (pad with 0's as necessary).
695
+ clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
696
+
697
+ Returns:
698
+ Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
699
+ are 2D Tensors, or `(..., time)` otherwise.
700
+ """
701
+ forward_filtered = lfilter(waveform, a_coeffs, b_coeffs, clamp=False, batching=True)
702
+ backward_filtered = lfilter(
703
+ forward_filtered.flip(-1),
704
+ a_coeffs,
705
+ b_coeffs,
706
+ clamp=clamp,
707
+ batching=True,
708
+ ).flip(-1)
709
+ return backward_filtered
710
+
711
+
712
+ def flanger(
713
+ waveform: Tensor,
714
+ sample_rate: int,
715
+ delay: float = 0.0,
716
+ depth: float = 2.0,
717
+ regen: float = 0.0,
718
+ width: float = 71.0,
719
+ speed: float = 0.5,
720
+ phase: float = 25.0,
721
+ modulation: str = "sinusoidal",
722
+ interpolation: str = "linear",
723
+ ) -> Tensor:
724
+ r"""Apply a flanger effect to the audio. Similar to SoX implementation.
725
+
726
+ .. devices:: CPU CUDA
727
+
728
+ .. properties:: Autograd TorchScript
729
+
730
+ Args:
731
+ waveform (Tensor): audio waveform of dimension of `(..., channel, time)` .
732
+ Max 4 channels allowed
733
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
734
+ delay (float, optional): desired delay in milliseconds(ms)
735
+ Allowed range of values are 0 to 30
736
+ depth (float, optional): desired delay depth in milliseconds(ms)
737
+ Allowed range of values are 0 to 10
738
+ regen (float, optional): desired regen(feedback gain) in dB
739
+ Allowed range of values are -95 to 95
740
+ width (float, optional): desired width(delay gain) in dB
741
+ Allowed range of values are 0 to 100
742
+ speed (float, optional): modulation speed in Hz
743
+ Allowed range of values are 0.1 to 10
744
+ phase (float, optional): percentage phase-shift for multi-channel
745
+ Allowed range of values are 0 to 100
746
+ modulation (str, optional): Use either "sinusoidal" or "triangular" modulation. (Default: ``sinusoidal``)
747
+ interpolation (str, optional): Use either "linear" or "quadratic" for delay-line interpolation.
748
+ (Default: ``linear``)
749
+
750
+ Returns:
751
+ Tensor: Waveform of dimension of `(..., channel, time)`
752
+
753
+ Reference:
754
+ - http://sox.sourceforge.net/sox.html
755
+
756
+ - Scott Lehman, `Effects Explained`_,
757
+
758
+ .. _Effects Explained:
759
+ https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
760
+ """
761
+
762
+ if modulation not in ("sinusoidal", "triangular"):
763
+ raise ValueError('Only "sinusoidal" or "triangular" modulation allowed')
764
+
765
+ if interpolation not in ("linear", "quadratic"):
766
+ raise ValueError('Only "linear" or "quadratic" interpolation allowed')
767
+
768
+ actual_shape = waveform.shape
769
+ device, dtype = waveform.device, waveform.dtype
770
+
771
+ if actual_shape[-2] > 4:
772
+ raise ValueError("Max 4 channels allowed")
773
+
774
+ # convert to 3D (batch, channels, time)
775
+ waveform = waveform.view(-1, actual_shape[-2], actual_shape[-1])
776
+
777
+ # Scaling
778
+ feedback_gain = regen / 100
779
+ delay_gain = width / 100
780
+ channel_phase = phase / 100
781
+ delay_min = delay / 1000
782
+ delay_depth = depth / 1000
783
+
784
+ n_channels = waveform.shape[-2]
785
+
786
+ if modulation == "sinusoidal":
787
+ wave_type = "SINE"
788
+ else:
789
+ wave_type = "TRIANGLE"
790
+
791
+ # Balance output:
792
+ in_gain = 1.0 / (1 + delay_gain)
793
+ delay_gain = delay_gain / (1 + delay_gain)
794
+
795
+ # Balance feedback loop:
796
+ delay_gain = delay_gain * (1 - abs(feedback_gain))
797
+
798
+ delay_buf_length = int((delay_min + delay_depth) * sample_rate + 0.5)
799
+ delay_buf_length = delay_buf_length + 2
800
+
801
+ delay_bufs = torch.zeros(waveform.shape[0], n_channels, delay_buf_length, dtype=dtype, device=device)
802
+ delay_last = torch.zeros(waveform.shape[0], n_channels, dtype=dtype, device=device)
803
+
804
+ lfo_length = int(sample_rate / speed)
805
+
806
+ table_min = math.floor(delay_min * sample_rate + 0.5)
807
+ table_max = delay_buf_length - 2.0
808
+
809
+ lfo = _generate_wave_table(
810
+ wave_type=wave_type,
811
+ data_type="FLOAT",
812
+ table_size=lfo_length,
813
+ min=float(table_min),
814
+ max=float(table_max),
815
+ phase=3 * math.pi / 2,
816
+ device=device,
817
+ )
818
+
819
+ output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
820
+
821
+ delay_buf_pos = 0
822
+ lfo_pos = 0
823
+ channel_idxs = torch.arange(0, n_channels, device=device)
824
+
825
+ for i in range(waveform.shape[-1]):
826
+
827
+ delay_buf_pos = (delay_buf_pos + delay_buf_length - 1) % delay_buf_length
828
+
829
+ cur_channel_phase = (channel_idxs * lfo_length * channel_phase + 0.5).to(torch.int64)
830
+ delay_tensor = lfo[(lfo_pos + cur_channel_phase) % lfo_length]
831
+ frac_delay = torch.frac(delay_tensor)
832
+ delay_tensor = torch.floor(delay_tensor)
833
+
834
+ int_delay = delay_tensor.to(torch.int64)
835
+
836
+ temp = waveform[:, :, i]
837
+
838
+ delay_bufs[:, :, delay_buf_pos] = temp + delay_last * feedback_gain
839
+
840
+ delayed_0 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
841
+
842
+ int_delay = int_delay + 1
843
+
844
+ delayed_1 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
845
+
846
+ int_delay = int_delay + 1
847
+
848
+ if interpolation == "linear":
849
+ delayed = delayed_0 + (delayed_1 - delayed_0) * frac_delay
850
+ else:
851
+ delayed_2 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
852
+
853
+ int_delay = int_delay + 1
854
+
855
+ delayed_2 = delayed_2 - delayed_0
856
+ delayed_1 = delayed_1 - delayed_0
857
+ a = delayed_2 * 0.5 - delayed_1
858
+ b = delayed_1 * 2 - delayed_2 * 0.5
859
+
860
+ delayed = delayed_0 + (a * frac_delay + b) * frac_delay
861
+
862
+ delay_last = delayed
863
+ output_waveform[:, :, i] = waveform[:, :, i] * in_gain + delayed * delay_gain
864
+
865
+ lfo_pos = (lfo_pos + 1) % lfo_length
866
+
867
+ return output_waveform.clamp(min=-1, max=1).view(actual_shape)
868
+
869
+
870
+ def gain(waveform: Tensor, gain_db: float = 1.0) -> Tensor:
871
+ r"""Apply amplification or attenuation to the whole waveform.
872
+
873
+ .. devices:: CPU CUDA
874
+
875
+ .. properties:: Autograd TorchScript
876
+
877
+ Args:
878
+ waveform (Tensor): Tensor of audio of dimension (..., time).
879
+ gain_db (float, optional) Gain adjustment in decibels (dB) (Default: ``1.0``).
880
+
881
+ Returns:
882
+ Tensor: the whole waveform amplified by gain_db.
883
+ """
884
+ if gain_db == 0:
885
+ return waveform
886
+
887
+ ratio = 10 ** (gain_db / 20)
888
+
889
+ return waveform * ratio
890
+
891
+
892
+ def highpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
893
+ r"""Design biquad highpass filter and perform filtering. Similar to SoX implementation.
894
+
895
+ .. devices:: CPU CUDA
896
+
897
+ .. properties:: Autograd TorchScript
898
+
899
+ Args:
900
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
901
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
902
+ cutoff_freq (float or torch.Tensor): filter cutoff frequency
903
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
904
+
905
+ Returns:
906
+ Tensor: Waveform dimension of `(..., time)`
907
+ """
908
+ dtype = waveform.dtype
909
+ device = waveform.device
910
+ cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device)
911
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
912
+
913
+ w0 = 2 * math.pi * cutoff_freq / sample_rate
914
+ alpha = torch.sin(w0) / 2.0 / Q
915
+
916
+ b0 = (1 + torch.cos(w0)) / 2
917
+ b1 = -1 - torch.cos(w0)
918
+ b2 = b0
919
+ a0 = 1 + alpha
920
+ a1 = -2 * torch.cos(w0)
921
+ a2 = 1 - alpha
922
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
923
+
924
+
925
+ def _lfilter_core_generic_loop(input_signal_windows: Tensor, a_coeffs_flipped: Tensor, padded_output_waveform: Tensor):
926
+ n_order = a_coeffs_flipped.size(1)
927
+ a_coeffs_flipped = a_coeffs_flipped.unsqueeze(2)
928
+ for i_sample, o0 in enumerate(input_signal_windows.permute(2, 0, 1)):
929
+ windowed_output_signal = padded_output_waveform[:, :, i_sample : i_sample + n_order]
930
+ o0 -= (windowed_output_signal.transpose(0, 1) @ a_coeffs_flipped)[..., 0].t()
931
+ padded_output_waveform[:, :, i_sample + n_order - 1] = o0
932
+
933
+
934
+ if _IS_TORCHAUDIO_EXT_AVAILABLE:
935
+ _lfilter_core_cpu_loop = torch.ops.torchaudio._lfilter_core_loop
936
+ else:
937
+ _lfilter_core_cpu_loop = _lfilter_core_generic_loop
938
+
939
+
940
+ def _lfilter_core(
941
+ waveform: Tensor,
942
+ a_coeffs: Tensor,
943
+ b_coeffs: Tensor,
944
+ ) -> Tensor:
945
+
946
+ if a_coeffs.size() != b_coeffs.size():
947
+ raise ValueError(
948
+ "Expected coeffs to be the same size."
949
+ f"Found a_coeffs size: {a_coeffs.size()}, b_coeffs size: {b_coeffs.size()}"
950
+ )
951
+ if waveform.ndim != 3:
952
+ raise ValueError(f"Expected waveform to be 3 dimensional. Found: {waveform.ndim}")
953
+ if not (waveform.device == a_coeffs.device == b_coeffs.device):
954
+ raise ValueError(
955
+ "Expected waveform and coeffs to be on the same device."
956
+ f"Found: waveform device:{waveform.device}, a_coeffs device: {a_coeffs.device}, "
957
+ f"b_coeffs device: {b_coeffs.device}"
958
+ )
959
+
960
+ n_batch, n_channel, n_sample = waveform.size()
961
+ n_order = a_coeffs.size(1)
962
+ if n_order <= 0:
963
+ raise ValueError(f"Expected n_order to be positive. Found: {n_order}")
964
+
965
+ # Pad the input and create output
966
+
967
+ padded_waveform = torch.nn.functional.pad(waveform, [n_order - 1, 0])
968
+ padded_output_waveform = torch.zeros_like(padded_waveform)
969
+
970
+ # Set up the coefficients matrix
971
+ # Flip coefficients' order
972
+ a_coeffs_flipped = a_coeffs.flip(1)
973
+ b_coeffs_flipped = b_coeffs.flip(1)
974
+
975
+ # calculate windowed_input_signal in parallel using convolution
976
+ input_signal_windows = torch.nn.functional.conv1d(padded_waveform, b_coeffs_flipped.unsqueeze(1), groups=n_channel)
977
+
978
+ input_signal_windows.div_(a_coeffs[:, :1])
979
+ a_coeffs_flipped.div_(a_coeffs[:, :1])
980
+
981
+ if (
982
+ input_signal_windows.device == torch.device("cpu")
983
+ and a_coeffs_flipped.device == torch.device("cpu")
984
+ and padded_output_waveform.device == torch.device("cpu")
985
+ ):
986
+ _lfilter_core_cpu_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform)
987
+ else:
988
+ _lfilter_core_generic_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform)
989
+
990
+ output = padded_output_waveform[:, :, n_order - 1 :]
991
+ return output
992
+
993
+
994
+ if _IS_TORCHAUDIO_EXT_AVAILABLE:
995
+ _lfilter = torch.ops.torchaudio._lfilter
996
+ else:
997
+ _lfilter = _lfilter_core
998
+
999
+
1000
+ def lfilter(waveform: Tensor, a_coeffs: Tensor, b_coeffs: Tensor, clamp: bool = True, batching: bool = True) -> Tensor:
1001
+ r"""Perform an IIR filter by evaluating difference equation, using differentiable implementation
1002
+ developed independently by *Yu et al.* :cite:`ismir_YuF23` and *Forgione et al.* :cite:`forgione2021dynonet`.
1003
+
1004
+ .. devices:: CPU CUDA
1005
+
1006
+ .. properties:: Autograd TorchScript
1007
+
1008
+ Note:
1009
+ To avoid numerical problems, small filter order is preferred.
1010
+ Using double precision could also minimize numerical precision errors.
1011
+
1012
+ Args:
1013
+ waveform (Tensor): audio waveform of dimension of `(..., time)`. Must be normalized to -1 to 1.
1014
+ a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either
1015
+ 1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
1016
+ Lower delays coefficients are first, e.g. ``[a0, a1, a2, ...]``.
1017
+ Must be same size as b_coeffs (pad with 0's as necessary).
1018
+ b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either
1019
+ 1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
1020
+ Lower delays coefficients are first, e.g. ``[b0, b1, b2, ...]``.
1021
+ Must be same size as a_coeffs (pad with 0's as necessary).
1022
+ clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
1023
+ batching (bool, optional): Effective only when coefficients are 2D. If ``True``, then waveform should be at
1024
+ least 2D, and the size of second axis from last should equals to ``num_filters``.
1025
+ The output can be expressed as ``output[..., i, :] = lfilter(waveform[..., i, :],
1026
+ a_coeffs[i], b_coeffs[i], clamp=clamp, batching=False)``. (Default: ``True``)
1027
+
1028
+ Returns:
1029
+ Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
1030
+ are 2D Tensors, or `(..., time)` otherwise.
1031
+ """
1032
+ if a_coeffs.size() != b_coeffs.size():
1033
+ raise ValueError(
1034
+ "Expected coeffs to be the same size."
1035
+ f"Found: a_coeffs size: {a_coeffs.size()}, b_coeffs size: {b_coeffs.size()}"
1036
+ )
1037
+ if a_coeffs.ndim > 2:
1038
+ raise ValueError(f"Expected coeffs to have greater than 1 dimension. Found: {a_coeffs.ndim}")
1039
+
1040
+ if a_coeffs.ndim > 1:
1041
+ if batching:
1042
+ if waveform.ndim <= 0:
1043
+ raise ValueError("Expected waveform to have a positive number of dimensions." f"Found: {waveform.ndim}")
1044
+ if waveform.shape[-2] != a_coeffs.shape[0]:
1045
+ raise ValueError(
1046
+ "Expected number of batches in waveform and coeffs to be the same."
1047
+ f"Found: coeffs batches: {a_coeffs.shape[0]}, waveform batches: {waveform.shape[-2]}"
1048
+ )
1049
+ else:
1050
+ waveform = torch.stack([waveform] * a_coeffs.shape[0], -2)
1051
+ else:
1052
+ a_coeffs = a_coeffs.unsqueeze(0)
1053
+ b_coeffs = b_coeffs.unsqueeze(0)
1054
+
1055
+ # pack batch
1056
+ shape = waveform.size()
1057
+ waveform = waveform.reshape(-1, a_coeffs.shape[0], shape[-1])
1058
+ output = _lfilter(waveform, a_coeffs, b_coeffs)
1059
+
1060
+ if clamp:
1061
+ output = torch.clamp(output, min=-1.0, max=1.0)
1062
+
1063
+ # unpack batch
1064
+ output = output.reshape(shape[:-1] + output.shape[-1:])
1065
+
1066
+ return output
1067
+
1068
+
1069
+ def lowpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
1070
+ r"""Design biquad lowpass filter and perform filtering. Similar to SoX implementation.
1071
+
1072
+ .. devices:: CPU CUDA
1073
+
1074
+ .. properties:: Autograd TorchScript
1075
+
1076
+ Args:
1077
+ waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
1078
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
1079
+ cutoff_freq (float or torch.Tensor): filter cutoff frequency
1080
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
1081
+
1082
+ Returns:
1083
+ Tensor: Waveform of dimension of `(..., time)`
1084
+ """
1085
+ dtype = waveform.dtype
1086
+ device = waveform.device
1087
+ cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device)
1088
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
1089
+
1090
+ w0 = 2 * math.pi * cutoff_freq / sample_rate
1091
+ alpha = torch.sin(w0) / 2 / Q
1092
+
1093
+ b0 = (1 - torch.cos(w0)) / 2
1094
+ b1 = 1 - torch.cos(w0)
1095
+ b2 = b0
1096
+ a0 = 1 + alpha
1097
+ a1 = -2 * torch.cos(w0)
1098
+ a2 = 1 - alpha
1099
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
1100
+
1101
+
1102
+ def _overdrive_core_loop_generic(
1103
+ waveform: Tensor, temp: Tensor, last_in: Tensor, last_out: Tensor, output_waveform: Tensor
1104
+ ):
1105
+ for i in range(waveform.shape[-1]):
1106
+ last_out = temp[:, i] - last_in + 0.995 * last_out
1107
+ last_in = temp[:, i]
1108
+ output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
1109
+
1110
+
1111
+ if _IS_TORCHAUDIO_EXT_AVAILABLE:
1112
+ _overdrive_core_loop_cpu = torch.ops.torchaudio._overdrive_core_loop
1113
+ else:
1114
+ _overdrive_core_loop_cpu = _overdrive_core_loop_generic
1115
+
1116
+
1117
+ def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
1118
+ r"""Apply a overdrive effect to the audio. Similar to SoX implementation.
1119
+
1120
+ .. devices:: CPU CUDA
1121
+
1122
+ .. properties:: Autograd TorchScript
1123
+
1124
+ This effect applies a non linear distortion to the audio signal.
1125
+
1126
+ Args:
1127
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
1128
+ gain (float, optional): desired gain at the boost (or attenuation) in dB
1129
+ Allowed range of values are 0 to 100
1130
+ colour (float, optional): controls the amount of even harmonic content in the over-driven output
1131
+ Allowed range of values are 0 to 100
1132
+
1133
+ Returns:
1134
+ Tensor: Waveform of dimension of `(..., time)`
1135
+
1136
+ Reference:
1137
+ - http://sox.sourceforge.net/sox.html
1138
+ """
1139
+ actual_shape = waveform.shape
1140
+ device, dtype = waveform.device, waveform.dtype
1141
+
1142
+ # convert to 2D (..,time)
1143
+ waveform = waveform.view(-1, actual_shape[-1])
1144
+
1145
+ gain = _dB2Linear(gain)
1146
+ colour = colour / 200
1147
+ last_in = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device)
1148
+ last_out = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device)
1149
+
1150
+ temp = waveform * gain + colour
1151
+
1152
+ mask1 = temp < -1
1153
+ temp[mask1] = torch.tensor(-2.0 / 3.0, dtype=dtype, device=device)
1154
+ # Wrapping the constant with Tensor is required for Torchscript
1155
+
1156
+ mask2 = temp > 1
1157
+ temp[mask2] = torch.tensor(2.0 / 3.0, dtype=dtype, device=device)
1158
+
1159
+ mask3 = ~mask1 & ~mask2
1160
+ temp[mask3] = temp[mask3] - (temp[mask3] ** 3) * (1.0 / 3)
1161
+
1162
+ output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
1163
+
1164
+ # Uses CPU optimized loop function if available for CPU device
1165
+ if device == torch.device("cpu"):
1166
+ _overdrive_core_loop_cpu(waveform, temp, last_in, last_out, output_waveform)
1167
+ else:
1168
+ _overdrive_core_loop_generic(waveform, temp, last_in, last_out, output_waveform)
1169
+
1170
+ return output_waveform.clamp(min=-1, max=1).view(actual_shape)
1171
+
1172
+
1173
+ def phaser(
1174
+ waveform: Tensor,
1175
+ sample_rate: int,
1176
+ gain_in: float = 0.4,
1177
+ gain_out: float = 0.74,
1178
+ delay_ms: float = 3.0,
1179
+ decay: float = 0.4,
1180
+ mod_speed: float = 0.5,
1181
+ sinusoidal: bool = True,
1182
+ ) -> Tensor:
1183
+ r"""Apply a phasing effect to the audio. Similar to SoX implementation.
1184
+
1185
+ .. devices:: CPU CUDA
1186
+
1187
+ .. properties:: Autograd TorchScript
1188
+
1189
+ Args:
1190
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
1191
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
1192
+ gain_in (float, optional): desired input gain at the boost (or attenuation) in dB
1193
+ Allowed range of values are 0 to 1
1194
+ gain_out (float, optional): desired output gain at the boost (or attenuation) in dB
1195
+ Allowed range of values are 0 to 1e9
1196
+ delay_ms (float, optional): desired delay in milliseconds
1197
+ Allowed range of values are 0 to 5.0
1198
+ decay (float, optional): desired decay relative to gain-in
1199
+ Allowed range of values are 0 to 0.99
1200
+ mod_speed (float, optional): modulation speed in Hz
1201
+ Allowed range of values are 0.1 to 2
1202
+ sinusoidal (bool, optional): If ``True``, uses sinusoidal modulation (preferable for multiple instruments)
1203
+ If ``False``, uses triangular modulation (gives single instruments a sharper phasing effect)
1204
+ (Default: ``True``)
1205
+
1206
+ Returns:
1207
+ Tensor: Waveform of dimension of `(..., time)`
1208
+
1209
+ Reference:
1210
+ - http://sox.sourceforge.net/sox.html
1211
+ - Scott Lehman, `Effects Explained`_.
1212
+
1213
+ .. _Effects Explained:
1214
+ https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
1215
+ """
1216
+ actual_shape = waveform.shape
1217
+ device, dtype = waveform.device, waveform.dtype
1218
+
1219
+ # convert to 2D (channels,time)
1220
+ waveform = waveform.view(-1, actual_shape[-1])
1221
+
1222
+ delay_buf_len = int((delay_ms * 0.001 * sample_rate) + 0.5)
1223
+ delay_buf = torch.zeros(waveform.shape[0], delay_buf_len, dtype=dtype, device=device)
1224
+
1225
+ mod_buf_len = int(sample_rate / mod_speed + 0.5)
1226
+
1227
+ if sinusoidal:
1228
+ wave_type = "SINE"
1229
+ else:
1230
+ wave_type = "TRIANGLE"
1231
+
1232
+ mod_buf = _generate_wave_table(
1233
+ wave_type=wave_type,
1234
+ data_type="INT",
1235
+ table_size=mod_buf_len,
1236
+ min=1.0,
1237
+ max=float(delay_buf_len),
1238
+ phase=math.pi / 2,
1239
+ device=device,
1240
+ )
1241
+
1242
+ delay_pos = 0
1243
+ mod_pos = 0
1244
+
1245
+ output_waveform_pre_gain_list = []
1246
+ waveform = waveform * gain_in
1247
+ delay_buf = delay_buf * decay
1248
+ waveform_list = [waveform[:, i] for i in range(waveform.size(1))]
1249
+ delay_buf_list = [delay_buf[:, i] for i in range(delay_buf.size(1))]
1250
+ mod_buf_list = [mod_buf[i] for i in range(mod_buf.size(0))]
1251
+
1252
+ for i in range(waveform.shape[-1]):
1253
+ idx = int((delay_pos + mod_buf_list[mod_pos]) % delay_buf_len)
1254
+ mod_pos = (mod_pos + 1) % mod_buf_len
1255
+ delay_pos = (delay_pos + 1) % delay_buf_len
1256
+ temp = (waveform_list[i]) + (delay_buf_list[idx])
1257
+ delay_buf_list[delay_pos] = temp * decay
1258
+ output_waveform_pre_gain_list.append(temp)
1259
+
1260
+ output_waveform = torch.stack(output_waveform_pre_gain_list, dim=1).to(dtype=dtype, device=device)
1261
+ output_waveform.mul_(gain_out)
1262
+
1263
+ return output_waveform.clamp(min=-1, max=1).view(actual_shape)
1264
+
1265
+
1266
+ def riaa_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
1267
+ r"""Apply RIAA vinyl playback equalization. Similar to SoX implementation.
1268
+
1269
+ .. devices:: CPU CUDA
1270
+
1271
+ .. properties:: Autograd TorchScript
1272
+
1273
+ Args:
1274
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
1275
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz).
1276
+ Allowed sample rates in Hz : ``44100``,``48000``,``88200``,``96000``
1277
+
1278
+ Returns:
1279
+ Tensor: Waveform of dimension of `(..., time)`
1280
+
1281
+ Reference:
1282
+ - http://sox.sourceforge.net/sox.html
1283
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
1284
+ """
1285
+
1286
+ if sample_rate == 44100:
1287
+ zeros = [-0.2014898, 0.9233820]
1288
+ poles = [0.7083149, 0.9924091]
1289
+
1290
+ elif sample_rate == 48000:
1291
+ zeros = [-0.1766069, 0.9321590]
1292
+ poles = [0.7396325, 0.9931330]
1293
+
1294
+ elif sample_rate == 88200:
1295
+ zeros = [-0.1168735, 0.9648312]
1296
+ poles = [0.8590646, 0.9964002]
1297
+
1298
+ elif sample_rate == 96000:
1299
+ zeros = [-0.1141486, 0.9676817]
1300
+ poles = [0.8699137, 0.9966946]
1301
+
1302
+ else:
1303
+ raise ValueError("Sample rate must be 44.1k, 48k, 88.2k, or 96k")
1304
+
1305
+ # polynomial coefficients with roots zeros[0] and zeros[1]
1306
+ b0 = 1.0
1307
+ b1 = -(zeros[0] + zeros[1])
1308
+ b2 = zeros[0] * zeros[1]
1309
+
1310
+ # polynomial coefficients with roots poles[0] and poles[1]
1311
+ a0 = 1.0
1312
+ a1 = -(poles[0] + poles[1])
1313
+ a2 = poles[0] * poles[1]
1314
+
1315
+ # Normalize to 0dB at 1kHz
1316
+ y = 2 * math.pi * 1000 / sample_rate
1317
+ b_re = b0 + b1 * math.cos(-y) + b2 * math.cos(-2 * y)
1318
+ a_re = a0 + a1 * math.cos(-y) + a2 * math.cos(-2 * y)
1319
+ b_im = b1 * math.sin(-y) + b2 * math.sin(-2 * y)
1320
+ a_im = a1 * math.sin(-y) + a2 * math.sin(-2 * y)
1321
+ g = 1 / math.sqrt((b_re**2 + b_im**2) / (a_re**2 + a_im**2))
1322
+
1323
+ b0 *= g
1324
+ b1 *= g
1325
+ b2 *= g
1326
+
1327
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
1328
+
1329
+
1330
+ def treble_biquad(
1331
+ waveform: Tensor,
1332
+ sample_rate: int,
1333
+ gain: float,
1334
+ central_freq: float = 3000,
1335
+ Q: float = 0.707,
1336
+ ) -> Tensor:
1337
+ r"""Design a treble tone-control effect. Similar to SoX implementation.
1338
+
1339
+ .. devices:: CPU CUDA
1340
+
1341
+ .. properties:: Autograd TorchScript
1342
+
1343
+ Args:
1344
+ waveform (Tensor): audio waveform of dimension of `(..., time)`
1345
+ sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
1346
+ gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB.
1347
+ central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``3000``)
1348
+ Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
1349
+
1350
+ Returns:
1351
+ Tensor: Waveform of dimension of `(..., time)`
1352
+
1353
+ Reference:
1354
+ - http://sox.sourceforge.net/sox.html
1355
+ - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
1356
+ """
1357
+ dtype = waveform.dtype
1358
+ device = waveform.device
1359
+ central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
1360
+ Q = torch.as_tensor(Q, dtype=dtype, device=device)
1361
+ gain = torch.as_tensor(gain, dtype=dtype, device=device)
1362
+
1363
+ w0 = 2 * math.pi * central_freq / sample_rate
1364
+ alpha = torch.sin(w0) / 2 / Q
1365
+ A = torch.exp(gain / 40 * math.log(10))
1366
+
1367
+ temp1 = 2 * torch.sqrt(A) * alpha
1368
+ temp2 = (A - 1) * torch.cos(w0)
1369
+ temp3 = (A + 1) * torch.cos(w0)
1370
+
1371
+ b0 = A * ((A + 1) + temp2 + temp1)
1372
+ b1 = -2 * A * ((A - 1) + temp3)
1373
+ b2 = A * ((A + 1) + temp2 - temp1)
1374
+ a0 = (A + 1) - temp2 + temp1
1375
+ a1 = 2 * ((A - 1) - temp3)
1376
+ a2 = (A + 1) - temp2 - temp1
1377
+
1378
+ return biquad(waveform, b0, b1, b2, a0, a1, a2)
1379
+
1380
+
1381
+ def _measure(
1382
+ measure_len_ws: int,
1383
+ samples: Tensor,
1384
+ spectrum: Tensor,
1385
+ noise_spectrum: Tensor,
1386
+ spectrum_window: Tensor,
1387
+ spectrum_start: int,
1388
+ spectrum_end: int,
1389
+ cepstrum_window: Tensor,
1390
+ cepstrum_start: int,
1391
+ cepstrum_end: int,
1392
+ noise_reduction_amount: float,
1393
+ measure_smooth_time_mult: float,
1394
+ noise_up_time_mult: Tensor,
1395
+ noise_down_time_mult: Tensor,
1396
+ boot_count: int,
1397
+ ) -> float:
1398
+ device = samples.device
1399
+
1400
+ if spectrum.size(-1) != noise_spectrum.size(-1):
1401
+ raise ValueError(
1402
+ "Expected spectrum size to match noise spectrum size in final dimension."
1403
+ f"Found: spectrum size: {spectrum.size()}, noise_spectrum size: {noise_spectrum.size()}"
1404
+ )
1405
+
1406
+ dft_len_ws = spectrum.size()[-1]
1407
+
1408
+ dftBuf = torch.zeros(dft_len_ws, device=device)
1409
+
1410
+ dftBuf[:measure_len_ws] = samples * spectrum_window[:measure_len_ws]
1411
+
1412
+ # lsx_safe_rdft((int)p->dft_len_ws, 1, c->dftBuf);
1413
+ _dftBuf = torch.fft.rfft(dftBuf)
1414
+
1415
+ mult: float = boot_count / (1.0 + boot_count) if boot_count >= 0 else measure_smooth_time_mult
1416
+
1417
+ _d = _dftBuf[spectrum_start:spectrum_end].abs()
1418
+ spectrum[spectrum_start:spectrum_end].mul_(mult).add_(_d * (1 - mult))
1419
+ _d = spectrum[spectrum_start:spectrum_end] ** 2
1420
+
1421
+ _zeros = torch.zeros(spectrum_end - spectrum_start, device=device)
1422
+ _mult = (
1423
+ _zeros
1424
+ if boot_count >= 0
1425
+ else torch.where(
1426
+ _d > noise_spectrum[spectrum_start:spectrum_end],
1427
+ noise_up_time_mult, # if
1428
+ noise_down_time_mult, # else,
1429
+ )
1430
+ )
1431
+
1432
+ noise_spectrum[spectrum_start:spectrum_end].mul_(_mult).add_(_d * (1 - _mult))
1433
+ _d = torch.sqrt(
1434
+ torch.max(
1435
+ _zeros,
1436
+ _d - noise_reduction_amount * noise_spectrum[spectrum_start:spectrum_end],
1437
+ ),
1438
+ )
1439
+
1440
+ _cepstrum_Buf: Tensor = torch.zeros(dft_len_ws >> 1, device=device)
1441
+ _cepstrum_Buf[spectrum_start:spectrum_end] = _d * cepstrum_window
1442
+ _cepstrum_Buf[spectrum_end : dft_len_ws >> 1].zero_()
1443
+
1444
+ # lsx_safe_rdft((int)p->dft_len_ws >> 1, 1, c->dftBuf);
1445
+ _cepstrum_Buf = torch.fft.rfft(_cepstrum_Buf)
1446
+
1447
+ result: float = float(torch.sum(_cepstrum_Buf[cepstrum_start:cepstrum_end].abs().pow(2)))
1448
+ result = math.log(result / (cepstrum_end - cepstrum_start)) if result > 0 else -math.inf
1449
+ return max(0, 21 + result)
1450
+
1451
+
1452
+ def vad(
1453
+ waveform: Tensor,
1454
+ sample_rate: int,
1455
+ trigger_level: float = 7.0,
1456
+ trigger_time: float = 0.25,
1457
+ search_time: float = 1.0,
1458
+ allowed_gap: float = 0.25,
1459
+ pre_trigger_time: float = 0.0,
1460
+ # Fine-tuning parameters
1461
+ boot_time: float = 0.35,
1462
+ noise_up_time: float = 0.1,
1463
+ noise_down_time: float = 0.01,
1464
+ noise_reduction_amount: float = 1.35,
1465
+ measure_freq: float = 20.0,
1466
+ measure_duration: Optional[float] = None,
1467
+ measure_smooth_time: float = 0.4,
1468
+ hp_filter_freq: float = 50.0,
1469
+ lp_filter_freq: float = 6000.0,
1470
+ hp_lifter_freq: float = 150.0,
1471
+ lp_lifter_freq: float = 2000.0,
1472
+ ) -> Tensor:
1473
+ r"""Voice Activity Detector. Similar to SoX implementation.
1474
+
1475
+ .. devices:: CPU CUDA
1476
+
1477
+ .. properties:: TorchScript
1478
+
1479
+ Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
1480
+ The algorithm currently uses a simple cepstral power measurement to detect voice,
1481
+ so may be fooled by other things, especially music.
1482
+
1483
+ The effect can trim only from the front of the audio,
1484
+ so in order to trim from the back, the reverse effect must also be used.
1485
+
1486
+ Args:
1487
+ waveform (Tensor): Tensor of audio of dimension `(channels, time)` or `(time)`
1488
+ Tensor of shape `(channels, time)` is treated as a multi-channel recording
1489
+ of the same event and the resulting output will be trimmed to the earliest
1490
+ voice activity in any channel.
1491
+ sample_rate (int): Sample rate of audio signal.
1492
+ trigger_level (float, optional): The measurement level used to trigger activity detection.
1493
+ This may need to be cahnged depending on the noise level, signal level,
1494
+ and other characteristics of the input audio. (Default: 7.0)
1495
+ trigger_time (float, optional): The time constant (in seconds)
1496
+ used to help ignore short bursts of sound. (Default: 0.25)
1497
+ search_time (float, optional): The amount of audio (in seconds)
1498
+ to search for quieter/shorter bursts of audio to include prior
1499
+ to the detected trigger point. (Default: 1.0)
1500
+ allowed_gap (float, optional): The allowed gap (in seconds) between
1501
+ quieter/shorter bursts of audio to include prior
1502
+ to the detected trigger point. (Default: 0.25)
1503
+ pre_trigger_time (float, optional): The amount of audio (in seconds) to preserve
1504
+ before the trigger point and any found quieter/shorter bursts. (Default: 0.0)
1505
+ boot_time (float, optional) The algorithm (internally) uses adaptive noise
1506
+ estimation/reduction in order to detect the start of the wanted audio.
1507
+ This option sets the time for the initial noise estimate. (Default: 0.35)
1508
+ noise_up_time (float, optional) Time constant used by the adaptive noise estimator
1509
+ for when the noise level is increasing. (Default: 0.1)
1510
+ noise_down_time (float, optional) Time constant used by the adaptive noise estimator
1511
+ for when the noise level is decreasing. (Default: 0.01)
1512
+ noise_reduction_amount (float, optional) Amount of noise reduction to use in
1513
+ the detection algorithm (e.g. 0, 0.5, ...). (Default: 1.35)
1514
+ measure_freq (float, optional) Frequency of the algorithm's
1515
+ processing/measurements. (Default: 20.0)
1516
+ measure_duration: (float, optional) Measurement duration.
1517
+ (Default: Twice the measurement period; i.e. with overlap.)
1518
+ measure_smooth_time (float, optional) Time constant used to smooth
1519
+ spectral measurements. (Default: 0.4)
1520
+ hp_filter_freq (float, optional) "Brick-wall" frequency of high-pass filter applied
1521
+ at the input to the detector algorithm. (Default: 50.0)
1522
+ lp_filter_freq (float, optional) "Brick-wall" frequency of low-pass filter applied
1523
+ at the input to the detector algorithm. (Default: 6000.0)
1524
+ hp_lifter_freq (float, optional) "Brick-wall" frequency of high-pass lifter used
1525
+ in the detector algorithm. (Default: 150.0)
1526
+ lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used
1527
+ in the detector algorithm. (Default: 2000.0)
1528
+
1529
+ Returns:
1530
+ Tensor: Tensor of audio of dimension `(..., time)`.
1531
+
1532
+ Reference:
1533
+ - http://sox.sourceforge.net/sox.html
1534
+ """
1535
+ device = waveform.device
1536
+
1537
+ if waveform.ndim > 2:
1538
+ warnings.warn(
1539
+ "Expected input tensor dimension of 1 for single channel"
1540
+ f" or 2 for multi-channel. Got {waveform.ndim} instead. "
1541
+ "Batch semantics is not supported. "
1542
+ "Please refer to https://github.com/pytorch/audio/issues/1348"
1543
+ " and https://github.com/pytorch/audio/issues/1468."
1544
+ )
1545
+
1546
+ measure_duration: float = 2.0 / measure_freq if measure_duration is None else measure_duration
1547
+
1548
+ measure_len_ws = int(sample_rate * measure_duration + 0.5)
1549
+ measure_len_ns = measure_len_ws
1550
+ # for (dft_len_ws = 16; dft_len_ws < measure_len_ws; dft_len_ws <<= 1);
1551
+ dft_len_ws = 16
1552
+ while dft_len_ws < measure_len_ws:
1553
+ dft_len_ws *= 2
1554
+
1555
+ measure_period_ns = int(sample_rate / measure_freq + 0.5)
1556
+ measures_len = math.ceil(search_time * measure_freq)
1557
+ search_pre_trigger_len_ns = measures_len * measure_period_ns
1558
+ gap_len = int(allowed_gap * measure_freq + 0.5)
1559
+
1560
+ fixed_pre_trigger_len_ns = int(pre_trigger_time * sample_rate + 0.5)
1561
+ samplesLen_ns = fixed_pre_trigger_len_ns + search_pre_trigger_len_ns + measure_len_ns
1562
+
1563
+ spectrum_window = torch.zeros(measure_len_ws, device=device)
1564
+ for i in range(measure_len_ws):
1565
+ # sox.h:741 define SOX_SAMPLE_MIN (sox_sample_t)SOX_INT_MIN(32)
1566
+ spectrum_window[i] = 2.0 / math.sqrt(float(measure_len_ws))
1567
+ # lsx_apply_hann(spectrum_window, (int)measure_len_ws);
1568
+ spectrum_window *= torch.hann_window(measure_len_ws, device=device, dtype=torch.float)
1569
+
1570
+ spectrum_start: int = int(hp_filter_freq / sample_rate * dft_len_ws + 0.5)
1571
+ spectrum_start: int = max(spectrum_start, 1)
1572
+ spectrum_end: int = int(lp_filter_freq / sample_rate * dft_len_ws + 0.5)
1573
+ spectrum_end: int = min(spectrum_end, dft_len_ws // 2)
1574
+
1575
+ cepstrum_window = torch.zeros(spectrum_end - spectrum_start, device=device)
1576
+ for i in range(spectrum_end - spectrum_start):
1577
+ cepstrum_window[i] = 2.0 / math.sqrt(float(spectrum_end) - spectrum_start)
1578
+ # lsx_apply_hann(cepstrum_window,(int)(spectrum_end - spectrum_start));
1579
+ cepstrum_window *= torch.hann_window(spectrum_end - spectrum_start, device=device, dtype=torch.float)
1580
+
1581
+ cepstrum_start = math.ceil(sample_rate * 0.5 / lp_lifter_freq)
1582
+ cepstrum_end = math.floor(sample_rate * 0.5 / hp_lifter_freq)
1583
+ cepstrum_end = min(cepstrum_end, dft_len_ws // 4)
1584
+
1585
+ if cepstrum_end <= cepstrum_start:
1586
+ raise ValueError(
1587
+ "Expected cepstrum_start to be smaller than cepstrum_end."
1588
+ f"Found: cepstrum_start: {cepstrum_start}, cepstrum_end: {cepstrum_end}."
1589
+ )
1590
+
1591
+ noise_up_time_mult = torch.tensor(math.exp(-1.0 / (noise_up_time * measure_freq)), device=device)
1592
+ noise_down_time_mult = torch.tensor(math.exp(-1.0 / (noise_down_time * measure_freq)), device=device)
1593
+ measure_smooth_time_mult = math.exp(-1.0 / (measure_smooth_time * measure_freq))
1594
+ trigger_meas_time_mult = math.exp(-1.0 / (trigger_time * measure_freq))
1595
+
1596
+ boot_count_max = int(boot_time * measure_freq - 0.5)
1597
+ boot_count = measures_index = flushedLen_ns = 0
1598
+
1599
+ # pack batch
1600
+ shape = waveform.size()
1601
+ waveform = waveform.view(-1, shape[-1])
1602
+
1603
+ n_channels, ilen = waveform.size()
1604
+
1605
+ mean_meas = torch.zeros(n_channels, device=device)
1606
+ spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
1607
+ noise_spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
1608
+ measures = torch.zeros(n_channels, measures_len, device=device)
1609
+
1610
+ has_triggered: bool = False
1611
+ num_measures_to_flush: int = 0
1612
+
1613
+ pos = 0
1614
+ for pos in range(measure_len_ns, ilen, measure_period_ns):
1615
+ for i in range(n_channels):
1616
+ meas: float = _measure(
1617
+ measure_len_ws=measure_len_ws,
1618
+ samples=waveform[i, pos - measure_len_ws : pos],
1619
+ spectrum=spectrum[i],
1620
+ noise_spectrum=noise_spectrum[i],
1621
+ spectrum_window=spectrum_window,
1622
+ spectrum_start=spectrum_start,
1623
+ spectrum_end=spectrum_end,
1624
+ cepstrum_window=cepstrum_window,
1625
+ cepstrum_start=cepstrum_start,
1626
+ cepstrum_end=cepstrum_end,
1627
+ noise_reduction_amount=noise_reduction_amount,
1628
+ measure_smooth_time_mult=measure_smooth_time_mult,
1629
+ noise_up_time_mult=noise_up_time_mult,
1630
+ noise_down_time_mult=noise_down_time_mult,
1631
+ boot_count=boot_count,
1632
+ )
1633
+ measures[i, measures_index] = meas
1634
+ mean_meas[i] = mean_meas[i] * trigger_meas_time_mult + meas * (1.0 - trigger_meas_time_mult)
1635
+
1636
+ has_triggered = has_triggered or (mean_meas[i] >= trigger_level)
1637
+ if has_triggered:
1638
+ n: int = measures_len
1639
+ k: int = measures_index
1640
+ jTrigger: int = n
1641
+ jZero: int = n
1642
+ j: int = 0
1643
+
1644
+ for j in range(n):
1645
+ if (measures[i, k] >= trigger_level) and (j <= jTrigger + gap_len):
1646
+ jZero = jTrigger = j
1647
+ elif (measures[i, k] == 0) and (jTrigger >= jZero):
1648
+ jZero = j
1649
+ k = (k + n - 1) % n
1650
+ j = min(j, jZero)
1651
+ # num_measures_to_flush = range_limit(j, num_measures_to_flush, n);
1652
+ num_measures_to_flush = min(max(num_measures_to_flush, j), n)
1653
+ # end if has_triggered
1654
+ # end for channel
1655
+ measures_index += 1
1656
+ measures_index = measures_index % measures_len
1657
+ if boot_count >= 0:
1658
+ boot_count = -1 if boot_count == boot_count_max else boot_count + 1
1659
+
1660
+ if has_triggered:
1661
+ flushedLen_ns = (measures_len - num_measures_to_flush) * measure_period_ns
1662
+ break
1663
+ # end for window
1664
+ if not has_triggered:
1665
+ return waveform[..., :0].view(shape[:-1] + torch.Size([0]))
1666
+
1667
+ res = waveform[:, pos - samplesLen_ns + flushedLen_ns :]
1668
+ # unpack batch
1669
+ return res.view(shape[:-1] + res.shape[-1:])