Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/const_vs_enum.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/contains.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/issue232.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/json_schema_test_suite.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/nested_schemas.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/subcomponents.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/unused_registry.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_applicator_schemas.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_keywords.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/validator_creation.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/const_vs_enum.py +30 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/contains.py +28 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/json_schema_test_suite.py +12 -0
- .venv/lib/python3.11/site-packages/jsonschema/benchmarks/validator_creation.py +14 -0
- .venv/lib/python3.11/site-packages/torchaudio/__init__.py +53 -0
- .venv/lib/python3.11/site-packages/torchaudio/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/__pycache__/kaldi_io.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/__pycache__/version.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/__init__.py +61 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/backend.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/common.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/ffmpeg.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile_backend.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/sox.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/backend.py +53 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/common.py +52 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/ffmpeg.py +334 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile.py +54 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile_backend.py +457 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/sox.py +91 -0
- .venv/lib/python3.11/site-packages/torchaudio/_backend/utils.py +317 -0
- .venv/lib/python3.11/site-packages/torchaudio/backend/__init__.py +8 -0
- .venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/_sox_io_backend.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/soundfile_backend.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/backend/_no_backend.py +25 -0
- .venv/lib/python3.11/site-packages/torchaudio/backend/common.py +13 -0
- .venv/lib/python3.11/site-packages/torchaudio/backend/soundfile_backend.py +14 -0
- .venv/lib/python3.11/site-packages/torchaudio/backend/sox_io_backend.py +14 -0
- .venv/lib/python3.11/site-packages/torchaudio/functional/__init__.py +127 -0
- .venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/_alignment.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/filtering.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/functional.cpython-311.pyc +3 -0
- .venv/lib/python3.11/site-packages/torchaudio/functional/_alignment.py +128 -0
- .venv/lib/python3.11/site-packages/torchaudio/functional/filtering.py +1669 -0
.gitattributes
CHANGED
|
@@ -295,3 +295,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/
|
|
| 295 |
.venv/bin/py-spy filter=lfs diff=lfs merge=lfs -text
|
| 296 |
.venv/lib/python3.11/site-packages/_cffi_backend.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 297 |
.venv/lib/python3.11/site-packages/jsonschema/tests/__pycache__/test_validators.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 295 |
.venv/bin/py-spy filter=lfs diff=lfs merge=lfs -text
|
| 296 |
.venv/lib/python3.11/site-packages/_cffi_backend.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 297 |
.venv/lib/python3.11/site-packages/jsonschema/tests/__pycache__/test_validators.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 298 |
+
.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/functional.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (279 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/const_vs_enum.cpython-311.pyc
ADDED
|
Binary file (2.18 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/contains.cpython-311.pyc
ADDED
|
Binary file (2.15 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/issue232.cpython-311.pyc
ADDED
|
Binary file (1.02 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/json_schema_test_suite.cpython-311.pyc
ADDED
|
Binary file (719 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/nested_schemas.cpython-311.pyc
ADDED
|
Binary file (2.74 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/subcomponents.cpython-311.pyc
ADDED
|
Binary file (2.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/unused_registry.cpython-311.pyc
ADDED
|
Binary file (1.79 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_applicator_schemas.cpython-311.pyc
ADDED
|
Binary file (4.04 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_keywords.cpython-311.pyc
ADDED
|
Binary file (2.36 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/validator_creation.cpython-311.pyc
ADDED
|
Binary file (629 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/const_vs_enum.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
A benchmark for comparing equivalent validation of `const` and `enum`.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from pyperf import Runner
|
| 6 |
+
|
| 7 |
+
from jsonschema import Draft202012Validator
|
| 8 |
+
|
| 9 |
+
value = [37] * 100
|
| 10 |
+
const_schema = {"const": list(value)}
|
| 11 |
+
enum_schema = {"enum": [list(value)]}
|
| 12 |
+
|
| 13 |
+
valid = list(value)
|
| 14 |
+
invalid = [*valid, 73]
|
| 15 |
+
|
| 16 |
+
const = Draft202012Validator(const_schema)
|
| 17 |
+
enum = Draft202012Validator(enum_schema)
|
| 18 |
+
|
| 19 |
+
assert const.is_valid(valid)
|
| 20 |
+
assert enum.is_valid(valid)
|
| 21 |
+
assert not const.is_valid(invalid)
|
| 22 |
+
assert not enum.is_valid(invalid)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
runner = Runner()
|
| 27 |
+
runner.bench_func("const valid", lambda: const.is_valid(valid))
|
| 28 |
+
runner.bench_func("const invalid", lambda: const.is_valid(invalid))
|
| 29 |
+
runner.bench_func("enum valid", lambda: enum.is_valid(valid))
|
| 30 |
+
runner.bench_func("enum invalid", lambda: enum.is_valid(invalid))
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/contains.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
A benchmark for validation of the `contains` keyword.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from pyperf import Runner
|
| 6 |
+
|
| 7 |
+
from jsonschema import Draft202012Validator
|
| 8 |
+
|
| 9 |
+
schema = {
|
| 10 |
+
"type": "array",
|
| 11 |
+
"contains": {"const": 37},
|
| 12 |
+
}
|
| 13 |
+
validator = Draft202012Validator(schema)
|
| 14 |
+
|
| 15 |
+
size = 1000
|
| 16 |
+
beginning = [37] + [0] * (size - 1)
|
| 17 |
+
middle = [0] * (size // 2) + [37] + [0] * (size // 2)
|
| 18 |
+
end = [0] * (size - 1) + [37]
|
| 19 |
+
invalid = [0] * size
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
if __name__ == "__main__":
|
| 23 |
+
runner = Runner()
|
| 24 |
+
runner.bench_func("baseline", lambda: validator.is_valid([]))
|
| 25 |
+
runner.bench_func("beginning", lambda: validator.is_valid(beginning))
|
| 26 |
+
runner.bench_func("middle", lambda: validator.is_valid(middle))
|
| 27 |
+
runner.bench_func("end", lambda: validator.is_valid(end))
|
| 28 |
+
runner.bench_func("invalid", lambda: validator.is_valid(invalid))
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/json_schema_test_suite.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
A performance benchmark using the official test suite.
|
| 3 |
+
|
| 4 |
+
This benchmarks jsonschema using every valid example in the
|
| 5 |
+
JSON-Schema-Test-Suite. It will take some time to complete.
|
| 6 |
+
"""
|
| 7 |
+
from pyperf import Runner
|
| 8 |
+
|
| 9 |
+
from jsonschema.tests._suite import Suite
|
| 10 |
+
|
| 11 |
+
if __name__ == "__main__":
|
| 12 |
+
Suite().benchmark(runner=Runner())
|
.venv/lib/python3.11/site-packages/jsonschema/benchmarks/validator_creation.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pyperf import Runner
|
| 2 |
+
|
| 3 |
+
from jsonschema import Draft202012Validator
|
| 4 |
+
|
| 5 |
+
schema = {
|
| 6 |
+
"type": "array",
|
| 7 |
+
"minLength": 1,
|
| 8 |
+
"maxLength": 1,
|
| 9 |
+
"items": {"type": "integer"},
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
if __name__ == "__main__":
|
| 14 |
+
Runner().bench_func("validator creation", Draft202012Validator, schema)
|
.venv/lib/python3.11/site-packages/torchaudio/__init__.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Initialize extension and backend first
|
| 2 |
+
from . import _extension # noqa # usort: skip
|
| 3 |
+
from ._backend import ( # noqa # usort: skip
|
| 4 |
+
AudioMetaData,
|
| 5 |
+
get_audio_backend,
|
| 6 |
+
info,
|
| 7 |
+
list_audio_backends,
|
| 8 |
+
load,
|
| 9 |
+
save,
|
| 10 |
+
set_audio_backend,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
from . import ( # noqa: F401
|
| 14 |
+
compliance,
|
| 15 |
+
datasets,
|
| 16 |
+
functional,
|
| 17 |
+
io,
|
| 18 |
+
kaldi_io,
|
| 19 |
+
models,
|
| 20 |
+
pipelines,
|
| 21 |
+
sox_effects,
|
| 22 |
+
transforms,
|
| 23 |
+
utils,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# For BC
|
| 27 |
+
from . import backend # noqa # usort: skip
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
from .version import __version__, git_version # noqa: F401
|
| 31 |
+
except ImportError:
|
| 32 |
+
pass
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
__all__ = [
|
| 36 |
+
"AudioMetaData",
|
| 37 |
+
"load",
|
| 38 |
+
"info",
|
| 39 |
+
"save",
|
| 40 |
+
"io",
|
| 41 |
+
"compliance",
|
| 42 |
+
"datasets",
|
| 43 |
+
"functional",
|
| 44 |
+
"models",
|
| 45 |
+
"pipelines",
|
| 46 |
+
"kaldi_io",
|
| 47 |
+
"utils",
|
| 48 |
+
"sox_effects",
|
| 49 |
+
"transforms",
|
| 50 |
+
"list_audio_backends",
|
| 51 |
+
"get_audio_backend",
|
| 52 |
+
"set_audio_backend",
|
| 53 |
+
]
|
.venv/lib/python3.11/site-packages/torchaudio/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (1.17 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/__pycache__/kaldi_io.cpython-311.pyc
ADDED
|
Binary file (5.84 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/__pycache__/version.cpython-311.pyc
ADDED
|
Binary file (272 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/__init__.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional
|
| 2 |
+
|
| 3 |
+
from torchaudio._internal.module_utils import deprecated
|
| 4 |
+
|
| 5 |
+
from . import utils
|
| 6 |
+
from .common import AudioMetaData
|
| 7 |
+
|
| 8 |
+
__all__ = [
|
| 9 |
+
"AudioMetaData",
|
| 10 |
+
"load",
|
| 11 |
+
"info",
|
| 12 |
+
"save",
|
| 13 |
+
"list_audio_backends",
|
| 14 |
+
"get_audio_backend",
|
| 15 |
+
"set_audio_backend",
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
info = utils.get_info_func()
|
| 20 |
+
load = utils.get_load_func()
|
| 21 |
+
save = utils.get_save_func()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def list_audio_backends() -> List[str]:
|
| 25 |
+
"""List available backends
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
list of str: The list of available backends.
|
| 29 |
+
|
| 30 |
+
The possible values are; ``"ffmpeg"``, ``"sox"`` and ``"soundfile"``.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
return list(utils.get_available_backends().keys())
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# Temporary until global backend is removed
|
| 37 |
+
@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
|
| 38 |
+
def get_audio_backend() -> Optional[str]:
|
| 39 |
+
"""Get the name of the current global backend
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
str or None:
|
| 43 |
+
If dispatcher mode is enabled, returns ``None`` otherwise,
|
| 44 |
+
the name of current backend or ``None`` (no backend is set).
|
| 45 |
+
"""
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# Temporary until global backend is removed
|
| 50 |
+
@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
|
| 51 |
+
def set_audio_backend(backend: Optional[str]): # noqa
|
| 52 |
+
"""Set the global backend.
|
| 53 |
+
|
| 54 |
+
This is a no-op when dispatcher mode is enabled.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
backend (str or None): Name of the backend.
|
| 58 |
+
One of ``"sox_io"`` or ``"soundfile"`` based on availability
|
| 59 |
+
of the system. If ``None`` is provided the current backend is unassigned.
|
| 60 |
+
"""
|
| 61 |
+
pass
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (2.33 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/backend.cpython-311.pyc
ADDED
|
Binary file (3.08 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/common.cpython-311.pyc
ADDED
|
Binary file (2.35 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/ffmpeg.cpython-311.pyc
ADDED
|
Binary file (14 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile.cpython-311.pyc
ADDED
|
Binary file (3.21 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile_backend.cpython-311.pyc
ADDED
|
Binary file (17.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/sox.cpython-311.pyc
ADDED
|
Binary file (4.97 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (16.5 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/backend.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from abc import ABC, abstractmethod
|
| 3 |
+
from typing import BinaryIO, Optional, Tuple, Union
|
| 4 |
+
|
| 5 |
+
from torch import Tensor
|
| 6 |
+
from torchaudio.io import CodecConfig
|
| 7 |
+
|
| 8 |
+
from .common import AudioMetaData
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class Backend(ABC):
|
| 12 |
+
@staticmethod
|
| 13 |
+
@abstractmethod
|
| 14 |
+
def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
|
| 15 |
+
raise NotImplementedError
|
| 16 |
+
|
| 17 |
+
@staticmethod
|
| 18 |
+
@abstractmethod
|
| 19 |
+
def load(
|
| 20 |
+
uri: Union[BinaryIO, str, os.PathLike],
|
| 21 |
+
frame_offset: int = 0,
|
| 22 |
+
num_frames: int = -1,
|
| 23 |
+
normalize: bool = True,
|
| 24 |
+
channels_first: bool = True,
|
| 25 |
+
format: Optional[str] = None,
|
| 26 |
+
buffer_size: int = 4096,
|
| 27 |
+
) -> Tuple[Tensor, int]:
|
| 28 |
+
raise NotImplementedError
|
| 29 |
+
|
| 30 |
+
@staticmethod
|
| 31 |
+
@abstractmethod
|
| 32 |
+
def save(
|
| 33 |
+
uri: Union[BinaryIO, str, os.PathLike],
|
| 34 |
+
src: Tensor,
|
| 35 |
+
sample_rate: int,
|
| 36 |
+
channels_first: bool = True,
|
| 37 |
+
format: Optional[str] = None,
|
| 38 |
+
encoding: Optional[str] = None,
|
| 39 |
+
bits_per_sample: Optional[int] = None,
|
| 40 |
+
buffer_size: int = 4096,
|
| 41 |
+
compression: Optional[Union[CodecConfig, float, int]] = None,
|
| 42 |
+
) -> None:
|
| 43 |
+
raise NotImplementedError
|
| 44 |
+
|
| 45 |
+
@staticmethod
|
| 46 |
+
@abstractmethod
|
| 47 |
+
def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
|
| 48 |
+
raise NotImplementedError
|
| 49 |
+
|
| 50 |
+
@staticmethod
|
| 51 |
+
@abstractmethod
|
| 52 |
+
def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
|
| 53 |
+
raise NotImplementedError
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/common.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class AudioMetaData:
|
| 2 |
+
"""AudioMetaData()
|
| 3 |
+
|
| 4 |
+
Return type of ``torchaudio.info`` function.
|
| 5 |
+
|
| 6 |
+
:ivar int sample_rate: Sample rate
|
| 7 |
+
:ivar int num_frames: The number of frames
|
| 8 |
+
:ivar int num_channels: The number of channels
|
| 9 |
+
:ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
|
| 10 |
+
or when it cannot be accurately inferred.
|
| 11 |
+
:ivar str encoding: Audio encoding
|
| 12 |
+
The values encoding can take are one of the following:
|
| 13 |
+
|
| 14 |
+
* ``PCM_S``: Signed integer linear PCM
|
| 15 |
+
* ``PCM_U``: Unsigned integer linear PCM
|
| 16 |
+
* ``PCM_F``: Floating point linear PCM
|
| 17 |
+
* ``FLAC``: Flac, Free Lossless Audio Codec
|
| 18 |
+
* ``ULAW``: Mu-law
|
| 19 |
+
* ``ALAW``: A-law
|
| 20 |
+
* ``MP3`` : MP3, MPEG-1 Audio Layer III
|
| 21 |
+
* ``VORBIS``: OGG Vorbis
|
| 22 |
+
* ``AMR_WB``: Adaptive Multi-Rate Wideband
|
| 23 |
+
* ``AMR_NB``: Adaptive Multi-Rate Narrowband
|
| 24 |
+
* ``OPUS``: Opus
|
| 25 |
+
* ``HTK``: Single channel 16-bit PCM
|
| 26 |
+
* ``UNKNOWN`` : None of above
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(
|
| 30 |
+
self,
|
| 31 |
+
sample_rate: int,
|
| 32 |
+
num_frames: int,
|
| 33 |
+
num_channels: int,
|
| 34 |
+
bits_per_sample: int,
|
| 35 |
+
encoding: str,
|
| 36 |
+
):
|
| 37 |
+
self.sample_rate = sample_rate
|
| 38 |
+
self.num_frames = num_frames
|
| 39 |
+
self.num_channels = num_channels
|
| 40 |
+
self.bits_per_sample = bits_per_sample
|
| 41 |
+
self.encoding = encoding
|
| 42 |
+
|
| 43 |
+
def __str__(self):
|
| 44 |
+
return (
|
| 45 |
+
f"AudioMetaData("
|
| 46 |
+
f"sample_rate={self.sample_rate}, "
|
| 47 |
+
f"num_frames={self.num_frames}, "
|
| 48 |
+
f"num_channels={self.num_channels}, "
|
| 49 |
+
f"bits_per_sample={self.bits_per_sample}, "
|
| 50 |
+
f"encoding={self.encoding}"
|
| 51 |
+
f")"
|
| 52 |
+
)
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/ffmpeg.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import sys
|
| 4 |
+
from typing import BinaryIO, Optional, Tuple, Union
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
import torchaudio
|
| 8 |
+
|
| 9 |
+
from .backend import Backend
|
| 10 |
+
from .common import AudioMetaData
|
| 11 |
+
|
| 12 |
+
InputType = Union[BinaryIO, str, os.PathLike]
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def info_audio(
|
| 16 |
+
src: InputType,
|
| 17 |
+
format: Optional[str],
|
| 18 |
+
buffer_size: int = 4096,
|
| 19 |
+
) -> AudioMetaData:
|
| 20 |
+
s = torchaudio.io.StreamReader(src, format, None, buffer_size)
|
| 21 |
+
sinfo = s.get_src_stream_info(s.default_audio_stream)
|
| 22 |
+
if sinfo.num_frames == 0:
|
| 23 |
+
waveform = _load_audio(s)
|
| 24 |
+
num_frames = waveform.size(1)
|
| 25 |
+
else:
|
| 26 |
+
num_frames = sinfo.num_frames
|
| 27 |
+
return AudioMetaData(
|
| 28 |
+
int(sinfo.sample_rate),
|
| 29 |
+
num_frames,
|
| 30 |
+
sinfo.num_channels,
|
| 31 |
+
sinfo.bits_per_sample,
|
| 32 |
+
sinfo.codec.upper(),
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _get_load_filter(
|
| 37 |
+
frame_offset: int = 0,
|
| 38 |
+
num_frames: int = -1,
|
| 39 |
+
convert: bool = True,
|
| 40 |
+
) -> Optional[str]:
|
| 41 |
+
if frame_offset < 0:
|
| 42 |
+
raise RuntimeError("Invalid argument: frame_offset must be non-negative. Found: {}".format(frame_offset))
|
| 43 |
+
if num_frames == 0 or num_frames < -1:
|
| 44 |
+
raise RuntimeError("Invalid argument: num_frames must be -1 or greater than 0. Found: {}".format(num_frames))
|
| 45 |
+
|
| 46 |
+
# All default values -> no filter
|
| 47 |
+
if frame_offset == 0 and num_frames == -1 and not convert:
|
| 48 |
+
return None
|
| 49 |
+
# Only convert
|
| 50 |
+
aformat = "aformat=sample_fmts=fltp"
|
| 51 |
+
if frame_offset == 0 and num_frames == -1 and convert:
|
| 52 |
+
return aformat
|
| 53 |
+
# At least one of frame_offset or num_frames has non-default value
|
| 54 |
+
if num_frames > 0:
|
| 55 |
+
atrim = "atrim=start_sample={}:end_sample={}".format(frame_offset, frame_offset + num_frames)
|
| 56 |
+
else:
|
| 57 |
+
atrim = "atrim=start_sample={}".format(frame_offset)
|
| 58 |
+
if not convert:
|
| 59 |
+
return atrim
|
| 60 |
+
return "{},{}".format(atrim, aformat)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _load_audio(
|
| 64 |
+
s: "torchaudio.io.StreamReader",
|
| 65 |
+
filter: Optional[str] = None,
|
| 66 |
+
channels_first: bool = True,
|
| 67 |
+
) -> torch.Tensor:
|
| 68 |
+
s.add_audio_stream(-1, -1, filter_desc=filter)
|
| 69 |
+
s.process_all_packets()
|
| 70 |
+
chunk = s.pop_chunks()[0]
|
| 71 |
+
if chunk is None:
|
| 72 |
+
raise RuntimeError("Failed to decode audio.")
|
| 73 |
+
waveform = chunk._elem
|
| 74 |
+
return waveform.T if channels_first else waveform
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def load_audio(
|
| 78 |
+
src: InputType,
|
| 79 |
+
frame_offset: int = 0,
|
| 80 |
+
num_frames: int = -1,
|
| 81 |
+
convert: bool = True,
|
| 82 |
+
channels_first: bool = True,
|
| 83 |
+
format: Optional[str] = None,
|
| 84 |
+
buffer_size: int = 4096,
|
| 85 |
+
) -> Tuple[torch.Tensor, int]:
|
| 86 |
+
if hasattr(src, "read") and format == "vorbis":
|
| 87 |
+
format = "ogg"
|
| 88 |
+
s = torchaudio.io.StreamReader(src, format, None, buffer_size)
|
| 89 |
+
sample_rate = int(s.get_src_stream_info(s.default_audio_stream).sample_rate)
|
| 90 |
+
filter = _get_load_filter(frame_offset, num_frames, convert)
|
| 91 |
+
waveform = _load_audio(s, filter, channels_first)
|
| 92 |
+
return waveform, sample_rate
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _get_sample_format(dtype: torch.dtype) -> str:
|
| 96 |
+
dtype_to_format = {
|
| 97 |
+
torch.uint8: "u8",
|
| 98 |
+
torch.int16: "s16",
|
| 99 |
+
torch.int32: "s32",
|
| 100 |
+
torch.int64: "s64",
|
| 101 |
+
torch.float32: "flt",
|
| 102 |
+
torch.float64: "dbl",
|
| 103 |
+
}
|
| 104 |
+
format = dtype_to_format.get(dtype)
|
| 105 |
+
if format is None:
|
| 106 |
+
raise ValueError(f"No format found for dtype {dtype}; dtype must be one of {list(dtype_to_format.keys())}.")
|
| 107 |
+
return format
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def _native_endianness() -> str:
|
| 111 |
+
if sys.byteorder == "little":
|
| 112 |
+
return "le"
|
| 113 |
+
else:
|
| 114 |
+
return "be"
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def _get_encoder_for_wav(encoding: str, bits_per_sample: int) -> str:
|
| 118 |
+
if bits_per_sample not in {None, 8, 16, 24, 32, 64}:
|
| 119 |
+
raise ValueError(f"Invalid bits_per_sample {bits_per_sample} for WAV encoding.")
|
| 120 |
+
endianness = _native_endianness()
|
| 121 |
+
if not encoding:
|
| 122 |
+
if not bits_per_sample:
|
| 123 |
+
# default to PCM S16
|
| 124 |
+
return f"pcm_s16{endianness}"
|
| 125 |
+
if bits_per_sample == 8:
|
| 126 |
+
return "pcm_u8"
|
| 127 |
+
return f"pcm_s{bits_per_sample}{endianness}"
|
| 128 |
+
if encoding == "PCM_S":
|
| 129 |
+
if not bits_per_sample:
|
| 130 |
+
bits_per_sample = 16
|
| 131 |
+
if bits_per_sample == 8:
|
| 132 |
+
raise ValueError("For WAV signed PCM, 8-bit encoding is not supported.")
|
| 133 |
+
return f"pcm_s{bits_per_sample}{endianness}"
|
| 134 |
+
if encoding == "PCM_U":
|
| 135 |
+
if bits_per_sample in (None, 8):
|
| 136 |
+
return "pcm_u8"
|
| 137 |
+
raise ValueError("For WAV unsigned PCM, only 8-bit encoding is supported.")
|
| 138 |
+
if encoding == "PCM_F":
|
| 139 |
+
if not bits_per_sample:
|
| 140 |
+
bits_per_sample = 32
|
| 141 |
+
if bits_per_sample in (32, 64):
|
| 142 |
+
return f"pcm_f{bits_per_sample}{endianness}"
|
| 143 |
+
raise ValueError("For WAV float PCM, only 32- and 64-bit encodings are supported.")
|
| 144 |
+
if encoding == "ULAW":
|
| 145 |
+
if bits_per_sample in (None, 8):
|
| 146 |
+
return "pcm_mulaw"
|
| 147 |
+
raise ValueError("For WAV PCM mu-law, only 8-bit encoding is supported.")
|
| 148 |
+
if encoding == "ALAW":
|
| 149 |
+
if bits_per_sample in (None, 8):
|
| 150 |
+
return "pcm_alaw"
|
| 151 |
+
raise ValueError("For WAV PCM A-law, only 8-bit encoding is supported.")
|
| 152 |
+
raise ValueError(f"WAV encoding {encoding} is not supported.")
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def _get_flac_sample_fmt(bps):
|
| 156 |
+
if bps is None or bps == 16:
|
| 157 |
+
return "s16"
|
| 158 |
+
if bps == 24:
|
| 159 |
+
return "s32"
|
| 160 |
+
raise ValueError(f"FLAC only supports bits_per_sample values of 16 and 24 ({bps} specified).")
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def _parse_save_args(
|
| 164 |
+
ext: Optional[str],
|
| 165 |
+
format: Optional[str],
|
| 166 |
+
encoding: Optional[str],
|
| 167 |
+
bps: Optional[int],
|
| 168 |
+
):
|
| 169 |
+
# torchaudio's save function accepts the followings, which do not 1to1 map
|
| 170 |
+
# to FFmpeg.
|
| 171 |
+
#
|
| 172 |
+
# - format: audio format
|
| 173 |
+
# - bits_per_sample: encoder sample format
|
| 174 |
+
# - encoding: such as PCM_U8.
|
| 175 |
+
#
|
| 176 |
+
# In FFmpeg, format is specified with the following three (and more)
|
| 177 |
+
#
|
| 178 |
+
# - muxer: could be audio format or container format.
|
| 179 |
+
# the one we passed to the constructor of StreamWriter
|
| 180 |
+
# - encoder: the audio encoder used to encode audio
|
| 181 |
+
# - encoder sample format: the format used by encoder to encode audio.
|
| 182 |
+
#
|
| 183 |
+
# If encoder sample format is different from source sample format, StreamWriter
|
| 184 |
+
# will insert a filter automatically.
|
| 185 |
+
#
|
| 186 |
+
def _type(spec):
|
| 187 |
+
# either format is exactly the specified one
|
| 188 |
+
# or extension matches to the spec AND there is no format override.
|
| 189 |
+
return format == spec or (format is None and ext == spec)
|
| 190 |
+
|
| 191 |
+
if _type("wav") or _type("amb"):
|
| 192 |
+
# wav is special because it supports different encoding through encoders
|
| 193 |
+
# each encoder only supports one encoder format
|
| 194 |
+
#
|
| 195 |
+
# amb format is a special case originated from libsox.
|
| 196 |
+
# It is basically a WAV format, with slight modification.
|
| 197 |
+
# https://github.com/chirlu/sox/commit/4a4ea33edbca5972a1ed8933cc3512c7302fa67a#diff-39171191a858add9df87f5f210a34a776ac2c026842ae6db6ce97f5e68836795
|
| 198 |
+
# It is a format so that decoders will recognize it as ambisonic.
|
| 199 |
+
# https://www.ambisonia.com/Members/mleese/file-format-for-b-format/
|
| 200 |
+
# FFmpeg does not recognize amb because it is basically a WAV format.
|
| 201 |
+
muxer = "wav"
|
| 202 |
+
encoder = _get_encoder_for_wav(encoding, bps)
|
| 203 |
+
sample_fmt = None
|
| 204 |
+
elif _type("vorbis"):
|
| 205 |
+
# FFpmeg does not recognize vorbis extension, while libsox used to do.
|
| 206 |
+
# For the sake of bakward compatibility, (and the simplicity),
|
| 207 |
+
# we support the case where users want to do save("foo.vorbis")
|
| 208 |
+
muxer = "ogg"
|
| 209 |
+
encoder = "vorbis"
|
| 210 |
+
sample_fmt = None
|
| 211 |
+
else:
|
| 212 |
+
muxer = format
|
| 213 |
+
encoder = None
|
| 214 |
+
sample_fmt = None
|
| 215 |
+
if _type("flac"):
|
| 216 |
+
sample_fmt = _get_flac_sample_fmt(bps)
|
| 217 |
+
if _type("ogg"):
|
| 218 |
+
sample_fmt = _get_flac_sample_fmt(bps)
|
| 219 |
+
return muxer, encoder, sample_fmt
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def save_audio(
|
| 223 |
+
uri: InputType,
|
| 224 |
+
src: torch.Tensor,
|
| 225 |
+
sample_rate: int,
|
| 226 |
+
channels_first: bool = True,
|
| 227 |
+
format: Optional[str] = None,
|
| 228 |
+
encoding: Optional[str] = None,
|
| 229 |
+
bits_per_sample: Optional[int] = None,
|
| 230 |
+
buffer_size: int = 4096,
|
| 231 |
+
compression: Optional[torchaudio.io.CodecConfig] = None,
|
| 232 |
+
) -> None:
|
| 233 |
+
ext = None
|
| 234 |
+
if hasattr(uri, "write"):
|
| 235 |
+
if format is None:
|
| 236 |
+
raise RuntimeError("'format' is required when saving to file object.")
|
| 237 |
+
else:
|
| 238 |
+
uri = os.path.normpath(uri)
|
| 239 |
+
if tokens := str(uri).split(".")[1:]:
|
| 240 |
+
ext = tokens[-1].lower()
|
| 241 |
+
|
| 242 |
+
muxer, encoder, enc_fmt = _parse_save_args(ext, format, encoding, bits_per_sample)
|
| 243 |
+
|
| 244 |
+
if channels_first:
|
| 245 |
+
src = src.T
|
| 246 |
+
|
| 247 |
+
s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)
|
| 248 |
+
s.add_audio_stream(
|
| 249 |
+
sample_rate,
|
| 250 |
+
num_channels=src.size(-1),
|
| 251 |
+
format=_get_sample_format(src.dtype),
|
| 252 |
+
encoder=encoder,
|
| 253 |
+
encoder_format=enc_fmt,
|
| 254 |
+
codec_config=compression,
|
| 255 |
+
)
|
| 256 |
+
with s.open():
|
| 257 |
+
s.write_audio_chunk(0, src)
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def _map_encoding(encoding: str) -> str:
|
| 261 |
+
for dst in ["PCM_S", "PCM_U", "PCM_F"]:
|
| 262 |
+
if dst in encoding:
|
| 263 |
+
return dst
|
| 264 |
+
if encoding == "PCM_MULAW":
|
| 265 |
+
return "ULAW"
|
| 266 |
+
elif encoding == "PCM_ALAW":
|
| 267 |
+
return "ALAW"
|
| 268 |
+
return encoding
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def _get_bits_per_sample(encoding: str, bits_per_sample: int) -> str:
|
| 272 |
+
if m := re.search(r"PCM_\w(\d+)\w*", encoding):
|
| 273 |
+
return int(m.group(1))
|
| 274 |
+
elif encoding in ["PCM_ALAW", "PCM_MULAW"]:
|
| 275 |
+
return 8
|
| 276 |
+
return bits_per_sample
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
class FFmpegBackend(Backend):
|
| 280 |
+
@staticmethod
|
| 281 |
+
def info(uri: InputType, format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
|
| 282 |
+
metadata = info_audio(uri, format, buffer_size)
|
| 283 |
+
metadata.bits_per_sample = _get_bits_per_sample(metadata.encoding, metadata.bits_per_sample)
|
| 284 |
+
metadata.encoding = _map_encoding(metadata.encoding)
|
| 285 |
+
return metadata
|
| 286 |
+
|
| 287 |
+
@staticmethod
|
| 288 |
+
def load(
|
| 289 |
+
uri: InputType,
|
| 290 |
+
frame_offset: int = 0,
|
| 291 |
+
num_frames: int = -1,
|
| 292 |
+
normalize: bool = True,
|
| 293 |
+
channels_first: bool = True,
|
| 294 |
+
format: Optional[str] = None,
|
| 295 |
+
buffer_size: int = 4096,
|
| 296 |
+
) -> Tuple[torch.Tensor, int]:
|
| 297 |
+
return load_audio(uri, frame_offset, num_frames, normalize, channels_first, format)
|
| 298 |
+
|
| 299 |
+
@staticmethod
|
| 300 |
+
def save(
|
| 301 |
+
uri: InputType,
|
| 302 |
+
src: torch.Tensor,
|
| 303 |
+
sample_rate: int,
|
| 304 |
+
channels_first: bool = True,
|
| 305 |
+
format: Optional[str] = None,
|
| 306 |
+
encoding: Optional[str] = None,
|
| 307 |
+
bits_per_sample: Optional[int] = None,
|
| 308 |
+
buffer_size: int = 4096,
|
| 309 |
+
compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
|
| 310 |
+
) -> None:
|
| 311 |
+
if not isinstance(compression, (torchaudio.io.CodecConfig, type(None))):
|
| 312 |
+
raise ValueError(
|
| 313 |
+
"FFmpeg backend expects non-`None` value for argument `compression` to be of ",
|
| 314 |
+
f"type `torchaudio.io.CodecConfig`, but received value of type {type(compression)}",
|
| 315 |
+
)
|
| 316 |
+
save_audio(
|
| 317 |
+
uri,
|
| 318 |
+
src,
|
| 319 |
+
sample_rate,
|
| 320 |
+
channels_first,
|
| 321 |
+
format,
|
| 322 |
+
encoding,
|
| 323 |
+
bits_per_sample,
|
| 324 |
+
buffer_size,
|
| 325 |
+
compression,
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
@staticmethod
|
| 329 |
+
def can_decode(uri: InputType, format: Optional[str]) -> bool:
|
| 330 |
+
return True
|
| 331 |
+
|
| 332 |
+
@staticmethod
|
| 333 |
+
def can_encode(uri: InputType, format: Optional[str]) -> bool:
|
| 334 |
+
return True
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import BinaryIO, Optional, Tuple, Union
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
from torchaudio.io import CodecConfig
|
| 6 |
+
|
| 7 |
+
from . import soundfile_backend
|
| 8 |
+
from .backend import Backend
|
| 9 |
+
from .common import AudioMetaData
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class SoundfileBackend(Backend):
|
| 13 |
+
@staticmethod
|
| 14 |
+
def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
|
| 15 |
+
return soundfile_backend.info(uri, format)
|
| 16 |
+
|
| 17 |
+
@staticmethod
|
| 18 |
+
def load(
|
| 19 |
+
uri: Union[BinaryIO, str, os.PathLike],
|
| 20 |
+
frame_offset: int = 0,
|
| 21 |
+
num_frames: int = -1,
|
| 22 |
+
normalize: bool = True,
|
| 23 |
+
channels_first: bool = True,
|
| 24 |
+
format: Optional[str] = None,
|
| 25 |
+
buffer_size: int = 4096,
|
| 26 |
+
) -> Tuple[torch.Tensor, int]:
|
| 27 |
+
return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format)
|
| 28 |
+
|
| 29 |
+
@staticmethod
|
| 30 |
+
def save(
|
| 31 |
+
uri: Union[BinaryIO, str, os.PathLike],
|
| 32 |
+
src: torch.Tensor,
|
| 33 |
+
sample_rate: int,
|
| 34 |
+
channels_first: bool = True,
|
| 35 |
+
format: Optional[str] = None,
|
| 36 |
+
encoding: Optional[str] = None,
|
| 37 |
+
bits_per_sample: Optional[int] = None,
|
| 38 |
+
buffer_size: int = 4096,
|
| 39 |
+
compression: Optional[Union[CodecConfig, float, int]] = None,
|
| 40 |
+
) -> None:
|
| 41 |
+
if compression:
|
| 42 |
+
raise ValueError("soundfile backend does not support argument `compression`.")
|
| 43 |
+
|
| 44 |
+
soundfile_backend.save(
|
| 45 |
+
uri, src, sample_rate, channels_first, format=format, encoding=encoding, bits_per_sample=bits_per_sample
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
@staticmethod
|
| 49 |
+
def can_decode(uri, format) -> bool:
|
| 50 |
+
return True
|
| 51 |
+
|
| 52 |
+
@staticmethod
|
| 53 |
+
def can_encode(uri, format) -> bool:
|
| 54 |
+
return True
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile_backend.py
ADDED
|
@@ -0,0 +1,457 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""The new soundfile backend which will become default in 0.8.0 onward"""
|
| 2 |
+
import warnings
|
| 3 |
+
from typing import Optional, Tuple
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from torchaudio._internal import module_utils as _mod_utils
|
| 7 |
+
|
| 8 |
+
from .common import AudioMetaData
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
_IS_SOUNDFILE_AVAILABLE = False
|
| 12 |
+
|
| 13 |
+
# TODO: import soundfile only when it is used.
|
| 14 |
+
if _mod_utils.is_module_available("soundfile"):
|
| 15 |
+
try:
|
| 16 |
+
import soundfile
|
| 17 |
+
|
| 18 |
+
_requires_soundfile = _mod_utils.no_op
|
| 19 |
+
_IS_SOUNDFILE_AVAILABLE = True
|
| 20 |
+
except Exception:
|
| 21 |
+
_requires_soundfile = _mod_utils.fail_with_message(
|
| 22 |
+
"requires soundfile, but we failed to import it. Please check the installation of soundfile."
|
| 23 |
+
)
|
| 24 |
+
else:
|
| 25 |
+
_requires_soundfile = _mod_utils.fail_with_message(
|
| 26 |
+
"requires soundfile, but it is not installed. Please install soundfile."
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Mapping from soundfile subtype to number of bits per sample.
|
| 31 |
+
# This is mostly heuristical and the value is set to 0 when it is irrelevant
|
| 32 |
+
# (lossy formats) or when it can't be inferred.
|
| 33 |
+
# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
|
| 34 |
+
# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
|
| 35 |
+
# the default seems to be 8 bits but it can be compressed further to 4 bits.
|
| 36 |
+
# The dict is inspired from
|
| 37 |
+
# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
|
| 38 |
+
_SUBTYPE_TO_BITS_PER_SAMPLE = {
|
| 39 |
+
"PCM_S8": 8, # Signed 8 bit data
|
| 40 |
+
"PCM_16": 16, # Signed 16 bit data
|
| 41 |
+
"PCM_24": 24, # Signed 24 bit data
|
| 42 |
+
"PCM_32": 32, # Signed 32 bit data
|
| 43 |
+
"PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only)
|
| 44 |
+
"FLOAT": 32, # 32 bit float data
|
| 45 |
+
"DOUBLE": 64, # 64 bit float data
|
| 46 |
+
"ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
|
| 47 |
+
"ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
|
| 48 |
+
"IMA_ADPCM": 0, # IMA ADPCM.
|
| 49 |
+
"MS_ADPCM": 0, # Microsoft ADPCM.
|
| 50 |
+
"GSM610": 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
|
| 51 |
+
"VOX_ADPCM": 0, # OKI / Dialogix ADPCM
|
| 52 |
+
"G721_32": 0, # 32kbs G721 ADPCM encoding.
|
| 53 |
+
"G723_24": 0, # 24kbs G723 ADPCM encoding.
|
| 54 |
+
"G723_40": 0, # 40kbs G723 ADPCM encoding.
|
| 55 |
+
"DWVW_12": 12, # 12 bit Delta Width Variable Word encoding.
|
| 56 |
+
"DWVW_16": 16, # 16 bit Delta Width Variable Word encoding.
|
| 57 |
+
"DWVW_24": 24, # 24 bit Delta Width Variable Word encoding.
|
| 58 |
+
"DWVW_N": 0, # N bit Delta Width Variable Word encoding.
|
| 59 |
+
"DPCM_8": 8, # 8 bit differential PCM (XI only)
|
| 60 |
+
"DPCM_16": 16, # 16 bit differential PCM (XI only)
|
| 61 |
+
"VORBIS": 0, # Xiph Vorbis encoding. (lossy)
|
| 62 |
+
"ALAC_16": 16, # Apple Lossless Audio Codec (16 bit).
|
| 63 |
+
"ALAC_20": 20, # Apple Lossless Audio Codec (20 bit).
|
| 64 |
+
"ALAC_24": 24, # Apple Lossless Audio Codec (24 bit).
|
| 65 |
+
"ALAC_32": 32, # Apple Lossless Audio Codec (32 bit).
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _get_bit_depth(subtype):
|
| 70 |
+
if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
|
| 71 |
+
warnings.warn(
|
| 72 |
+
f"The {subtype} subtype is unknown to TorchAudio. As a result, the bits_per_sample "
|
| 73 |
+
"attribute will be set to 0. If you are seeing this warning, please "
|
| 74 |
+
"report by opening an issue on github (after checking for existing/closed ones). "
|
| 75 |
+
"You may otherwise ignore this warning."
|
| 76 |
+
)
|
| 77 |
+
return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
_SUBTYPE_TO_ENCODING = {
|
| 81 |
+
"PCM_S8": "PCM_S",
|
| 82 |
+
"PCM_16": "PCM_S",
|
| 83 |
+
"PCM_24": "PCM_S",
|
| 84 |
+
"PCM_32": "PCM_S",
|
| 85 |
+
"PCM_U8": "PCM_U",
|
| 86 |
+
"FLOAT": "PCM_F",
|
| 87 |
+
"DOUBLE": "PCM_F",
|
| 88 |
+
"ULAW": "ULAW",
|
| 89 |
+
"ALAW": "ALAW",
|
| 90 |
+
"VORBIS": "VORBIS",
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def _get_encoding(format: str, subtype: str):
|
| 95 |
+
if format == "FLAC":
|
| 96 |
+
return "FLAC"
|
| 97 |
+
return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
@_requires_soundfile
|
| 101 |
+
def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
|
| 102 |
+
"""Get signal information of an audio file.
|
| 103 |
+
|
| 104 |
+
Note:
|
| 105 |
+
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
| 106 |
+
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
|
| 107 |
+
which has a restriction on type annotation due to TorchScript compiler compatiblity.
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
filepath (path-like object or file-like object):
|
| 111 |
+
Source of audio data.
|
| 112 |
+
format (str or None, optional):
|
| 113 |
+
Not used. PySoundFile does not accept format hint.
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
AudioMetaData: meta data of the given audio.
|
| 117 |
+
|
| 118 |
+
"""
|
| 119 |
+
sinfo = soundfile.info(filepath)
|
| 120 |
+
return AudioMetaData(
|
| 121 |
+
sinfo.samplerate,
|
| 122 |
+
sinfo.frames,
|
| 123 |
+
sinfo.channels,
|
| 124 |
+
bits_per_sample=_get_bit_depth(sinfo.subtype),
|
| 125 |
+
encoding=_get_encoding(sinfo.format, sinfo.subtype),
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
_SUBTYPE2DTYPE = {
|
| 130 |
+
"PCM_S8": "int8",
|
| 131 |
+
"PCM_U8": "uint8",
|
| 132 |
+
"PCM_16": "int16",
|
| 133 |
+
"PCM_32": "int32",
|
| 134 |
+
"FLOAT": "float32",
|
| 135 |
+
"DOUBLE": "float64",
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
@_requires_soundfile
|
| 140 |
+
def load(
|
| 141 |
+
filepath: str,
|
| 142 |
+
frame_offset: int = 0,
|
| 143 |
+
num_frames: int = -1,
|
| 144 |
+
normalize: bool = True,
|
| 145 |
+
channels_first: bool = True,
|
| 146 |
+
format: Optional[str] = None,
|
| 147 |
+
) -> Tuple[torch.Tensor, int]:
|
| 148 |
+
"""Load audio data from file.
|
| 149 |
+
|
| 150 |
+
Note:
|
| 151 |
+
The formats this function can handle depend on the soundfile installation.
|
| 152 |
+
This function is tested on the following formats;
|
| 153 |
+
|
| 154 |
+
* WAV
|
| 155 |
+
|
| 156 |
+
* 32-bit floating-point
|
| 157 |
+
* 32-bit signed integer
|
| 158 |
+
* 16-bit signed integer
|
| 159 |
+
* 8-bit unsigned integer
|
| 160 |
+
|
| 161 |
+
* FLAC
|
| 162 |
+
* OGG/VORBIS
|
| 163 |
+
* SPHERE
|
| 164 |
+
|
| 165 |
+
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
|
| 166 |
+
``float32`` dtype, and the shape of `[channel, time]`.
|
| 167 |
+
|
| 168 |
+
.. warning::
|
| 169 |
+
|
| 170 |
+
``normalize`` argument does not perform volume normalization.
|
| 171 |
+
It only converts the sample type to `torch.float32` from the native sample
|
| 172 |
+
type.
|
| 173 |
+
|
| 174 |
+
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
|
| 175 |
+
signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
|
| 176 |
+
this function can return integer Tensor, where the samples are expressed within the whole range
|
| 177 |
+
of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
|
| 178 |
+
``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
|
| 179 |
+
support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
|
| 180 |
+
|
| 181 |
+
``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
|
| 182 |
+
``flac`` and ``mp3``.
|
| 183 |
+
|
| 184 |
+
For these formats, this function always returns ``float32`` Tensor with values.
|
| 185 |
+
|
| 186 |
+
Note:
|
| 187 |
+
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
| 188 |
+
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
|
| 189 |
+
which has a restriction on type annotation due to TorchScript compiler compatiblity.
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
filepath (path-like object or file-like object):
|
| 193 |
+
Source of audio data.
|
| 194 |
+
frame_offset (int, optional):
|
| 195 |
+
Number of frames to skip before start reading data.
|
| 196 |
+
num_frames (int, optional):
|
| 197 |
+
Maximum number of frames to read. ``-1`` reads all the remaining samples,
|
| 198 |
+
starting from ``frame_offset``.
|
| 199 |
+
This function may return the less number of frames if there is not enough
|
| 200 |
+
frames in the given file.
|
| 201 |
+
normalize (bool, optional):
|
| 202 |
+
When ``True``, this function converts the native sample type to ``float32``.
|
| 203 |
+
Default: ``True``.
|
| 204 |
+
|
| 205 |
+
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
|
| 206 |
+
integer type.
|
| 207 |
+
This argument has no effect for formats other than integer WAV type.
|
| 208 |
+
|
| 209 |
+
channels_first (bool, optional):
|
| 210 |
+
When True, the returned Tensor has dimension `[channel, time]`.
|
| 211 |
+
Otherwise, the returned Tensor's dimension is `[time, channel]`.
|
| 212 |
+
format (str or None, optional):
|
| 213 |
+
Not used. PySoundFile does not accept format hint.
|
| 214 |
+
|
| 215 |
+
Returns:
|
| 216 |
+
(torch.Tensor, int): Resulting Tensor and sample rate.
|
| 217 |
+
If the input file has integer wav format and normalization is off, then it has
|
| 218 |
+
integer type, else ``float32`` type. If ``channels_first=True``, it has
|
| 219 |
+
`[channel, time]` else `[time, channel]`.
|
| 220 |
+
"""
|
| 221 |
+
with soundfile.SoundFile(filepath, "r") as file_:
|
| 222 |
+
if file_.format != "WAV" or normalize:
|
| 223 |
+
dtype = "float32"
|
| 224 |
+
elif file_.subtype not in _SUBTYPE2DTYPE:
|
| 225 |
+
raise ValueError(f"Unsupported subtype: {file_.subtype}")
|
| 226 |
+
else:
|
| 227 |
+
dtype = _SUBTYPE2DTYPE[file_.subtype]
|
| 228 |
+
|
| 229 |
+
frames = file_._prepare_read(frame_offset, None, num_frames)
|
| 230 |
+
waveform = file_.read(frames, dtype, always_2d=True)
|
| 231 |
+
sample_rate = file_.samplerate
|
| 232 |
+
|
| 233 |
+
waveform = torch.from_numpy(waveform)
|
| 234 |
+
if channels_first:
|
| 235 |
+
waveform = waveform.t()
|
| 236 |
+
return waveform, sample_rate
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def _get_subtype_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int):
|
| 240 |
+
if not encoding:
|
| 241 |
+
if not bits_per_sample:
|
| 242 |
+
subtype = {
|
| 243 |
+
torch.uint8: "PCM_U8",
|
| 244 |
+
torch.int16: "PCM_16",
|
| 245 |
+
torch.int32: "PCM_32",
|
| 246 |
+
torch.float32: "FLOAT",
|
| 247 |
+
torch.float64: "DOUBLE",
|
| 248 |
+
}.get(dtype)
|
| 249 |
+
if not subtype:
|
| 250 |
+
raise ValueError(f"Unsupported dtype for wav: {dtype}")
|
| 251 |
+
return subtype
|
| 252 |
+
if bits_per_sample == 8:
|
| 253 |
+
return "PCM_U8"
|
| 254 |
+
return f"PCM_{bits_per_sample}"
|
| 255 |
+
if encoding == "PCM_S":
|
| 256 |
+
if not bits_per_sample:
|
| 257 |
+
return "PCM_32"
|
| 258 |
+
if bits_per_sample == 8:
|
| 259 |
+
raise ValueError("wav does not support 8-bit signed PCM encoding.")
|
| 260 |
+
return f"PCM_{bits_per_sample}"
|
| 261 |
+
if encoding == "PCM_U":
|
| 262 |
+
if bits_per_sample in (None, 8):
|
| 263 |
+
return "PCM_U8"
|
| 264 |
+
raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
|
| 265 |
+
if encoding == "PCM_F":
|
| 266 |
+
if bits_per_sample in (None, 32):
|
| 267 |
+
return "FLOAT"
|
| 268 |
+
if bits_per_sample == 64:
|
| 269 |
+
return "DOUBLE"
|
| 270 |
+
raise ValueError("wav only supports 32/64-bit float PCM encoding.")
|
| 271 |
+
if encoding == "ULAW":
|
| 272 |
+
if bits_per_sample in (None, 8):
|
| 273 |
+
return "ULAW"
|
| 274 |
+
raise ValueError("wav only supports 8-bit mu-law encoding.")
|
| 275 |
+
if encoding == "ALAW":
|
| 276 |
+
if bits_per_sample in (None, 8):
|
| 277 |
+
return "ALAW"
|
| 278 |
+
raise ValueError("wav only supports 8-bit a-law encoding.")
|
| 279 |
+
raise ValueError(f"wav does not support {encoding}.")
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
|
| 283 |
+
if encoding in (None, "PCM_S"):
|
| 284 |
+
return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
|
| 285 |
+
if encoding in ("PCM_U", "PCM_F"):
|
| 286 |
+
raise ValueError(f"sph does not support {encoding} encoding.")
|
| 287 |
+
if encoding == "ULAW":
|
| 288 |
+
if bits_per_sample in (None, 8):
|
| 289 |
+
return "ULAW"
|
| 290 |
+
raise ValueError("sph only supports 8-bit for mu-law encoding.")
|
| 291 |
+
if encoding == "ALAW":
|
| 292 |
+
return "ALAW"
|
| 293 |
+
raise ValueError(f"sph does not support {encoding}.")
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def _get_subtype(dtype: torch.dtype, format: str, encoding: str, bits_per_sample: int):
|
| 297 |
+
if format == "wav":
|
| 298 |
+
return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
|
| 299 |
+
if format == "flac":
|
| 300 |
+
if encoding:
|
| 301 |
+
raise ValueError("flac does not support encoding.")
|
| 302 |
+
if not bits_per_sample:
|
| 303 |
+
return "PCM_16"
|
| 304 |
+
if bits_per_sample > 24:
|
| 305 |
+
raise ValueError("flac does not support bits_per_sample > 24.")
|
| 306 |
+
return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
|
| 307 |
+
if format in ("ogg", "vorbis"):
|
| 308 |
+
if bits_per_sample:
|
| 309 |
+
raise ValueError("ogg/vorbis does not support bits_per_sample.")
|
| 310 |
+
if encoding is None or encoding == "vorbis":
|
| 311 |
+
return "VORBIS"
|
| 312 |
+
if encoding == "opus":
|
| 313 |
+
return "OPUS"
|
| 314 |
+
raise ValueError(f"Unexpected encoding: {encoding}")
|
| 315 |
+
if format == "mp3":
|
| 316 |
+
return "MPEG_LAYER_III"
|
| 317 |
+
if format == "sph":
|
| 318 |
+
return _get_subtype_for_sphere(encoding, bits_per_sample)
|
| 319 |
+
if format in ("nis", "nist"):
|
| 320 |
+
return "PCM_16"
|
| 321 |
+
raise ValueError(f"Unsupported format: {format}")
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
@_requires_soundfile
|
| 325 |
+
def save(
|
| 326 |
+
filepath: str,
|
| 327 |
+
src: torch.Tensor,
|
| 328 |
+
sample_rate: int,
|
| 329 |
+
channels_first: bool = True,
|
| 330 |
+
compression: Optional[float] = None,
|
| 331 |
+
format: Optional[str] = None,
|
| 332 |
+
encoding: Optional[str] = None,
|
| 333 |
+
bits_per_sample: Optional[int] = None,
|
| 334 |
+
):
|
| 335 |
+
"""Save audio data to file.
|
| 336 |
+
|
| 337 |
+
Note:
|
| 338 |
+
The formats this function can handle depend on the soundfile installation.
|
| 339 |
+
This function is tested on the following formats;
|
| 340 |
+
|
| 341 |
+
* WAV
|
| 342 |
+
|
| 343 |
+
* 32-bit floating-point
|
| 344 |
+
* 32-bit signed integer
|
| 345 |
+
* 16-bit signed integer
|
| 346 |
+
* 8-bit unsigned integer
|
| 347 |
+
|
| 348 |
+
* FLAC
|
| 349 |
+
* OGG/VORBIS
|
| 350 |
+
* SPHERE
|
| 351 |
+
|
| 352 |
+
Note:
|
| 353 |
+
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
| 354 |
+
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
|
| 355 |
+
which has a restriction on type annotation due to TorchScript compiler compatiblity.
|
| 356 |
+
|
| 357 |
+
Args:
|
| 358 |
+
filepath (str or pathlib.Path): Path to audio file.
|
| 359 |
+
src (torch.Tensor): Audio data to save. must be 2D tensor.
|
| 360 |
+
sample_rate (int): sampling rate
|
| 361 |
+
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
|
| 362 |
+
otherwise `[time, channel]`.
|
| 363 |
+
compression (float of None, optional): Not used.
|
| 364 |
+
It is here only for interface compatibility reson with "sox_io" backend.
|
| 365 |
+
format (str or None, optional): Override the audio format.
|
| 366 |
+
When ``filepath`` argument is path-like object, audio format is
|
| 367 |
+
inferred from file extension. If the file extension is missing or
|
| 368 |
+
different, you can specify the correct format with this argument.
|
| 369 |
+
|
| 370 |
+
When ``filepath`` argument is file-like object,
|
| 371 |
+
this argument is required.
|
| 372 |
+
|
| 373 |
+
Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
|
| 374 |
+
``"flac"`` and ``"sph"``.
|
| 375 |
+
encoding (str or None, optional): Changes the encoding for supported formats.
|
| 376 |
+
This argument is effective only for supported formats, sush as
|
| 377 |
+
``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
|
| 378 |
+
|
| 379 |
+
- ``"PCM_S"`` (signed integer Linear PCM)
|
| 380 |
+
- ``"PCM_U"`` (unsigned integer Linear PCM)
|
| 381 |
+
- ``"PCM_F"`` (floating point PCM)
|
| 382 |
+
- ``"ULAW"`` (mu-law)
|
| 383 |
+
- ``"ALAW"`` (a-law)
|
| 384 |
+
|
| 385 |
+
bits_per_sample (int or None, optional): Changes the bit depth for the
|
| 386 |
+
supported formats.
|
| 387 |
+
When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
|
| 388 |
+
you can change the bit depth.
|
| 389 |
+
Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
|
| 390 |
+
|
| 391 |
+
Supported formats/encodings/bit depth/compression are:
|
| 392 |
+
|
| 393 |
+
``"wav"``
|
| 394 |
+
- 32-bit floating-point PCM
|
| 395 |
+
- 32-bit signed integer PCM
|
| 396 |
+
- 24-bit signed integer PCM
|
| 397 |
+
- 16-bit signed integer PCM
|
| 398 |
+
- 8-bit unsigned integer PCM
|
| 399 |
+
- 8-bit mu-law
|
| 400 |
+
- 8-bit a-law
|
| 401 |
+
|
| 402 |
+
Note:
|
| 403 |
+
Default encoding/bit depth is determined by the dtype of
|
| 404 |
+
the input Tensor.
|
| 405 |
+
|
| 406 |
+
``"flac"``
|
| 407 |
+
- 8-bit
|
| 408 |
+
- 16-bit (default)
|
| 409 |
+
- 24-bit
|
| 410 |
+
|
| 411 |
+
``"ogg"``, ``"vorbis"``
|
| 412 |
+
- Doesn't accept changing configuration.
|
| 413 |
+
|
| 414 |
+
``"sph"``
|
| 415 |
+
- 8-bit signed integer PCM
|
| 416 |
+
- 16-bit signed integer PCM
|
| 417 |
+
- 24-bit signed integer PCM
|
| 418 |
+
- 32-bit signed integer PCM (default)
|
| 419 |
+
- 8-bit mu-law
|
| 420 |
+
- 8-bit a-law
|
| 421 |
+
- 16-bit a-law
|
| 422 |
+
- 24-bit a-law
|
| 423 |
+
- 32-bit a-law
|
| 424 |
+
|
| 425 |
+
"""
|
| 426 |
+
if src.ndim != 2:
|
| 427 |
+
raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
|
| 428 |
+
if compression is not None:
|
| 429 |
+
warnings.warn(
|
| 430 |
+
'`save` function of "soundfile" backend does not support "compression" parameter. '
|
| 431 |
+
"The argument is silently ignored."
|
| 432 |
+
)
|
| 433 |
+
if hasattr(filepath, "write"):
|
| 434 |
+
if format is None:
|
| 435 |
+
raise RuntimeError("`format` is required when saving to file object.")
|
| 436 |
+
ext = format.lower()
|
| 437 |
+
else:
|
| 438 |
+
ext = str(filepath).split(".")[-1].lower()
|
| 439 |
+
|
| 440 |
+
if bits_per_sample not in (None, 8, 16, 24, 32, 64):
|
| 441 |
+
raise ValueError("Invalid bits_per_sample.")
|
| 442 |
+
if bits_per_sample == 24:
|
| 443 |
+
warnings.warn(
|
| 444 |
+
"Saving audio with 24 bits per sample might warp samples near -1. "
|
| 445 |
+
"Using 16 bits per sample might be able to avoid this."
|
| 446 |
+
)
|
| 447 |
+
subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
|
| 448 |
+
|
| 449 |
+
# sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
|
| 450 |
+
# so we extend the extensions manually here
|
| 451 |
+
if ext in ["nis", "nist", "sph"] and format is None:
|
| 452 |
+
format = "NIST"
|
| 453 |
+
|
| 454 |
+
if channels_first:
|
| 455 |
+
src = src.t()
|
| 456 |
+
|
| 457 |
+
soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/sox.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import BinaryIO, Optional, Tuple, Union
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
import torchaudio
|
| 6 |
+
|
| 7 |
+
from .backend import Backend
|
| 8 |
+
from .common import AudioMetaData
|
| 9 |
+
|
| 10 |
+
sox_ext = torchaudio._extension.lazy_import_sox_ext()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class SoXBackend(Backend):
|
| 14 |
+
@staticmethod
|
| 15 |
+
def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
|
| 16 |
+
if hasattr(uri, "read"):
|
| 17 |
+
raise ValueError(
|
| 18 |
+
"SoX backend does not support reading from file-like objects. ",
|
| 19 |
+
"Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.",
|
| 20 |
+
)
|
| 21 |
+
else:
|
| 22 |
+
sinfo = sox_ext.get_info(uri, format)
|
| 23 |
+
if sinfo:
|
| 24 |
+
return AudioMetaData(*sinfo)
|
| 25 |
+
else:
|
| 26 |
+
raise RuntimeError(f"Failed to fetch metadata for {uri}.")
|
| 27 |
+
|
| 28 |
+
@staticmethod
|
| 29 |
+
def load(
|
| 30 |
+
uri: Union[BinaryIO, str, os.PathLike],
|
| 31 |
+
frame_offset: int = 0,
|
| 32 |
+
num_frames: int = -1,
|
| 33 |
+
normalize: bool = True,
|
| 34 |
+
channels_first: bool = True,
|
| 35 |
+
format: Optional[str] = None,
|
| 36 |
+
buffer_size: int = 4096,
|
| 37 |
+
) -> Tuple[torch.Tensor, int]:
|
| 38 |
+
if hasattr(uri, "read"):
|
| 39 |
+
raise ValueError(
|
| 40 |
+
"SoX backend does not support loading from file-like objects. ",
|
| 41 |
+
"Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.",
|
| 42 |
+
)
|
| 43 |
+
else:
|
| 44 |
+
ret = sox_ext.load_audio_file(uri, frame_offset, num_frames, normalize, channels_first, format)
|
| 45 |
+
if not ret:
|
| 46 |
+
raise RuntimeError(f"Failed to load audio from {uri}.")
|
| 47 |
+
return ret
|
| 48 |
+
|
| 49 |
+
@staticmethod
|
| 50 |
+
def save(
|
| 51 |
+
uri: Union[BinaryIO, str, os.PathLike],
|
| 52 |
+
src: torch.Tensor,
|
| 53 |
+
sample_rate: int,
|
| 54 |
+
channels_first: bool = True,
|
| 55 |
+
format: Optional[str] = None,
|
| 56 |
+
encoding: Optional[str] = None,
|
| 57 |
+
bits_per_sample: Optional[int] = None,
|
| 58 |
+
buffer_size: int = 4096,
|
| 59 |
+
compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
|
| 60 |
+
) -> None:
|
| 61 |
+
if not isinstance(compression, (float, int, type(None))):
|
| 62 |
+
raise ValueError(
|
| 63 |
+
"SoX backend expects non-`None` value for argument `compression` to be of ",
|
| 64 |
+
f"type `float` or `int`, but received value of type {type(compression)}",
|
| 65 |
+
)
|
| 66 |
+
if hasattr(uri, "write"):
|
| 67 |
+
raise ValueError(
|
| 68 |
+
"SoX backend does not support writing to file-like objects. ",
|
| 69 |
+
"Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.",
|
| 70 |
+
)
|
| 71 |
+
else:
|
| 72 |
+
sox_ext.save_audio_file(
|
| 73 |
+
uri,
|
| 74 |
+
src,
|
| 75 |
+
sample_rate,
|
| 76 |
+
channels_first,
|
| 77 |
+
compression,
|
| 78 |
+
format,
|
| 79 |
+
encoding,
|
| 80 |
+
bits_per_sample,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
@staticmethod
|
| 84 |
+
def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
|
| 85 |
+
# i.e. not a file-like object.
|
| 86 |
+
return not hasattr(uri, "read")
|
| 87 |
+
|
| 88 |
+
@staticmethod
|
| 89 |
+
def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
|
| 90 |
+
# i.e. not a file-like object.
|
| 91 |
+
return not hasattr(uri, "write")
|
.venv/lib/python3.11/site-packages/torchaudio/_backend/utils.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from functools import lru_cache
|
| 3 |
+
from typing import BinaryIO, Dict, Optional, Tuple, Type, Union
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
|
| 7 |
+
from torchaudio._extension import lazy_import_sox_ext
|
| 8 |
+
from torchaudio.io import CodecConfig
|
| 9 |
+
from torio._extension import lazy_import_ffmpeg_ext
|
| 10 |
+
|
| 11 |
+
from . import soundfile_backend
|
| 12 |
+
|
| 13 |
+
from .backend import Backend
|
| 14 |
+
from .common import AudioMetaData
|
| 15 |
+
from .ffmpeg import FFmpegBackend
|
| 16 |
+
from .soundfile import SoundfileBackend
|
| 17 |
+
from .sox import SoXBackend
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@lru_cache(None)
|
| 21 |
+
def get_available_backends() -> Dict[str, Type[Backend]]:
|
| 22 |
+
backend_specs: Dict[str, Type[Backend]] = {}
|
| 23 |
+
if lazy_import_ffmpeg_ext().is_available():
|
| 24 |
+
backend_specs["ffmpeg"] = FFmpegBackend
|
| 25 |
+
if lazy_import_sox_ext().is_available():
|
| 26 |
+
backend_specs["sox"] = SoXBackend
|
| 27 |
+
if soundfile_backend._IS_SOUNDFILE_AVAILABLE:
|
| 28 |
+
backend_specs["soundfile"] = SoundfileBackend
|
| 29 |
+
return backend_specs
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_backend(backend_name, backends) -> Backend:
|
| 33 |
+
if backend := backends.get(backend_name):
|
| 34 |
+
return backend
|
| 35 |
+
else:
|
| 36 |
+
raise ValueError(
|
| 37 |
+
f"Unsupported backend '{backend_name}' specified; ",
|
| 38 |
+
f"please select one of {list(backends.keys())} instead.",
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def get_info_func():
|
| 43 |
+
backends = get_available_backends()
|
| 44 |
+
|
| 45 |
+
def dispatcher(
|
| 46 |
+
uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
|
| 47 |
+
) -> Backend:
|
| 48 |
+
if backend_name is not None:
|
| 49 |
+
return get_backend(backend_name, backends)
|
| 50 |
+
|
| 51 |
+
for backend in backends.values():
|
| 52 |
+
if backend.can_decode(uri, format):
|
| 53 |
+
return backend
|
| 54 |
+
raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
|
| 55 |
+
|
| 56 |
+
def info(
|
| 57 |
+
uri: Union[BinaryIO, str, os.PathLike],
|
| 58 |
+
format: Optional[str] = None,
|
| 59 |
+
buffer_size: int = 4096,
|
| 60 |
+
backend: Optional[str] = None,
|
| 61 |
+
) -> AudioMetaData:
|
| 62 |
+
"""Get signal information of an audio file.
|
| 63 |
+
|
| 64 |
+
Note:
|
| 65 |
+
When the input type is file-like object, this function cannot
|
| 66 |
+
get the correct length (``num_samples``) for certain formats,
|
| 67 |
+
such as ``vorbis``.
|
| 68 |
+
In this case, the value of ``num_samples`` is ``0``.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
uri (path-like object or file-like object):
|
| 72 |
+
Source of audio data. The following types are accepted:
|
| 73 |
+
|
| 74 |
+
* ``path-like``: File path or URL.
|
| 75 |
+
* ``file-like``: Object with ``read(size: int) -> bytes`` method,
|
| 76 |
+
which returns byte string of at most ``size`` length.
|
| 77 |
+
|
| 78 |
+
format (str or None, optional):
|
| 79 |
+
If not ``None``, interpreted as hint that may allow backend to override the detected format.
|
| 80 |
+
(Default: ``None``)
|
| 81 |
+
|
| 82 |
+
buffer_size (int, optional):
|
| 83 |
+
Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
|
| 84 |
+
|
| 85 |
+
backend (str or None, optional):
|
| 86 |
+
I/O backend to use.
|
| 87 |
+
If ``None``, function selects backend given input and available backends.
|
| 88 |
+
Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
|
| 89 |
+
with the corresponding backend available.
|
| 90 |
+
(Default: ``None``)
|
| 91 |
+
|
| 92 |
+
.. seealso::
|
| 93 |
+
:ref:`backend`
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
AudioMetaData
|
| 97 |
+
"""
|
| 98 |
+
backend = dispatcher(uri, format, backend)
|
| 99 |
+
return backend.info(uri, format, buffer_size)
|
| 100 |
+
|
| 101 |
+
return info
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def get_load_func():
|
| 105 |
+
backends = get_available_backends()
|
| 106 |
+
|
| 107 |
+
def dispatcher(
|
| 108 |
+
uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
|
| 109 |
+
) -> Backend:
|
| 110 |
+
if backend_name is not None:
|
| 111 |
+
return get_backend(backend_name, backends)
|
| 112 |
+
|
| 113 |
+
for backend in backends.values():
|
| 114 |
+
if backend.can_decode(uri, format):
|
| 115 |
+
return backend
|
| 116 |
+
raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
|
| 117 |
+
|
| 118 |
+
def load(
|
| 119 |
+
uri: Union[BinaryIO, str, os.PathLike],
|
| 120 |
+
frame_offset: int = 0,
|
| 121 |
+
num_frames: int = -1,
|
| 122 |
+
normalize: bool = True,
|
| 123 |
+
channels_first: bool = True,
|
| 124 |
+
format: Optional[str] = None,
|
| 125 |
+
buffer_size: int = 4096,
|
| 126 |
+
backend: Optional[str] = None,
|
| 127 |
+
) -> Tuple[torch.Tensor, int]:
|
| 128 |
+
"""Load audio data from source.
|
| 129 |
+
|
| 130 |
+
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
|
| 131 |
+
``float32`` dtype, and the shape of `[channel, time]`.
|
| 132 |
+
|
| 133 |
+
Note:
|
| 134 |
+
The formats this function can handle depend on the availability of backends.
|
| 135 |
+
Please use the following functions to fetch the supported formats.
|
| 136 |
+
|
| 137 |
+
- FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_decoders`
|
| 138 |
+
- Sox: :py:func:`torchaudio.utils.sox_utils.list_read_formats`
|
| 139 |
+
- SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
|
| 140 |
+
|
| 141 |
+
.. warning::
|
| 142 |
+
|
| 143 |
+
``normalize`` argument does not perform volume normalization.
|
| 144 |
+
It only converts the sample type to `torch.float32` from the native sample
|
| 145 |
+
type.
|
| 146 |
+
|
| 147 |
+
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
|
| 148 |
+
signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
|
| 149 |
+
this function can return integer Tensor, where the samples are expressed within the whole range
|
| 150 |
+
of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
|
| 151 |
+
``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
|
| 152 |
+
support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
|
| 153 |
+
|
| 154 |
+
``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
|
| 155 |
+
``flac`` and ``mp3``.
|
| 156 |
+
|
| 157 |
+
For these formats, this function always returns ``float32`` Tensor with values.
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
Args:
|
| 161 |
+
uri (path-like object or file-like object):
|
| 162 |
+
Source of audio data.
|
| 163 |
+
frame_offset (int, optional):
|
| 164 |
+
Number of frames to skip before start reading data.
|
| 165 |
+
num_frames (int, optional):
|
| 166 |
+
Maximum number of frames to read. ``-1`` reads all the remaining samples,
|
| 167 |
+
starting from ``frame_offset``.
|
| 168 |
+
This function may return the less number of frames if there is not enough
|
| 169 |
+
frames in the given file.
|
| 170 |
+
normalize (bool, optional):
|
| 171 |
+
When ``True``, this function converts the native sample type to ``float32``.
|
| 172 |
+
Default: ``True``.
|
| 173 |
+
|
| 174 |
+
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
|
| 175 |
+
integer type.
|
| 176 |
+
This argument has no effect for formats other than integer WAV type.
|
| 177 |
+
|
| 178 |
+
channels_first (bool, optional):
|
| 179 |
+
When True, the returned Tensor has dimension `[channel, time]`.
|
| 180 |
+
Otherwise, the returned Tensor's dimension is `[time, channel]`.
|
| 181 |
+
|
| 182 |
+
format (str or None, optional):
|
| 183 |
+
If not ``None``, interpreted as hint that may allow backend to override the detected format.
|
| 184 |
+
(Default: ``None``)
|
| 185 |
+
|
| 186 |
+
buffer_size (int, optional):
|
| 187 |
+
Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
|
| 188 |
+
|
| 189 |
+
backend (str or None, optional):
|
| 190 |
+
I/O backend to use.
|
| 191 |
+
If ``None``, function selects backend given input and available backends.
|
| 192 |
+
Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
|
| 193 |
+
with the corresponding backend being available. (Default: ``None``)
|
| 194 |
+
|
| 195 |
+
.. seealso::
|
| 196 |
+
:ref:`backend`
|
| 197 |
+
|
| 198 |
+
Returns:
|
| 199 |
+
(torch.Tensor, int): Resulting Tensor and sample rate.
|
| 200 |
+
If the input file has integer wav format and normalization is off, then it has
|
| 201 |
+
integer type, else ``float32`` type. If ``channels_first=True``, it has
|
| 202 |
+
`[channel, time]` else `[time, channel]`.
|
| 203 |
+
"""
|
| 204 |
+
backend = dispatcher(uri, format, backend)
|
| 205 |
+
return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size)
|
| 206 |
+
|
| 207 |
+
return load
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def get_save_func():
|
| 211 |
+
backends = get_available_backends()
|
| 212 |
+
|
| 213 |
+
def dispatcher(
|
| 214 |
+
uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
|
| 215 |
+
) -> Backend:
|
| 216 |
+
if backend_name is not None:
|
| 217 |
+
return get_backend(backend_name, backends)
|
| 218 |
+
|
| 219 |
+
for backend in backends.values():
|
| 220 |
+
if backend.can_encode(uri, format):
|
| 221 |
+
return backend
|
| 222 |
+
raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
|
| 223 |
+
|
| 224 |
+
def save(
|
| 225 |
+
uri: Union[BinaryIO, str, os.PathLike],
|
| 226 |
+
src: torch.Tensor,
|
| 227 |
+
sample_rate: int,
|
| 228 |
+
channels_first: bool = True,
|
| 229 |
+
format: Optional[str] = None,
|
| 230 |
+
encoding: Optional[str] = None,
|
| 231 |
+
bits_per_sample: Optional[int] = None,
|
| 232 |
+
buffer_size: int = 4096,
|
| 233 |
+
backend: Optional[str] = None,
|
| 234 |
+
compression: Optional[Union[CodecConfig, float, int]] = None,
|
| 235 |
+
):
|
| 236 |
+
"""Save audio data to file.
|
| 237 |
+
|
| 238 |
+
Note:
|
| 239 |
+
The formats this function can handle depend on the availability of backends.
|
| 240 |
+
Please use the following functions to fetch the supported formats.
|
| 241 |
+
|
| 242 |
+
- FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_encoders`
|
| 243 |
+
- Sox: :py:func:`torchaudio.utils.sox_utils.list_write_formats`
|
| 244 |
+
- SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
|
| 245 |
+
|
| 246 |
+
Args:
|
| 247 |
+
uri (str or pathlib.Path): Path to audio file.
|
| 248 |
+
src (torch.Tensor): Audio data to save. must be 2D tensor.
|
| 249 |
+
sample_rate (int): sampling rate
|
| 250 |
+
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
|
| 251 |
+
otherwise `[time, channel]`.
|
| 252 |
+
format (str or None, optional): Override the audio format.
|
| 253 |
+
When ``uri`` argument is path-like object, audio format is
|
| 254 |
+
inferred from file extension. If the file extension is missing or
|
| 255 |
+
different, you can specify the correct format with this argument.
|
| 256 |
+
|
| 257 |
+
When ``uri`` argument is file-like object,
|
| 258 |
+
this argument is required.
|
| 259 |
+
|
| 260 |
+
Valid values are ``"wav"``, ``"ogg"``, and ``"flac"``.
|
| 261 |
+
encoding (str or None, optional): Changes the encoding for supported formats.
|
| 262 |
+
This argument is effective only for supported formats, i.e.
|
| 263 |
+
``"wav"`` and ``""flac"```. Valid values are
|
| 264 |
+
|
| 265 |
+
- ``"PCM_S"`` (signed integer Linear PCM)
|
| 266 |
+
- ``"PCM_U"`` (unsigned integer Linear PCM)
|
| 267 |
+
- ``"PCM_F"`` (floating point PCM)
|
| 268 |
+
- ``"ULAW"`` (mu-law)
|
| 269 |
+
- ``"ALAW"`` (a-law)
|
| 270 |
+
|
| 271 |
+
bits_per_sample (int or None, optional): Changes the bit depth for the
|
| 272 |
+
supported formats.
|
| 273 |
+
When ``format`` is one of ``"wav"`` and ``"flac"``,
|
| 274 |
+
you can change the bit depth.
|
| 275 |
+
Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
|
| 276 |
+
|
| 277 |
+
buffer_size (int, optional):
|
| 278 |
+
Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
|
| 279 |
+
|
| 280 |
+
backend (str or None, optional):
|
| 281 |
+
I/O backend to use.
|
| 282 |
+
If ``None``, function selects backend given input and available backends.
|
| 283 |
+
Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
|
| 284 |
+
with the corresponding backend being available.
|
| 285 |
+
(Default: ``None``)
|
| 286 |
+
|
| 287 |
+
.. seealso::
|
| 288 |
+
:ref:`backend`
|
| 289 |
+
|
| 290 |
+
compression (CodecConfig, float, int, or None, optional):
|
| 291 |
+
Compression configuration to apply.
|
| 292 |
+
|
| 293 |
+
If the selected backend is FFmpeg, an instance of :py:class:`CodecConfig` must be provided.
|
| 294 |
+
|
| 295 |
+
Otherwise, if the selected backend is SoX, a float or int value corresponding to option ``-C`` of the
|
| 296 |
+
``sox`` command line interface must be provided. For instance:
|
| 297 |
+
|
| 298 |
+
``"mp3"``
|
| 299 |
+
Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
|
| 300 |
+
VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
|
| 301 |
+
|
| 302 |
+
``"flac"``
|
| 303 |
+
Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
|
| 304 |
+
|
| 305 |
+
``"ogg"``, ``"vorbis"``
|
| 306 |
+
Number from ``-1`` to ``10``; ``-1`` is the highest compression
|
| 307 |
+
and lowest quality. Default: ``3``.
|
| 308 |
+
|
| 309 |
+
Refer to http://sox.sourceforge.net/soxformat.html for more details.
|
| 310 |
+
|
| 311 |
+
"""
|
| 312 |
+
backend = dispatcher(uri, format, backend)
|
| 313 |
+
return backend.save(
|
| 314 |
+
uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size, compression
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
return save
|
.venv/lib/python3.11/site-packages/torchaudio/backend/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# NOTE:
|
| 2 |
+
# The entire `torchaudio.backend` module is deprecated.
|
| 3 |
+
# New things should be added to `torchaudio._backend`.
|
| 4 |
+
# Only things related to backward compatibility should be placed here.
|
| 5 |
+
|
| 6 |
+
from . import common, no_backend, soundfile_backend, sox_io_backend # noqa
|
| 7 |
+
|
| 8 |
+
__all__ = []
|
.venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/_sox_io_backend.cpython-311.pyc
ADDED
|
Binary file (12.7 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/soundfile_backend.cpython-311.pyc
ADDED
|
Binary file (889 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/backend/_no_backend.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from typing import Callable, Optional, Tuple, Union
|
| 3 |
+
|
| 4 |
+
from torch import Tensor
|
| 5 |
+
from torchaudio import AudioMetaData
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def load(
|
| 9 |
+
filepath: Union[str, Path],
|
| 10 |
+
out: Optional[Tensor] = None,
|
| 11 |
+
normalization: Union[bool, float, Callable] = True,
|
| 12 |
+
channels_first: bool = True,
|
| 13 |
+
num_frames: int = 0,
|
| 14 |
+
offset: int = 0,
|
| 15 |
+
filetype: Optional[str] = None,
|
| 16 |
+
) -> Tuple[Tensor, int]:
|
| 17 |
+
raise RuntimeError("No audio I/O backend is available.")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
|
| 21 |
+
raise RuntimeError("No audio I/O backend is available.")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def info(filepath: str) -> AudioMetaData:
|
| 25 |
+
raise RuntimeError("No audio I/O backend is available.")
|
.venv/lib/python3.11/site-packages/torchaudio/backend/common.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def __getattr__(name: str):
|
| 2 |
+
if name == "AudioMetaData":
|
| 3 |
+
import warnings
|
| 4 |
+
|
| 5 |
+
warnings.warn(
|
| 6 |
+
"`torchaudio.backend.common.AudioMetaData` has been moved to "
|
| 7 |
+
"`torchaudio.AudioMetaData`. Please update the import path.",
|
| 8 |
+
stacklevel=2,
|
| 9 |
+
)
|
| 10 |
+
from torchaudio import AudioMetaData
|
| 11 |
+
|
| 12 |
+
return AudioMetaData
|
| 13 |
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
.venv/lib/python3.11/site-packages/torchaudio/backend/soundfile_backend.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def __getattr__(name: str):
|
| 2 |
+
import warnings
|
| 3 |
+
|
| 4 |
+
warnings.warn(
|
| 5 |
+
"Torchaudio's I/O functions now support par-call bakcend dispatch. "
|
| 6 |
+
"Importing backend implementation directly is no longer guaranteed to work. "
|
| 7 |
+
"Please use `backend` keyword with load/save/info function, instead of "
|
| 8 |
+
"calling the udnerlying implementation directly.",
|
| 9 |
+
stacklevel=2,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
from torchaudio._backend import soundfile_backend
|
| 13 |
+
|
| 14 |
+
return getattr(soundfile_backend, name)
|
.venv/lib/python3.11/site-packages/torchaudio/backend/sox_io_backend.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def __getattr__(name: str):
|
| 2 |
+
import warnings
|
| 3 |
+
|
| 4 |
+
warnings.warn(
|
| 5 |
+
"Torchaudio's I/O functions now support par-call bakcend dispatch. "
|
| 6 |
+
"Importing backend implementation directly is no longer guaranteed to work. "
|
| 7 |
+
"Please use `backend` keyword with load/save/info function, instead of "
|
| 8 |
+
"calling the udnerlying implementation directly.",
|
| 9 |
+
stacklevel=2,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
from . import _sox_io_backend
|
| 13 |
+
|
| 14 |
+
return getattr(_sox_io_backend, name)
|
.venv/lib/python3.11/site-packages/torchaudio/functional/__init__.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ._alignment import forced_align, merge_tokens, TokenSpan
|
| 2 |
+
from .filtering import (
|
| 3 |
+
allpass_biquad,
|
| 4 |
+
band_biquad,
|
| 5 |
+
bandpass_biquad,
|
| 6 |
+
bandreject_biquad,
|
| 7 |
+
bass_biquad,
|
| 8 |
+
biquad,
|
| 9 |
+
contrast,
|
| 10 |
+
dcshift,
|
| 11 |
+
deemph_biquad,
|
| 12 |
+
dither,
|
| 13 |
+
equalizer_biquad,
|
| 14 |
+
filtfilt,
|
| 15 |
+
flanger,
|
| 16 |
+
gain,
|
| 17 |
+
highpass_biquad,
|
| 18 |
+
lfilter,
|
| 19 |
+
lowpass_biquad,
|
| 20 |
+
overdrive,
|
| 21 |
+
phaser,
|
| 22 |
+
riaa_biquad,
|
| 23 |
+
treble_biquad,
|
| 24 |
+
vad,
|
| 25 |
+
)
|
| 26 |
+
from .functional import (
|
| 27 |
+
add_noise,
|
| 28 |
+
amplitude_to_DB,
|
| 29 |
+
apply_beamforming,
|
| 30 |
+
apply_codec,
|
| 31 |
+
compute_deltas,
|
| 32 |
+
convolve,
|
| 33 |
+
create_dct,
|
| 34 |
+
DB_to_amplitude,
|
| 35 |
+
deemphasis,
|
| 36 |
+
detect_pitch_frequency,
|
| 37 |
+
edit_distance,
|
| 38 |
+
fftconvolve,
|
| 39 |
+
frechet_distance,
|
| 40 |
+
griffinlim,
|
| 41 |
+
inverse_spectrogram,
|
| 42 |
+
linear_fbanks,
|
| 43 |
+
loudness,
|
| 44 |
+
mask_along_axis,
|
| 45 |
+
mask_along_axis_iid,
|
| 46 |
+
melscale_fbanks,
|
| 47 |
+
mu_law_decoding,
|
| 48 |
+
mu_law_encoding,
|
| 49 |
+
mvdr_weights_rtf,
|
| 50 |
+
mvdr_weights_souden,
|
| 51 |
+
phase_vocoder,
|
| 52 |
+
pitch_shift,
|
| 53 |
+
preemphasis,
|
| 54 |
+
psd,
|
| 55 |
+
resample,
|
| 56 |
+
rnnt_loss,
|
| 57 |
+
rtf_evd,
|
| 58 |
+
rtf_power,
|
| 59 |
+
sliding_window_cmn,
|
| 60 |
+
spectral_centroid,
|
| 61 |
+
spectrogram,
|
| 62 |
+
speed,
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
__all__ = [
|
| 66 |
+
"amplitude_to_DB",
|
| 67 |
+
"compute_deltas",
|
| 68 |
+
"create_dct",
|
| 69 |
+
"melscale_fbanks",
|
| 70 |
+
"linear_fbanks",
|
| 71 |
+
"DB_to_amplitude",
|
| 72 |
+
"loudness",
|
| 73 |
+
"detect_pitch_frequency",
|
| 74 |
+
"griffinlim",
|
| 75 |
+
"mask_along_axis",
|
| 76 |
+
"mask_along_axis_iid",
|
| 77 |
+
"mu_law_encoding",
|
| 78 |
+
"mu_law_decoding",
|
| 79 |
+
"phase_vocoder",
|
| 80 |
+
"sliding_window_cmn",
|
| 81 |
+
"spectrogram",
|
| 82 |
+
"inverse_spectrogram",
|
| 83 |
+
"spectral_centroid",
|
| 84 |
+
"allpass_biquad",
|
| 85 |
+
"band_biquad",
|
| 86 |
+
"bandpass_biquad",
|
| 87 |
+
"bandreject_biquad",
|
| 88 |
+
"bass_biquad",
|
| 89 |
+
"biquad",
|
| 90 |
+
"contrast",
|
| 91 |
+
"dither",
|
| 92 |
+
"dcshift",
|
| 93 |
+
"deemph_biquad",
|
| 94 |
+
"equalizer_biquad",
|
| 95 |
+
"filtfilt",
|
| 96 |
+
"flanger",
|
| 97 |
+
"forced_align",
|
| 98 |
+
"merge_tokens",
|
| 99 |
+
"TokenSpan",
|
| 100 |
+
"gain",
|
| 101 |
+
"highpass_biquad",
|
| 102 |
+
"lfilter",
|
| 103 |
+
"lowpass_biquad",
|
| 104 |
+
"overdrive",
|
| 105 |
+
"phaser",
|
| 106 |
+
"riaa_biquad",
|
| 107 |
+
"treble_biquad",
|
| 108 |
+
"vad",
|
| 109 |
+
"apply_codec",
|
| 110 |
+
"resample",
|
| 111 |
+
"edit_distance",
|
| 112 |
+
"pitch_shift",
|
| 113 |
+
"rnnt_loss",
|
| 114 |
+
"psd",
|
| 115 |
+
"mvdr_weights_souden",
|
| 116 |
+
"mvdr_weights_rtf",
|
| 117 |
+
"rtf_evd",
|
| 118 |
+
"rtf_power",
|
| 119 |
+
"apply_beamforming",
|
| 120 |
+
"fftconvolve",
|
| 121 |
+
"convolve",
|
| 122 |
+
"add_noise",
|
| 123 |
+
"speed",
|
| 124 |
+
"preemphasis",
|
| 125 |
+
"deemphasis",
|
| 126 |
+
"frechet_distance",
|
| 127 |
+
]
|
.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (2.63 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/_alignment.cpython-311.pyc
ADDED
|
Binary file (6.77 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/filtering.cpython-311.pyc
ADDED
|
Binary file (74 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/functional.cpython-311.pyc
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:71e5719c3daaa09433b5ece2431df353ef399f7678bc6bee1f1ebff9b16f9c13
|
| 3 |
+
size 115834
|
.venv/lib/python3.11/site-packages/torchaudio/functional/_alignment.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from typing import List, Optional, Tuple
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
from torch import Tensor
|
| 6 |
+
from torchaudio._extension import fail_if_no_align
|
| 7 |
+
|
| 8 |
+
__all__ = []
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@fail_if_no_align
|
| 12 |
+
def forced_align(
|
| 13 |
+
log_probs: Tensor,
|
| 14 |
+
targets: Tensor,
|
| 15 |
+
input_lengths: Optional[Tensor] = None,
|
| 16 |
+
target_lengths: Optional[Tensor] = None,
|
| 17 |
+
blank: int = 0,
|
| 18 |
+
) -> Tuple[Tensor, Tensor]:
|
| 19 |
+
r"""Align a CTC label sequence to an emission.
|
| 20 |
+
|
| 21 |
+
.. devices:: CPU CUDA
|
| 22 |
+
|
| 23 |
+
.. properties:: TorchScript
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
log_probs (Tensor): log probability of CTC emission output.
|
| 27 |
+
Tensor of shape `(B, T, C)`. where `B` is the batch size, `T` is the input length,
|
| 28 |
+
`C` is the number of characters in alphabet including blank.
|
| 29 |
+
targets (Tensor): Target sequence. Tensor of shape `(B, L)`,
|
| 30 |
+
where `L` is the target length.
|
| 31 |
+
input_lengths (Tensor or None, optional):
|
| 32 |
+
Lengths of the inputs (max value must each be <= `T`). 1-D Tensor of shape `(B,)`.
|
| 33 |
+
target_lengths (Tensor or None, optional):
|
| 34 |
+
Lengths of the targets. 1-D Tensor of shape `(B,)`.
|
| 35 |
+
blank_id (int, optional): The index of blank symbol in CTC emission. (Default: 0)
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
Tuple(Tensor, Tensor):
|
| 39 |
+
Tensor: Label for each time step in the alignment path computed using forced alignment.
|
| 40 |
+
|
| 41 |
+
Tensor: Log probability scores of the labels for each time step.
|
| 42 |
+
|
| 43 |
+
Note:
|
| 44 |
+
The sequence length of `log_probs` must satisfy:
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
.. math::
|
| 48 |
+
L_{\text{log\_probs}} \ge L_{\text{label}} + N_{\text{repeat}}
|
| 49 |
+
|
| 50 |
+
where :math:`N_{\text{repeat}}` is the number of consecutively repeated tokens.
|
| 51 |
+
For example, in str `"aabbc"`, the number of repeats are `2`.
|
| 52 |
+
|
| 53 |
+
Note:
|
| 54 |
+
The current version only supports ``batch_size==1``.
|
| 55 |
+
"""
|
| 56 |
+
if blank in targets:
|
| 57 |
+
raise ValueError(f"targets Tensor shouldn't contain blank index. Found {targets}.")
|
| 58 |
+
if torch.max(targets) >= log_probs.shape[-1]:
|
| 59 |
+
raise ValueError("targets values must be less than the CTC dimension")
|
| 60 |
+
|
| 61 |
+
if input_lengths is None:
|
| 62 |
+
batch_size, length = log_probs.size(0), log_probs.size(1)
|
| 63 |
+
input_lengths = torch.full((batch_size,), length, dtype=torch.int64, device=log_probs.device)
|
| 64 |
+
if target_lengths is None:
|
| 65 |
+
batch_size, length = targets.size(0), targets.size(1)
|
| 66 |
+
target_lengths = torch.full((batch_size,), length, dtype=torch.int64, device=targets.device)
|
| 67 |
+
|
| 68 |
+
# For TorchScript compatibility
|
| 69 |
+
assert input_lengths is not None
|
| 70 |
+
assert target_lengths is not None
|
| 71 |
+
|
| 72 |
+
paths, scores = torch.ops.torchaudio.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
|
| 73 |
+
return paths, scores
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
@dataclass
|
| 77 |
+
class TokenSpan:
|
| 78 |
+
"""TokenSpan()
|
| 79 |
+
Token with time stamps and score. Returned by :py:func:`merge_tokens`.
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
token: int
|
| 83 |
+
"""The token"""
|
| 84 |
+
start: int
|
| 85 |
+
"""The start time (inclusive) in emission time axis."""
|
| 86 |
+
end: int
|
| 87 |
+
"""The end time (exclusive) in emission time axis."""
|
| 88 |
+
score: float
|
| 89 |
+
"""The score of the this token."""
|
| 90 |
+
|
| 91 |
+
def __len__(self) -> int:
|
| 92 |
+
"""Returns the time span"""
|
| 93 |
+
return self.end - self.start
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def merge_tokens(tokens: Tensor, scores: Tensor, blank: int = 0) -> List[TokenSpan]:
|
| 97 |
+
"""Removes repeated tokens and blank tokens from the given CTC token sequence.
|
| 98 |
+
|
| 99 |
+
Args:
|
| 100 |
+
tokens (Tensor): Alignment tokens (unbatched) returned from :py:func:`forced_align`.
|
| 101 |
+
Shape: `(time, )`.
|
| 102 |
+
scores (Tensor): Alignment scores (unbatched) returned from :py:func:`forced_align`.
|
| 103 |
+
Shape: `(time, )`. When computing the token-size score, the given score is averaged
|
| 104 |
+
across the corresponding time span.
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
list of TokenSpan
|
| 108 |
+
|
| 109 |
+
Example:
|
| 110 |
+
>>> aligned_tokens, scores = forced_align(emission, targets, input_lengths, target_lengths)
|
| 111 |
+
>>> token_spans = merge_tokens(aligned_tokens[0], scores[0])
|
| 112 |
+
"""
|
| 113 |
+
if tokens.ndim != 1 or scores.ndim != 1:
|
| 114 |
+
raise ValueError("`tokens` and `scores` must be 1D Tensor.")
|
| 115 |
+
if len(tokens) != len(scores):
|
| 116 |
+
raise ValueError("`tokens` and `scores` must be the same length.")
|
| 117 |
+
|
| 118 |
+
diff = torch.diff(
|
| 119 |
+
tokens, prepend=torch.tensor([-1], device=tokens.device), append=torch.tensor([-1], device=tokens.device)
|
| 120 |
+
)
|
| 121 |
+
changes_wo_blank = torch.nonzero((diff != 0)).squeeze().tolist()
|
| 122 |
+
tokens = tokens.tolist()
|
| 123 |
+
spans = [
|
| 124 |
+
TokenSpan(token=token, start=start, end=end, score=scores[start:end].mean().item())
|
| 125 |
+
for start, end in zip(changes_wo_blank[:-1], changes_wo_blank[1:])
|
| 126 |
+
if (token := tokens[start]) != blank
|
| 127 |
+
]
|
| 128 |
+
return spans
|
.venv/lib/python3.11/site-packages/torchaudio/functional/filtering.py
ADDED
|
@@ -0,0 +1,1669 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import warnings
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from torch import Tensor
|
| 7 |
+
|
| 8 |
+
from torchaudio._extension import _IS_TORCHAUDIO_EXT_AVAILABLE
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _dB2Linear(x: float) -> float:
|
| 12 |
+
return math.exp(x * math.log(10) / 20.0)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _generate_wave_table(
|
| 16 |
+
wave_type: str,
|
| 17 |
+
data_type: str,
|
| 18 |
+
table_size: int,
|
| 19 |
+
min: float,
|
| 20 |
+
max: float,
|
| 21 |
+
phase: float,
|
| 22 |
+
device: torch.device,
|
| 23 |
+
) -> Tensor:
|
| 24 |
+
r"""A helper function for phaser. Generates a table with given parameters.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
wave_type (str): SINE or TRIANGULAR
|
| 28 |
+
data_type (str): desired data_type ( `INT` or `FLOAT` )
|
| 29 |
+
table_size (int): desired table size
|
| 30 |
+
min (float): desired min value
|
| 31 |
+
max (float): desired max value
|
| 32 |
+
phase (float): desired phase
|
| 33 |
+
device (torch.device): Torch device on which table must be generated
|
| 34 |
+
Returns:
|
| 35 |
+
Tensor: A 1D tensor with wave table values
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
phase_offset = int(phase / math.pi / 2 * table_size + 0.5)
|
| 39 |
+
|
| 40 |
+
t = torch.arange(table_size, device=device, dtype=torch.int32)
|
| 41 |
+
|
| 42 |
+
point = (t + phase_offset) % table_size
|
| 43 |
+
|
| 44 |
+
d = torch.zeros_like(point, device=device, dtype=torch.float64)
|
| 45 |
+
|
| 46 |
+
if wave_type == "SINE":
|
| 47 |
+
d = (torch.sin(point.to(torch.float64) / table_size * 2 * math.pi) + 1) / 2
|
| 48 |
+
elif wave_type == "TRIANGLE":
|
| 49 |
+
d = point.to(torch.float64) * 2 / table_size
|
| 50 |
+
value = torch.div(4 * point, table_size, rounding_mode="floor")
|
| 51 |
+
d[value == 0] = d[value == 0] + 0.5
|
| 52 |
+
d[value == 1] = 1.5 - d[value == 1]
|
| 53 |
+
d[value == 2] = 1.5 - d[value == 2]
|
| 54 |
+
d[value == 3] = d[value == 3] - 1.5
|
| 55 |
+
|
| 56 |
+
d = d * (max - min) + min
|
| 57 |
+
|
| 58 |
+
if data_type == "INT":
|
| 59 |
+
mask = d < 0
|
| 60 |
+
d[mask] = d[mask] - 0.5
|
| 61 |
+
d[~mask] = d[~mask] + 0.5
|
| 62 |
+
d = d.to(torch.int32)
|
| 63 |
+
elif data_type == "FLOAT":
|
| 64 |
+
d = d.to(torch.float32)
|
| 65 |
+
|
| 66 |
+
return d
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def allpass_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
|
| 70 |
+
r"""Design two-pole all-pass filter. Similar to SoX implementation.
|
| 71 |
+
|
| 72 |
+
.. devices:: CPU CUDA
|
| 73 |
+
|
| 74 |
+
.. properties:: Autograd TorchScript
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
waveform(torch.Tensor): audio waveform of dimension of `(..., time)`
|
| 78 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
| 79 |
+
central_freq (float or torch.Tensor): central frequency (in Hz)
|
| 80 |
+
Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
Tensor: Waveform of dimension of `(..., time)`
|
| 84 |
+
|
| 85 |
+
Reference:
|
| 86 |
+
- http://sox.sourceforge.net/sox.html
|
| 87 |
+
- https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
|
| 88 |
+
"""
|
| 89 |
+
dtype = waveform.dtype
|
| 90 |
+
device = waveform.device
|
| 91 |
+
central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
|
| 92 |
+
Q = torch.as_tensor(Q, dtype=dtype, device=device)
|
| 93 |
+
|
| 94 |
+
w0 = 2 * math.pi * central_freq / sample_rate
|
| 95 |
+
|
| 96 |
+
alpha = torch.sin(w0) / 2 / Q
|
| 97 |
+
|
| 98 |
+
b0 = 1 - alpha
|
| 99 |
+
b1 = -2 * torch.cos(w0)
|
| 100 |
+
b2 = 1 + alpha
|
| 101 |
+
a0 = 1 + alpha
|
| 102 |
+
a1 = -2 * torch.cos(w0)
|
| 103 |
+
a2 = 1 - alpha
|
| 104 |
+
return biquad(waveform, b0, b1, b2, a0, a1, a2)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def band_biquad(
|
| 108 |
+
waveform: Tensor,
|
| 109 |
+
sample_rate: int,
|
| 110 |
+
central_freq: float,
|
| 111 |
+
Q: float = 0.707,
|
| 112 |
+
noise: bool = False,
|
| 113 |
+
) -> Tensor:
|
| 114 |
+
r"""Design two-pole band filter. Similar to SoX implementation.
|
| 115 |
+
|
| 116 |
+
.. devices:: CPU CUDA
|
| 117 |
+
|
| 118 |
+
.. properties:: Autograd TorchScript
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
| 122 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
| 123 |
+
central_freq (float or torch.Tensor): central frequency (in Hz)
|
| 124 |
+
Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
|
| 125 |
+
noise (bool, optional) : If ``True``, uses the alternate mode for un-pitched audio (e.g. percussion).
|
| 126 |
+
If ``False``, uses mode oriented to pitched audio, i.e. voice, singing,
|
| 127 |
+
or instrumental music (Default: ``False``).
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
Tensor: Waveform of dimension of `(..., time)`
|
| 131 |
+
|
| 132 |
+
Reference:
|
| 133 |
+
- http://sox.sourceforge.net/sox.html
|
| 134 |
+
- https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
|
| 135 |
+
"""
|
| 136 |
+
dtype = waveform.dtype
|
| 137 |
+
device = waveform.device
|
| 138 |
+
central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
|
| 139 |
+
Q = torch.as_tensor(Q, dtype=dtype, device=device)
|
| 140 |
+
|
| 141 |
+
w0 = 2 * math.pi * central_freq / sample_rate
|
| 142 |
+
bw_Hz = central_freq / Q
|
| 143 |
+
|
| 144 |
+
a0 = 1.0
|
| 145 |
+
a2 = torch.exp(-2 * math.pi * bw_Hz / sample_rate)
|
| 146 |
+
a1 = -4 * a2 / (1 + a2) * torch.cos(w0)
|
| 147 |
+
|
| 148 |
+
b0 = torch.sqrt(1 - a1 * a1 / (4 * a2)) * (1 - a2)
|
| 149 |
+
|
| 150 |
+
if noise:
|
| 151 |
+
mult = torch.sqrt(((1 + a2) * (1 + a2) - a1 * a1) * (1 - a2) / (1 + a2)) / b0
|
| 152 |
+
b0 = mult * b0
|
| 153 |
+
|
| 154 |
+
b1 = 0.0
|
| 155 |
+
b2 = 0.0
|
| 156 |
+
|
| 157 |
+
return biquad(waveform, b0, b1, b2, a0, a1, a2)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def bandpass_biquad(
|
| 161 |
+
waveform: Tensor,
|
| 162 |
+
sample_rate: int,
|
| 163 |
+
central_freq: float,
|
| 164 |
+
Q: float = 0.707,
|
| 165 |
+
const_skirt_gain: bool = False,
|
| 166 |
+
) -> Tensor:
|
| 167 |
+
r"""Design two-pole band-pass filter. Similar to SoX implementation.
|
| 168 |
+
|
| 169 |
+
.. devices:: CPU CUDA
|
| 170 |
+
|
| 171 |
+
.. properties:: Autograd TorchScript
|
| 172 |
+
|
| 173 |
+
Args:
|
| 174 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
| 175 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
| 176 |
+
central_freq (float or torch.Tensor): central frequency (in Hz)
|
| 177 |
+
Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
|
| 178 |
+
const_skirt_gain (bool, optional) : If ``True``, uses a constant skirt gain (peak gain = Q).
|
| 179 |
+
If ``False``, uses a constant 0dB peak gain. (Default: ``False``)
|
| 180 |
+
|
| 181 |
+
Returns:
|
| 182 |
+
Tensor: Waveform of dimension of `(..., time)`
|
| 183 |
+
|
| 184 |
+
Reference:
|
| 185 |
+
- http://sox.sourceforge.net/sox.html
|
| 186 |
+
- https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
|
| 187 |
+
"""
|
| 188 |
+
dtype = waveform.dtype
|
| 189 |
+
device = waveform.device
|
| 190 |
+
central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
|
| 191 |
+
Q = torch.as_tensor(Q, dtype=dtype, device=device)
|
| 192 |
+
|
| 193 |
+
w0 = 2 * math.pi * central_freq / sample_rate
|
| 194 |
+
alpha = torch.sin(w0) / 2 / Q
|
| 195 |
+
|
| 196 |
+
temp = torch.sin(w0) / 2 if const_skirt_gain else alpha
|
| 197 |
+
b0 = temp
|
| 198 |
+
b1 = 0.0
|
| 199 |
+
b2 = -temp
|
| 200 |
+
a0 = 1 + alpha
|
| 201 |
+
a1 = -2 * torch.cos(w0)
|
| 202 |
+
a2 = 1 - alpha
|
| 203 |
+
return biquad(waveform, b0, b1, b2, a0, a1, a2)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def bandreject_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
|
| 207 |
+
r"""Design two-pole band-reject filter. Similar to SoX implementation.
|
| 208 |
+
|
| 209 |
+
.. devices:: CPU CUDA
|
| 210 |
+
|
| 211 |
+
.. properties:: Autograd TorchScript
|
| 212 |
+
|
| 213 |
+
Args:
|
| 214 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
| 215 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
| 216 |
+
central_freq (float or torch.Tensor): central frequency (in Hz)
|
| 217 |
+
Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
Tensor: Waveform of dimension of `(..., time)`
|
| 221 |
+
|
| 222 |
+
Reference:
|
| 223 |
+
- http://sox.sourceforge.net/sox.html
|
| 224 |
+
- https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
|
| 225 |
+
"""
|
| 226 |
+
dtype = waveform.dtype
|
| 227 |
+
device = waveform.device
|
| 228 |
+
central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
|
| 229 |
+
Q = torch.as_tensor(Q, dtype=dtype, device=device)
|
| 230 |
+
|
| 231 |
+
w0 = 2 * math.pi * central_freq / sample_rate
|
| 232 |
+
alpha = torch.sin(w0) / 2 / Q
|
| 233 |
+
|
| 234 |
+
b0 = 1.0
|
| 235 |
+
b1 = -2 * torch.cos(w0)
|
| 236 |
+
b2 = 1.0
|
| 237 |
+
a0 = 1 + alpha
|
| 238 |
+
a1 = -2 * torch.cos(w0)
|
| 239 |
+
a2 = 1 - alpha
|
| 240 |
+
return biquad(waveform, b0, b1, b2, a0, a1, a2)
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def bass_biquad(
|
| 244 |
+
waveform: Tensor,
|
| 245 |
+
sample_rate: int,
|
| 246 |
+
gain: float,
|
| 247 |
+
central_freq: float = 100,
|
| 248 |
+
Q: float = 0.707,
|
| 249 |
+
) -> Tensor:
|
| 250 |
+
r"""Design a bass tone-control effect. Similar to SoX implementation.
|
| 251 |
+
|
| 252 |
+
.. devices:: CPU CUDA
|
| 253 |
+
|
| 254 |
+
.. properties:: Autograd TorchScript
|
| 255 |
+
|
| 256 |
+
Args:
|
| 257 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
| 258 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
| 259 |
+
gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB.
|
| 260 |
+
central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``100``)
|
| 261 |
+
Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
|
| 262 |
+
|
| 263 |
+
Returns:
|
| 264 |
+
Tensor: Waveform of dimension of `(..., time)`
|
| 265 |
+
|
| 266 |
+
Reference:
|
| 267 |
+
- http://sox.sourceforge.net/sox.html
|
| 268 |
+
- https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
|
| 269 |
+
"""
|
| 270 |
+
dtype = waveform.dtype
|
| 271 |
+
device = waveform.device
|
| 272 |
+
central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
|
| 273 |
+
Q = torch.as_tensor(Q, dtype=dtype, device=device)
|
| 274 |
+
gain = torch.as_tensor(gain, dtype=dtype, device=device)
|
| 275 |
+
|
| 276 |
+
w0 = 2 * math.pi * central_freq / sample_rate
|
| 277 |
+
alpha = torch.sin(w0) / 2 / Q
|
| 278 |
+
A = torch.exp(gain / 40 * math.log(10))
|
| 279 |
+
|
| 280 |
+
temp1 = 2 * torch.sqrt(A) * alpha
|
| 281 |
+
temp2 = (A - 1) * torch.cos(w0)
|
| 282 |
+
temp3 = (A + 1) * torch.cos(w0)
|
| 283 |
+
|
| 284 |
+
b0 = A * ((A + 1) - temp2 + temp1)
|
| 285 |
+
b1 = 2 * A * ((A - 1) - temp3)
|
| 286 |
+
b2 = A * ((A + 1) - temp2 - temp1)
|
| 287 |
+
a0 = (A + 1) + temp2 + temp1
|
| 288 |
+
a1 = -2 * ((A - 1) + temp3)
|
| 289 |
+
a2 = (A + 1) + temp2 - temp1
|
| 290 |
+
|
| 291 |
+
return biquad(waveform, b0 / a0, b1 / a0, b2 / a0, a0 / a0, a1 / a0, a2 / a0)
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: float, a2: float) -> Tensor:
|
| 295 |
+
r"""Perform a biquad filter of input tensor. Initial conditions set to 0.
|
| 296 |
+
|
| 297 |
+
.. devices:: CPU CUDA
|
| 298 |
+
|
| 299 |
+
.. properties:: Autograd TorchScript
|
| 300 |
+
|
| 301 |
+
Args:
|
| 302 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
| 303 |
+
b0 (float or torch.Tensor): numerator coefficient of current input, x[n]
|
| 304 |
+
b1 (float or torch.Tensor): numerator coefficient of input one time step ago x[n-1]
|
| 305 |
+
b2 (float or torch.Tensor): numerator coefficient of input two time steps ago x[n-2]
|
| 306 |
+
a0 (float or torch.Tensor): denominator coefficient of current output y[n], typically 1
|
| 307 |
+
a1 (float or torch.Tensor): denominator coefficient of current output y[n-1]
|
| 308 |
+
a2 (float or torch.Tensor): denominator coefficient of current output y[n-2]
|
| 309 |
+
|
| 310 |
+
Returns:
|
| 311 |
+
Tensor: Waveform with dimension of `(..., time)`
|
| 312 |
+
|
| 313 |
+
Reference:
|
| 314 |
+
- https://en.wikipedia.org/wiki/Digital_biquad_filter
|
| 315 |
+
"""
|
| 316 |
+
|
| 317 |
+
device = waveform.device
|
| 318 |
+
dtype = waveform.dtype
|
| 319 |
+
|
| 320 |
+
b0 = torch.as_tensor(b0, dtype=dtype, device=device).view(1)
|
| 321 |
+
b1 = torch.as_tensor(b1, dtype=dtype, device=device).view(1)
|
| 322 |
+
b2 = torch.as_tensor(b2, dtype=dtype, device=device).view(1)
|
| 323 |
+
a0 = torch.as_tensor(a0, dtype=dtype, device=device).view(1)
|
| 324 |
+
a1 = torch.as_tensor(a1, dtype=dtype, device=device).view(1)
|
| 325 |
+
a2 = torch.as_tensor(a2, dtype=dtype, device=device).view(1)
|
| 326 |
+
|
| 327 |
+
output_waveform = lfilter(
|
| 328 |
+
waveform,
|
| 329 |
+
torch.cat([a0, a1, a2]),
|
| 330 |
+
torch.cat([b0, b1, b2]),
|
| 331 |
+
)
|
| 332 |
+
return output_waveform
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor:
|
| 336 |
+
r"""Apply contrast effect. Similar to SoX implementation.
|
| 337 |
+
|
| 338 |
+
.. devices:: CPU CUDA
|
| 339 |
+
|
| 340 |
+
.. properties:: Autograd TorchScript
|
| 341 |
+
|
| 342 |
+
Comparable with compression, this effect modifies an audio signal to make it sound louder
|
| 343 |
+
|
| 344 |
+
Args:
|
| 345 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
| 346 |
+
enhancement_amount (float, optional): controls the amount of the enhancement
|
| 347 |
+
Allowed range of values for enhancement_amount : 0-100
|
| 348 |
+
Note that enhancement_amount = 0 still gives a significant contrast enhancement
|
| 349 |
+
|
| 350 |
+
Returns:
|
| 351 |
+
Tensor: Waveform of dimension of `(..., time)`
|
| 352 |
+
|
| 353 |
+
Reference:
|
| 354 |
+
- http://sox.sourceforge.net/sox.html
|
| 355 |
+
"""
|
| 356 |
+
|
| 357 |
+
if not 0 <= enhancement_amount <= 100:
|
| 358 |
+
raise ValueError("Allowed range of values for enhancement_amount : 0-100")
|
| 359 |
+
|
| 360 |
+
contrast = enhancement_amount / 750.0
|
| 361 |
+
|
| 362 |
+
temp1 = waveform * (math.pi / 2)
|
| 363 |
+
temp2 = contrast * torch.sin(temp1 * 4)
|
| 364 |
+
output_waveform = torch.sin(temp1 + temp2)
|
| 365 |
+
|
| 366 |
+
return output_waveform
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None) -> Tensor:
|
| 370 |
+
r"""Apply a DC shift to the audio. Similar to SoX implementation.
|
| 371 |
+
|
| 372 |
+
.. devices:: CPU CUDA
|
| 373 |
+
|
| 374 |
+
.. properties:: TorchScript
|
| 375 |
+
|
| 376 |
+
This can be useful to remove a DC offset
|
| 377 |
+
(caused perhaps by a hardware problem in the recording chain) from the audio
|
| 378 |
+
|
| 379 |
+
Args:
|
| 380 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
| 381 |
+
shift (float): indicates the amount to shift the audio
|
| 382 |
+
Allowed range of values for shift : -2.0 to +2.0
|
| 383 |
+
limiter_gain (float of None, optional): It is used only on peaks to prevent clipping
|
| 384 |
+
It should have a value much less than 1 (e.g. 0.05 or 0.02)
|
| 385 |
+
|
| 386 |
+
Returns:
|
| 387 |
+
Tensor: Waveform of dimension of `(..., time)`
|
| 388 |
+
|
| 389 |
+
Reference:
|
| 390 |
+
- http://sox.sourceforge.net/sox.html
|
| 391 |
+
"""
|
| 392 |
+
output_waveform = waveform
|
| 393 |
+
limiter_threshold = 0.0
|
| 394 |
+
|
| 395 |
+
if limiter_gain is not None:
|
| 396 |
+
limiter_threshold = 1.0 - (abs(shift) - limiter_gain)
|
| 397 |
+
|
| 398 |
+
# Note:
|
| 399 |
+
# the following index-based update breaks auto-grad support
|
| 400 |
+
if limiter_gain is not None and shift > 0:
|
| 401 |
+
mask = waveform > limiter_threshold
|
| 402 |
+
temp = (waveform[mask] - limiter_threshold) * limiter_gain / (1 - limiter_threshold)
|
| 403 |
+
output_waveform[mask] = (temp + limiter_threshold + shift).clamp(max=limiter_threshold)
|
| 404 |
+
output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1)
|
| 405 |
+
elif limiter_gain is not None and shift < 0:
|
| 406 |
+
mask = waveform < -limiter_threshold
|
| 407 |
+
temp = (waveform[mask] + limiter_threshold) * limiter_gain / (1 - limiter_threshold)
|
| 408 |
+
output_waveform[mask] = (temp - limiter_threshold + shift).clamp(min=-limiter_threshold)
|
| 409 |
+
output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1)
|
| 410 |
+
else:
|
| 411 |
+
output_waveform = (waveform + shift).clamp(min=-1, max=1)
|
| 412 |
+
|
| 413 |
+
return output_waveform
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
def deemph_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
|
| 417 |
+
r"""Apply ISO 908 CD de-emphasis (shelving) IIR filter. Similar to SoX implementation.
|
| 418 |
+
|
| 419 |
+
.. devices:: CPU CUDA
|
| 420 |
+
|
| 421 |
+
.. properties:: Autograd TorchScript
|
| 422 |
+
|
| 423 |
+
Args:
|
| 424 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
| 425 |
+
sample_rate (int): sampling rate of the waveform, Allowed sample rate ``44100`` or ``48000``
|
| 426 |
+
|
| 427 |
+
Returns:
|
| 428 |
+
Tensor: Waveform of dimension of `(..., time)`
|
| 429 |
+
|
| 430 |
+
Reference:
|
| 431 |
+
- http://sox.sourceforge.net/sox.html
|
| 432 |
+
- https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
|
| 433 |
+
"""
|
| 434 |
+
|
| 435 |
+
if sample_rate == 44100:
|
| 436 |
+
central_freq = 5283
|
| 437 |
+
width_slope = 0.4845
|
| 438 |
+
gain = -9.477
|
| 439 |
+
elif sample_rate == 48000:
|
| 440 |
+
central_freq = 5356
|
| 441 |
+
width_slope = 0.479
|
| 442 |
+
gain = -9.62
|
| 443 |
+
else:
|
| 444 |
+
raise ValueError("Sample rate must be 44100 (audio-CD) or 48000 (DAT)")
|
| 445 |
+
|
| 446 |
+
w0 = 2 * math.pi * central_freq / sample_rate
|
| 447 |
+
A = math.exp(gain / 40.0 * math.log(10))
|
| 448 |
+
alpha = math.sin(w0) / 2 * math.sqrt((A + 1 / A) * (1 / width_slope - 1) + 2)
|
| 449 |
+
|
| 450 |
+
temp1 = 2 * math.sqrt(A) * alpha
|
| 451 |
+
temp2 = (A - 1) * math.cos(w0)
|
| 452 |
+
temp3 = (A + 1) * math.cos(w0)
|
| 453 |
+
|
| 454 |
+
b0 = A * ((A + 1) + temp2 + temp1)
|
| 455 |
+
b1 = -2 * A * ((A - 1) + temp3)
|
| 456 |
+
b2 = A * ((A + 1) + temp2 - temp1)
|
| 457 |
+
a0 = (A + 1) - temp2 + temp1
|
| 458 |
+
a1 = 2 * ((A - 1) - temp3)
|
| 459 |
+
a2 = (A + 1) - temp2 - temp1
|
| 460 |
+
|
| 461 |
+
return biquad(waveform, b0, b1, b2, a0, a1, a2)
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
def _add_noise_shaping(dithered_waveform: Tensor, waveform: Tensor) -> Tensor:
|
| 465 |
+
r"""Noise shaping is calculated by error:
|
| 466 |
+
error[n] = dithered[n] - original[n]
|
| 467 |
+
noise_shaped_waveform[n] = dithered[n] + error[n-1]
|
| 468 |
+
"""
|
| 469 |
+
wf_shape = waveform.size()
|
| 470 |
+
waveform = waveform.reshape(-1, wf_shape[-1])
|
| 471 |
+
|
| 472 |
+
dithered_shape = dithered_waveform.size()
|
| 473 |
+
dithered_waveform = dithered_waveform.reshape(-1, dithered_shape[-1])
|
| 474 |
+
|
| 475 |
+
error = dithered_waveform - waveform
|
| 476 |
+
|
| 477 |
+
# add error[n-1] to dithered_waveform[n], so offset the error by 1 index
|
| 478 |
+
zeros = torch.zeros(1, dtype=error.dtype, device=error.device)
|
| 479 |
+
for index in range(error.size()[0]):
|
| 480 |
+
err = error[index]
|
| 481 |
+
error_offset = torch.cat((zeros, err))
|
| 482 |
+
error[index] = error_offset[: waveform.size()[1]]
|
| 483 |
+
|
| 484 |
+
noise_shaped = dithered_waveform + error
|
| 485 |
+
return noise_shaped.reshape(dithered_shape[:-1] + noise_shaped.shape[-1:])
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
def _apply_probability_distribution(waveform: Tensor, density_function: str = "TPDF") -> Tensor:
|
| 489 |
+
r"""Apply a probability distribution function on a waveform.
|
| 490 |
+
|
| 491 |
+
Triangular probability density function (TPDF) dither noise has a
|
| 492 |
+
triangular distribution; values in the center of the range have a higher
|
| 493 |
+
probability of occurring.
|
| 494 |
+
|
| 495 |
+
Rectangular probability density function (RPDF) dither noise has a
|
| 496 |
+
uniform distribution; any value in the specified range has the same
|
| 497 |
+
probability of occurring.
|
| 498 |
+
|
| 499 |
+
Gaussian probability density function (GPDF) has a normal distribution.
|
| 500 |
+
The relationship of probabilities of results follows a bell-shaped,
|
| 501 |
+
or Gaussian curve, typical of dither generated by analog sources.
|
| 502 |
+
Args:
|
| 503 |
+
waveform (Tensor): Tensor of audio of dimension (..., time)
|
| 504 |
+
density_function (str, optional): The density function of a
|
| 505 |
+
continuous random variable (Default: ``"TPDF"``)
|
| 506 |
+
Options: Triangular Probability Density Function - `TPDF`
|
| 507 |
+
Rectangular Probability Density Function - `RPDF`
|
| 508 |
+
Gaussian Probability Density Function - `GPDF`
|
| 509 |
+
Returns:
|
| 510 |
+
Tensor: waveform dithered with TPDF
|
| 511 |
+
"""
|
| 512 |
+
|
| 513 |
+
# pack batch
|
| 514 |
+
shape = waveform.size()
|
| 515 |
+
waveform = waveform.reshape(-1, shape[-1])
|
| 516 |
+
|
| 517 |
+
channel_size = waveform.size()[0] - 1
|
| 518 |
+
time_size = waveform.size()[-1] - 1
|
| 519 |
+
|
| 520 |
+
random_channel = (
|
| 521 |
+
int(
|
| 522 |
+
torch.randint(
|
| 523 |
+
channel_size,
|
| 524 |
+
[
|
| 525 |
+
1,
|
| 526 |
+
],
|
| 527 |
+
).item()
|
| 528 |
+
)
|
| 529 |
+
if channel_size > 0
|
| 530 |
+
else 0
|
| 531 |
+
)
|
| 532 |
+
random_time = (
|
| 533 |
+
int(
|
| 534 |
+
torch.randint(
|
| 535 |
+
time_size,
|
| 536 |
+
[
|
| 537 |
+
1,
|
| 538 |
+
],
|
| 539 |
+
).item()
|
| 540 |
+
)
|
| 541 |
+
if time_size > 0
|
| 542 |
+
else 0
|
| 543 |
+
)
|
| 544 |
+
|
| 545 |
+
number_of_bits = 16
|
| 546 |
+
up_scaling = 2 ** (number_of_bits - 1) - 2
|
| 547 |
+
signal_scaled = waveform * up_scaling
|
| 548 |
+
down_scaling = 2 ** (number_of_bits - 1)
|
| 549 |
+
|
| 550 |
+
signal_scaled_dis = waveform
|
| 551 |
+
if density_function == "RPDF":
|
| 552 |
+
RPDF = waveform[random_channel][random_time] - 0.5
|
| 553 |
+
|
| 554 |
+
signal_scaled_dis = signal_scaled + RPDF
|
| 555 |
+
elif density_function == "GPDF":
|
| 556 |
+
# TODO Replace by distribution code once
|
| 557 |
+
# https://github.com/pytorch/pytorch/issues/29843 is resolved
|
| 558 |
+
# gaussian = torch.distributions.normal.Normal(torch.mean(waveform, -1), 1).sample()
|
| 559 |
+
|
| 560 |
+
num_rand_variables = 6
|
| 561 |
+
|
| 562 |
+
gaussian = waveform[random_channel][random_time]
|
| 563 |
+
for ws in num_rand_variables * [time_size]:
|
| 564 |
+
rand_chan = int(
|
| 565 |
+
torch.randint(
|
| 566 |
+
channel_size,
|
| 567 |
+
[
|
| 568 |
+
1,
|
| 569 |
+
],
|
| 570 |
+
).item()
|
| 571 |
+
)
|
| 572 |
+
gaussian += waveform[rand_chan][
|
| 573 |
+
int(
|
| 574 |
+
torch.randint(
|
| 575 |
+
ws,
|
| 576 |
+
[
|
| 577 |
+
1,
|
| 578 |
+
],
|
| 579 |
+
).item()
|
| 580 |
+
)
|
| 581 |
+
]
|
| 582 |
+
|
| 583 |
+
signal_scaled_dis = signal_scaled + gaussian
|
| 584 |
+
else:
|
| 585 |
+
# dtype needed for https://github.com/pytorch/pytorch/issues/32358
|
| 586 |
+
TPDF = torch.bartlett_window(time_size + 1, dtype=signal_scaled.dtype, device=signal_scaled.device)
|
| 587 |
+
TPDF = TPDF.repeat((channel_size + 1), 1)
|
| 588 |
+
signal_scaled_dis = signal_scaled + TPDF
|
| 589 |
+
|
| 590 |
+
quantised_signal_scaled = torch.round(signal_scaled_dis)
|
| 591 |
+
quantised_signal = quantised_signal_scaled / down_scaling
|
| 592 |
+
|
| 593 |
+
# unpack batch
|
| 594 |
+
return quantised_signal.reshape(shape[:-1] + quantised_signal.shape[-1:])
|
| 595 |
+
|
| 596 |
+
|
| 597 |
+
def dither(waveform: Tensor, density_function: str = "TPDF", noise_shaping: bool = False) -> Tensor:
|
| 598 |
+
r"""Apply dither
|
| 599 |
+
|
| 600 |
+
.. devices:: CPU CUDA
|
| 601 |
+
|
| 602 |
+
.. properties:: TorchScript
|
| 603 |
+
|
| 604 |
+
Dither increases the perceived dynamic range of audio stored at a
|
| 605 |
+
particular bit-depth by eliminating nonlinear truncation distortion
|
| 606 |
+
(i.e. adding minimally perceived noise to mask distortion caused by quantization).
|
| 607 |
+
|
| 608 |
+
Args:
|
| 609 |
+
waveform (Tensor): Tensor of audio of dimension (..., time)
|
| 610 |
+
density_function (str, optional):
|
| 611 |
+
The density function of a continuous random variable. One of
|
| 612 |
+
``"TPDF"`` (Triangular Probability Density Function),
|
| 613 |
+
``"RPDF"`` (Rectangular Probability Density Function) or
|
| 614 |
+
``"GPDF"`` (Gaussian Probability Density Function) (Default: ``"TPDF"``).
|
| 615 |
+
noise_shaping (bool, optional): a filtering process that shapes the spectral
|
| 616 |
+
energy of quantisation error (Default: ``False``)
|
| 617 |
+
|
| 618 |
+
Returns:
|
| 619 |
+
Tensor: waveform dithered
|
| 620 |
+
"""
|
| 621 |
+
dithered = _apply_probability_distribution(waveform, density_function=density_function)
|
| 622 |
+
|
| 623 |
+
if noise_shaping:
|
| 624 |
+
return _add_noise_shaping(dithered, waveform)
|
| 625 |
+
else:
|
| 626 |
+
return dithered
|
| 627 |
+
|
| 628 |
+
|
| 629 |
+
def equalizer_biquad(
|
| 630 |
+
waveform: Tensor,
|
| 631 |
+
sample_rate: int,
|
| 632 |
+
center_freq: float,
|
| 633 |
+
gain: float,
|
| 634 |
+
Q: float = 0.707,
|
| 635 |
+
) -> Tensor:
|
| 636 |
+
r"""Design biquad peaking equalizer filter and perform filtering. Similar to SoX implementation.
|
| 637 |
+
|
| 638 |
+
.. devices:: CPU CUDA
|
| 639 |
+
|
| 640 |
+
.. properties:: Autograd TorchScript
|
| 641 |
+
|
| 642 |
+
Args:
|
| 643 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
| 644 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
| 645 |
+
center_freq (float): filter's central frequency
|
| 646 |
+
gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB
|
| 647 |
+
Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
|
| 648 |
+
|
| 649 |
+
Returns:
|
| 650 |
+
Tensor: Waveform of dimension of `(..., time)`
|
| 651 |
+
"""
|
| 652 |
+
dtype = waveform.dtype
|
| 653 |
+
device = waveform.device
|
| 654 |
+
center_freq = torch.as_tensor(center_freq, dtype=dtype, device=device)
|
| 655 |
+
Q = torch.as_tensor(Q, dtype=dtype, device=device)
|
| 656 |
+
gain = torch.as_tensor(gain, dtype=dtype, device=device)
|
| 657 |
+
|
| 658 |
+
w0 = 2 * math.pi * center_freq / sample_rate
|
| 659 |
+
A = torch.exp(gain / 40.0 * math.log(10))
|
| 660 |
+
alpha = torch.sin(w0) / 2 / Q
|
| 661 |
+
|
| 662 |
+
b0 = 1 + alpha * A
|
| 663 |
+
b1 = -2 * torch.cos(w0)
|
| 664 |
+
b2 = 1 - alpha * A
|
| 665 |
+
a0 = 1 + alpha / A
|
| 666 |
+
a1 = -2 * torch.cos(w0)
|
| 667 |
+
a2 = 1 - alpha / A
|
| 668 |
+
return biquad(waveform, b0, b1, b2, a0, a1, a2)
|
| 669 |
+
|
| 670 |
+
|
| 671 |
+
def filtfilt(
|
| 672 |
+
waveform: Tensor,
|
| 673 |
+
a_coeffs: Tensor,
|
| 674 |
+
b_coeffs: Tensor,
|
| 675 |
+
clamp: bool = True,
|
| 676 |
+
) -> Tensor:
|
| 677 |
+
r"""Apply an IIR filter forward and backward to a waveform.
|
| 678 |
+
|
| 679 |
+
.. devices:: CPU CUDA
|
| 680 |
+
|
| 681 |
+
.. properties:: Autograd TorchScript
|
| 682 |
+
|
| 683 |
+
Inspired by https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.filtfilt.html
|
| 684 |
+
|
| 685 |
+
Args:
|
| 686 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`. Must be normalized to -1 to 1.
|
| 687 |
+
a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either
|
| 688 |
+
1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
|
| 689 |
+
Lower delay coefficients are first, e.g. ``[a0, a1, a2, ...]``.
|
| 690 |
+
Must be same size as b_coeffs (pad with 0's as necessary).
|
| 691 |
+
b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either
|
| 692 |
+
1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
|
| 693 |
+
Lower delay coefficients are first, e.g. ``[b0, b1, b2, ...]``.
|
| 694 |
+
Must be same size as a_coeffs (pad with 0's as necessary).
|
| 695 |
+
clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
|
| 696 |
+
|
| 697 |
+
Returns:
|
| 698 |
+
Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
|
| 699 |
+
are 2D Tensors, or `(..., time)` otherwise.
|
| 700 |
+
"""
|
| 701 |
+
forward_filtered = lfilter(waveform, a_coeffs, b_coeffs, clamp=False, batching=True)
|
| 702 |
+
backward_filtered = lfilter(
|
| 703 |
+
forward_filtered.flip(-1),
|
| 704 |
+
a_coeffs,
|
| 705 |
+
b_coeffs,
|
| 706 |
+
clamp=clamp,
|
| 707 |
+
batching=True,
|
| 708 |
+
).flip(-1)
|
| 709 |
+
return backward_filtered
|
| 710 |
+
|
| 711 |
+
|
| 712 |
+
def flanger(
|
| 713 |
+
waveform: Tensor,
|
| 714 |
+
sample_rate: int,
|
| 715 |
+
delay: float = 0.0,
|
| 716 |
+
depth: float = 2.0,
|
| 717 |
+
regen: float = 0.0,
|
| 718 |
+
width: float = 71.0,
|
| 719 |
+
speed: float = 0.5,
|
| 720 |
+
phase: float = 25.0,
|
| 721 |
+
modulation: str = "sinusoidal",
|
| 722 |
+
interpolation: str = "linear",
|
| 723 |
+
) -> Tensor:
|
| 724 |
+
r"""Apply a flanger effect to the audio. Similar to SoX implementation.
|
| 725 |
+
|
| 726 |
+
.. devices:: CPU CUDA
|
| 727 |
+
|
| 728 |
+
.. properties:: Autograd TorchScript
|
| 729 |
+
|
| 730 |
+
Args:
|
| 731 |
+
waveform (Tensor): audio waveform of dimension of `(..., channel, time)` .
|
| 732 |
+
Max 4 channels allowed
|
| 733 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
| 734 |
+
delay (float, optional): desired delay in milliseconds(ms)
|
| 735 |
+
Allowed range of values are 0 to 30
|
| 736 |
+
depth (float, optional): desired delay depth in milliseconds(ms)
|
| 737 |
+
Allowed range of values are 0 to 10
|
| 738 |
+
regen (float, optional): desired regen(feedback gain) in dB
|
| 739 |
+
Allowed range of values are -95 to 95
|
| 740 |
+
width (float, optional): desired width(delay gain) in dB
|
| 741 |
+
Allowed range of values are 0 to 100
|
| 742 |
+
speed (float, optional): modulation speed in Hz
|
| 743 |
+
Allowed range of values are 0.1 to 10
|
| 744 |
+
phase (float, optional): percentage phase-shift for multi-channel
|
| 745 |
+
Allowed range of values are 0 to 100
|
| 746 |
+
modulation (str, optional): Use either "sinusoidal" or "triangular" modulation. (Default: ``sinusoidal``)
|
| 747 |
+
interpolation (str, optional): Use either "linear" or "quadratic" for delay-line interpolation.
|
| 748 |
+
(Default: ``linear``)
|
| 749 |
+
|
| 750 |
+
Returns:
|
| 751 |
+
Tensor: Waveform of dimension of `(..., channel, time)`
|
| 752 |
+
|
| 753 |
+
Reference:
|
| 754 |
+
- http://sox.sourceforge.net/sox.html
|
| 755 |
+
|
| 756 |
+
- Scott Lehman, `Effects Explained`_,
|
| 757 |
+
|
| 758 |
+
.. _Effects Explained:
|
| 759 |
+
https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
|
| 760 |
+
"""
|
| 761 |
+
|
| 762 |
+
if modulation not in ("sinusoidal", "triangular"):
|
| 763 |
+
raise ValueError('Only "sinusoidal" or "triangular" modulation allowed')
|
| 764 |
+
|
| 765 |
+
if interpolation not in ("linear", "quadratic"):
|
| 766 |
+
raise ValueError('Only "linear" or "quadratic" interpolation allowed')
|
| 767 |
+
|
| 768 |
+
actual_shape = waveform.shape
|
| 769 |
+
device, dtype = waveform.device, waveform.dtype
|
| 770 |
+
|
| 771 |
+
if actual_shape[-2] > 4:
|
| 772 |
+
raise ValueError("Max 4 channels allowed")
|
| 773 |
+
|
| 774 |
+
# convert to 3D (batch, channels, time)
|
| 775 |
+
waveform = waveform.view(-1, actual_shape[-2], actual_shape[-1])
|
| 776 |
+
|
| 777 |
+
# Scaling
|
| 778 |
+
feedback_gain = regen / 100
|
| 779 |
+
delay_gain = width / 100
|
| 780 |
+
channel_phase = phase / 100
|
| 781 |
+
delay_min = delay / 1000
|
| 782 |
+
delay_depth = depth / 1000
|
| 783 |
+
|
| 784 |
+
n_channels = waveform.shape[-2]
|
| 785 |
+
|
| 786 |
+
if modulation == "sinusoidal":
|
| 787 |
+
wave_type = "SINE"
|
| 788 |
+
else:
|
| 789 |
+
wave_type = "TRIANGLE"
|
| 790 |
+
|
| 791 |
+
# Balance output:
|
| 792 |
+
in_gain = 1.0 / (1 + delay_gain)
|
| 793 |
+
delay_gain = delay_gain / (1 + delay_gain)
|
| 794 |
+
|
| 795 |
+
# Balance feedback loop:
|
| 796 |
+
delay_gain = delay_gain * (1 - abs(feedback_gain))
|
| 797 |
+
|
| 798 |
+
delay_buf_length = int((delay_min + delay_depth) * sample_rate + 0.5)
|
| 799 |
+
delay_buf_length = delay_buf_length + 2
|
| 800 |
+
|
| 801 |
+
delay_bufs = torch.zeros(waveform.shape[0], n_channels, delay_buf_length, dtype=dtype, device=device)
|
| 802 |
+
delay_last = torch.zeros(waveform.shape[0], n_channels, dtype=dtype, device=device)
|
| 803 |
+
|
| 804 |
+
lfo_length = int(sample_rate / speed)
|
| 805 |
+
|
| 806 |
+
table_min = math.floor(delay_min * sample_rate + 0.5)
|
| 807 |
+
table_max = delay_buf_length - 2.0
|
| 808 |
+
|
| 809 |
+
lfo = _generate_wave_table(
|
| 810 |
+
wave_type=wave_type,
|
| 811 |
+
data_type="FLOAT",
|
| 812 |
+
table_size=lfo_length,
|
| 813 |
+
min=float(table_min),
|
| 814 |
+
max=float(table_max),
|
| 815 |
+
phase=3 * math.pi / 2,
|
| 816 |
+
device=device,
|
| 817 |
+
)
|
| 818 |
+
|
| 819 |
+
output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
|
| 820 |
+
|
| 821 |
+
delay_buf_pos = 0
|
| 822 |
+
lfo_pos = 0
|
| 823 |
+
channel_idxs = torch.arange(0, n_channels, device=device)
|
| 824 |
+
|
| 825 |
+
for i in range(waveform.shape[-1]):
|
| 826 |
+
|
| 827 |
+
delay_buf_pos = (delay_buf_pos + delay_buf_length - 1) % delay_buf_length
|
| 828 |
+
|
| 829 |
+
cur_channel_phase = (channel_idxs * lfo_length * channel_phase + 0.5).to(torch.int64)
|
| 830 |
+
delay_tensor = lfo[(lfo_pos + cur_channel_phase) % lfo_length]
|
| 831 |
+
frac_delay = torch.frac(delay_tensor)
|
| 832 |
+
delay_tensor = torch.floor(delay_tensor)
|
| 833 |
+
|
| 834 |
+
int_delay = delay_tensor.to(torch.int64)
|
| 835 |
+
|
| 836 |
+
temp = waveform[:, :, i]
|
| 837 |
+
|
| 838 |
+
delay_bufs[:, :, delay_buf_pos] = temp + delay_last * feedback_gain
|
| 839 |
+
|
| 840 |
+
delayed_0 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
|
| 841 |
+
|
| 842 |
+
int_delay = int_delay + 1
|
| 843 |
+
|
| 844 |
+
delayed_1 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
|
| 845 |
+
|
| 846 |
+
int_delay = int_delay + 1
|
| 847 |
+
|
| 848 |
+
if interpolation == "linear":
|
| 849 |
+
delayed = delayed_0 + (delayed_1 - delayed_0) * frac_delay
|
| 850 |
+
else:
|
| 851 |
+
delayed_2 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
|
| 852 |
+
|
| 853 |
+
int_delay = int_delay + 1
|
| 854 |
+
|
| 855 |
+
delayed_2 = delayed_2 - delayed_0
|
| 856 |
+
delayed_1 = delayed_1 - delayed_0
|
| 857 |
+
a = delayed_2 * 0.5 - delayed_1
|
| 858 |
+
b = delayed_1 * 2 - delayed_2 * 0.5
|
| 859 |
+
|
| 860 |
+
delayed = delayed_0 + (a * frac_delay + b) * frac_delay
|
| 861 |
+
|
| 862 |
+
delay_last = delayed
|
| 863 |
+
output_waveform[:, :, i] = waveform[:, :, i] * in_gain + delayed * delay_gain
|
| 864 |
+
|
| 865 |
+
lfo_pos = (lfo_pos + 1) % lfo_length
|
| 866 |
+
|
| 867 |
+
return output_waveform.clamp(min=-1, max=1).view(actual_shape)
|
| 868 |
+
|
| 869 |
+
|
| 870 |
+
def gain(waveform: Tensor, gain_db: float = 1.0) -> Tensor:
|
| 871 |
+
r"""Apply amplification or attenuation to the whole waveform.
|
| 872 |
+
|
| 873 |
+
.. devices:: CPU CUDA
|
| 874 |
+
|
| 875 |
+
.. properties:: Autograd TorchScript
|
| 876 |
+
|
| 877 |
+
Args:
|
| 878 |
+
waveform (Tensor): Tensor of audio of dimension (..., time).
|
| 879 |
+
gain_db (float, optional) Gain adjustment in decibels (dB) (Default: ``1.0``).
|
| 880 |
+
|
| 881 |
+
Returns:
|
| 882 |
+
Tensor: the whole waveform amplified by gain_db.
|
| 883 |
+
"""
|
| 884 |
+
if gain_db == 0:
|
| 885 |
+
return waveform
|
| 886 |
+
|
| 887 |
+
ratio = 10 ** (gain_db / 20)
|
| 888 |
+
|
| 889 |
+
return waveform * ratio
|
| 890 |
+
|
| 891 |
+
|
| 892 |
+
def highpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
|
| 893 |
+
r"""Design biquad highpass filter and perform filtering. Similar to SoX implementation.
|
| 894 |
+
|
| 895 |
+
.. devices:: CPU CUDA
|
| 896 |
+
|
| 897 |
+
.. properties:: Autograd TorchScript
|
| 898 |
+
|
| 899 |
+
Args:
|
| 900 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
| 901 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
| 902 |
+
cutoff_freq (float or torch.Tensor): filter cutoff frequency
|
| 903 |
+
Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
|
| 904 |
+
|
| 905 |
+
Returns:
|
| 906 |
+
Tensor: Waveform dimension of `(..., time)`
|
| 907 |
+
"""
|
| 908 |
+
dtype = waveform.dtype
|
| 909 |
+
device = waveform.device
|
| 910 |
+
cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device)
|
| 911 |
+
Q = torch.as_tensor(Q, dtype=dtype, device=device)
|
| 912 |
+
|
| 913 |
+
w0 = 2 * math.pi * cutoff_freq / sample_rate
|
| 914 |
+
alpha = torch.sin(w0) / 2.0 / Q
|
| 915 |
+
|
| 916 |
+
b0 = (1 + torch.cos(w0)) / 2
|
| 917 |
+
b1 = -1 - torch.cos(w0)
|
| 918 |
+
b2 = b0
|
| 919 |
+
a0 = 1 + alpha
|
| 920 |
+
a1 = -2 * torch.cos(w0)
|
| 921 |
+
a2 = 1 - alpha
|
| 922 |
+
return biquad(waveform, b0, b1, b2, a0, a1, a2)
|
| 923 |
+
|
| 924 |
+
|
| 925 |
+
def _lfilter_core_generic_loop(input_signal_windows: Tensor, a_coeffs_flipped: Tensor, padded_output_waveform: Tensor):
|
| 926 |
+
n_order = a_coeffs_flipped.size(1)
|
| 927 |
+
a_coeffs_flipped = a_coeffs_flipped.unsqueeze(2)
|
| 928 |
+
for i_sample, o0 in enumerate(input_signal_windows.permute(2, 0, 1)):
|
| 929 |
+
windowed_output_signal = padded_output_waveform[:, :, i_sample : i_sample + n_order]
|
| 930 |
+
o0 -= (windowed_output_signal.transpose(0, 1) @ a_coeffs_flipped)[..., 0].t()
|
| 931 |
+
padded_output_waveform[:, :, i_sample + n_order - 1] = o0
|
| 932 |
+
|
| 933 |
+
|
| 934 |
+
if _IS_TORCHAUDIO_EXT_AVAILABLE:
|
| 935 |
+
_lfilter_core_cpu_loop = torch.ops.torchaudio._lfilter_core_loop
|
| 936 |
+
else:
|
| 937 |
+
_lfilter_core_cpu_loop = _lfilter_core_generic_loop
|
| 938 |
+
|
| 939 |
+
|
| 940 |
+
def _lfilter_core(
|
| 941 |
+
waveform: Tensor,
|
| 942 |
+
a_coeffs: Tensor,
|
| 943 |
+
b_coeffs: Tensor,
|
| 944 |
+
) -> Tensor:
|
| 945 |
+
|
| 946 |
+
if a_coeffs.size() != b_coeffs.size():
|
| 947 |
+
raise ValueError(
|
| 948 |
+
"Expected coeffs to be the same size."
|
| 949 |
+
f"Found a_coeffs size: {a_coeffs.size()}, b_coeffs size: {b_coeffs.size()}"
|
| 950 |
+
)
|
| 951 |
+
if waveform.ndim != 3:
|
| 952 |
+
raise ValueError(f"Expected waveform to be 3 dimensional. Found: {waveform.ndim}")
|
| 953 |
+
if not (waveform.device == a_coeffs.device == b_coeffs.device):
|
| 954 |
+
raise ValueError(
|
| 955 |
+
"Expected waveform and coeffs to be on the same device."
|
| 956 |
+
f"Found: waveform device:{waveform.device}, a_coeffs device: {a_coeffs.device}, "
|
| 957 |
+
f"b_coeffs device: {b_coeffs.device}"
|
| 958 |
+
)
|
| 959 |
+
|
| 960 |
+
n_batch, n_channel, n_sample = waveform.size()
|
| 961 |
+
n_order = a_coeffs.size(1)
|
| 962 |
+
if n_order <= 0:
|
| 963 |
+
raise ValueError(f"Expected n_order to be positive. Found: {n_order}")
|
| 964 |
+
|
| 965 |
+
# Pad the input and create output
|
| 966 |
+
|
| 967 |
+
padded_waveform = torch.nn.functional.pad(waveform, [n_order - 1, 0])
|
| 968 |
+
padded_output_waveform = torch.zeros_like(padded_waveform)
|
| 969 |
+
|
| 970 |
+
# Set up the coefficients matrix
|
| 971 |
+
# Flip coefficients' order
|
| 972 |
+
a_coeffs_flipped = a_coeffs.flip(1)
|
| 973 |
+
b_coeffs_flipped = b_coeffs.flip(1)
|
| 974 |
+
|
| 975 |
+
# calculate windowed_input_signal in parallel using convolution
|
| 976 |
+
input_signal_windows = torch.nn.functional.conv1d(padded_waveform, b_coeffs_flipped.unsqueeze(1), groups=n_channel)
|
| 977 |
+
|
| 978 |
+
input_signal_windows.div_(a_coeffs[:, :1])
|
| 979 |
+
a_coeffs_flipped.div_(a_coeffs[:, :1])
|
| 980 |
+
|
| 981 |
+
if (
|
| 982 |
+
input_signal_windows.device == torch.device("cpu")
|
| 983 |
+
and a_coeffs_flipped.device == torch.device("cpu")
|
| 984 |
+
and padded_output_waveform.device == torch.device("cpu")
|
| 985 |
+
):
|
| 986 |
+
_lfilter_core_cpu_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform)
|
| 987 |
+
else:
|
| 988 |
+
_lfilter_core_generic_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform)
|
| 989 |
+
|
| 990 |
+
output = padded_output_waveform[:, :, n_order - 1 :]
|
| 991 |
+
return output
|
| 992 |
+
|
| 993 |
+
|
| 994 |
+
if _IS_TORCHAUDIO_EXT_AVAILABLE:
|
| 995 |
+
_lfilter = torch.ops.torchaudio._lfilter
|
| 996 |
+
else:
|
| 997 |
+
_lfilter = _lfilter_core
|
| 998 |
+
|
| 999 |
+
|
| 1000 |
+
def lfilter(waveform: Tensor, a_coeffs: Tensor, b_coeffs: Tensor, clamp: bool = True, batching: bool = True) -> Tensor:
|
| 1001 |
+
r"""Perform an IIR filter by evaluating difference equation, using differentiable implementation
|
| 1002 |
+
developed independently by *Yu et al.* :cite:`ismir_YuF23` and *Forgione et al.* :cite:`forgione2021dynonet`.
|
| 1003 |
+
|
| 1004 |
+
.. devices:: CPU CUDA
|
| 1005 |
+
|
| 1006 |
+
.. properties:: Autograd TorchScript
|
| 1007 |
+
|
| 1008 |
+
Note:
|
| 1009 |
+
To avoid numerical problems, small filter order is preferred.
|
| 1010 |
+
Using double precision could also minimize numerical precision errors.
|
| 1011 |
+
|
| 1012 |
+
Args:
|
| 1013 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`. Must be normalized to -1 to 1.
|
| 1014 |
+
a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either
|
| 1015 |
+
1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
|
| 1016 |
+
Lower delays coefficients are first, e.g. ``[a0, a1, a2, ...]``.
|
| 1017 |
+
Must be same size as b_coeffs (pad with 0's as necessary).
|
| 1018 |
+
b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either
|
| 1019 |
+
1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
|
| 1020 |
+
Lower delays coefficients are first, e.g. ``[b0, b1, b2, ...]``.
|
| 1021 |
+
Must be same size as a_coeffs (pad with 0's as necessary).
|
| 1022 |
+
clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
|
| 1023 |
+
batching (bool, optional): Effective only when coefficients are 2D. If ``True``, then waveform should be at
|
| 1024 |
+
least 2D, and the size of second axis from last should equals to ``num_filters``.
|
| 1025 |
+
The output can be expressed as ``output[..., i, :] = lfilter(waveform[..., i, :],
|
| 1026 |
+
a_coeffs[i], b_coeffs[i], clamp=clamp, batching=False)``. (Default: ``True``)
|
| 1027 |
+
|
| 1028 |
+
Returns:
|
| 1029 |
+
Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
|
| 1030 |
+
are 2D Tensors, or `(..., time)` otherwise.
|
| 1031 |
+
"""
|
| 1032 |
+
if a_coeffs.size() != b_coeffs.size():
|
| 1033 |
+
raise ValueError(
|
| 1034 |
+
"Expected coeffs to be the same size."
|
| 1035 |
+
f"Found: a_coeffs size: {a_coeffs.size()}, b_coeffs size: {b_coeffs.size()}"
|
| 1036 |
+
)
|
| 1037 |
+
if a_coeffs.ndim > 2:
|
| 1038 |
+
raise ValueError(f"Expected coeffs to have greater than 1 dimension. Found: {a_coeffs.ndim}")
|
| 1039 |
+
|
| 1040 |
+
if a_coeffs.ndim > 1:
|
| 1041 |
+
if batching:
|
| 1042 |
+
if waveform.ndim <= 0:
|
| 1043 |
+
raise ValueError("Expected waveform to have a positive number of dimensions." f"Found: {waveform.ndim}")
|
| 1044 |
+
if waveform.shape[-2] != a_coeffs.shape[0]:
|
| 1045 |
+
raise ValueError(
|
| 1046 |
+
"Expected number of batches in waveform and coeffs to be the same."
|
| 1047 |
+
f"Found: coeffs batches: {a_coeffs.shape[0]}, waveform batches: {waveform.shape[-2]}"
|
| 1048 |
+
)
|
| 1049 |
+
else:
|
| 1050 |
+
waveform = torch.stack([waveform] * a_coeffs.shape[0], -2)
|
| 1051 |
+
else:
|
| 1052 |
+
a_coeffs = a_coeffs.unsqueeze(0)
|
| 1053 |
+
b_coeffs = b_coeffs.unsqueeze(0)
|
| 1054 |
+
|
| 1055 |
+
# pack batch
|
| 1056 |
+
shape = waveform.size()
|
| 1057 |
+
waveform = waveform.reshape(-1, a_coeffs.shape[0], shape[-1])
|
| 1058 |
+
output = _lfilter(waveform, a_coeffs, b_coeffs)
|
| 1059 |
+
|
| 1060 |
+
if clamp:
|
| 1061 |
+
output = torch.clamp(output, min=-1.0, max=1.0)
|
| 1062 |
+
|
| 1063 |
+
# unpack batch
|
| 1064 |
+
output = output.reshape(shape[:-1] + output.shape[-1:])
|
| 1065 |
+
|
| 1066 |
+
return output
|
| 1067 |
+
|
| 1068 |
+
|
| 1069 |
+
def lowpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
|
| 1070 |
+
r"""Design biquad lowpass filter and perform filtering. Similar to SoX implementation.
|
| 1071 |
+
|
| 1072 |
+
.. devices:: CPU CUDA
|
| 1073 |
+
|
| 1074 |
+
.. properties:: Autograd TorchScript
|
| 1075 |
+
|
| 1076 |
+
Args:
|
| 1077 |
+
waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
|
| 1078 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
| 1079 |
+
cutoff_freq (float or torch.Tensor): filter cutoff frequency
|
| 1080 |
+
Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
|
| 1081 |
+
|
| 1082 |
+
Returns:
|
| 1083 |
+
Tensor: Waveform of dimension of `(..., time)`
|
| 1084 |
+
"""
|
| 1085 |
+
dtype = waveform.dtype
|
| 1086 |
+
device = waveform.device
|
| 1087 |
+
cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device)
|
| 1088 |
+
Q = torch.as_tensor(Q, dtype=dtype, device=device)
|
| 1089 |
+
|
| 1090 |
+
w0 = 2 * math.pi * cutoff_freq / sample_rate
|
| 1091 |
+
alpha = torch.sin(w0) / 2 / Q
|
| 1092 |
+
|
| 1093 |
+
b0 = (1 - torch.cos(w0)) / 2
|
| 1094 |
+
b1 = 1 - torch.cos(w0)
|
| 1095 |
+
b2 = b0
|
| 1096 |
+
a0 = 1 + alpha
|
| 1097 |
+
a1 = -2 * torch.cos(w0)
|
| 1098 |
+
a2 = 1 - alpha
|
| 1099 |
+
return biquad(waveform, b0, b1, b2, a0, a1, a2)
|
| 1100 |
+
|
| 1101 |
+
|
| 1102 |
+
def _overdrive_core_loop_generic(
|
| 1103 |
+
waveform: Tensor, temp: Tensor, last_in: Tensor, last_out: Tensor, output_waveform: Tensor
|
| 1104 |
+
):
|
| 1105 |
+
for i in range(waveform.shape[-1]):
|
| 1106 |
+
last_out = temp[:, i] - last_in + 0.995 * last_out
|
| 1107 |
+
last_in = temp[:, i]
|
| 1108 |
+
output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
|
| 1109 |
+
|
| 1110 |
+
|
| 1111 |
+
if _IS_TORCHAUDIO_EXT_AVAILABLE:
|
| 1112 |
+
_overdrive_core_loop_cpu = torch.ops.torchaudio._overdrive_core_loop
|
| 1113 |
+
else:
|
| 1114 |
+
_overdrive_core_loop_cpu = _overdrive_core_loop_generic
|
| 1115 |
+
|
| 1116 |
+
|
| 1117 |
+
def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
|
| 1118 |
+
r"""Apply a overdrive effect to the audio. Similar to SoX implementation.
|
| 1119 |
+
|
| 1120 |
+
.. devices:: CPU CUDA
|
| 1121 |
+
|
| 1122 |
+
.. properties:: Autograd TorchScript
|
| 1123 |
+
|
| 1124 |
+
This effect applies a non linear distortion to the audio signal.
|
| 1125 |
+
|
| 1126 |
+
Args:
|
| 1127 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
| 1128 |
+
gain (float, optional): desired gain at the boost (or attenuation) in dB
|
| 1129 |
+
Allowed range of values are 0 to 100
|
| 1130 |
+
colour (float, optional): controls the amount of even harmonic content in the over-driven output
|
| 1131 |
+
Allowed range of values are 0 to 100
|
| 1132 |
+
|
| 1133 |
+
Returns:
|
| 1134 |
+
Tensor: Waveform of dimension of `(..., time)`
|
| 1135 |
+
|
| 1136 |
+
Reference:
|
| 1137 |
+
- http://sox.sourceforge.net/sox.html
|
| 1138 |
+
"""
|
| 1139 |
+
actual_shape = waveform.shape
|
| 1140 |
+
device, dtype = waveform.device, waveform.dtype
|
| 1141 |
+
|
| 1142 |
+
# convert to 2D (..,time)
|
| 1143 |
+
waveform = waveform.view(-1, actual_shape[-1])
|
| 1144 |
+
|
| 1145 |
+
gain = _dB2Linear(gain)
|
| 1146 |
+
colour = colour / 200
|
| 1147 |
+
last_in = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device)
|
| 1148 |
+
last_out = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device)
|
| 1149 |
+
|
| 1150 |
+
temp = waveform * gain + colour
|
| 1151 |
+
|
| 1152 |
+
mask1 = temp < -1
|
| 1153 |
+
temp[mask1] = torch.tensor(-2.0 / 3.0, dtype=dtype, device=device)
|
| 1154 |
+
# Wrapping the constant with Tensor is required for Torchscript
|
| 1155 |
+
|
| 1156 |
+
mask2 = temp > 1
|
| 1157 |
+
temp[mask2] = torch.tensor(2.0 / 3.0, dtype=dtype, device=device)
|
| 1158 |
+
|
| 1159 |
+
mask3 = ~mask1 & ~mask2
|
| 1160 |
+
temp[mask3] = temp[mask3] - (temp[mask3] ** 3) * (1.0 / 3)
|
| 1161 |
+
|
| 1162 |
+
output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
|
| 1163 |
+
|
| 1164 |
+
# Uses CPU optimized loop function if available for CPU device
|
| 1165 |
+
if device == torch.device("cpu"):
|
| 1166 |
+
_overdrive_core_loop_cpu(waveform, temp, last_in, last_out, output_waveform)
|
| 1167 |
+
else:
|
| 1168 |
+
_overdrive_core_loop_generic(waveform, temp, last_in, last_out, output_waveform)
|
| 1169 |
+
|
| 1170 |
+
return output_waveform.clamp(min=-1, max=1).view(actual_shape)
|
| 1171 |
+
|
| 1172 |
+
|
| 1173 |
+
def phaser(
|
| 1174 |
+
waveform: Tensor,
|
| 1175 |
+
sample_rate: int,
|
| 1176 |
+
gain_in: float = 0.4,
|
| 1177 |
+
gain_out: float = 0.74,
|
| 1178 |
+
delay_ms: float = 3.0,
|
| 1179 |
+
decay: float = 0.4,
|
| 1180 |
+
mod_speed: float = 0.5,
|
| 1181 |
+
sinusoidal: bool = True,
|
| 1182 |
+
) -> Tensor:
|
| 1183 |
+
r"""Apply a phasing effect to the audio. Similar to SoX implementation.
|
| 1184 |
+
|
| 1185 |
+
.. devices:: CPU CUDA
|
| 1186 |
+
|
| 1187 |
+
.. properties:: Autograd TorchScript
|
| 1188 |
+
|
| 1189 |
+
Args:
|
| 1190 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
| 1191 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
| 1192 |
+
gain_in (float, optional): desired input gain at the boost (or attenuation) in dB
|
| 1193 |
+
Allowed range of values are 0 to 1
|
| 1194 |
+
gain_out (float, optional): desired output gain at the boost (or attenuation) in dB
|
| 1195 |
+
Allowed range of values are 0 to 1e9
|
| 1196 |
+
delay_ms (float, optional): desired delay in milliseconds
|
| 1197 |
+
Allowed range of values are 0 to 5.0
|
| 1198 |
+
decay (float, optional): desired decay relative to gain-in
|
| 1199 |
+
Allowed range of values are 0 to 0.99
|
| 1200 |
+
mod_speed (float, optional): modulation speed in Hz
|
| 1201 |
+
Allowed range of values are 0.1 to 2
|
| 1202 |
+
sinusoidal (bool, optional): If ``True``, uses sinusoidal modulation (preferable for multiple instruments)
|
| 1203 |
+
If ``False``, uses triangular modulation (gives single instruments a sharper phasing effect)
|
| 1204 |
+
(Default: ``True``)
|
| 1205 |
+
|
| 1206 |
+
Returns:
|
| 1207 |
+
Tensor: Waveform of dimension of `(..., time)`
|
| 1208 |
+
|
| 1209 |
+
Reference:
|
| 1210 |
+
- http://sox.sourceforge.net/sox.html
|
| 1211 |
+
- Scott Lehman, `Effects Explained`_.
|
| 1212 |
+
|
| 1213 |
+
.. _Effects Explained:
|
| 1214 |
+
https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
|
| 1215 |
+
"""
|
| 1216 |
+
actual_shape = waveform.shape
|
| 1217 |
+
device, dtype = waveform.device, waveform.dtype
|
| 1218 |
+
|
| 1219 |
+
# convert to 2D (channels,time)
|
| 1220 |
+
waveform = waveform.view(-1, actual_shape[-1])
|
| 1221 |
+
|
| 1222 |
+
delay_buf_len = int((delay_ms * 0.001 * sample_rate) + 0.5)
|
| 1223 |
+
delay_buf = torch.zeros(waveform.shape[0], delay_buf_len, dtype=dtype, device=device)
|
| 1224 |
+
|
| 1225 |
+
mod_buf_len = int(sample_rate / mod_speed + 0.5)
|
| 1226 |
+
|
| 1227 |
+
if sinusoidal:
|
| 1228 |
+
wave_type = "SINE"
|
| 1229 |
+
else:
|
| 1230 |
+
wave_type = "TRIANGLE"
|
| 1231 |
+
|
| 1232 |
+
mod_buf = _generate_wave_table(
|
| 1233 |
+
wave_type=wave_type,
|
| 1234 |
+
data_type="INT",
|
| 1235 |
+
table_size=mod_buf_len,
|
| 1236 |
+
min=1.0,
|
| 1237 |
+
max=float(delay_buf_len),
|
| 1238 |
+
phase=math.pi / 2,
|
| 1239 |
+
device=device,
|
| 1240 |
+
)
|
| 1241 |
+
|
| 1242 |
+
delay_pos = 0
|
| 1243 |
+
mod_pos = 0
|
| 1244 |
+
|
| 1245 |
+
output_waveform_pre_gain_list = []
|
| 1246 |
+
waveform = waveform * gain_in
|
| 1247 |
+
delay_buf = delay_buf * decay
|
| 1248 |
+
waveform_list = [waveform[:, i] for i in range(waveform.size(1))]
|
| 1249 |
+
delay_buf_list = [delay_buf[:, i] for i in range(delay_buf.size(1))]
|
| 1250 |
+
mod_buf_list = [mod_buf[i] for i in range(mod_buf.size(0))]
|
| 1251 |
+
|
| 1252 |
+
for i in range(waveform.shape[-1]):
|
| 1253 |
+
idx = int((delay_pos + mod_buf_list[mod_pos]) % delay_buf_len)
|
| 1254 |
+
mod_pos = (mod_pos + 1) % mod_buf_len
|
| 1255 |
+
delay_pos = (delay_pos + 1) % delay_buf_len
|
| 1256 |
+
temp = (waveform_list[i]) + (delay_buf_list[idx])
|
| 1257 |
+
delay_buf_list[delay_pos] = temp * decay
|
| 1258 |
+
output_waveform_pre_gain_list.append(temp)
|
| 1259 |
+
|
| 1260 |
+
output_waveform = torch.stack(output_waveform_pre_gain_list, dim=1).to(dtype=dtype, device=device)
|
| 1261 |
+
output_waveform.mul_(gain_out)
|
| 1262 |
+
|
| 1263 |
+
return output_waveform.clamp(min=-1, max=1).view(actual_shape)
|
| 1264 |
+
|
| 1265 |
+
|
| 1266 |
+
def riaa_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
|
| 1267 |
+
r"""Apply RIAA vinyl playback equalization. Similar to SoX implementation.
|
| 1268 |
+
|
| 1269 |
+
.. devices:: CPU CUDA
|
| 1270 |
+
|
| 1271 |
+
.. properties:: Autograd TorchScript
|
| 1272 |
+
|
| 1273 |
+
Args:
|
| 1274 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
| 1275 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz).
|
| 1276 |
+
Allowed sample rates in Hz : ``44100``,``48000``,``88200``,``96000``
|
| 1277 |
+
|
| 1278 |
+
Returns:
|
| 1279 |
+
Tensor: Waveform of dimension of `(..., time)`
|
| 1280 |
+
|
| 1281 |
+
Reference:
|
| 1282 |
+
- http://sox.sourceforge.net/sox.html
|
| 1283 |
+
- https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
|
| 1284 |
+
"""
|
| 1285 |
+
|
| 1286 |
+
if sample_rate == 44100:
|
| 1287 |
+
zeros = [-0.2014898, 0.9233820]
|
| 1288 |
+
poles = [0.7083149, 0.9924091]
|
| 1289 |
+
|
| 1290 |
+
elif sample_rate == 48000:
|
| 1291 |
+
zeros = [-0.1766069, 0.9321590]
|
| 1292 |
+
poles = [0.7396325, 0.9931330]
|
| 1293 |
+
|
| 1294 |
+
elif sample_rate == 88200:
|
| 1295 |
+
zeros = [-0.1168735, 0.9648312]
|
| 1296 |
+
poles = [0.8590646, 0.9964002]
|
| 1297 |
+
|
| 1298 |
+
elif sample_rate == 96000:
|
| 1299 |
+
zeros = [-0.1141486, 0.9676817]
|
| 1300 |
+
poles = [0.8699137, 0.9966946]
|
| 1301 |
+
|
| 1302 |
+
else:
|
| 1303 |
+
raise ValueError("Sample rate must be 44.1k, 48k, 88.2k, or 96k")
|
| 1304 |
+
|
| 1305 |
+
# polynomial coefficients with roots zeros[0] and zeros[1]
|
| 1306 |
+
b0 = 1.0
|
| 1307 |
+
b1 = -(zeros[0] + zeros[1])
|
| 1308 |
+
b2 = zeros[0] * zeros[1]
|
| 1309 |
+
|
| 1310 |
+
# polynomial coefficients with roots poles[0] and poles[1]
|
| 1311 |
+
a0 = 1.0
|
| 1312 |
+
a1 = -(poles[0] + poles[1])
|
| 1313 |
+
a2 = poles[0] * poles[1]
|
| 1314 |
+
|
| 1315 |
+
# Normalize to 0dB at 1kHz
|
| 1316 |
+
y = 2 * math.pi * 1000 / sample_rate
|
| 1317 |
+
b_re = b0 + b1 * math.cos(-y) + b2 * math.cos(-2 * y)
|
| 1318 |
+
a_re = a0 + a1 * math.cos(-y) + a2 * math.cos(-2 * y)
|
| 1319 |
+
b_im = b1 * math.sin(-y) + b2 * math.sin(-2 * y)
|
| 1320 |
+
a_im = a1 * math.sin(-y) + a2 * math.sin(-2 * y)
|
| 1321 |
+
g = 1 / math.sqrt((b_re**2 + b_im**2) / (a_re**2 + a_im**2))
|
| 1322 |
+
|
| 1323 |
+
b0 *= g
|
| 1324 |
+
b1 *= g
|
| 1325 |
+
b2 *= g
|
| 1326 |
+
|
| 1327 |
+
return biquad(waveform, b0, b1, b2, a0, a1, a2)
|
| 1328 |
+
|
| 1329 |
+
|
| 1330 |
+
def treble_biquad(
|
| 1331 |
+
waveform: Tensor,
|
| 1332 |
+
sample_rate: int,
|
| 1333 |
+
gain: float,
|
| 1334 |
+
central_freq: float = 3000,
|
| 1335 |
+
Q: float = 0.707,
|
| 1336 |
+
) -> Tensor:
|
| 1337 |
+
r"""Design a treble tone-control effect. Similar to SoX implementation.
|
| 1338 |
+
|
| 1339 |
+
.. devices:: CPU CUDA
|
| 1340 |
+
|
| 1341 |
+
.. properties:: Autograd TorchScript
|
| 1342 |
+
|
| 1343 |
+
Args:
|
| 1344 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
| 1345 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
| 1346 |
+
gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB.
|
| 1347 |
+
central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``3000``)
|
| 1348 |
+
Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
|
| 1349 |
+
|
| 1350 |
+
Returns:
|
| 1351 |
+
Tensor: Waveform of dimension of `(..., time)`
|
| 1352 |
+
|
| 1353 |
+
Reference:
|
| 1354 |
+
- http://sox.sourceforge.net/sox.html
|
| 1355 |
+
- https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
|
| 1356 |
+
"""
|
| 1357 |
+
dtype = waveform.dtype
|
| 1358 |
+
device = waveform.device
|
| 1359 |
+
central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
|
| 1360 |
+
Q = torch.as_tensor(Q, dtype=dtype, device=device)
|
| 1361 |
+
gain = torch.as_tensor(gain, dtype=dtype, device=device)
|
| 1362 |
+
|
| 1363 |
+
w0 = 2 * math.pi * central_freq / sample_rate
|
| 1364 |
+
alpha = torch.sin(w0) / 2 / Q
|
| 1365 |
+
A = torch.exp(gain / 40 * math.log(10))
|
| 1366 |
+
|
| 1367 |
+
temp1 = 2 * torch.sqrt(A) * alpha
|
| 1368 |
+
temp2 = (A - 1) * torch.cos(w0)
|
| 1369 |
+
temp3 = (A + 1) * torch.cos(w0)
|
| 1370 |
+
|
| 1371 |
+
b0 = A * ((A + 1) + temp2 + temp1)
|
| 1372 |
+
b1 = -2 * A * ((A - 1) + temp3)
|
| 1373 |
+
b2 = A * ((A + 1) + temp2 - temp1)
|
| 1374 |
+
a0 = (A + 1) - temp2 + temp1
|
| 1375 |
+
a1 = 2 * ((A - 1) - temp3)
|
| 1376 |
+
a2 = (A + 1) - temp2 - temp1
|
| 1377 |
+
|
| 1378 |
+
return biquad(waveform, b0, b1, b2, a0, a1, a2)
|
| 1379 |
+
|
| 1380 |
+
|
| 1381 |
+
def _measure(
|
| 1382 |
+
measure_len_ws: int,
|
| 1383 |
+
samples: Tensor,
|
| 1384 |
+
spectrum: Tensor,
|
| 1385 |
+
noise_spectrum: Tensor,
|
| 1386 |
+
spectrum_window: Tensor,
|
| 1387 |
+
spectrum_start: int,
|
| 1388 |
+
spectrum_end: int,
|
| 1389 |
+
cepstrum_window: Tensor,
|
| 1390 |
+
cepstrum_start: int,
|
| 1391 |
+
cepstrum_end: int,
|
| 1392 |
+
noise_reduction_amount: float,
|
| 1393 |
+
measure_smooth_time_mult: float,
|
| 1394 |
+
noise_up_time_mult: Tensor,
|
| 1395 |
+
noise_down_time_mult: Tensor,
|
| 1396 |
+
boot_count: int,
|
| 1397 |
+
) -> float:
|
| 1398 |
+
device = samples.device
|
| 1399 |
+
|
| 1400 |
+
if spectrum.size(-1) != noise_spectrum.size(-1):
|
| 1401 |
+
raise ValueError(
|
| 1402 |
+
"Expected spectrum size to match noise spectrum size in final dimension."
|
| 1403 |
+
f"Found: spectrum size: {spectrum.size()}, noise_spectrum size: {noise_spectrum.size()}"
|
| 1404 |
+
)
|
| 1405 |
+
|
| 1406 |
+
dft_len_ws = spectrum.size()[-1]
|
| 1407 |
+
|
| 1408 |
+
dftBuf = torch.zeros(dft_len_ws, device=device)
|
| 1409 |
+
|
| 1410 |
+
dftBuf[:measure_len_ws] = samples * spectrum_window[:measure_len_ws]
|
| 1411 |
+
|
| 1412 |
+
# lsx_safe_rdft((int)p->dft_len_ws, 1, c->dftBuf);
|
| 1413 |
+
_dftBuf = torch.fft.rfft(dftBuf)
|
| 1414 |
+
|
| 1415 |
+
mult: float = boot_count / (1.0 + boot_count) if boot_count >= 0 else measure_smooth_time_mult
|
| 1416 |
+
|
| 1417 |
+
_d = _dftBuf[spectrum_start:spectrum_end].abs()
|
| 1418 |
+
spectrum[spectrum_start:spectrum_end].mul_(mult).add_(_d * (1 - mult))
|
| 1419 |
+
_d = spectrum[spectrum_start:spectrum_end] ** 2
|
| 1420 |
+
|
| 1421 |
+
_zeros = torch.zeros(spectrum_end - spectrum_start, device=device)
|
| 1422 |
+
_mult = (
|
| 1423 |
+
_zeros
|
| 1424 |
+
if boot_count >= 0
|
| 1425 |
+
else torch.where(
|
| 1426 |
+
_d > noise_spectrum[spectrum_start:spectrum_end],
|
| 1427 |
+
noise_up_time_mult, # if
|
| 1428 |
+
noise_down_time_mult, # else,
|
| 1429 |
+
)
|
| 1430 |
+
)
|
| 1431 |
+
|
| 1432 |
+
noise_spectrum[spectrum_start:spectrum_end].mul_(_mult).add_(_d * (1 - _mult))
|
| 1433 |
+
_d = torch.sqrt(
|
| 1434 |
+
torch.max(
|
| 1435 |
+
_zeros,
|
| 1436 |
+
_d - noise_reduction_amount * noise_spectrum[spectrum_start:spectrum_end],
|
| 1437 |
+
),
|
| 1438 |
+
)
|
| 1439 |
+
|
| 1440 |
+
_cepstrum_Buf: Tensor = torch.zeros(dft_len_ws >> 1, device=device)
|
| 1441 |
+
_cepstrum_Buf[spectrum_start:spectrum_end] = _d * cepstrum_window
|
| 1442 |
+
_cepstrum_Buf[spectrum_end : dft_len_ws >> 1].zero_()
|
| 1443 |
+
|
| 1444 |
+
# lsx_safe_rdft((int)p->dft_len_ws >> 1, 1, c->dftBuf);
|
| 1445 |
+
_cepstrum_Buf = torch.fft.rfft(_cepstrum_Buf)
|
| 1446 |
+
|
| 1447 |
+
result: float = float(torch.sum(_cepstrum_Buf[cepstrum_start:cepstrum_end].abs().pow(2)))
|
| 1448 |
+
result = math.log(result / (cepstrum_end - cepstrum_start)) if result > 0 else -math.inf
|
| 1449 |
+
return max(0, 21 + result)
|
| 1450 |
+
|
| 1451 |
+
|
| 1452 |
+
def vad(
|
| 1453 |
+
waveform: Tensor,
|
| 1454 |
+
sample_rate: int,
|
| 1455 |
+
trigger_level: float = 7.0,
|
| 1456 |
+
trigger_time: float = 0.25,
|
| 1457 |
+
search_time: float = 1.0,
|
| 1458 |
+
allowed_gap: float = 0.25,
|
| 1459 |
+
pre_trigger_time: float = 0.0,
|
| 1460 |
+
# Fine-tuning parameters
|
| 1461 |
+
boot_time: float = 0.35,
|
| 1462 |
+
noise_up_time: float = 0.1,
|
| 1463 |
+
noise_down_time: float = 0.01,
|
| 1464 |
+
noise_reduction_amount: float = 1.35,
|
| 1465 |
+
measure_freq: float = 20.0,
|
| 1466 |
+
measure_duration: Optional[float] = None,
|
| 1467 |
+
measure_smooth_time: float = 0.4,
|
| 1468 |
+
hp_filter_freq: float = 50.0,
|
| 1469 |
+
lp_filter_freq: float = 6000.0,
|
| 1470 |
+
hp_lifter_freq: float = 150.0,
|
| 1471 |
+
lp_lifter_freq: float = 2000.0,
|
| 1472 |
+
) -> Tensor:
|
| 1473 |
+
r"""Voice Activity Detector. Similar to SoX implementation.
|
| 1474 |
+
|
| 1475 |
+
.. devices:: CPU CUDA
|
| 1476 |
+
|
| 1477 |
+
.. properties:: TorchScript
|
| 1478 |
+
|
| 1479 |
+
Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
|
| 1480 |
+
The algorithm currently uses a simple cepstral power measurement to detect voice,
|
| 1481 |
+
so may be fooled by other things, especially music.
|
| 1482 |
+
|
| 1483 |
+
The effect can trim only from the front of the audio,
|
| 1484 |
+
so in order to trim from the back, the reverse effect must also be used.
|
| 1485 |
+
|
| 1486 |
+
Args:
|
| 1487 |
+
waveform (Tensor): Tensor of audio of dimension `(channels, time)` or `(time)`
|
| 1488 |
+
Tensor of shape `(channels, time)` is treated as a multi-channel recording
|
| 1489 |
+
of the same event and the resulting output will be trimmed to the earliest
|
| 1490 |
+
voice activity in any channel.
|
| 1491 |
+
sample_rate (int): Sample rate of audio signal.
|
| 1492 |
+
trigger_level (float, optional): The measurement level used to trigger activity detection.
|
| 1493 |
+
This may need to be cahnged depending on the noise level, signal level,
|
| 1494 |
+
and other characteristics of the input audio. (Default: 7.0)
|
| 1495 |
+
trigger_time (float, optional): The time constant (in seconds)
|
| 1496 |
+
used to help ignore short bursts of sound. (Default: 0.25)
|
| 1497 |
+
search_time (float, optional): The amount of audio (in seconds)
|
| 1498 |
+
to search for quieter/shorter bursts of audio to include prior
|
| 1499 |
+
to the detected trigger point. (Default: 1.0)
|
| 1500 |
+
allowed_gap (float, optional): The allowed gap (in seconds) between
|
| 1501 |
+
quieter/shorter bursts of audio to include prior
|
| 1502 |
+
to the detected trigger point. (Default: 0.25)
|
| 1503 |
+
pre_trigger_time (float, optional): The amount of audio (in seconds) to preserve
|
| 1504 |
+
before the trigger point and any found quieter/shorter bursts. (Default: 0.0)
|
| 1505 |
+
boot_time (float, optional) The algorithm (internally) uses adaptive noise
|
| 1506 |
+
estimation/reduction in order to detect the start of the wanted audio.
|
| 1507 |
+
This option sets the time for the initial noise estimate. (Default: 0.35)
|
| 1508 |
+
noise_up_time (float, optional) Time constant used by the adaptive noise estimator
|
| 1509 |
+
for when the noise level is increasing. (Default: 0.1)
|
| 1510 |
+
noise_down_time (float, optional) Time constant used by the adaptive noise estimator
|
| 1511 |
+
for when the noise level is decreasing. (Default: 0.01)
|
| 1512 |
+
noise_reduction_amount (float, optional) Amount of noise reduction to use in
|
| 1513 |
+
the detection algorithm (e.g. 0, 0.5, ...). (Default: 1.35)
|
| 1514 |
+
measure_freq (float, optional) Frequency of the algorithm's
|
| 1515 |
+
processing/measurements. (Default: 20.0)
|
| 1516 |
+
measure_duration: (float, optional) Measurement duration.
|
| 1517 |
+
(Default: Twice the measurement period; i.e. with overlap.)
|
| 1518 |
+
measure_smooth_time (float, optional) Time constant used to smooth
|
| 1519 |
+
spectral measurements. (Default: 0.4)
|
| 1520 |
+
hp_filter_freq (float, optional) "Brick-wall" frequency of high-pass filter applied
|
| 1521 |
+
at the input to the detector algorithm. (Default: 50.0)
|
| 1522 |
+
lp_filter_freq (float, optional) "Brick-wall" frequency of low-pass filter applied
|
| 1523 |
+
at the input to the detector algorithm. (Default: 6000.0)
|
| 1524 |
+
hp_lifter_freq (float, optional) "Brick-wall" frequency of high-pass lifter used
|
| 1525 |
+
in the detector algorithm. (Default: 150.0)
|
| 1526 |
+
lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used
|
| 1527 |
+
in the detector algorithm. (Default: 2000.0)
|
| 1528 |
+
|
| 1529 |
+
Returns:
|
| 1530 |
+
Tensor: Tensor of audio of dimension `(..., time)`.
|
| 1531 |
+
|
| 1532 |
+
Reference:
|
| 1533 |
+
- http://sox.sourceforge.net/sox.html
|
| 1534 |
+
"""
|
| 1535 |
+
device = waveform.device
|
| 1536 |
+
|
| 1537 |
+
if waveform.ndim > 2:
|
| 1538 |
+
warnings.warn(
|
| 1539 |
+
"Expected input tensor dimension of 1 for single channel"
|
| 1540 |
+
f" or 2 for multi-channel. Got {waveform.ndim} instead. "
|
| 1541 |
+
"Batch semantics is not supported. "
|
| 1542 |
+
"Please refer to https://github.com/pytorch/audio/issues/1348"
|
| 1543 |
+
" and https://github.com/pytorch/audio/issues/1468."
|
| 1544 |
+
)
|
| 1545 |
+
|
| 1546 |
+
measure_duration: float = 2.0 / measure_freq if measure_duration is None else measure_duration
|
| 1547 |
+
|
| 1548 |
+
measure_len_ws = int(sample_rate * measure_duration + 0.5)
|
| 1549 |
+
measure_len_ns = measure_len_ws
|
| 1550 |
+
# for (dft_len_ws = 16; dft_len_ws < measure_len_ws; dft_len_ws <<= 1);
|
| 1551 |
+
dft_len_ws = 16
|
| 1552 |
+
while dft_len_ws < measure_len_ws:
|
| 1553 |
+
dft_len_ws *= 2
|
| 1554 |
+
|
| 1555 |
+
measure_period_ns = int(sample_rate / measure_freq + 0.5)
|
| 1556 |
+
measures_len = math.ceil(search_time * measure_freq)
|
| 1557 |
+
search_pre_trigger_len_ns = measures_len * measure_period_ns
|
| 1558 |
+
gap_len = int(allowed_gap * measure_freq + 0.5)
|
| 1559 |
+
|
| 1560 |
+
fixed_pre_trigger_len_ns = int(pre_trigger_time * sample_rate + 0.5)
|
| 1561 |
+
samplesLen_ns = fixed_pre_trigger_len_ns + search_pre_trigger_len_ns + measure_len_ns
|
| 1562 |
+
|
| 1563 |
+
spectrum_window = torch.zeros(measure_len_ws, device=device)
|
| 1564 |
+
for i in range(measure_len_ws):
|
| 1565 |
+
# sox.h:741 define SOX_SAMPLE_MIN (sox_sample_t)SOX_INT_MIN(32)
|
| 1566 |
+
spectrum_window[i] = 2.0 / math.sqrt(float(measure_len_ws))
|
| 1567 |
+
# lsx_apply_hann(spectrum_window, (int)measure_len_ws);
|
| 1568 |
+
spectrum_window *= torch.hann_window(measure_len_ws, device=device, dtype=torch.float)
|
| 1569 |
+
|
| 1570 |
+
spectrum_start: int = int(hp_filter_freq / sample_rate * dft_len_ws + 0.5)
|
| 1571 |
+
spectrum_start: int = max(spectrum_start, 1)
|
| 1572 |
+
spectrum_end: int = int(lp_filter_freq / sample_rate * dft_len_ws + 0.5)
|
| 1573 |
+
spectrum_end: int = min(spectrum_end, dft_len_ws // 2)
|
| 1574 |
+
|
| 1575 |
+
cepstrum_window = torch.zeros(spectrum_end - spectrum_start, device=device)
|
| 1576 |
+
for i in range(spectrum_end - spectrum_start):
|
| 1577 |
+
cepstrum_window[i] = 2.0 / math.sqrt(float(spectrum_end) - spectrum_start)
|
| 1578 |
+
# lsx_apply_hann(cepstrum_window,(int)(spectrum_end - spectrum_start));
|
| 1579 |
+
cepstrum_window *= torch.hann_window(spectrum_end - spectrum_start, device=device, dtype=torch.float)
|
| 1580 |
+
|
| 1581 |
+
cepstrum_start = math.ceil(sample_rate * 0.5 / lp_lifter_freq)
|
| 1582 |
+
cepstrum_end = math.floor(sample_rate * 0.5 / hp_lifter_freq)
|
| 1583 |
+
cepstrum_end = min(cepstrum_end, dft_len_ws // 4)
|
| 1584 |
+
|
| 1585 |
+
if cepstrum_end <= cepstrum_start:
|
| 1586 |
+
raise ValueError(
|
| 1587 |
+
"Expected cepstrum_start to be smaller than cepstrum_end."
|
| 1588 |
+
f"Found: cepstrum_start: {cepstrum_start}, cepstrum_end: {cepstrum_end}."
|
| 1589 |
+
)
|
| 1590 |
+
|
| 1591 |
+
noise_up_time_mult = torch.tensor(math.exp(-1.0 / (noise_up_time * measure_freq)), device=device)
|
| 1592 |
+
noise_down_time_mult = torch.tensor(math.exp(-1.0 / (noise_down_time * measure_freq)), device=device)
|
| 1593 |
+
measure_smooth_time_mult = math.exp(-1.0 / (measure_smooth_time * measure_freq))
|
| 1594 |
+
trigger_meas_time_mult = math.exp(-1.0 / (trigger_time * measure_freq))
|
| 1595 |
+
|
| 1596 |
+
boot_count_max = int(boot_time * measure_freq - 0.5)
|
| 1597 |
+
boot_count = measures_index = flushedLen_ns = 0
|
| 1598 |
+
|
| 1599 |
+
# pack batch
|
| 1600 |
+
shape = waveform.size()
|
| 1601 |
+
waveform = waveform.view(-1, shape[-1])
|
| 1602 |
+
|
| 1603 |
+
n_channels, ilen = waveform.size()
|
| 1604 |
+
|
| 1605 |
+
mean_meas = torch.zeros(n_channels, device=device)
|
| 1606 |
+
spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
|
| 1607 |
+
noise_spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
|
| 1608 |
+
measures = torch.zeros(n_channels, measures_len, device=device)
|
| 1609 |
+
|
| 1610 |
+
has_triggered: bool = False
|
| 1611 |
+
num_measures_to_flush: int = 0
|
| 1612 |
+
|
| 1613 |
+
pos = 0
|
| 1614 |
+
for pos in range(measure_len_ns, ilen, measure_period_ns):
|
| 1615 |
+
for i in range(n_channels):
|
| 1616 |
+
meas: float = _measure(
|
| 1617 |
+
measure_len_ws=measure_len_ws,
|
| 1618 |
+
samples=waveform[i, pos - measure_len_ws : pos],
|
| 1619 |
+
spectrum=spectrum[i],
|
| 1620 |
+
noise_spectrum=noise_spectrum[i],
|
| 1621 |
+
spectrum_window=spectrum_window,
|
| 1622 |
+
spectrum_start=spectrum_start,
|
| 1623 |
+
spectrum_end=spectrum_end,
|
| 1624 |
+
cepstrum_window=cepstrum_window,
|
| 1625 |
+
cepstrum_start=cepstrum_start,
|
| 1626 |
+
cepstrum_end=cepstrum_end,
|
| 1627 |
+
noise_reduction_amount=noise_reduction_amount,
|
| 1628 |
+
measure_smooth_time_mult=measure_smooth_time_mult,
|
| 1629 |
+
noise_up_time_mult=noise_up_time_mult,
|
| 1630 |
+
noise_down_time_mult=noise_down_time_mult,
|
| 1631 |
+
boot_count=boot_count,
|
| 1632 |
+
)
|
| 1633 |
+
measures[i, measures_index] = meas
|
| 1634 |
+
mean_meas[i] = mean_meas[i] * trigger_meas_time_mult + meas * (1.0 - trigger_meas_time_mult)
|
| 1635 |
+
|
| 1636 |
+
has_triggered = has_triggered or (mean_meas[i] >= trigger_level)
|
| 1637 |
+
if has_triggered:
|
| 1638 |
+
n: int = measures_len
|
| 1639 |
+
k: int = measures_index
|
| 1640 |
+
jTrigger: int = n
|
| 1641 |
+
jZero: int = n
|
| 1642 |
+
j: int = 0
|
| 1643 |
+
|
| 1644 |
+
for j in range(n):
|
| 1645 |
+
if (measures[i, k] >= trigger_level) and (j <= jTrigger + gap_len):
|
| 1646 |
+
jZero = jTrigger = j
|
| 1647 |
+
elif (measures[i, k] == 0) and (jTrigger >= jZero):
|
| 1648 |
+
jZero = j
|
| 1649 |
+
k = (k + n - 1) % n
|
| 1650 |
+
j = min(j, jZero)
|
| 1651 |
+
# num_measures_to_flush = range_limit(j, num_measures_to_flush, n);
|
| 1652 |
+
num_measures_to_flush = min(max(num_measures_to_flush, j), n)
|
| 1653 |
+
# end if has_triggered
|
| 1654 |
+
# end for channel
|
| 1655 |
+
measures_index += 1
|
| 1656 |
+
measures_index = measures_index % measures_len
|
| 1657 |
+
if boot_count >= 0:
|
| 1658 |
+
boot_count = -1 if boot_count == boot_count_max else boot_count + 1
|
| 1659 |
+
|
| 1660 |
+
if has_triggered:
|
| 1661 |
+
flushedLen_ns = (measures_len - num_measures_to_flush) * measure_period_ns
|
| 1662 |
+
break
|
| 1663 |
+
# end for window
|
| 1664 |
+
if not has_triggered:
|
| 1665 |
+
return waveform[..., :0].view(shape[:-1] + torch.Size([0]))
|
| 1666 |
+
|
| 1667 |
+
res = waveform[:, pos - samplesLen_ns + flushedLen_ns :]
|
| 1668 |
+
# unpack batch
|
| 1669 |
+
return res.view(shape[:-1] + res.shape[-1:])
|