diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATN.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATN.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bff3e84c92533c81f8656b0ec6e02e28c2eb2ac8 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATN.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfig.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfig.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab2b84f781a5805ca2a9617e4f28512a4e7762a2 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfig.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfigSet.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfigSet.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b0b60c89c44732b8c5df488636203517cc7e059 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfigSet.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializationOptions.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializationOptions.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3057100b71db367709ab45d20946a4b5b469b45 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializationOptions.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializer.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializer.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30e518b4ddd4c0c5bff7c9ca7a11dcf926f8d497 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializer.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNSimulator.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNSimulator.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27b60e55d23a1e168aa42e5ad206cb37a8ebd241 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNSimulator.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNState.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNState.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab641f63f9a4da8127853556ef9f76e2de0f6f74 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNState.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNType.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNType.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15381271e2e329d524bb660fdc3c7c5b45381b73 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNType.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerATNSimulator.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerATNSimulator.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..774ca125462d17c27e0b26513e27711223aec7a1 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerATNSimulator.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerAction.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerAction.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..686640240c2e2ea967859cafaae1d886db7aeb67 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerAction.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerActionExecutor.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerActionExecutor.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4c8f63a1b058cc2e6443955f40db394808d9593 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerActionExecutor.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ParserATNSimulator.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ParserATNSimulator.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e245cfcbe7ba5a83692bbd61ab0ef1b5bde04776 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ParserATNSimulator.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/PredictionMode.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/PredictionMode.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab2a3b414508d7deb429e2cc43ef35e129c7c6f1 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/PredictionMode.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/SemanticContext.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/SemanticContext.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76ea1aa70f35faac6b84b170a70a89fbb0a3f34a Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/SemanticContext.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/Transition.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/Transition.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b0ee8536e03db74fef621c929e34c203db55d82 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/Transition.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/__init__.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..703a3c9b8d5ac82e3d4f0e96eff54cf01f8e0324 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/__init__.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFA.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFA.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ab4677275f9453ed463c14de16103fa59611637 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFA.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFASerializer.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFASerializer.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b22cd5ca6461eeb9d7bcacbaaa839689b0daf30a Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFASerializer.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFAState.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFAState.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2fc2f2b796a4a23edf837db0474ba087683a864c Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFAState.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/__init__.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c0a4e0f8b09e3c7e003d928a918479591186392 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/__init__.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/DiagnosticErrorListener.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/DiagnosticErrorListener.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b25e662f2159370fdbbeef584846a6652000f338 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/DiagnosticErrorListener.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorListener.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorListener.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b39b81d99e78e62b90b957e970c20daf394668f5 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorListener.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorStrategy.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorStrategy.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd371dd65f59c160b442ac4dbb7a7df2e9a28dbe Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorStrategy.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/Errors.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/Errors.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f04efb03c684f1890a276cbbd46277ff2588a07a Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/Errors.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/__init__.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97a6c10d0016dc7528f07d83b15b3a538b9dbb22 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/__init__.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreeMatch.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreeMatch.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20f227b581f4d62f01a8f7f48ce9c3d0bcda6c12 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreeMatch.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePattern.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePattern.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..027984b0bc83ed5b36ed28d5b7d8fcd7785212f0 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePattern.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePatternMatcher.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePatternMatcher.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..421c65dc1f589b47a6d9811f2d45ae02e2beadc4 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePatternMatcher.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/TokenTagToken.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/TokenTagToken.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..39070d91afe6bcdb6fc31412369dae6964d4b8ba Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/TokenTagToken.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/Trees.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/Trees.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..519877130354fe2d7e9476ad24ec698492d0d3d9 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/Trees.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/__init__.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c091329dfebd8011d45c9f8c67c06b3cb1d0d512 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/__init__.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/XPath.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/XPath.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b87d2ac7969aaaf6ac406a3e4e81442e6befef4 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/XPath.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/__init__.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff886c81a72601acf45558dfe8b26281a534d173 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/__init__.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/goog.npz b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/goog.npz new file mode 100644 index 0000000000000000000000000000000000000000..bd82e71bf7d72da7db030381c31f769a3d9736bb --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/goog.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:400917cf30e6b664f7b0da93d7c745860d3aa9008da8b7f160d2dd12e6a318b1 +size 22845 diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/topobathy.npz b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/topobathy.npz new file mode 100644 index 0000000000000000000000000000000000000000..67fc6c403643c5b4e0624005b7bd99ac59e856fd --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/topobathy.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0244e03291702df45024dcb5cacbc4f3d4cb30d72dfa7fd371c4ac61c42b4fbf +size 45224 diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/__init__.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e5ca977bd8c96a361a162b772ba69d431fc711a Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/__init__.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/_extension.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/_extension.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c09334a52431c675f071cda8cb80e9c2bc29d58 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/_extension.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/kaldi_io.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/kaldi_io.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f3822dfcbe87f6193460c276007eaabb78c75af5 Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/kaldi_io.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/version.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/version.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8113682b5a49357301713fee035101fd6c96a4bd Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/version.cpython-38.pyc differ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/__init__.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..65579b4f01ba09695860717f1e6cd90d6e42b631 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/__init__.py @@ -0,0 +1,5 @@ +from . import kaldi + +__all__ = [ + "kaldi", +] diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/kaldi.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/kaldi.py new file mode 100644 index 0000000000000000000000000000000000000000..12092d90d1dcbd634ece7dc5a0693b9a2aaf0c5f --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/kaldi.py @@ -0,0 +1,815 @@ +import math +from typing import Tuple + +import torch +import torchaudio +from torch import Tensor + +__all__ = [ + "get_mel_banks", + "inverse_mel_scale", + "inverse_mel_scale_scalar", + "mel_scale", + "mel_scale_scalar", + "spectrogram", + "fbank", + "mfcc", + "vtln_warp_freq", + "vtln_warp_mel_freq", +] + +# numeric_limits::epsilon() 1.1920928955078125e-07 +EPSILON = torch.tensor(torch.finfo(torch.float).eps) +# 1 milliseconds = 0.001 seconds +MILLISECONDS_TO_SECONDS = 0.001 + +# window types +HAMMING = "hamming" +HANNING = "hanning" +POVEY = "povey" +RECTANGULAR = "rectangular" +BLACKMAN = "blackman" +WINDOWS = [HAMMING, HANNING, POVEY, RECTANGULAR, BLACKMAN] + + +def _get_epsilon(device, dtype): + return EPSILON.to(device=device, dtype=dtype) + + +def _next_power_of_2(x: int) -> int: + r"""Returns the smallest power of 2 that is greater than x""" + return 1 if x == 0 else 2 ** (x - 1).bit_length() + + +def _get_strided(waveform: Tensor, window_size: int, window_shift: int, snip_edges: bool) -> Tensor: + r"""Given a waveform (1D tensor of size ``num_samples``), it returns a 2D tensor (m, ``window_size``) + representing how the window is shifted along the waveform. Each row is a frame. + + Args: + waveform (Tensor): Tensor of size ``num_samples`` + window_size (int): Frame length + window_shift (int): Frame shift + snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit + in the file, and the number of frames depends on the frame_length. If False, the number of frames + depends only on the frame_shift, and we reflect the data at the ends. + + Returns: + Tensor: 2D tensor of size (m, ``window_size``) where each row is a frame + """ + assert waveform.dim() == 1 + num_samples = waveform.size(0) + strides = (window_shift * waveform.stride(0), waveform.stride(0)) + + if snip_edges: + if num_samples < window_size: + return torch.empty((0, 0), dtype=waveform.dtype, device=waveform.device) + else: + m = 1 + (num_samples - window_size) // window_shift + else: + reversed_waveform = torch.flip(waveform, [0]) + m = (num_samples + (window_shift // 2)) // window_shift + pad = window_size // 2 - window_shift // 2 + pad_right = reversed_waveform + if pad > 0: + # torch.nn.functional.pad returns [2,1,0,1,2] for 'reflect' + # but we want [2, 1, 0, 0, 1, 2] + pad_left = reversed_waveform[-pad:] + waveform = torch.cat((pad_left, waveform, pad_right), dim=0) + else: + # pad is negative so we want to trim the waveform at the front + waveform = torch.cat((waveform[-pad:], pad_right), dim=0) + + sizes = (m, window_size) + return waveform.as_strided(sizes, strides) + + +def _feature_window_function( + window_type: str, + window_size: int, + blackman_coeff: float, + device: torch.device, + dtype: int, +) -> Tensor: + r"""Returns a window function with the given type and size""" + if window_type == HANNING: + return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype) + elif window_type == HAMMING: + return torch.hamming_window(window_size, periodic=False, alpha=0.54, beta=0.46, device=device, dtype=dtype) + elif window_type == POVEY: + # like hanning but goes to zero at edges + return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype).pow(0.85) + elif window_type == RECTANGULAR: + return torch.ones(window_size, device=device, dtype=dtype) + elif window_type == BLACKMAN: + a = 2 * math.pi / (window_size - 1) + window_function = torch.arange(window_size, device=device, dtype=dtype) + # can't use torch.blackman_window as they use different coefficients + return ( + blackman_coeff + - 0.5 * torch.cos(a * window_function) + + (0.5 - blackman_coeff) * torch.cos(2 * a * window_function) + ).to(device=device, dtype=dtype) + else: + raise Exception("Invalid window type " + window_type) + + +def _get_log_energy(strided_input: Tensor, epsilon: Tensor, energy_floor: float) -> Tensor: + r"""Returns the log energy of size (m) for a strided_input (m,*)""" + device, dtype = strided_input.device, strided_input.dtype + log_energy = torch.max(strided_input.pow(2).sum(1), epsilon).log() # size (m) + if energy_floor == 0.0: + return log_energy + return torch.max(log_energy, torch.tensor(math.log(energy_floor), device=device, dtype=dtype)) + + +def _get_waveform_and_window_properties( + waveform: Tensor, + channel: int, + sample_frequency: float, + frame_shift: float, + frame_length: float, + round_to_power_of_two: bool, + preemphasis_coefficient: float, +) -> Tuple[Tensor, int, int, int]: + r"""Gets the waveform and window properties""" + channel = max(channel, 0) + assert channel < waveform.size(0), "Invalid channel {} for size {}".format(channel, waveform.size(0)) + waveform = waveform[channel, :] # size (n) + window_shift = int(sample_frequency * frame_shift * MILLISECONDS_TO_SECONDS) + window_size = int(sample_frequency * frame_length * MILLISECONDS_TO_SECONDS) + padded_window_size = _next_power_of_2(window_size) if round_to_power_of_two else window_size + + assert 2 <= window_size <= len(waveform), "choose a window size {} that is [2, {}]".format( + window_size, len(waveform) + ) + assert 0 < window_shift, "`window_shift` must be greater than 0" + assert padded_window_size % 2 == 0, ( + "the padded `window_size` must be divisible by two." " use `round_to_power_of_two` or change `frame_length`" + ) + assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]" + assert sample_frequency > 0, "`sample_frequency` must be greater than zero" + return waveform, window_shift, window_size, padded_window_size + + +def _get_window( + waveform: Tensor, + padded_window_size: int, + window_size: int, + window_shift: int, + window_type: str, + blackman_coeff: float, + snip_edges: bool, + raw_energy: bool, + energy_floor: float, + dither: float, + remove_dc_offset: bool, + preemphasis_coefficient: float, +) -> Tuple[Tensor, Tensor]: + r"""Gets a window and its log energy + + Returns: + (Tensor, Tensor): strided_input of size (m, ``padded_window_size``) and signal_log_energy of size (m) + """ + device, dtype = waveform.device, waveform.dtype + epsilon = _get_epsilon(device, dtype) + + # size (m, window_size) + strided_input = _get_strided(waveform, window_size, window_shift, snip_edges) + + if dither != 0.0: + # Returns a random number strictly between 0 and 1 + x = torch.max(epsilon, torch.rand(strided_input.shape, device=device, dtype=dtype)) + rand_gauss = torch.sqrt(-2 * x.log()) * torch.cos(2 * math.pi * x) + strided_input = strided_input + rand_gauss * dither + + if remove_dc_offset: + # Subtract each row/frame by its mean + row_means = torch.mean(strided_input, dim=1).unsqueeze(1) # size (m, 1) + strided_input = strided_input - row_means + + if raw_energy: + # Compute the log energy of each row/frame before applying preemphasis and + # window function + signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor) # size (m) + + if preemphasis_coefficient != 0.0: + # strided_input[i,j] -= preemphasis_coefficient * strided_input[i, max(0, j-1)] for all i,j + offset_strided_input = torch.nn.functional.pad(strided_input.unsqueeze(0), (1, 0), mode="replicate").squeeze( + 0 + ) # size (m, window_size + 1) + strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :-1] + + # Apply window_function to each row/frame + window_function = _feature_window_function(window_type, window_size, blackman_coeff, device, dtype).unsqueeze( + 0 + ) # size (1, window_size) + strided_input = strided_input * window_function # size (m, window_size) + + # Pad columns with zero until we reach size (m, padded_window_size) + if padded_window_size != window_size: + padding_right = padded_window_size - window_size + strided_input = torch.nn.functional.pad( + strided_input.unsqueeze(0), (0, padding_right), mode="constant", value=0 + ).squeeze(0) + + # Compute energy after window function (not the raw one) + if not raw_energy: + signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor) # size (m) + + return strided_input, signal_log_energy + + +def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor: + # subtracts the column mean of the tensor size (m, n) if subtract_mean=True + # it returns size (m, n) + if subtract_mean: + col_means = torch.mean(tensor, dim=0).unsqueeze(0) + tensor = tensor - col_means + return tensor + + +def spectrogram( + waveform: Tensor, + blackman_coeff: float = 0.42, + channel: int = -1, + dither: float = 0.0, + energy_floor: float = 1.0, + frame_length: float = 25.0, + frame_shift: float = 10.0, + min_duration: float = 0.0, + preemphasis_coefficient: float = 0.97, + raw_energy: bool = True, + remove_dc_offset: bool = True, + round_to_power_of_two: bool = True, + sample_frequency: float = 16000.0, + snip_edges: bool = True, + subtract_mean: bool = False, + window_type: str = POVEY, +) -> Tensor: + r"""Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's + compute-spectrogram-feats. + + Args: + waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2) + blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``) + channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``) + dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set + the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``) + energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation. Caution: + this floor is applied to the zeroth component, representing the total signal energy. The floor on the + individual spectrogram elements is fixed at std::numeric_limits::epsilon(). (Default: ``1.0``) + frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``) + frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``) + min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``) + preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``) + raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``) + remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``) + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. (Default: ``True``) + sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if + specified there) (Default: ``16000.0``) + snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit + in the file, and the number of frames depends on the frame_length. If False, the number of frames + depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``) + subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do + it this way. (Default: ``False``) + window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman') + (Default: ``'povey'``) + + Returns: + Tensor: A spectrogram identical to what Kaldi would output. The shape is + (m, ``padded_window_size // 2 + 1``) where m is calculated in _get_strided + """ + device, dtype = waveform.device, waveform.dtype + epsilon = _get_epsilon(device, dtype) + + waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( + waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient + ) + + if len(waveform) < min_duration * sample_frequency: + # signal is too short + return torch.empty(0) + + strided_input, signal_log_energy = _get_window( + waveform, + padded_window_size, + window_size, + window_shift, + window_type, + blackman_coeff, + snip_edges, + raw_energy, + energy_floor, + dither, + remove_dc_offset, + preemphasis_coefficient, + ) + + # size (m, padded_window_size // 2 + 1, 2) + fft = torch.fft.rfft(strided_input) + + # Convert the FFT into a power spectrum + power_spectrum = torch.max(fft.abs().pow(2.0), epsilon).log() # size (m, padded_window_size // 2 + 1) + power_spectrum[:, 0] = signal_log_energy + + power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean) + return power_spectrum + + +def inverse_mel_scale_scalar(mel_freq: float) -> float: + return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0) + + +def inverse_mel_scale(mel_freq: Tensor) -> Tensor: + return 700.0 * ((mel_freq / 1127.0).exp() - 1.0) + + +def mel_scale_scalar(freq: float) -> float: + return 1127.0 * math.log(1.0 + freq / 700.0) + + +def mel_scale(freq: Tensor) -> Tensor: + return 1127.0 * (1.0 + freq / 700.0).log() + + +def vtln_warp_freq( + vtln_low_cutoff: float, + vtln_high_cutoff: float, + low_freq: float, + high_freq: float, + vtln_warp_factor: float, + freq: Tensor, +) -> Tensor: + r"""This computes a VTLN warping function that is not the same as HTK's one, + but has similar inputs (this function has the advantage of never producing + empty bins). + + This function computes a warp function F(freq), defined between low_freq + and high_freq inclusive, with the following properties: + F(low_freq) == low_freq + F(high_freq) == high_freq + The function is continuous and piecewise linear with two inflection + points. + The lower inflection point (measured in terms of the unwarped + frequency) is at frequency l, determined as described below. + The higher inflection point is at a frequency h, determined as + described below. + If l <= f <= h, then F(f) = f/vtln_warp_factor. + If the higher inflection point (measured in terms of the unwarped + frequency) is at h, then max(h, F(h)) == vtln_high_cutoff. + Since (by the last point) F(h) == h/vtln_warp_factor, then + max(h, h/vtln_warp_factor) == vtln_high_cutoff, so + h = vtln_high_cutoff / max(1, 1/vtln_warp_factor). + = vtln_high_cutoff * min(1, vtln_warp_factor). + If the lower inflection point (measured in terms of the unwarped + frequency) is at l, then min(l, F(l)) == vtln_low_cutoff + This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor) + = vtln_low_cutoff * max(1, vtln_warp_factor) + Args: + vtln_low_cutoff (float): Lower frequency cutoffs for VTLN + vtln_high_cutoff (float): Upper frequency cutoffs for VTLN + low_freq (float): Lower frequency cutoffs in mel computation + high_freq (float): Upper frequency cutoffs in mel computation + vtln_warp_factor (float): Vtln warp factor + freq (Tensor): given frequency in Hz + + Returns: + Tensor: Freq after vtln warp + """ + assert vtln_low_cutoff > low_freq, "be sure to set the vtln_low option higher than low_freq" + assert vtln_high_cutoff < high_freq, "be sure to set the vtln_high option lower than high_freq [or negative]" + l = vtln_low_cutoff * max(1.0, vtln_warp_factor) + h = vtln_high_cutoff * min(1.0, vtln_warp_factor) + scale = 1.0 / vtln_warp_factor + Fl = scale * l # F(l) + Fh = scale * h # F(h) + assert l > low_freq and h < high_freq + # slope of left part of the 3-piece linear function + scale_left = (Fl - low_freq) / (l - low_freq) + # [slope of center part is just "scale"] + + # slope of right part of the 3-piece linear function + scale_right = (high_freq - Fh) / (high_freq - h) + + res = torch.empty_like(freq) + + outside_low_high_freq = torch.lt(freq, low_freq) | torch.gt(freq, high_freq) # freq < low_freq || freq > high_freq + before_l = torch.lt(freq, l) # freq < l + before_h = torch.lt(freq, h) # freq < h + after_h = torch.ge(freq, h) # freq >= h + + # order of operations matter here (since there is overlapping frequency regions) + res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq) + res[before_h] = scale * freq[before_h] + res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq) + res[outside_low_high_freq] = freq[outside_low_high_freq] + + return res + + +def vtln_warp_mel_freq( + vtln_low_cutoff: float, + vtln_high_cutoff: float, + low_freq, + high_freq: float, + vtln_warp_factor: float, + mel_freq: Tensor, +) -> Tensor: + r""" + Args: + vtln_low_cutoff (float): Lower frequency cutoffs for VTLN + vtln_high_cutoff (float): Upper frequency cutoffs for VTLN + low_freq (float): Lower frequency cutoffs in mel computation + high_freq (float): Upper frequency cutoffs in mel computation + vtln_warp_factor (float): Vtln warp factor + mel_freq (Tensor): Given frequency in Mel + + Returns: + Tensor: ``mel_freq`` after vtln warp + """ + return mel_scale( + vtln_warp_freq( + vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq, vtln_warp_factor, inverse_mel_scale(mel_freq) + ) + ) + + +def get_mel_banks( + num_bins: int, + window_length_padded: int, + sample_freq: float, + low_freq: float, + high_freq: float, + vtln_low: float, + vtln_high: float, + vtln_warp_factor: float, +) -> Tuple[Tensor, Tensor]: + """ + Returns: + (Tensor, Tensor): The tuple consists of ``bins`` (which is + melbank of size (``num_bins``, ``num_fft_bins``)) and ``center_freqs`` (which is + center frequencies of bins of size (``num_bins``)). + """ + assert num_bins > 3, "Must have at least 3 mel bins" + assert window_length_padded % 2 == 0 + num_fft_bins = window_length_padded / 2 + nyquist = 0.5 * sample_freq + + if high_freq <= 0.0: + high_freq += nyquist + + assert ( + (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq) + ), "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist) + + # fft-bin width [think of it as Nyquist-freq / half-window-length] + fft_bin_width = sample_freq / window_length_padded + mel_low_freq = mel_scale_scalar(low_freq) + mel_high_freq = mel_scale_scalar(high_freq) + + # divide by num_bins+1 in next line because of end-effects where the bins + # spread out to the sides. + mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1) + + if vtln_high < 0.0: + vtln_high += nyquist + + assert vtln_warp_factor == 1.0 or ( + (low_freq < vtln_low < high_freq) and (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high) + ), "Bad values in options: vtln-low {} and vtln-high {}, versus " "low-freq {} and high-freq {}".format( + vtln_low, vtln_high, low_freq, high_freq + ) + + bin = torch.arange(num_bins).unsqueeze(1) + left_mel = mel_low_freq + bin * mel_freq_delta # size(num_bins, 1) + center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # size(num_bins, 1) + right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # size(num_bins, 1) + + if vtln_warp_factor != 1.0: + left_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, left_mel) + center_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, center_mel) + right_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, right_mel) + + center_freqs = inverse_mel_scale(center_mel) # size (num_bins) + # size(1, num_fft_bins) + mel = mel_scale(fft_bin_width * torch.arange(num_fft_bins)).unsqueeze(0) + + # size (num_bins, num_fft_bins) + up_slope = (mel - left_mel) / (center_mel - left_mel) + down_slope = (right_mel - mel) / (right_mel - center_mel) + + if vtln_warp_factor == 1.0: + # left_mel < center_mel < right_mel so we can min the two slopes and clamp negative values + bins = torch.max(torch.zeros(1), torch.min(up_slope, down_slope)) + else: + # warping can move the order of left_mel, center_mel, right_mel anywhere + bins = torch.zeros_like(up_slope) + up_idx = torch.gt(mel, left_mel) & torch.le(mel, center_mel) # left_mel < mel <= center_mel + down_idx = torch.gt(mel, center_mel) & torch.lt(mel, right_mel) # center_mel < mel < right_mel + bins[up_idx] = up_slope[up_idx] + bins[down_idx] = down_slope[down_idx] + + return bins, center_freqs + + +def fbank( + waveform: Tensor, + blackman_coeff: float = 0.42, + channel: int = -1, + dither: float = 0.0, + energy_floor: float = 1.0, + frame_length: float = 25.0, + frame_shift: float = 10.0, + high_freq: float = 0.0, + htk_compat: bool = False, + low_freq: float = 20.0, + min_duration: float = 0.0, + num_mel_bins: int = 23, + preemphasis_coefficient: float = 0.97, + raw_energy: bool = True, + remove_dc_offset: bool = True, + round_to_power_of_two: bool = True, + sample_frequency: float = 16000.0, + snip_edges: bool = True, + subtract_mean: bool = False, + use_energy: bool = False, + use_log_fbank: bool = True, + use_power: bool = True, + vtln_high: float = -500.0, + vtln_low: float = 100.0, + vtln_warp: float = 1.0, + window_type: str = POVEY, +) -> Tensor: + r"""Create a fbank from a raw audio signal. This matches the input/output of Kaldi's + compute-fbank-feats. + + Args: + waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2) + blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``) + channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``) + dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set + the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``) + energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation. Caution: + this floor is applied to the zeroth component, representing the total signal energy. The floor on the + individual spectrogram elements is fixed at std::numeric_limits::epsilon(). (Default: ``1.0``) + frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``) + frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``) + high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist) + (Default: ``0.0``) + htk_compat (bool, optional): If true, put energy last. Warning: not sufficient to get HTK compatible features + (need to change other parameters). (Default: ``False``) + low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``) + min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``) + num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``) + preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``) + raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``) + remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``) + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. (Default: ``True``) + sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if + specified there) (Default: ``16000.0``) + snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit + in the file, and the number of frames depends on the frame_length. If False, the number of frames + depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``) + subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do + it this way. (Default: ``False``) + use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``) + use_log_fbank (bool, optional):If true, produce log-filterbank, else produce linear. (Default: ``True``) + use_power (bool, optional): If true, use power, else use magnitude. (Default: ``True``) + vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if + negative, offset from high-mel-freq (Default: ``-500.0``) + vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``) + vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``) + window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman') + (Default: ``'povey'``) + + Returns: + Tensor: A fbank identical to what Kaldi would output. The shape is (m, ``num_mel_bins + use_energy``) + where m is calculated in _get_strided + """ + device, dtype = waveform.device, waveform.dtype + + waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( + waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient + ) + + if len(waveform) < min_duration * sample_frequency: + # signal is too short + return torch.empty(0, device=device, dtype=dtype) + + # strided_input, size (m, padded_window_size) and signal_log_energy, size (m) + strided_input, signal_log_energy = _get_window( + waveform, + padded_window_size, + window_size, + window_shift, + window_type, + blackman_coeff, + snip_edges, + raw_energy, + energy_floor, + dither, + remove_dc_offset, + preemphasis_coefficient, + ) + + # size (m, padded_window_size // 2 + 1) + spectrum = torch.fft.rfft(strided_input).abs() + if use_power: + spectrum = spectrum.pow(2.0) + + # size (num_mel_bins, padded_window_size // 2) + mel_energies, _ = get_mel_banks( + num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp + ) + mel_energies = mel_energies.to(device=device, dtype=dtype) + + # pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1) + mel_energies = torch.nn.functional.pad(mel_energies, (0, 1), mode="constant", value=0) + + # sum with mel fiterbanks over the power spectrum, size (m, num_mel_bins) + mel_energies = torch.mm(spectrum, mel_energies.T) + if use_log_fbank: + # avoid log of zero (which should be prevented anyway by dithering) + mel_energies = torch.max(mel_energies, _get_epsilon(device, dtype)).log() + + # if use_energy then add it as the last column for htk_compat == true else first column + if use_energy: + signal_log_energy = signal_log_energy.unsqueeze(1) # size (m, 1) + # returns size (m, num_mel_bins + 1) + if htk_compat: + mel_energies = torch.cat((mel_energies, signal_log_energy), dim=1) + else: + mel_energies = torch.cat((signal_log_energy, mel_energies), dim=1) + + mel_energies = _subtract_column_mean(mel_energies, subtract_mean) + return mel_energies + + +def _get_dct_matrix(num_ceps: int, num_mel_bins: int) -> Tensor: + # returns a dct matrix of size (num_mel_bins, num_ceps) + # size (num_mel_bins, num_mel_bins) + dct_matrix = torchaudio.functional.create_dct(num_mel_bins, num_mel_bins, "ortho") + # kaldi expects the first cepstral to be weighted sum of factor sqrt(1/num_mel_bins) + # this would be the first column in the dct_matrix for torchaudio as it expects a + # right multiply (which would be the first column of the kaldi's dct_matrix as kaldi + # expects a left multiply e.g. dct_matrix * vector). + dct_matrix[:, 0] = math.sqrt(1 / float(num_mel_bins)) + dct_matrix = dct_matrix[:, :num_ceps] + return dct_matrix + + +def _get_lifter_coeffs(num_ceps: int, cepstral_lifter: float) -> Tensor: + # returns size (num_ceps) + # Compute liftering coefficients (scaling on cepstral coeffs) + # coeffs are numbered slightly differently from HTK: the zeroth index is C0, which is not affected. + i = torch.arange(num_ceps) + return 1.0 + 0.5 * cepstral_lifter * torch.sin(math.pi * i / cepstral_lifter) + + +def mfcc( + waveform: Tensor, + blackman_coeff: float = 0.42, + cepstral_lifter: float = 22.0, + channel: int = -1, + dither: float = 0.0, + energy_floor: float = 1.0, + frame_length: float = 25.0, + frame_shift: float = 10.0, + high_freq: float = 0.0, + htk_compat: bool = False, + low_freq: float = 20.0, + num_ceps: int = 13, + min_duration: float = 0.0, + num_mel_bins: int = 23, + preemphasis_coefficient: float = 0.97, + raw_energy: bool = True, + remove_dc_offset: bool = True, + round_to_power_of_two: bool = True, + sample_frequency: float = 16000.0, + snip_edges: bool = True, + subtract_mean: bool = False, + use_energy: bool = False, + vtln_high: float = -500.0, + vtln_low: float = 100.0, + vtln_warp: float = 1.0, + window_type: str = POVEY, +) -> Tensor: + r"""Create a mfcc from a raw audio signal. This matches the input/output of Kaldi's + compute-mfcc-feats. + + Args: + waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2) + blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``) + cepstral_lifter (float, optional): Constant that controls scaling of MFCCs (Default: ``22.0``) + channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``) + dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set + the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``) + energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation. Caution: + this floor is applied to the zeroth component, representing the total signal energy. The floor on the + individual spectrogram elements is fixed at std::numeric_limits::epsilon(). (Default: ``1.0``) + frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``) + frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``) + high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist) + (Default: ``0.0``) + htk_compat (bool, optional): If true, put energy last. Warning: not sufficient to get HTK compatible + features (need to change other parameters). (Default: ``False``) + low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``) + num_ceps (int, optional): Number of cepstra in MFCC computation (including C0) (Default: ``13``) + min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``) + num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``) + preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``) + raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``) + remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``) + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. (Default: ``True``) + sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if + specified there) (Default: ``16000.0``) + snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit + in the file, and the number of frames depends on the frame_length. If False, the number of frames + depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``) + subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do + it this way. (Default: ``False``) + use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``) + vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if + negative, offset from high-mel-freq (Default: ``-500.0``) + vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``) + vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``) + window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman') + (Default: ``"povey"``) + + Returns: + Tensor: A mfcc identical to what Kaldi would output. The shape is (m, ``num_ceps``) + where m is calculated in _get_strided + """ + assert num_ceps <= num_mel_bins, "num_ceps cannot be larger than num_mel_bins: %d vs %d" % (num_ceps, num_mel_bins) + + device, dtype = waveform.device, waveform.dtype + + # The mel_energies should not be squared (use_power=True), not have mean subtracted + # (subtract_mean=False), and use log (use_log_fbank=True). + # size (m, num_mel_bins + use_energy) + feature = fbank( + waveform=waveform, + blackman_coeff=blackman_coeff, + channel=channel, + dither=dither, + energy_floor=energy_floor, + frame_length=frame_length, + frame_shift=frame_shift, + high_freq=high_freq, + htk_compat=htk_compat, + low_freq=low_freq, + min_duration=min_duration, + num_mel_bins=num_mel_bins, + preemphasis_coefficient=preemphasis_coefficient, + raw_energy=raw_energy, + remove_dc_offset=remove_dc_offset, + round_to_power_of_two=round_to_power_of_two, + sample_frequency=sample_frequency, + snip_edges=snip_edges, + subtract_mean=False, + use_energy=use_energy, + use_log_fbank=True, + use_power=True, + vtln_high=vtln_high, + vtln_low=vtln_low, + vtln_warp=vtln_warp, + window_type=window_type, + ) + + if use_energy: + # size (m) + signal_log_energy = feature[:, num_mel_bins if htk_compat else 0] + # offset is 0 if htk_compat==True else 1 + mel_offset = int(not htk_compat) + feature = feature[:, mel_offset : (num_mel_bins + mel_offset)] + + # size (num_mel_bins, num_ceps) + dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins).to(dtype=dtype, device=device) + + # size (m, num_ceps) + feature = feature.matmul(dct_matrix) + + if cepstral_lifter != 0.0: + # size (1, num_ceps) + lifter_coeffs = _get_lifter_coeffs(num_ceps, cepstral_lifter).unsqueeze(0) + feature *= lifter_coeffs.to(device=device, dtype=dtype) + + # if use_energy then replace the last column for htk_compat == true else first column + if use_energy: + feature[:, 0] = signal_log_energy + + if htk_compat: + energy = feature[:, 0].unsqueeze(1) # size (m, 1) + feature = feature[:, 1:] # size (m, num_ceps - 1) + if not use_energy: + # scale on C0 (actually removing a scale we previously added that's + # part of one common definition of the cosine transform.) + energy *= math.sqrt(2) + + feature = torch.cat((feature, energy), dim=1) + + feature = _subtract_column_mean(feature, subtract_mean) + return feature diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/__init__.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c5946e809653c916bbee7cfad330ed50cefe3447 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/__init__.py @@ -0,0 +1,34 @@ +from .cmuarctic import CMUARCTIC +from .cmudict import CMUDict +from .commonvoice import COMMONVOICE +from .dr_vctk import DR_VCTK +from .gtzan import GTZAN +from .librilight_limited import LibriLightLimited +from .librimix import LibriMix +from .librispeech import LIBRISPEECH +from .libritts import LIBRITTS +from .ljspeech import LJSPEECH +from .quesst14 import QUESST14 +from .speechcommands import SPEECHCOMMANDS +from .tedlium import TEDLIUM +from .vctk import VCTK_092 +from .yesno import YESNO + + +__all__ = [ + "COMMONVOICE", + "LIBRISPEECH", + "LibriLightLimited", + "SPEECHCOMMANDS", + "VCTK_092", + "DR_VCTK", + "YESNO", + "LJSPEECH", + "GTZAN", + "CMUARCTIC", + "CMUDict", + "LibriMix", + "LIBRITTS", + "TEDLIUM", + "QUESST14", +] diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmuarctic.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmuarctic.py new file mode 100644 index 0000000000000000000000000000000000000000..6a1227b0151f28fe3cf82e54883ffef5e15a21b6 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmuarctic.py @@ -0,0 +1,148 @@ +import csv +import os +from pathlib import Path +from typing import Tuple, Union + +import torchaudio +from torch import Tensor +from torch.hub import download_url_to_file +from torch.utils.data import Dataset +from torchaudio.datasets.utils import extract_archive + +URL = "aew" +FOLDER_IN_ARCHIVE = "ARCTIC" +_CHECKSUMS = { + "http://festvox.org/cmu_arctic/packed/cmu_us_aew_arctic.tar.bz2": "645cb33c0f0b2ce41384fdd8d3db2c3f5fc15c1e688baeb74d2e08cab18ab406", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_ahw_arctic.tar.bz2": "024664adeb892809d646a3efd043625b46b5bfa3e6189b3500b2d0d59dfab06c", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_aup_arctic.tar.bz2": "2c55bc3050caa996758869126ad10cf42e1441212111db034b3a45189c18b6fc", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_awb_arctic.tar.bz2": "d74a950c9739a65f7bfc4dfa6187f2730fa03de5b8eb3f2da97a51b74df64d3c", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_axb_arctic.tar.bz2": "dd65c3d2907d1ee52f86e44f578319159e60f4bf722a9142be01161d84e330ff", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_bdl_arctic.tar.bz2": "26b91aaf48b2799b2956792b4632c2f926cd0542f402b5452d5adecb60942904", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_clb_arctic.tar.bz2": "3f16dc3f3b97955ea22623efb33b444341013fc660677b2e170efdcc959fa7c6", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_eey_arctic.tar.bz2": "8a0ee4e5acbd4b2f61a4fb947c1730ab3adcc9dc50b195981d99391d29928e8a", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_fem_arctic.tar.bz2": "3fcff629412b57233589cdb058f730594a62c4f3a75c20de14afe06621ef45e2", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_gka_arctic.tar.bz2": "dc82e7967cbd5eddbed33074b0699128dbd4482b41711916d58103707e38c67f", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_jmk_arctic.tar.bz2": "3a37c0e1dfc91e734fdbc88b562d9e2ebca621772402cdc693bbc9b09b211d73", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_ksp_arctic.tar.bz2": "8029cafce8296f9bed3022c44ef1e7953332b6bf6943c14b929f468122532717", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_ljm_arctic.tar.bz2": "b23993765cbf2b9e7bbc3c85b6c56eaf292ac81ee4bb887b638a24d104f921a0", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_lnh_arctic.tar.bz2": "4faf34d71aa7112813252fb20c5433e2fdd9a9de55a00701ffcbf05f24a5991a", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_rms_arctic.tar.bz2": "c6dc11235629c58441c071a7ba8a2d067903dfefbaabc4056d87da35b72ecda4", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_rxr_arctic.tar.bz2": "1fa4271c393e5998d200e56c102ff46fcfea169aaa2148ad9e9469616fbfdd9b", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_slp_arctic.tar.bz2": "54345ed55e45c23d419e9a823eef427f1cc93c83a710735ec667d068c916abf1", # noqa: E501 + "http://festvox.org/cmu_arctic/packed/cmu_us_slt_arctic.tar.bz2": "7c173297916acf3cc7fcab2713be4c60b27312316765a90934651d367226b4ea", # noqa: E501 +} + + +def load_cmuarctic_item(line: str, path: str, folder_audio: str, ext_audio: str) -> Tuple[Tensor, int, str, str]: + + utterance_id, transcript = line[0].strip().split(" ", 2)[1:] + + # Remove space, double quote, and single parenthesis from transcript + transcript = transcript[1:-3] + + file_audio = os.path.join(path, folder_audio, utterance_id + ext_audio) + + # Load audio + waveform, sample_rate = torchaudio.load(file_audio) + + return (waveform, sample_rate, transcript, utterance_id.split("_")[1]) + + +class CMUARCTIC(Dataset): + """Create a Dataset for *CMU ARCTIC* [:footcite:`Kominek03cmuarctic`]. + + Args: + root (str or Path): Path to the directory where the dataset is found or downloaded. + url (str, optional): + The URL to download the dataset from or the type of the dataset to download. + (default: ``"aew"``) + Allowed type values are ``"aew"``, ``"ahw"``, ``"aup"``, ``"awb"``, ``"axb"``, ``"bdl"``, + ``"clb"``, ``"eey"``, ``"fem"``, ``"gka"``, ``"jmk"``, ``"ksp"``, ``"ljm"``, ``"lnh"``, + ``"rms"``, ``"rxr"``, ``"slp"`` or ``"slt"``. + folder_in_archive (str, optional): + The top-level directory of the dataset. (default: ``"ARCTIC"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + """ + + _file_text = "txt.done.data" + _folder_text = "etc" + _ext_audio = ".wav" + _folder_audio = "wav" + + def __init__( + self, root: Union[str, Path], url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False + ) -> None: + + if url in [ + "aew", + "ahw", + "aup", + "awb", + "axb", + "bdl", + "clb", + "eey", + "fem", + "gka", + "jmk", + "ksp", + "ljm", + "lnh", + "rms", + "rxr", + "slp", + "slt", + ]: + + url = "cmu_us_" + url + "_arctic" + ext_archive = ".tar.bz2" + base_url = "http://www.festvox.org/cmu_arctic/packed/" + + url = os.path.join(base_url, url + ext_archive) + + # Get string representation of 'root' in case Path object is passed + root = os.fspath(root) + + basename = os.path.basename(url) + root = os.path.join(root, folder_in_archive) + if not os.path.isdir(root): + os.mkdir(root) + archive = os.path.join(root, basename) + + basename = basename.split(".")[0] + + self._path = os.path.join(root, basename) + + if download: + if not os.path.isdir(self._path): + if not os.path.isfile(archive): + checksum = _CHECKSUMS.get(url, None) + download_url_to_file(url, archive, hash_prefix=checksum) + extract_archive(archive) + else: + if not os.path.exists(self._path): + raise RuntimeError( + f"The path {self._path} doesn't exist. " + "Please check the ``root`` path or set `download=True` to download it" + ) + self._text = os.path.join(self._path, self._folder_text, self._file_text) + + with open(self._text, "r") as text: + walker = csv.reader(text, delimiter="\n") + self._walker = list(walker) + + def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + (Tensor, int, str, str): ``(waveform, sample_rate, transcript, utterance_id)`` + """ + line = self._walker[n] + return load_cmuarctic_item(line, self._path, self._folder_audio, self._ext_audio) + + def __len__(self) -> int: + return len(self._walker) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmudict.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmudict.py new file mode 100644 index 0000000000000000000000000000000000000000..cd17153409f813997a63eef42aea0f61165c72cc --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmudict.py @@ -0,0 +1,183 @@ +import os +import re +from pathlib import Path +from typing import Iterable, List, Tuple, Union + +from torch.hub import download_url_to_file +from torch.utils.data import Dataset + +_CHECKSUMS = { + "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b": "209a8b4cd265013e96f4658632a9878103b0c5abf62b50d4ef3ae1be226b29e4", # noqa: E501 + "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols": "408ccaae803641c6d7b626b6299949320c2dbca96b2220fd3fb17887b023b027", # noqa: E501 +} +_PUNCTUATIONS = set( + [ + "!EXCLAMATION-POINT", + '"CLOSE-QUOTE', + '"DOUBLE-QUOTE', + '"END-OF-QUOTE', + '"END-QUOTE', + '"IN-QUOTES', + '"QUOTE', + '"UNQUOTE', + "#HASH-MARK", + "#POUND-SIGN", + "#SHARP-SIGN", + "%PERCENT", + "&ERSAND", + "'END-INNER-QUOTE", + "'END-QUOTE", + "'INNER-QUOTE", + "'QUOTE", + "'SINGLE-QUOTE", + "(BEGIN-PARENS", + "(IN-PARENTHESES", + "(LEFT-PAREN", + "(OPEN-PARENTHESES", + "(PAREN", + "(PARENS", + "(PARENTHESES", + ")CLOSE-PAREN", + ")CLOSE-PARENTHESES", + ")END-PAREN", + ")END-PARENS", + ")END-PARENTHESES", + ")END-THE-PAREN", + ")PAREN", + ")PARENS", + ")RIGHT-PAREN", + ")UN-PARENTHESES", + "+PLUS", + ",COMMA", + "--DASH", + "-DASH", + "-HYPHEN", + "...ELLIPSIS", + ".DECIMAL", + ".DOT", + ".FULL-STOP", + ".PERIOD", + ".POINT", + "/SLASH", + ":COLON", + ";SEMI-COLON", + ";SEMI-COLON(1)", + "?QUESTION-MARK", + "{BRACE", + "{LEFT-BRACE", + "{OPEN-BRACE", + "}CLOSE-BRACE", + "}RIGHT-BRACE", + ] +) + + +def _parse_dictionary(lines: Iterable[str], exclude_punctuations: bool) -> List[str]: + _alt_re = re.compile(r"\([0-9]+\)") + cmudict: List[Tuple[str, List[str]]] = list() + for line in lines: + if not line or line.startswith(";;;"): # ignore comments + continue + + word, phones = line.strip().split(" ") + if word in _PUNCTUATIONS: + if exclude_punctuations: + continue + # !EXCLAMATION-POINT -> ! + # --DASH -> -- + # ...ELLIPSIS -> ... + if word.startswith("..."): + word = "..." + elif word.startswith("--"): + word = "--" + else: + word = word[0] + + # if a word have multiple pronunciations, there will be (number) appended to it + # for example, DATAPOINTS and DATAPOINTS(1), + # the regular expression `_alt_re` removes the '(1)' and change the word DATAPOINTS(1) to DATAPOINTS + word = re.sub(_alt_re, "", word) + phones = phones.split(" ") + cmudict.append((word, phones)) + + return cmudict + + +class CMUDict(Dataset): + """Create a Dataset for *CMU Pronouncing Dictionary* [:footcite:`cmudict`] (CMUDict). + + Args: + root (str or Path): Path to the directory where the dataset is found or downloaded. + exclude_punctuations (bool, optional): + When enabled, exclude the pronounciation of punctuations, such as + `!EXCLAMATION-POINT` and `#HASH-MARK`. + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + url (str, optional): + The URL to download the dictionary from. + (default: ``"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b"``) + url_symbols (str, optional): + The URL to download the list of symbols from. + (default: ``"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols"``) + """ + + def __init__( + self, + root: Union[str, Path], + exclude_punctuations: bool = True, + *, + download: bool = False, + url: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b", + url_symbols: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols", + ) -> None: + + self.exclude_punctuations = exclude_punctuations + + self._root_path = Path(root) + if not os.path.isdir(self._root_path): + raise RuntimeError(f"The root directory does not exist; {root}") + + dict_file = self._root_path / os.path.basename(url) + symbol_file = self._root_path / os.path.basename(url_symbols) + if not os.path.exists(dict_file): + if not download: + raise RuntimeError( + "The dictionary file is not found in the following location. " + f"Set `download=True` to download it. {dict_file}" + ) + checksum = _CHECKSUMS.get(url, None) + download_url_to_file(url, dict_file, checksum) + if not os.path.exists(symbol_file): + if not download: + raise RuntimeError( + "The symbol file is not found in the following location. " + f"Set `download=True` to download it. {symbol_file}" + ) + checksum = _CHECKSUMS.get(url_symbols, None) + download_url_to_file(url_symbols, symbol_file, checksum) + + with open(symbol_file, "r") as text: + self._symbols = [line.strip() for line in text.readlines()] + + with open(dict_file, "r", encoding="latin-1") as text: + self._dictionary = _parse_dictionary(text.readlines(), exclude_punctuations=self.exclude_punctuations) + + def __getitem__(self, n: int) -> Tuple[str, List[str]]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded. + + Returns: + (str, List[str]): The corresponding word and phonemes ``(word, [phonemes])``. + + """ + return self._dictionary[n] + + def __len__(self) -> int: + return len(self._dictionary) + + @property + def symbols(self) -> List[str]: + """list[str]: A list of phonemes symbols, such as `AA`, `AE`, `AH`.""" + return self._symbols.copy() diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/commonvoice.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/commonvoice.py new file mode 100644 index 0000000000000000000000000000000000000000..29ad5b7e8d09a110a8a957c38c04bfba180bbce4 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/commonvoice.py @@ -0,0 +1,71 @@ +import csv +import os +from pathlib import Path +from typing import Dict, List, Tuple, Union + +import torchaudio +from torch import Tensor +from torch.utils.data import Dataset + + +def load_commonvoice_item( + line: List[str], header: List[str], path: str, folder_audio: str, ext_audio: str +) -> Tuple[Tensor, int, Dict[str, str]]: + # Each line as the following data: + # client_id, path, sentence, up_votes, down_votes, age, gender, accent + + assert header[1] == "path" + fileid = line[1] + filename = os.path.join(path, folder_audio, fileid) + if not filename.endswith(ext_audio): + filename += ext_audio + waveform, sample_rate = torchaudio.load(filename) + + dic = dict(zip(header, line)) + + return waveform, sample_rate, dic + + +class COMMONVOICE(Dataset): + """Create a Dataset for *CommonVoice* [:footcite:`ardila2020common`]. + + Args: + root (str or Path): Path to the directory where the dataset is located. + (Where the ``tsv`` file is present.) + tsv (str, optional): + The name of the tsv file used to construct the metadata, such as + ``"train.tsv"``, ``"test.tsv"``, ``"dev.tsv"``, ``"invalidated.tsv"``, + ``"validated.tsv"`` and ``"other.tsv"``. (default: ``"train.tsv"``) + """ + + _ext_txt = ".txt" + _ext_audio = ".mp3" + _folder_audio = "clips" + + def __init__(self, root: Union[str, Path], tsv: str = "train.tsv") -> None: + + # Get string representation of 'root' in case Path object is passed + self._path = os.fspath(root) + self._tsv = os.path.join(self._path, tsv) + + with open(self._tsv, "r") as tsv_: + walker = csv.reader(tsv_, delimiter="\t") + self._header = next(walker) + self._walker = list(walker) + + def __getitem__(self, n: int) -> Tuple[Tensor, int, Dict[str, str]]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + (Tensor, int, Dict[str, str]): ``(waveform, sample_rate, dictionary)``, where dictionary + is built from the TSV file with the following keys: ``client_id``, ``path``, ``sentence``, + ``up_votes``, ``down_votes``, ``age``, ``gender`` and ``accent``. + """ + line = self._walker[n] + return load_commonvoice_item(line, self._header, self._path, self._folder_audio, self._ext_audio) + + def __len__(self) -> int: + return len(self._walker) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/dr_vctk.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/dr_vctk.py new file mode 100644 index 0000000000000000000000000000000000000000..be865b61b2528f46aa2a014d4a9ab54672756fe3 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/dr_vctk.py @@ -0,0 +1,106 @@ +from pathlib import Path +from typing import Dict, Tuple, Union + +import torchaudio +from torch import Tensor +from torch.hub import download_url_to_file +from torch.utils.data import Dataset +from torchaudio.datasets.utils import extract_archive + + +_URL = "https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip" +_CHECKSUM = "781f12f4406ed36ed27ae3bce55da47ba176e2d8bae67319e389e07b2c9bd769" +_SUPPORTED_SUBSETS = {"train", "test"} + + +class DR_VCTK(Dataset): + """Create a dataset for *Device Recorded VCTK (Small subset version)* [:footcite:`Sarfjoo2018DeviceRV`]. + + Args: + root (str or Path): Root directory where the dataset's top level directory is found. + subset (str): The subset to use. Can be one of ``"train"`` and ``"test"``. (default: ``"train"``). + download (bool): + Whether to download the dataset if it is not found at root path. (default: ``False``). + url (str): The URL to download the dataset from. + (default: ``"https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip"``) + """ + + def __init__( + self, + root: Union[str, Path], + subset: str = "train", + *, + download: bool = False, + url: str = _URL, + ) -> None: + if subset not in _SUPPORTED_SUBSETS: + raise RuntimeError( + f"The subset '{subset}' does not match any of the supported subsets: {_SUPPORTED_SUBSETS}" + ) + + root = Path(root).expanduser() + archive = root / "DR-VCTK.zip" + + self._subset = subset + self._path = root / "DR-VCTK" / "DR-VCTK" + self._clean_audio_dir = self._path / f"clean_{self._subset}set_wav_16k" + self._noisy_audio_dir = self._path / f"device-recorded_{self._subset}set_wav_16k" + self._config_filepath = self._path / "configurations" / f"{self._subset}_ch_log.txt" + + if not self._path.is_dir(): + if not archive.is_file(): + if not download: + raise RuntimeError("Dataset not found. Please use `download=True` to download it.") + download_url_to_file(url, archive, hash_prefix=_CHECKSUM) + extract_archive(archive, root) + + self._config = self._load_config(self._config_filepath) + self._filename_list = sorted(self._config) + + def _load_config(self, filepath: str) -> Dict[str, Tuple[str, int]]: + # Skip header + skip_rows = 2 if self._subset == "train" else 1 + + config = {} + with open(filepath) as f: + for i, line in enumerate(f): + if i < skip_rows or not line: + continue + filename, source, channel_id = line.strip().split("\t") + config[filename] = (source, int(channel_id)) + return config + + def _load_dr_vctk_item(self, filename: str) -> Tuple[Tensor, int, Tensor, int, str, str, str, int]: + speaker_id, utterance_id = filename.split(".")[0].split("_") + source, channel_id = self._config[filename] + file_clean_audio = self._clean_audio_dir / filename + file_noisy_audio = self._noisy_audio_dir / filename + waveform_clean, sample_rate_clean = torchaudio.load(file_clean_audio) + waveform_noisy, sample_rate_noisy = torchaudio.load(file_noisy_audio) + return ( + waveform_clean, + sample_rate_clean, + waveform_noisy, + sample_rate_noisy, + speaker_id, + utterance_id, + source, + channel_id, + ) + + def __getitem__(self, n: int) -> Tuple[Tensor, int, Tensor, int, str, str, str, int]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + (Tensor, int, Tensor, int, str, str, str, int): + ``(waveform_clean, sample_rate_clean, waveform_noisy, sample_rate_noisy, speaker_id,\ + utterance_id, source, channel_id)`` + """ + filename = self._filename_list[n] + return self._load_dr_vctk_item(filename) + + def __len__(self) -> int: + return len(self._filename_list) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/gtzan.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/gtzan.py new file mode 100644 index 0000000000000000000000000000000000000000..6d087ea5ec401eabe80a1a9605def5a7a96bc961 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/gtzan.py @@ -0,0 +1,1108 @@ +import os +from pathlib import Path +from typing import Optional, Tuple, Union + +import torchaudio +from torch import Tensor +from torch.hub import download_url_to_file +from torch.utils.data import Dataset +from torchaudio.datasets.utils import extract_archive + +# The following lists prefixed with `filtered_` provide a filtered split +# that: +# +# a. Mitigate a known issue with GTZAN (duplication) +# +# b. Provide a standard split for testing it against other +# methods (e.g. the one in jordipons/sklearn-audio-transfer-learning). +# +# Those are used when GTZAN is initialised with the `filtered` keyword. +# The split was taken from (github) jordipons/sklearn-audio-transfer-learning. + +gtzan_genres = [ + "blues", + "classical", + "country", + "disco", + "hiphop", + "jazz", + "metal", + "pop", + "reggae", + "rock", +] + +filtered_test = [ + "blues.00012", + "blues.00013", + "blues.00014", + "blues.00015", + "blues.00016", + "blues.00017", + "blues.00018", + "blues.00019", + "blues.00020", + "blues.00021", + "blues.00022", + "blues.00023", + "blues.00024", + "blues.00025", + "blues.00026", + "blues.00027", + "blues.00028", + "blues.00061", + "blues.00062", + "blues.00063", + "blues.00064", + "blues.00065", + "blues.00066", + "blues.00067", + "blues.00068", + "blues.00069", + "blues.00070", + "blues.00071", + "blues.00072", + "blues.00098", + "blues.00099", + "classical.00011", + "classical.00012", + "classical.00013", + "classical.00014", + "classical.00015", + "classical.00016", + "classical.00017", + "classical.00018", + "classical.00019", + "classical.00020", + "classical.00021", + "classical.00022", + "classical.00023", + "classical.00024", + "classical.00025", + "classical.00026", + "classical.00027", + "classical.00028", + "classical.00029", + "classical.00034", + "classical.00035", + "classical.00036", + "classical.00037", + "classical.00038", + "classical.00039", + "classical.00040", + "classical.00041", + "classical.00049", + "classical.00077", + "classical.00078", + "classical.00079", + "country.00030", + "country.00031", + "country.00032", + "country.00033", + "country.00034", + "country.00035", + "country.00036", + "country.00037", + "country.00038", + "country.00039", + "country.00040", + "country.00043", + "country.00044", + "country.00046", + "country.00047", + "country.00048", + "country.00050", + "country.00051", + "country.00053", + "country.00054", + "country.00055", + "country.00056", + "country.00057", + "country.00058", + "country.00059", + "country.00060", + "country.00061", + "country.00062", + "country.00063", + "country.00064", + "disco.00001", + "disco.00021", + "disco.00058", + "disco.00062", + "disco.00063", + "disco.00064", + "disco.00065", + "disco.00066", + "disco.00069", + "disco.00076", + "disco.00077", + "disco.00078", + "disco.00079", + "disco.00080", + "disco.00081", + "disco.00082", + "disco.00083", + "disco.00084", + "disco.00085", + "disco.00086", + "disco.00087", + "disco.00088", + "disco.00091", + "disco.00092", + "disco.00093", + "disco.00094", + "disco.00096", + "disco.00097", + "disco.00099", + "hiphop.00000", + "hiphop.00026", + "hiphop.00027", + "hiphop.00030", + "hiphop.00040", + "hiphop.00043", + "hiphop.00044", + "hiphop.00045", + "hiphop.00051", + "hiphop.00052", + "hiphop.00053", + "hiphop.00054", + "hiphop.00062", + "hiphop.00063", + "hiphop.00064", + "hiphop.00065", + "hiphop.00066", + "hiphop.00067", + "hiphop.00068", + "hiphop.00069", + "hiphop.00070", + "hiphop.00071", + "hiphop.00072", + "hiphop.00073", + "hiphop.00074", + "hiphop.00075", + "hiphop.00099", + "jazz.00073", + "jazz.00074", + "jazz.00075", + "jazz.00076", + "jazz.00077", + "jazz.00078", + "jazz.00079", + "jazz.00080", + "jazz.00081", + "jazz.00082", + "jazz.00083", + "jazz.00084", + "jazz.00085", + "jazz.00086", + "jazz.00087", + "jazz.00088", + "jazz.00089", + "jazz.00090", + "jazz.00091", + "jazz.00092", + "jazz.00093", + "jazz.00094", + "jazz.00095", + "jazz.00096", + "jazz.00097", + "jazz.00098", + "jazz.00099", + "metal.00012", + "metal.00013", + "metal.00014", + "metal.00015", + "metal.00022", + "metal.00023", + "metal.00025", + "metal.00026", + "metal.00027", + "metal.00028", + "metal.00029", + "metal.00030", + "metal.00031", + "metal.00032", + "metal.00033", + "metal.00038", + "metal.00039", + "metal.00067", + "metal.00070", + "metal.00073", + "metal.00074", + "metal.00075", + "metal.00078", + "metal.00083", + "metal.00085", + "metal.00087", + "metal.00088", + "pop.00000", + "pop.00001", + "pop.00013", + "pop.00014", + "pop.00043", + "pop.00063", + "pop.00064", + "pop.00065", + "pop.00066", + "pop.00069", + "pop.00070", + "pop.00071", + "pop.00072", + "pop.00073", + "pop.00074", + "pop.00075", + "pop.00076", + "pop.00077", + "pop.00078", + "pop.00079", + "pop.00082", + "pop.00088", + "pop.00089", + "pop.00090", + "pop.00091", + "pop.00092", + "pop.00093", + "pop.00094", + "pop.00095", + "pop.00096", + "reggae.00034", + "reggae.00035", + "reggae.00036", + "reggae.00037", + "reggae.00038", + "reggae.00039", + "reggae.00040", + "reggae.00046", + "reggae.00047", + "reggae.00048", + "reggae.00052", + "reggae.00053", + "reggae.00064", + "reggae.00065", + "reggae.00066", + "reggae.00067", + "reggae.00068", + "reggae.00071", + "reggae.00079", + "reggae.00082", + "reggae.00083", + "reggae.00084", + "reggae.00087", + "reggae.00088", + "reggae.00089", + "reggae.00090", + "rock.00010", + "rock.00011", + "rock.00012", + "rock.00013", + "rock.00014", + "rock.00015", + "rock.00027", + "rock.00028", + "rock.00029", + "rock.00030", + "rock.00031", + "rock.00032", + "rock.00033", + "rock.00034", + "rock.00035", + "rock.00036", + "rock.00037", + "rock.00039", + "rock.00040", + "rock.00041", + "rock.00042", + "rock.00043", + "rock.00044", + "rock.00045", + "rock.00046", + "rock.00047", + "rock.00048", + "rock.00086", + "rock.00087", + "rock.00088", + "rock.00089", + "rock.00090", +] + +filtered_train = [ + "blues.00029", + "blues.00030", + "blues.00031", + "blues.00032", + "blues.00033", + "blues.00034", + "blues.00035", + "blues.00036", + "blues.00037", + "blues.00038", + "blues.00039", + "blues.00040", + "blues.00041", + "blues.00042", + "blues.00043", + "blues.00044", + "blues.00045", + "blues.00046", + "blues.00047", + "blues.00048", + "blues.00049", + "blues.00073", + "blues.00074", + "blues.00075", + "blues.00076", + "blues.00077", + "blues.00078", + "blues.00079", + "blues.00080", + "blues.00081", + "blues.00082", + "blues.00083", + "blues.00084", + "blues.00085", + "blues.00086", + "blues.00087", + "blues.00088", + "blues.00089", + "blues.00090", + "blues.00091", + "blues.00092", + "blues.00093", + "blues.00094", + "blues.00095", + "blues.00096", + "blues.00097", + "classical.00030", + "classical.00031", + "classical.00032", + "classical.00033", + "classical.00043", + "classical.00044", + "classical.00045", + "classical.00046", + "classical.00047", + "classical.00048", + "classical.00050", + "classical.00051", + "classical.00052", + "classical.00053", + "classical.00054", + "classical.00055", + "classical.00056", + "classical.00057", + "classical.00058", + "classical.00059", + "classical.00060", + "classical.00061", + "classical.00062", + "classical.00063", + "classical.00064", + "classical.00065", + "classical.00066", + "classical.00067", + "classical.00080", + "classical.00081", + "classical.00082", + "classical.00083", + "classical.00084", + "classical.00085", + "classical.00086", + "classical.00087", + "classical.00088", + "classical.00089", + "classical.00090", + "classical.00091", + "classical.00092", + "classical.00093", + "classical.00094", + "classical.00095", + "classical.00096", + "classical.00097", + "classical.00098", + "classical.00099", + "country.00019", + "country.00020", + "country.00021", + "country.00022", + "country.00023", + "country.00024", + "country.00025", + "country.00026", + "country.00028", + "country.00029", + "country.00065", + "country.00066", + "country.00067", + "country.00068", + "country.00069", + "country.00070", + "country.00071", + "country.00072", + "country.00073", + "country.00074", + "country.00075", + "country.00076", + "country.00077", + "country.00078", + "country.00079", + "country.00080", + "country.00081", + "country.00082", + "country.00083", + "country.00084", + "country.00085", + "country.00086", + "country.00087", + "country.00088", + "country.00089", + "country.00090", + "country.00091", + "country.00092", + "country.00093", + "country.00094", + "country.00095", + "country.00096", + "country.00097", + "country.00098", + "country.00099", + "disco.00005", + "disco.00015", + "disco.00016", + "disco.00017", + "disco.00018", + "disco.00019", + "disco.00020", + "disco.00022", + "disco.00023", + "disco.00024", + "disco.00025", + "disco.00026", + "disco.00027", + "disco.00028", + "disco.00029", + "disco.00030", + "disco.00031", + "disco.00032", + "disco.00033", + "disco.00034", + "disco.00035", + "disco.00036", + "disco.00037", + "disco.00039", + "disco.00040", + "disco.00041", + "disco.00042", + "disco.00043", + "disco.00044", + "disco.00045", + "disco.00047", + "disco.00049", + "disco.00053", + "disco.00054", + "disco.00056", + "disco.00057", + "disco.00059", + "disco.00061", + "disco.00070", + "disco.00073", + "disco.00074", + "disco.00089", + "hiphop.00002", + "hiphop.00003", + "hiphop.00004", + "hiphop.00005", + "hiphop.00006", + "hiphop.00007", + "hiphop.00008", + "hiphop.00009", + "hiphop.00010", + "hiphop.00011", + "hiphop.00012", + "hiphop.00013", + "hiphop.00014", + "hiphop.00015", + "hiphop.00016", + "hiphop.00017", + "hiphop.00018", + "hiphop.00019", + "hiphop.00020", + "hiphop.00021", + "hiphop.00022", + "hiphop.00023", + "hiphop.00024", + "hiphop.00025", + "hiphop.00028", + "hiphop.00029", + "hiphop.00031", + "hiphop.00032", + "hiphop.00033", + "hiphop.00034", + "hiphop.00035", + "hiphop.00036", + "hiphop.00037", + "hiphop.00038", + "hiphop.00041", + "hiphop.00042", + "hiphop.00055", + "hiphop.00056", + "hiphop.00057", + "hiphop.00058", + "hiphop.00059", + "hiphop.00060", + "hiphop.00061", + "hiphop.00077", + "hiphop.00078", + "hiphop.00079", + "hiphop.00080", + "jazz.00000", + "jazz.00001", + "jazz.00011", + "jazz.00012", + "jazz.00013", + "jazz.00014", + "jazz.00015", + "jazz.00016", + "jazz.00017", + "jazz.00018", + "jazz.00019", + "jazz.00020", + "jazz.00021", + "jazz.00022", + "jazz.00023", + "jazz.00024", + "jazz.00041", + "jazz.00047", + "jazz.00048", + "jazz.00049", + "jazz.00050", + "jazz.00051", + "jazz.00052", + "jazz.00053", + "jazz.00054", + "jazz.00055", + "jazz.00056", + "jazz.00057", + "jazz.00058", + "jazz.00059", + "jazz.00060", + "jazz.00061", + "jazz.00062", + "jazz.00063", + "jazz.00064", + "jazz.00065", + "jazz.00066", + "jazz.00067", + "jazz.00068", + "jazz.00069", + "jazz.00070", + "jazz.00071", + "jazz.00072", + "metal.00002", + "metal.00003", + "metal.00005", + "metal.00021", + "metal.00024", + "metal.00035", + "metal.00046", + "metal.00047", + "metal.00048", + "metal.00049", + "metal.00050", + "metal.00051", + "metal.00052", + "metal.00053", + "metal.00054", + "metal.00055", + "metal.00056", + "metal.00057", + "metal.00059", + "metal.00060", + "metal.00061", + "metal.00062", + "metal.00063", + "metal.00064", + "metal.00065", + "metal.00066", + "metal.00069", + "metal.00071", + "metal.00072", + "metal.00079", + "metal.00080", + "metal.00084", + "metal.00086", + "metal.00089", + "metal.00090", + "metal.00091", + "metal.00092", + "metal.00093", + "metal.00094", + "metal.00095", + "metal.00096", + "metal.00097", + "metal.00098", + "metal.00099", + "pop.00002", + "pop.00003", + "pop.00004", + "pop.00005", + "pop.00006", + "pop.00007", + "pop.00008", + "pop.00009", + "pop.00011", + "pop.00012", + "pop.00016", + "pop.00017", + "pop.00018", + "pop.00019", + "pop.00020", + "pop.00023", + "pop.00024", + "pop.00025", + "pop.00026", + "pop.00027", + "pop.00028", + "pop.00029", + "pop.00031", + "pop.00032", + "pop.00033", + "pop.00034", + "pop.00035", + "pop.00036", + "pop.00038", + "pop.00039", + "pop.00040", + "pop.00041", + "pop.00042", + "pop.00044", + "pop.00046", + "pop.00049", + "pop.00050", + "pop.00080", + "pop.00097", + "pop.00098", + "pop.00099", + "reggae.00000", + "reggae.00001", + "reggae.00002", + "reggae.00004", + "reggae.00006", + "reggae.00009", + "reggae.00011", + "reggae.00012", + "reggae.00014", + "reggae.00015", + "reggae.00016", + "reggae.00017", + "reggae.00018", + "reggae.00019", + "reggae.00020", + "reggae.00021", + "reggae.00022", + "reggae.00023", + "reggae.00024", + "reggae.00025", + "reggae.00026", + "reggae.00027", + "reggae.00028", + "reggae.00029", + "reggae.00030", + "reggae.00031", + "reggae.00032", + "reggae.00042", + "reggae.00043", + "reggae.00044", + "reggae.00045", + "reggae.00049", + "reggae.00050", + "reggae.00051", + "reggae.00054", + "reggae.00055", + "reggae.00056", + "reggae.00057", + "reggae.00058", + "reggae.00059", + "reggae.00060", + "reggae.00063", + "reggae.00069", + "rock.00000", + "rock.00001", + "rock.00002", + "rock.00003", + "rock.00004", + "rock.00005", + "rock.00006", + "rock.00007", + "rock.00008", + "rock.00009", + "rock.00016", + "rock.00017", + "rock.00018", + "rock.00019", + "rock.00020", + "rock.00021", + "rock.00022", + "rock.00023", + "rock.00024", + "rock.00025", + "rock.00026", + "rock.00057", + "rock.00058", + "rock.00059", + "rock.00060", + "rock.00061", + "rock.00062", + "rock.00063", + "rock.00064", + "rock.00065", + "rock.00066", + "rock.00067", + "rock.00068", + "rock.00069", + "rock.00070", + "rock.00091", + "rock.00092", + "rock.00093", + "rock.00094", + "rock.00095", + "rock.00096", + "rock.00097", + "rock.00098", + "rock.00099", +] + +filtered_valid = [ + "blues.00000", + "blues.00001", + "blues.00002", + "blues.00003", + "blues.00004", + "blues.00005", + "blues.00006", + "blues.00007", + "blues.00008", + "blues.00009", + "blues.00010", + "blues.00011", + "blues.00050", + "blues.00051", + "blues.00052", + "blues.00053", + "blues.00054", + "blues.00055", + "blues.00056", + "blues.00057", + "blues.00058", + "blues.00059", + "blues.00060", + "classical.00000", + "classical.00001", + "classical.00002", + "classical.00003", + "classical.00004", + "classical.00005", + "classical.00006", + "classical.00007", + "classical.00008", + "classical.00009", + "classical.00010", + "classical.00068", + "classical.00069", + "classical.00070", + "classical.00071", + "classical.00072", + "classical.00073", + "classical.00074", + "classical.00075", + "classical.00076", + "country.00000", + "country.00001", + "country.00002", + "country.00003", + "country.00004", + "country.00005", + "country.00006", + "country.00007", + "country.00009", + "country.00010", + "country.00011", + "country.00012", + "country.00013", + "country.00014", + "country.00015", + "country.00016", + "country.00017", + "country.00018", + "country.00027", + "country.00041", + "country.00042", + "country.00045", + "country.00049", + "disco.00000", + "disco.00002", + "disco.00003", + "disco.00004", + "disco.00006", + "disco.00007", + "disco.00008", + "disco.00009", + "disco.00010", + "disco.00011", + "disco.00012", + "disco.00013", + "disco.00014", + "disco.00046", + "disco.00048", + "disco.00052", + "disco.00067", + "disco.00068", + "disco.00072", + "disco.00075", + "disco.00090", + "disco.00095", + "hiphop.00081", + "hiphop.00082", + "hiphop.00083", + "hiphop.00084", + "hiphop.00085", + "hiphop.00086", + "hiphop.00087", + "hiphop.00088", + "hiphop.00089", + "hiphop.00090", + "hiphop.00091", + "hiphop.00092", + "hiphop.00093", + "hiphop.00094", + "hiphop.00095", + "hiphop.00096", + "hiphop.00097", + "hiphop.00098", + "jazz.00002", + "jazz.00003", + "jazz.00004", + "jazz.00005", + "jazz.00006", + "jazz.00007", + "jazz.00008", + "jazz.00009", + "jazz.00010", + "jazz.00025", + "jazz.00026", + "jazz.00027", + "jazz.00028", + "jazz.00029", + "jazz.00030", + "jazz.00031", + "jazz.00032", + "metal.00000", + "metal.00001", + "metal.00006", + "metal.00007", + "metal.00008", + "metal.00009", + "metal.00010", + "metal.00011", + "metal.00016", + "metal.00017", + "metal.00018", + "metal.00019", + "metal.00020", + "metal.00036", + "metal.00037", + "metal.00068", + "metal.00076", + "metal.00077", + "metal.00081", + "metal.00082", + "pop.00010", + "pop.00053", + "pop.00055", + "pop.00058", + "pop.00059", + "pop.00060", + "pop.00061", + "pop.00062", + "pop.00081", + "pop.00083", + "pop.00084", + "pop.00085", + "pop.00086", + "reggae.00061", + "reggae.00062", + "reggae.00070", + "reggae.00072", + "reggae.00074", + "reggae.00076", + "reggae.00077", + "reggae.00078", + "reggae.00085", + "reggae.00092", + "reggae.00093", + "reggae.00094", + "reggae.00095", + "reggae.00096", + "reggae.00097", + "reggae.00098", + "reggae.00099", + "rock.00038", + "rock.00049", + "rock.00050", + "rock.00051", + "rock.00052", + "rock.00053", + "rock.00054", + "rock.00055", + "rock.00056", + "rock.00071", + "rock.00072", + "rock.00073", + "rock.00074", + "rock.00075", + "rock.00076", + "rock.00077", + "rock.00078", + "rock.00079", + "rock.00080", + "rock.00081", + "rock.00082", + "rock.00083", + "rock.00084", + "rock.00085", +] + + +URL = "http://opihi.cs.uvic.ca/sound/genres.tar.gz" +FOLDER_IN_ARCHIVE = "genres" +_CHECKSUMS = { + "http://opihi.cs.uvic.ca/sound/genres.tar.gz": "24347e0223d2ba798e0a558c4c172d9d4a19c00bb7963fe055d183dadb4ef2c6" +} + + +def load_gtzan_item(fileid: str, path: str, ext_audio: str) -> Tuple[Tensor, str]: + """ + Loads a file from the dataset and returns the raw waveform + as a Torch Tensor, its sample rate as an integer, and its + genre as a string. + """ + # Filenames are of the form label.id, e.g. blues.00078 + label, _ = fileid.split(".") + + # Read wav + file_audio = os.path.join(path, label, fileid + ext_audio) + waveform, sample_rate = torchaudio.load(file_audio) + + return waveform, sample_rate, label + + +class GTZAN(Dataset): + """Create a Dataset for *GTZAN* [:footcite:`tzanetakis_essl_cook_2001`]. + + Note: + Please see http://marsyas.info/downloads/datasets.html if you are planning to use + this dataset to publish results. + + Args: + root (str or Path): Path to the directory where the dataset is found or downloaded. + url (str, optional): The URL to download the dataset from. + (default: ``"http://opihi.cs.uvic.ca/sound/genres.tar.gz"``) + folder_in_archive (str, optional): The top-level directory of the dataset. + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + subset (str or None, optional): Which subset of the dataset to use. + One of ``"training"``, ``"validation"``, ``"testing"`` or ``None``. + If ``None``, the entire dataset is used. (default: ``None``). + """ + + _ext_audio = ".wav" + + def __init__( + self, + root: Union[str, Path], + url: str = URL, + folder_in_archive: str = FOLDER_IN_ARCHIVE, + download: bool = False, + subset: Optional[str] = None, + ) -> None: + + # super(GTZAN, self).__init__() + + # Get string representation of 'root' in case Path object is passed + root = os.fspath(root) + + self.root = root + self.url = url + self.folder_in_archive = folder_in_archive + self.download = download + self.subset = subset + + assert subset is None or subset in ["training", "validation", "testing"], ( + "When `subset` not None, it must take a value from " + "{'training', 'validation', 'testing'}." + ) + + archive = os.path.basename(url) + archive = os.path.join(root, archive) + self._path = os.path.join(root, folder_in_archive) + + if download: + if not os.path.isdir(self._path): + if not os.path.isfile(archive): + checksum = _CHECKSUMS.get(url, None) + download_url_to_file(url, archive, hash_prefix=checksum) + extract_archive(archive) + + if not os.path.isdir(self._path): + raise RuntimeError("Dataset not found. Please use `download=True` to download it.") + + if self.subset is None: + # Check every subdirectory under dataset root + # which has the same name as the genres in + # GTZAN (e.g. `root_dir'/blues/, `root_dir'/rock, etc.) + # This lets users remove or move around song files, + # useful when e.g. they want to use only some of the files + # in a genre or want to label other files with a different + # genre. + self._walker = [] + + root = os.path.expanduser(self._path) + + for directory in gtzan_genres: + fulldir = os.path.join(root, directory) + + if not os.path.exists(fulldir): + continue + + songs_in_genre = os.listdir(fulldir) + songs_in_genre.sort() + for fname in songs_in_genre: + name, ext = os.path.splitext(fname) + if ext.lower() == ".wav" and "." in name: + # Check whether the file is of the form + # `gtzan_genre`.`5 digit number`.wav + genre, num = name.split(".") + if genre in gtzan_genres and len(num) == 5 and num.isdigit(): + self._walker.append(name) + else: + if self.subset == "training": + self._walker = filtered_train + elif self.subset == "validation": + self._walker = filtered_valid + elif self.subset == "testing": + self._walker = filtered_test + + def __getitem__(self, n: int) -> Tuple[Tensor, int, str]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + (Tensor, int, str): ``(waveform, sample_rate, label)`` + """ + fileid = self._walker[n] + item = load_gtzan_item(fileid, self._path, self._ext_audio) + waveform, sample_rate, label = item + return waveform, sample_rate, label + + def __len__(self) -> int: + return len(self._walker) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librilight_limited.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librilight_limited.py new file mode 100644 index 0000000000000000000000000000000000000000..947254479ec83e2bc96525fe824a2201afe1e6ae --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librilight_limited.py @@ -0,0 +1,91 @@ +import os +from pathlib import Path +from typing import List, Tuple, Union + +from torch import Tensor +from torch.hub import download_url_to_file +from torch.utils.data import Dataset +from torchaudio.datasets.librispeech import load_librispeech_item +from torchaudio.datasets.utils import extract_archive + + +_ARCHIVE_NAME = "librispeech_finetuning" +_URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz" +_CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af" + + +def _get_fileids_paths(path, subset, _ext_audio) -> List[Tuple[str, str]]: + """Get the file names and the corresponding file paths without `speaker_id` + and `chapter_id` directories. + The format of path is like: + {root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or + {root}/{_ARCHIVE_NAME}/9h/[clean, other] + """ + if subset == "10min": + files_paths = [ + (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem)) + for p in Path(path).glob("1h/0/*/*/*/*" + _ext_audio) + ] + elif subset in ["1h", "10h"]: + files_paths = [ + (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem)) + for p in Path(path).glob("1h/*/*/*/*/*" + _ext_audio) + ] + if subset == "10h": + files_paths += [ + (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem)) + for p in Path(path).glob("9h/*/*/*/*" + _ext_audio) + ] + else: + raise ValueError(f"Unsupported subset value. Found {subset}.") + files_paths = sorted(files_paths, key=lambda x: x[0] + x[1]) + return files_paths + + +class LibriLightLimited(Dataset): + """Create a Dataset for LibriLightLimited, which is the supervised subset of + LibriLight dataset. + + Args: + root (str or Path): Path to the directory where the dataset is found or downloaded. + subset (str, optional): The subset to use. Options: [``10min``, ``1h``, ``10h``] + (Default: ``10min``). + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + """ + + _ext_txt = ".trans.txt" + _ext_audio = ".flac" + + def __init__( + self, + root: Union[str, Path], + subset: str = "10min", + download: bool = False, + ) -> None: + assert subset in ["10min", "1h", "10h"], "`subset` must be one of ['10min', '1h', '10h']" + + root = os.fspath(root) + self._path = os.path.join(root, _ARCHIVE_NAME) + archive = os.path.join(root, f"{_ARCHIVE_NAME}.tgz") + if not os.path.isdir(self._path): + if not download: + raise RuntimeError("Dataset not found. Please use `download=True` to download") + if not os.path.isfile(archive): + download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM) + extract_archive(archive) + self._fileids_paths = _get_fileids_paths(self._path, subset, self._ext_audio) + + def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: + """Load the n-th sample from the dataset. + Args: + n (int): The index of the sample to be loaded + Returns: + (Tensor, int, str, int, int, int): + ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)`` + """ + file_path, fileid = self._fileids_paths[n] + return load_librispeech_item(fileid, file_path, self._ext_audio, self._ext_txt) + + def __len__(self) -> int: + return len(self._fileids_paths) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librimix.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librimix.py new file mode 100644 index 0000000000000000000000000000000000000000..ebc9496d7732be787023129a33da59623d586693 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librimix.py @@ -0,0 +1,85 @@ +from pathlib import Path +from typing import List, Tuple, Union + +import torch +import torchaudio +from torch.utils.data import Dataset + +SampleType = Tuple[int, torch.Tensor, List[torch.Tensor]] + + +class LibriMix(Dataset): + r"""Create the *LibriMix* [:footcite:`cosentino2020librimix`] dataset. + + Args: + root (str or Path): The path to the directory where the directory ``Libri2Mix`` or + ``Libri3Mix`` is stored. + subset (str, optional): The subset to use. Options: [``train-360``, ``train-100``, + ``dev``, and ``test``] (Default: ``train-360``). + num_speakers (int, optional): The number of speakers, which determines the directories + to traverse. The Dataset will traverse ``s1`` to ``sN`` directories to collect + N source audios. (Default: 2) + sample_rate (int, optional): sample rate of audio files. The ``sample_rate`` determines + which subdirectory the audio are fetched. If any of the audio has a different sample + rate, raises ``ValueError``. Options: [8000, 16000] (Default: 8000) + task (str, optional): the task of LibriMix. + Options: [``enh_single``, ``enh_both``, ``sep_clean``, ``sep_noisy``] + (Default: ``sep_clean``) + + Note: + The LibriMix dataset needs to be manually generated. Please check https://github.com/JorisCos/LibriMix + """ + + def __init__( + self, + root: Union[str, Path], + subset: str = "train-360", + num_speakers: int = 2, + sample_rate: int = 8000, + task: str = "sep_clean", + ): + self.root = Path(root) / f"Libri{num_speakers}Mix" + if sample_rate == 8000: + self.root = self.root / "wav8k/min" / subset + elif sample_rate == 16000: + self.root = self.root / "wav16k/min" / subset + else: + raise ValueError(f"Unsupported sample rate. Found {sample_rate}.") + self.sample_rate = sample_rate + self.task = task + self.mix_dir = (self.root / f"mix_{task.split('_')[1]}").resolve() + self.src_dirs = [(self.root / f"s{i+1}").resolve() for i in range(num_speakers)] + + self.files = [p.name for p in self.mix_dir.glob("*wav")] + self.files.sort() + + def _load_audio(self, path) -> torch.Tensor: + waveform, sample_rate = torchaudio.load(path) + if sample_rate != self.sample_rate: + raise ValueError( + f"The dataset contains audio file of sample rate {sample_rate}, " + f"but the requested sample rate is {self.sample_rate}." + ) + return waveform + + def _load_sample(self, filename) -> SampleType: + mixed = self._load_audio(str(self.mix_dir / filename)) + srcs = [] + for i, dir_ in enumerate(self.src_dirs): + src = self._load_audio(str(dir_ / filename)) + if mixed.shape != src.shape: + raise ValueError(f"Different waveform shapes. mixed: {mixed.shape}, src[{i}]: {src.shape}") + srcs.append(src) + return self.sample_rate, mixed, srcs + + def __len__(self) -> int: + return len(self.files) + + def __getitem__(self, key: int) -> SampleType: + """Load the n-th sample from the dataset. + Args: + key (int): The index of the sample to be loaded + Returns: + (int, Tensor, List[Tensor]): ``(sample_rate, mix_waveform, list_of_source_waveforms)`` + """ + return self._load_sample(self.files[key]) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librispeech.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librispeech.py new file mode 100644 index 0000000000000000000000000000000000000000..5cd06468303bd612a45aa286ed5af83c169cff40 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librispeech.py @@ -0,0 +1,135 @@ +import os +from pathlib import Path +from typing import Tuple, Union + +import torchaudio +from torch import Tensor +from torch.hub import download_url_to_file +from torch.utils.data import Dataset +from torchaudio.datasets.utils import extract_archive + +URL = "train-clean-100" +FOLDER_IN_ARCHIVE = "LibriSpeech" +_DATA_SUBSETS = [ + "dev-clean", + "dev-other", + "test-clean", + "test-other", + "train-clean-100", + "train-clean-360", + "train-other-500", +] +_CHECKSUMS = { + "http://www.openslr.org/resources/12/dev-clean.tar.gz": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3", # noqa: E501 + "http://www.openslr.org/resources/12/dev-other.tar.gz": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365", # noqa: E501 + "http://www.openslr.org/resources/12/test-clean.tar.gz": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23", # noqa: E501 + "http://www.openslr.org/resources/12/test-other.tar.gz": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29", # noqa: E501 + "http://www.openslr.org/resources/12/train-clean-100.tar.gz": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2", # noqa: E501 + "http://www.openslr.org/resources/12/train-clean-360.tar.gz": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf", # noqa: E501 + "http://www.openslr.org/resources/12/train-other-500.tar.gz": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2", # noqa: E501 +} + + +def download_librispeech(root, url): + base_url = "http://www.openslr.org/resources/12/" + ext_archive = ".tar.gz" + + filename = url + ext_archive + archive = os.path.join(root, filename) + download_url = os.path.join(base_url, filename) + if not os.path.isfile(archive): + checksum = _CHECKSUMS.get(download_url, None) + download_url_to_file(download_url, archive, hash_prefix=checksum) + extract_archive(archive) + + +def load_librispeech_item( + fileid: str, path: str, ext_audio: str, ext_txt: str +) -> Tuple[Tensor, int, str, int, int, int]: + speaker_id, chapter_id, utterance_id = fileid.split("-") + + # Load audio + fileid_audio = f"{speaker_id}-{chapter_id}-{utterance_id}" + file_audio = fileid_audio + ext_audio + file_audio = os.path.join(path, speaker_id, chapter_id, file_audio) + waveform, sample_rate = torchaudio.load(file_audio) + + # Load text + file_text = f"{speaker_id}-{chapter_id}{ext_txt}" + file_text = os.path.join(path, speaker_id, chapter_id, file_text) + with open(file_text) as ft: + for line in ft: + fileid_text, transcript = line.strip().split(" ", 1) + if fileid_audio == fileid_text: + break + else: + # Translation not found + raise FileNotFoundError(f"Translation not found for {fileid_audio}") + + return ( + waveform, + sample_rate, + transcript, + int(speaker_id), + int(chapter_id), + int(utterance_id), + ) + + +class LIBRISPEECH(Dataset): + """Create a Dataset for *LibriSpeech* [:footcite:`7178964`]. + + Args: + root (str or Path): Path to the directory where the dataset is found or downloaded. + url (str, optional): The URL to download the dataset from, + or the type of the dataset to dowload. + Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``, + ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and + ``"train-other-500"``. (default: ``"train-clean-100"``) + folder_in_archive (str, optional): + The top-level directory of the dataset. (default: ``"LibriSpeech"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + """ + + _ext_txt = ".trans.txt" + _ext_audio = ".flac" + + def __init__( + self, + root: Union[str, Path], + url: str = URL, + folder_in_archive: str = FOLDER_IN_ARCHIVE, + download: bool = False, + ) -> None: + if url not in _DATA_SUBSETS: + raise ValueError(f"Invalid url '{url}' given; please provide one of {_DATA_SUBSETS}.") + + root = os.fspath(root) + self._path = os.path.join(root, folder_in_archive, url) + + if not os.path.isdir(self._path): + if download: + download_librispeech(root, url) + else: + raise RuntimeError( + f"Dataset not found at {self._path}. Please set `download=True` to download the dataset." + ) + + self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*/*/*" + self._ext_audio)) + + def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + (Tensor, int, str, int, int, int): + ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)`` + """ + fileid = self._walker[n] + return load_librispeech_item(fileid, self._path, self._ext_audio, self._ext_txt) + + def __len__(self) -> int: + return len(self._walker) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/libritts.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/libritts.py new file mode 100644 index 0000000000000000000000000000000000000000..f7e10cedc4be9f91e0dc839ea446aea984060447 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/libritts.py @@ -0,0 +1,154 @@ +import os +from pathlib import Path +from typing import Tuple, Union + +import torchaudio +from torch import Tensor +from torch.hub import download_url_to_file +from torch.utils.data import Dataset +from torchaudio.datasets.utils import extract_archive + +URL = "train-clean-100" +FOLDER_IN_ARCHIVE = "LibriTTS" +_CHECKSUMS = { + "http://www.openslr.org/resources/60/dev-clean.tar.gz": "da0864e1bd26debed35da8a869dd5c04dfc27682921936de7cff9c8a254dbe1a", # noqa: E501 + "http://www.openslr.org/resources/60/dev-other.tar.gz": "d413eda26f3a152ac7c9cf3658ef85504dfb1b625296e5fa83727f5186cca79c", # noqa: E501 + "http://www.openslr.org/resources/60/test-clean.tar.gz": "234ea5b25859102a87024a4b9b86641f5b5aaaf1197335c95090cde04fe9a4f5", # noqa: E501 + "http://www.openslr.org/resources/60/test-other.tar.gz": "33a5342094f3bba7ccc2e0500b9e72d558f72eb99328ac8debe1d9080402f10d", # noqa: E501 + "http://www.openslr.org/resources/60/train-clean-100.tar.gz": "c5608bf1ef74bb621935382b8399c5cdd51cd3ee47cec51f00f885a64c6c7f6b", # noqa: E501 + "http://www.openslr.org/resources/60/train-clean-360.tar.gz": "ce7cff44dcac46009d18379f37ef36551123a1dc4e5c8e4eb73ae57260de4886", # noqa: E501 + "http://www.openslr.org/resources/60/train-other-500.tar.gz": "e35f7e34deeb2e2bdfe4403d88c8fdd5fbf64865cae41f027a185a6965f0a5df", # noqa: E501 +} + + +def load_libritts_item( + fileid: str, + path: str, + ext_audio: str, + ext_original_txt: str, + ext_normalized_txt: str, +) -> Tuple[Tensor, int, str, str, int, int, str]: + speaker_id, chapter_id, segment_id, utterance_id = fileid.split("_") + utterance_id = fileid + + normalized_text = utterance_id + ext_normalized_txt + normalized_text = os.path.join(path, speaker_id, chapter_id, normalized_text) + + original_text = utterance_id + ext_original_txt + original_text = os.path.join(path, speaker_id, chapter_id, original_text) + + file_audio = utterance_id + ext_audio + file_audio = os.path.join(path, speaker_id, chapter_id, file_audio) + + # Load audio + waveform, sample_rate = torchaudio.load(file_audio) + + # Load original text + with open(original_text) as ft: + original_text = ft.readline() + + # Load normalized text + with open(normalized_text, "r") as ft: + normalized_text = ft.readline() + + return ( + waveform, + sample_rate, + original_text, + normalized_text, + int(speaker_id), + int(chapter_id), + utterance_id, + ) + + +class LIBRITTS(Dataset): + """Create a Dataset for *LibriTTS* [:footcite:`Zen2019LibriTTSAC`]. + + Args: + root (str or Path): Path to the directory where the dataset is found or downloaded. + url (str, optional): The URL to download the dataset from, + or the type of the dataset to dowload. + Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``, + ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and + ``"train-other-500"``. (default: ``"train-clean-100"``) + folder_in_archive (str, optional): + The top-level directory of the dataset. (default: ``"LibriTTS"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + """ + + _ext_original_txt = ".original.txt" + _ext_normalized_txt = ".normalized.txt" + _ext_audio = ".wav" + + def __init__( + self, + root: Union[str, Path], + url: str = URL, + folder_in_archive: str = FOLDER_IN_ARCHIVE, + download: bool = False, + ) -> None: + + if url in [ + "dev-clean", + "dev-other", + "test-clean", + "test-other", + "train-clean-100", + "train-clean-360", + "train-other-500", + ]: + + ext_archive = ".tar.gz" + base_url = "http://www.openslr.org/resources/60/" + + url = os.path.join(base_url, url + ext_archive) + + # Get string representation of 'root' in case Path object is passed + root = os.fspath(root) + + basename = os.path.basename(url) + archive = os.path.join(root, basename) + + basename = basename.split(".")[0] + folder_in_archive = os.path.join(folder_in_archive, basename) + + self._path = os.path.join(root, folder_in_archive) + + if download: + if not os.path.isdir(self._path): + if not os.path.isfile(archive): + checksum = _CHECKSUMS.get(url, None) + download_url_to_file(url, archive, hash_prefix=checksum) + extract_archive(archive) + else: + if not os.path.exists(self._path): + raise RuntimeError( + f"The path {self._path} doesn't exist. " + "Please check the ``root`` path or set `download=True` to download it" + ) + + self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*/*/*" + self._ext_audio)) + + def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int, int, str]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + (Tensor, int, str, str, str, int, int, str): + ``(waveform, sample_rate, original_text, normalized_text, speaker_id, chapter_id, utterance_id)`` + """ + fileid = self._walker[n] + return load_libritts_item( + fileid, + self._path, + self._ext_audio, + self._ext_original_txt, + self._ext_normalized_txt, + ) + + def __len__(self) -> int: + return len(self._walker) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/ljspeech.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/ljspeech.py new file mode 100644 index 0000000000000000000000000000000000000000..e8421b639f913a007838c2d01fed1181d83d5c9a --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/ljspeech.py @@ -0,0 +1,99 @@ +import csv +import os +from pathlib import Path +from typing import Tuple, Union + +import torchaudio +from torch import Tensor +from torch.hub import download_url_to_file +from torch.utils.data import Dataset +from torchaudio.datasets.utils import extract_archive + + +_RELEASE_CONFIGS = { + "release1": { + "folder_in_archive": "wavs", + "url": "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2", + "checksum": "be1a30453f28eb8dd26af4101ae40cbf2c50413b1bb21936cbcdc6fae3de8aa5", + } +} + + +class LJSPEECH(Dataset): + """Create a Dataset for *LJSpeech-1.1* [:footcite:`ljspeech17`]. + + Args: + root (str or Path): Path to the directory where the dataset is found or downloaded. + url (str, optional): The URL to download the dataset from. + (default: ``"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"``) + folder_in_archive (str, optional): + The top-level directory of the dataset. (default: ``"wavs"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + """ + + def __init__( + self, + root: Union[str, Path], + url: str = _RELEASE_CONFIGS["release1"]["url"], + folder_in_archive: str = _RELEASE_CONFIGS["release1"]["folder_in_archive"], + download: bool = False, + ) -> None: + + self._parse_filesystem(root, url, folder_in_archive, download) + + def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, download: bool) -> None: + root = Path(root) + + basename = os.path.basename(url) + archive = root / basename + + basename = Path(basename.split(".tar.bz2")[0]) + folder_in_archive = basename / folder_in_archive + + self._path = root / folder_in_archive + self._metadata_path = root / basename / "metadata.csv" + + if download: + if not os.path.isdir(self._path): + if not os.path.isfile(archive): + checksum = _RELEASE_CONFIGS["release1"]["checksum"] + download_url_to_file(url, archive, hash_prefix=checksum) + extract_archive(archive) + else: + if not os.path.exists(self._path): + raise RuntimeError( + f"The path {self._path} doesn't exist. " + "Please check the ``root`` path or set `download=True` to download it" + ) + + with open(self._metadata_path, "r", newline="") as metadata: + flist = csv.reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE) + self._flist = list(flist) + + def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + (Tensor, int, str, str): + ``(waveform, sample_rate, transcript, normalized_transcript)`` + """ + line = self._flist[n] + fileid, transcript, normalized_transcript = line + fileid_audio = self._path / (fileid + ".wav") + + # Load audio + waveform, sample_rate = torchaudio.load(fileid_audio) + + return ( + waveform, + sample_rate, + transcript, + normalized_transcript, + ) + + def __len__(self) -> int: + return len(self._flist) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/quesst14.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/quesst14.py new file mode 100644 index 0000000000000000000000000000000000000000..68ddceeaf9d770a772de87d9709472d6c0dce455 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/quesst14.py @@ -0,0 +1,109 @@ +import os +import re +from pathlib import Path +from typing import Optional, Tuple, Union + +import torch +import torchaudio +from torch.hub import download_url_to_file +from torch.utils.data import Dataset +from torchaudio.datasets.utils import extract_archive + + +URL = "https://speech.fit.vutbr.cz/files/quesst14Database.tgz" +_CHECKSUM = "4f869e06bc066bbe9c5dde31dbd3909a0870d70291110ebbb38878dcbc2fc5e4" +_LANGUAGES = [ + "albanian", + "basque", + "czech", + "nnenglish", + "romanian", + "slovak", +] + + +class QUESST14(Dataset): + """Create *QUESST14* [:footcite:`Mir2015QUESST2014EQ`] Dataset + + Args: + root (str or Path): Root directory where the dataset's top level directory is found + subset (str): Subset of the dataset to use. Options: [``"docs"``, ``"dev"``, ``"eval"``]. + language (str or None, optional): Language to get dataset for. + Options: [``None``, ``albanian``, ``basque``, ``czech``, ``nnenglish``, ``romanian``, ``slovak``]. + If ``None``, dataset consists of all languages. (default: ``"nnenglish"``) + download (bool, optional): Whether to download the dataset if it is not found at root path. + (default: ``False``) + """ + + def __init__( + self, + root: Union[str, Path], + subset: str, + language: Optional[str] = "nnenglish", + download: bool = False, + ) -> None: + assert subset in ["docs", "dev", "eval"], "`subset` must be one of ['docs', 'dev', 'eval']" + + assert language is None or language in _LANGUAGES, f"`language` must be None or one of {str(_LANGUAGES)}" + + # Get string representation of 'root' + root = os.fspath(root) + + basename = os.path.basename(URL) + archive = os.path.join(root, basename) + + basename = basename.rsplit(".", 2)[0] + self._path = os.path.join(root, basename) + + if not os.path.isdir(self._path): + if not os.path.isfile(archive): + if not download: + raise RuntimeError("Dataset not found. Please use `download=True` to download") + download_url_to_file(URL, archive, hash_prefix=_CHECKSUM) + extract_archive(archive, root) + + if subset == "docs": + self.data = filter_audio_paths(self._path, language, "language_key_utterances.lst") + elif subset == "dev": + self.data = filter_audio_paths(self._path, language, "language_key_dev.lst") + elif subset == "eval": + self.data = filter_audio_paths(self._path, language, "language_key_eval.lst") + + def _load_sample(self, n: int) -> Tuple[torch.Tensor, int, str]: + audio_path = self.data[n] + wav, sample_rate = torchaudio.load(audio_path) + return wav, sample_rate, audio_path.with_suffix("").name + + def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + (Tensor, int, str): ``(waveform, sample_rate, file_name)`` + """ + return self._load_sample(n) + + def __len__(self) -> int: + return len(self.data) + + +def filter_audio_paths( + path: str, + language: str, + lst_name: str, +): + """Extract audio paths for the given language.""" + audio_paths = [] + + path = Path(path) + with open(path / "scoring" / lst_name) as f: + for line in f: + audio_path, lang = line.strip().split() + if language is not None and lang != language: + continue + audio_path = re.sub(r"^.*?\/", "", audio_path) + audio_paths.append(path / audio_path) + + return audio_paths diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/speechcommands.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/speechcommands.py new file mode 100644 index 0000000000000000000000000000000000000000..6b9872662f9396651ff85a1438aed14afaed3c90 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/speechcommands.py @@ -0,0 +1,149 @@ +import os +from pathlib import Path +from typing import Optional, Tuple, Union + +import torchaudio +from torch import Tensor +from torch.hub import download_url_to_file +from torch.utils.data import Dataset +from torchaudio.datasets.utils import extract_archive + +FOLDER_IN_ARCHIVE = "SpeechCommands" +URL = "speech_commands_v0.02" +HASH_DIVIDER = "_nohash_" +EXCEPT_FOLDER = "_background_noise_" +_CHECKSUMS = { + "https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.01.tar.gz": "743935421bb51cccdb6bdd152e04c5c70274e935c82119ad7faeec31780d811d", # noqa: E501 + "https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz": "af14739ee7dc311471de98f5f9d2c9191b18aedfe957f4a6ff791c709868ff58", # noqa: E501 +} + + +def _load_list(root, *filenames): + output = [] + for filename in filenames: + filepath = os.path.join(root, filename) + with open(filepath) as fileobj: + output += [os.path.normpath(os.path.join(root, line.strip())) for line in fileobj] + return output + + +def load_speechcommands_item(filepath: str, path: str) -> Tuple[Tensor, int, str, str, int]: + relpath = os.path.relpath(filepath, path) + label, filename = os.path.split(relpath) + # Besides the officially supported split method for datasets defined by "validation_list.txt" + # and "testing_list.txt" over "speech_commands_v0.0x.tar.gz" archives, an alternative split + # method referred to in paragraph 2-3 of Section 7.1, references 13 and 14 of the original + # paper, and the checksums file from the tensorflow_datasets package [1] is also supported. + # Some filenames in those "speech_commands_test_set_v0.0x.tar.gz" archives have the form + # "xxx.wav.wav", so file extensions twice needs to be stripped twice. + # [1] https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/url_checksums/speech_commands.txt + speaker, _ = os.path.splitext(filename) + speaker, _ = os.path.splitext(speaker) + + speaker_id, utterance_number = speaker.split(HASH_DIVIDER) + utterance_number = int(utterance_number) + + # Load audio + waveform, sample_rate = torchaudio.load(filepath) + return waveform, sample_rate, label, speaker_id, utterance_number + + +class SPEECHCOMMANDS(Dataset): + """Create a Dataset for *Speech Commands* [:footcite:`speechcommandsv2`]. + + Args: + root (str or Path): Path to the directory where the dataset is found or downloaded. + url (str, optional): The URL to download the dataset from, + or the type of the dataset to dowload. + Allowed type values are ``"speech_commands_v0.01"`` and ``"speech_commands_v0.02"`` + (default: ``"speech_commands_v0.02"``) + folder_in_archive (str, optional): + The top-level directory of the dataset. (default: ``"SpeechCommands"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + subset (str or None, optional): + Select a subset of the dataset [None, "training", "validation", "testing"]. None means + the whole dataset. "validation" and "testing" are defined in "validation_list.txt" and + "testing_list.txt", respectively, and "training" is the rest. Details for the files + "validation_list.txt" and "testing_list.txt" are explained in the README of the dataset + and in the introduction of Section 7 of the original paper and its reference 12. The + original paper can be found `here `_. (Default: ``None``) + """ + + def __init__( + self, + root: Union[str, Path], + url: str = URL, + folder_in_archive: str = FOLDER_IN_ARCHIVE, + download: bool = False, + subset: Optional[str] = None, + ) -> None: + + assert subset is None or subset in ["training", "validation", "testing"], ( + "When `subset` not None, it must take a value from " + "{'training', 'validation', 'testing'}." + ) + + if url in [ + "speech_commands_v0.01", + "speech_commands_v0.02", + ]: + base_url = "https://storage.googleapis.com/download.tensorflow.org/data/" + ext_archive = ".tar.gz" + + url = os.path.join(base_url, url + ext_archive) + + # Get string representation of 'root' in case Path object is passed + root = os.fspath(root) + + basename = os.path.basename(url) + archive = os.path.join(root, basename) + + basename = basename.rsplit(".", 2)[0] + folder_in_archive = os.path.join(folder_in_archive, basename) + + self._path = os.path.join(root, folder_in_archive) + + if download: + if not os.path.isdir(self._path): + if not os.path.isfile(archive): + checksum = _CHECKSUMS.get(url, None) + download_url_to_file(url, archive, hash_prefix=checksum) + extract_archive(archive, self._path) + else: + if not os.path.exists(self._path): + raise RuntimeError( + f"The path {self._path} doesn't exist. " + "Please check the ``root`` path or set `download=True` to download it" + ) + + if subset == "validation": + self._walker = _load_list(self._path, "validation_list.txt") + elif subset == "testing": + self._walker = _load_list(self._path, "testing_list.txt") + elif subset == "training": + excludes = set(_load_list(self._path, "validation_list.txt", "testing_list.txt")) + walker = sorted(str(p) for p in Path(self._path).glob("*/*.wav")) + self._walker = [ + w + for w in walker + if HASH_DIVIDER in w and EXCEPT_FOLDER not in w and os.path.normpath(w) not in excludes + ] + else: + walker = sorted(str(p) for p in Path(self._path).glob("*/*.wav")) + self._walker = [w for w in walker if HASH_DIVIDER in w and EXCEPT_FOLDER not in w] + + def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + (Tensor, int, str, str, int): + ``(waveform, sample_rate, label, speaker_id, utterance_number)`` + """ + fileid = self._walker[n] + return load_speechcommands_item(fileid, self._path) + + def __len__(self) -> int: + return len(self._walker) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/tedlium.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/tedlium.py new file mode 100644 index 0000000000000000000000000000000000000000..d7478ca7beb892c1016a5d286811111de12f436d --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/tedlium.py @@ -0,0 +1,206 @@ +import os +from pathlib import Path +from typing import Tuple, Union + +import torchaudio +from torch import Tensor +from torch.hub import download_url_to_file +from torch.utils.data import Dataset +from torchaudio.datasets.utils import extract_archive + + +_RELEASE_CONFIGS = { + "release1": { + "folder_in_archive": "TEDLIUM_release1", + "url": "http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz", + "checksum": "30301975fd8c5cac4040c261c0852f57cfa8adbbad2ce78e77e4986957445f27", + "data_path": "", + "subset": "train", + "supported_subsets": ["train", "test", "dev"], + "dict": "TEDLIUM.150K.dic", + }, + "release2": { + "folder_in_archive": "TEDLIUM_release2", + "url": "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz", + "checksum": "93281b5fcaaae5c88671c9d000b443cb3c7ea3499ad12010b3934ca41a7b9c58", + "data_path": "", + "subset": "train", + "supported_subsets": ["train", "test", "dev"], + "dict": "TEDLIUM.152k.dic", + }, + "release3": { + "folder_in_archive": "TEDLIUM_release-3", + "url": "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz", + "checksum": "ad1e454d14d1ad550bc2564c462d87c7a7ec83d4dc2b9210f22ab4973b9eccdb", + "data_path": "data/", + "subset": "train", + "supported_subsets": ["train", "test", "dev"], + "dict": "TEDLIUM.152k.dic", + }, +} + + +class TEDLIUM(Dataset): + """ + Create a Dataset for *Tedlium* [:footcite:`rousseau2012tedlium`]. It supports releases 1,2 and 3. + + Args: + root (str or Path): Path to the directory where the dataset is found or downloaded. + release (str, optional): Release version. + Allowed values are ``"release1"``, ``"release2"`` or ``"release3"``. + (default: ``"release1"``). + subset (str, optional): The subset of dataset to use. Valid options are ``"train"``, ``"dev"``, + and ``"test"``. Defaults to ``"train"``. + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + audio_ext (str, optional): extension for audio file (default: ``".sph"``) + """ + + def __init__( + self, + root: Union[str, Path], + release: str = "release1", + subset: str = "train", + download: bool = False, + audio_ext: str = ".sph", + ) -> None: + self._ext_audio = audio_ext + if release in _RELEASE_CONFIGS.keys(): + folder_in_archive = _RELEASE_CONFIGS[release]["folder_in_archive"] + url = _RELEASE_CONFIGS[release]["url"] + subset = subset if subset else _RELEASE_CONFIGS[release]["subset"] + else: + # Raise warning + raise RuntimeError( + "The release {} does not match any of the supported tedlium releases{} ".format( + release, + _RELEASE_CONFIGS.keys(), + ) + ) + if subset not in _RELEASE_CONFIGS[release]["supported_subsets"]: + # Raise warning + raise RuntimeError( + "The subset {} does not match any of the supported tedlium subsets{} ".format( + subset, + _RELEASE_CONFIGS[release]["supported_subsets"], + ) + ) + + # Get string representation of 'root' in case Path object is passed + root = os.fspath(root) + + basename = os.path.basename(url) + archive = os.path.join(root, basename) + + basename = basename.split(".")[0] + + if release == "release3": + if subset == "train": + self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"]) + else: + self._path = os.path.join(root, folder_in_archive, "legacy", subset) + else: + self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"], subset) + + if download: + if not os.path.isdir(self._path): + if not os.path.isfile(archive): + checksum = _RELEASE_CONFIGS[release]["checksum"] + download_url_to_file(url, archive, hash_prefix=checksum) + extract_archive(archive) + else: + if not os.path.exists(self._path): + raise RuntimeError( + f"The path {self._path} doesn't exist. " + "Please check the ``root`` path or set `download=True` to download it" + ) + + # Create list for all samples + self._filelist = [] + stm_path = os.path.join(self._path, "stm") + for file in sorted(os.listdir(stm_path)): + if file.endswith(".stm"): + stm_path = os.path.join(self._path, "stm", file) + with open(stm_path) as f: + l = len(f.readlines()) + file = file.replace(".stm", "") + self._filelist.extend((file, line) for line in range(l)) + # Create dict path for later read + self._dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"]) + self._phoneme_dict = None + + def _load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, int, str, int, int, int]: + """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name. + + Args: + fileid (str): File id to identify both text and audio files corresponding to the sample + line (int): Line identifier for the sample inside the text file + path (str): Dataset root path + + Returns: + (Tensor, int, str, int, int, int): + ``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)`` + """ + transcript_path = os.path.join(path, "stm", fileid) + with open(transcript_path + ".stm") as f: + transcript = f.readlines()[line] + talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split(" ", 6) + + wave_path = os.path.join(path, "sph", fileid) + waveform, sample_rate = self._load_audio(wave_path + self._ext_audio, start_time=start_time, end_time=end_time) + + return (waveform, sample_rate, transcript, talk_id, speaker_id, identifier) + + def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]: + """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality + and load individual sentences from a full ted audio talk file. + + Args: + path (str): Path to audio file + start_time (int): Time in seconds where the sample sentence stars + end_time (int): Time in seconds where the sample sentence finishes + sample_rate (float, optional): Sampling rate + + Returns: + [Tensor, int]: Audio tensor representation and sample rate + """ + start_time = int(float(start_time) * sample_rate) + end_time = int(float(end_time) * sample_rate) + + kwargs = {"frame_offset": start_time, "num_frames": end_time - start_time} + + return torchaudio.load(path, **kwargs) + + def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + tuple: ``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)`` + """ + fileid, line = self._filelist[n] + return self._load_tedlium_item(fileid, line, self._path) + + def __len__(self) -> int: + """TEDLIUM dataset custom function overwritting len default behaviour. + + Returns: + int: TEDLIUM dataset length + """ + return len(self._filelist) + + @property + def phoneme_dict(self): + """dict[str, tuple[str]]: Phonemes. Mapping from word to tuple of phonemes. + Note that some words have empty phonemes. + """ + # Read phoneme dictionary + if not self._phoneme_dict: + self._phoneme_dict = {} + with open(self._dict_path, "r", encoding="utf-8") as f: + for line in f.readlines(): + content = line.strip().split() + self._phoneme_dict[content[0]] = tuple(content[1:]) # content[1:] can be empty list + return self._phoneme_dict.copy() diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/utils.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..020555c480cb8f71ddbc26d525716e34bfc35fd8 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/utils.py @@ -0,0 +1,191 @@ +import hashlib +import logging +import os +import tarfile +import urllib +import urllib.request +import warnings +import zipfile +from typing import Any, Iterable, List, Optional + +from torch.utils.model_zoo import tqdm + + +def stream_url( + url: str, start_byte: Optional[int] = None, block_size: int = 32 * 1024, progress_bar: bool = True +) -> Iterable: + """Stream url by chunk + + Args: + url (str): Url. + start_byte (int or None, optional): Start streaming at that point (Default: ``None``). + block_size (int, optional): Size of chunks to stream (Default: ``32 * 1024``). + progress_bar (bool, optional): Display a progress bar (Default: ``True``). + """ + + # If we already have the whole file, there is no need to download it again + req = urllib.request.Request(url, method="HEAD") + with urllib.request.urlopen(req) as response: + url_size = int(response.info().get("Content-Length", -1)) + if url_size == start_byte: + return + + req = urllib.request.Request(url) + if start_byte: + req.headers["Range"] = "bytes={}-".format(start_byte) + + with urllib.request.urlopen(req) as upointer, tqdm( + unit="B", + unit_scale=True, + unit_divisor=1024, + total=url_size, + disable=not progress_bar, + ) as pbar: + + num_bytes = 0 + while True: + chunk = upointer.read(block_size) + if not chunk: + break + yield chunk + num_bytes += len(chunk) + pbar.update(len(chunk)) + + +def download_url( + url: str, + download_folder: str, + filename: Optional[str] = None, + hash_value: Optional[str] = None, + hash_type: str = "sha256", + progress_bar: bool = True, + resume: bool = False, +) -> None: + """Download file to disk. + + Args: + url (str): Url. + download_folder (str): Folder to download file. + filename (str or None, optional): Name of downloaded file. If None, it is inferred from the url + (Default: ``None``). + hash_value (str or None, optional): Hash for url (Default: ``None``). + hash_type (str, optional): Hash type, among "sha256" and "md5" (Default: ``"sha256"``). + progress_bar (bool, optional): Display a progress bar (Default: ``True``). + resume (bool, optional): Enable resuming download (Default: ``False``). + """ + warnings.warn("download_url is deprecated and will be removed in the v0.12 release.") + req = urllib.request.Request(url, method="HEAD") + req_info = urllib.request.urlopen(req).info() + + # Detect filename + filename = filename or req_info.get_filename() or os.path.basename(url) + filepath = os.path.join(download_folder, filename) + if resume and os.path.exists(filepath): + mode = "ab" + local_size: Optional[int] = os.path.getsize(filepath) + + elif not resume and os.path.exists(filepath): + raise RuntimeError("{} already exists. Delete the file manually and retry.".format(filepath)) + else: + mode = "wb" + local_size = None + + if hash_value and local_size == int(req_info.get("Content-Length", -1)): + with open(filepath, "rb") as file_obj: + if validate_file(file_obj, hash_value, hash_type): + return + raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath)) + + with open(filepath, mode) as fpointer: + for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar): + fpointer.write(chunk) + + with open(filepath, "rb") as file_obj: + if hash_value and not validate_file(file_obj, hash_value, hash_type): + raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath)) + + +def validate_file(file_obj: Any, hash_value: str, hash_type: str = "sha256") -> bool: + """Validate a given file object with its hash. + + Args: + file_obj: File object to read from. + hash_value (str): Hash for url. + hash_type (str, optional): Hash type, among "sha256" and "md5" (Default: ``"sha256"``). + + Returns: + bool: return True if its a valid file, else False. + """ + + if hash_type == "sha256": + hash_func = hashlib.sha256() + elif hash_type == "md5": + hash_func = hashlib.md5() + else: + raise ValueError + + while True: + # Read by chunk to avoid filling memory + chunk = file_obj.read(1024**2) + if not chunk: + break + hash_func.update(chunk) + + return hash_func.hexdigest() == hash_value + + +def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bool = False) -> List[str]: + """Extract archive. + Args: + from_path (str): the path of the archive. + to_path (str or None, optional): the root path of the extraced files (directory of from_path) + (Default: ``None``) + overwrite (bool, optional): overwrite existing files (Default: ``False``) + + Returns: + List[str]: List of paths to extracted files even if not overwritten. + + Examples: + >>> url = 'http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz' + >>> from_path = './validation.tar.gz' + >>> to_path = './' + >>> torchaudio.datasets.utils.download_from_url(url, from_path) + >>> torchaudio.datasets.utils.extract_archive(from_path, to_path) + """ + + if to_path is None: + to_path = os.path.dirname(from_path) + + try: + with tarfile.open(from_path, "r") as tar: + logging.info("Opened tar file {}.".format(from_path)) + files = [] + for file_ in tar: # type: Any + file_path = os.path.join(to_path, file_.name) + if file_.isfile(): + files.append(file_path) + if os.path.exists(file_path): + logging.info("{} already extracted.".format(file_path)) + if not overwrite: + continue + tar.extract(file_, to_path) + return files + except tarfile.ReadError: + pass + + try: + with zipfile.ZipFile(from_path, "r") as zfile: + logging.info("Opened zip file {}.".format(from_path)) + files = zfile.namelist() + for file_ in files: + file_path = os.path.join(to_path, file_) + if os.path.exists(file_path): + logging.info("{} already extracted.".format(file_path)) + if not overwrite: + continue + zfile.extract(file_, to_path) + return files + except zipfile.BadZipFile: + pass + + raise NotImplementedError("We currently only support tar.gz, tgz, and zip achives.") diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/vctk.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/vctk.py new file mode 100644 index 0000000000000000000000000000000000000000..a2dd6abddf9a9dea8d583e9d42f2578b065f1e86 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/vctk.py @@ -0,0 +1,133 @@ +import os +from typing import Tuple + +import torchaudio +from torch import Tensor +from torch.hub import download_url_to_file +from torch.utils.data import Dataset +from torchaudio.datasets.utils import extract_archive + +URL = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip" +_CHECKSUMS = { + "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip": "f96258be9fdc2cbff6559541aae7ea4f59df3fcaf5cf963aae5ca647357e359c" # noqa: E501 +} + + +SampleType = Tuple[Tensor, int, str, str, str] + + +class VCTK_092(Dataset): + """Create *VCTK 0.92* [:footcite:`yamagishi2019vctk`] Dataset + + Args: + root (str): Root directory where the dataset's top level directory is found. + mic_id (str, optional): Microphone ID. Either ``"mic1"`` or ``"mic2"``. (default: ``"mic2"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + url (str, optional): The URL to download the dataset from. + (default: ``"https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"``) + audio_ext (str, optional): Custom audio extension if dataset is converted to non-default audio format. + + Note: + * All the speeches from speaker ``p315`` will be skipped due to the lack of the corresponding text files. + * All the speeches from ``p280`` will be skipped for ``mic_id="mic2"`` due to the lack of the audio files. + * Some of the speeches from speaker ``p362`` will be skipped due to the lack of the audio files. + * See Also: https://datashare.is.ed.ac.uk/handle/10283/3443 + """ + + def __init__( + self, + root: str, + mic_id: str = "mic2", + download: bool = False, + url: str = URL, + audio_ext=".flac", + ): + if mic_id not in ["mic1", "mic2"]: + raise RuntimeError(f'`mic_id` has to be either "mic1" or "mic2". Found: {mic_id}') + + archive = os.path.join(root, "VCTK-Corpus-0.92.zip") + + self._path = os.path.join(root, "VCTK-Corpus-0.92") + self._txt_dir = os.path.join(self._path, "txt") + self._audio_dir = os.path.join(self._path, "wav48_silence_trimmed") + self._mic_id = mic_id + self._audio_ext = audio_ext + + if download: + if not os.path.isdir(self._path): + if not os.path.isfile(archive): + checksum = _CHECKSUMS.get(url, None) + download_url_to_file(url, archive, hash_prefix=checksum) + extract_archive(archive, self._path) + + if not os.path.isdir(self._path): + raise RuntimeError("Dataset not found. Please use `download=True` to download it.") + + # Extracting speaker IDs from the folder structure + self._speaker_ids = sorted(os.listdir(self._txt_dir)) + self._sample_ids = [] + + """ + Due to some insufficient data complexity in the 0.92 version of this dataset, + we start traversing the audio folder structure in accordance with the text folder. + As some of the audio files are missing of either ``mic_1`` or ``mic_2`` but the + text is present for the same, we first check for the existence of the audio file + before adding it to the ``sample_ids`` list. + + Once the ``audio_ids`` are loaded into memory we can quickly access the list for + different parameters required by the user. + """ + for speaker_id in self._speaker_ids: + if speaker_id == "p280" and mic_id == "mic2": + continue + utterance_dir = os.path.join(self._txt_dir, speaker_id) + for utterance_file in sorted(f for f in os.listdir(utterance_dir) if f.endswith(".txt")): + utterance_id = os.path.splitext(utterance_file)[0] + audio_path_mic = os.path.join( + self._audio_dir, + speaker_id, + f"{utterance_id}_{mic_id}{self._audio_ext}", + ) + if speaker_id == "p362" and not os.path.isfile(audio_path_mic): + continue + self._sample_ids.append(utterance_id.split("_")) + + def _load_text(self, file_path) -> str: + with open(file_path) as file_path: + return file_path.readlines()[0] + + def _load_audio(self, file_path) -> Tuple[Tensor, int]: + return torchaudio.load(file_path) + + def _load_sample(self, speaker_id: str, utterance_id: str, mic_id: str) -> SampleType: + transcript_path = os.path.join(self._txt_dir, speaker_id, f"{speaker_id}_{utterance_id}.txt") + audio_path = os.path.join( + self._audio_dir, + speaker_id, + f"{speaker_id}_{utterance_id}_{mic_id}{self._audio_ext}", + ) + + # Reading text + transcript = self._load_text(transcript_path) + + # Reading FLAC + waveform, sample_rate = self._load_audio(audio_path) + + return (waveform, sample_rate, transcript, speaker_id, utterance_id) + + def __getitem__(self, n: int) -> SampleType: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + (Tensor, int, str, str, str): + ``(waveform, sample_rate, transcript, speaker_id, utterance_id)`` + """ + speaker_id, utterance_id = self._sample_ids[n] + return self._load_sample(speaker_id, utterance_id, self._mic_id) + + def __len__(self) -> int: + return len(self._sample_ids) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/yesno.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/yesno.py new file mode 100644 index 0000000000000000000000000000000000000000..8818f578052c7cfbee876aba9db7402abdc835b0 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/yesno.py @@ -0,0 +1,82 @@ +import os +from pathlib import Path +from typing import List, Tuple, Union + +import torchaudio +from torch import Tensor +from torch.hub import download_url_to_file +from torch.utils.data import Dataset +from torchaudio.datasets.utils import extract_archive + + +_RELEASE_CONFIGS = { + "release1": { + "folder_in_archive": "waves_yesno", + "url": "http://www.openslr.org/resources/1/waves_yesno.tar.gz", + "checksum": "c3f49e0cca421f96b75b41640749167b52118f232498667ca7a5f9416aef8e73", + } +} + + +class YESNO(Dataset): + """Create a Dataset for *YesNo* [:footcite:`YesNo`]. + + Args: + root (str or Path): Path to the directory where the dataset is found or downloaded. + url (str, optional): The URL to download the dataset from. + (default: ``"http://www.openslr.org/resources/1/waves_yesno.tar.gz"``) + folder_in_archive (str, optional): + The top-level directory of the dataset. (default: ``"waves_yesno"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + """ + + def __init__( + self, + root: Union[str, Path], + url: str = _RELEASE_CONFIGS["release1"]["url"], + folder_in_archive: str = _RELEASE_CONFIGS["release1"]["folder_in_archive"], + download: bool = False, + ) -> None: + + self._parse_filesystem(root, url, folder_in_archive, download) + + def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, download: bool) -> None: + root = Path(root) + archive = os.path.basename(url) + archive = root / archive + + self._path = root / folder_in_archive + if download: + if not os.path.isdir(self._path): + if not os.path.isfile(archive): + checksum = _RELEASE_CONFIGS["release1"]["checksum"] + download_url_to_file(url, archive, hash_prefix=checksum) + extract_archive(archive) + + if not os.path.isdir(self._path): + raise RuntimeError("Dataset not found. Please use `download=True` to download it.") + + self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*.wav")) + + def _load_item(self, fileid: str, path: str): + labels = [int(c) for c in fileid.split("_")] + file_audio = os.path.join(path, fileid + ".wav") + waveform, sample_rate = torchaudio.load(file_audio) + return waveform, sample_rate, labels + + def __getitem__(self, n: int) -> Tuple[Tensor, int, List[int]]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + (Tensor, int, List[int]): ``(waveform, sample_rate, labels)`` + """ + fileid = self._walker[n] + item = self._load_item(fileid, self._path) + return item + + def __len__(self) -> int: + return len(self._walker) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/__init__.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..06325da3fee54a2696c6829ee6d5491a9384b37d --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/__init__.py @@ -0,0 +1,109 @@ +from .filtering import ( + allpass_biquad, + band_biquad, + bandpass_biquad, + bandreject_biquad, + bass_biquad, + biquad, + contrast, + dcshift, + deemph_biquad, + dither, + equalizer_biquad, + filtfilt, + flanger, + gain, + highpass_biquad, + lfilter, + lowpass_biquad, + overdrive, + phaser, + riaa_biquad, + treble_biquad, + vad, +) +from .functional import ( + amplitude_to_DB, + apply_beamforming, + apply_codec, + compute_deltas, + compute_kaldi_pitch, + create_dct, + DB_to_amplitude, + detect_pitch_frequency, + edit_distance, + griffinlim, + inverse_spectrogram, + linear_fbanks, + mask_along_axis, + mask_along_axis_iid, + melscale_fbanks, + mu_law_decoding, + mu_law_encoding, + mvdr_weights_rtf, + mvdr_weights_souden, + phase_vocoder, + pitch_shift, + psd, + resample, + rnnt_loss, + rtf_evd, + rtf_power, + sliding_window_cmn, + spectral_centroid, + spectrogram, +) + +__all__ = [ + "amplitude_to_DB", + "compute_deltas", + "compute_kaldi_pitch", + "create_dct", + "melscale_fbanks", + "linear_fbanks", + "DB_to_amplitude", + "detect_pitch_frequency", + "griffinlim", + "mask_along_axis", + "mask_along_axis_iid", + "mu_law_encoding", + "mu_law_decoding", + "phase_vocoder", + "sliding_window_cmn", + "spectrogram", + "inverse_spectrogram", + "spectral_centroid", + "allpass_biquad", + "band_biquad", + "bandpass_biquad", + "bandreject_biquad", + "bass_biquad", + "biquad", + "contrast", + "dither", + "dcshift", + "deemph_biquad", + "equalizer_biquad", + "filtfilt", + "flanger", + "gain", + "highpass_biquad", + "lfilter", + "lowpass_biquad", + "overdrive", + "phaser", + "riaa_biquad", + "treble_biquad", + "vad", + "apply_codec", + "resample", + "edit_distance", + "pitch_shift", + "rnnt_loss", + "psd", + "mvdr_weights_souden", + "mvdr_weights_rtf", + "rtf_evd", + "rtf_power", + "apply_beamforming", +] diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/filtering.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/filtering.py new file mode 100644 index 0000000000000000000000000000000000000000..4a08418ed105494b1bc9a13dd10a76bbb595e8d6 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/filtering.py @@ -0,0 +1,1661 @@ +import math +import warnings +from typing import Optional + +import torch +from torch import Tensor + + +def _dB2Linear(x: float) -> float: + return math.exp(x * math.log(10) / 20.0) + + +def _generate_wave_table( + wave_type: str, + data_type: str, + table_size: int, + min: float, + max: float, + phase: float, + device: torch.device, +) -> Tensor: + r"""A helper function for phaser. Generates a table with given parameters. + + Args: + wave_type (str): SINE or TRIANGULAR + data_type (str): desired data_type ( `INT` or `FLOAT` ) + table_size (int): desired table size + min (float): desired min value + max (float): desired max value + phase (float): desired phase + device (torch.device): Torch device on which table must be generated + Returns: + Tensor: A 1D tensor with wave table values + """ + + phase_offset = int(phase / math.pi / 2 * table_size + 0.5) + + t = torch.arange(table_size, device=device, dtype=torch.int32) + + point = (t + phase_offset) % table_size + + d = torch.zeros_like(point, device=device, dtype=torch.float64) + + if wave_type == "SINE": + d = (torch.sin(point.to(torch.float64) / table_size * 2 * math.pi) + 1) / 2 + elif wave_type == "TRIANGLE": + d = point.to(torch.float64) * 2 / table_size + value = torch.div(4 * point, table_size, rounding_mode="floor") + d[value == 0] = d[value == 0] + 0.5 + d[value == 1] = 1.5 - d[value == 1] + d[value == 2] = 1.5 - d[value == 2] + d[value == 3] = d[value == 3] - 1.5 + + d = d * (max - min) + min + + if data_type == "INT": + mask = d < 0 + d[mask] = d[mask] - 0.5 + d[~mask] = d[~mask] + 0.5 + d = d.to(torch.int32) + elif data_type == "FLOAT": + d = d.to(torch.float32) + + return d + + +def allpass_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor: + r"""Design two-pole all-pass filter. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform(torch.Tensor): audio waveform of dimension of `(..., time)` + sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) + central_freq (float or torch.Tensor): central frequency (in Hz) + Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``) + + Returns: + Tensor: Waveform of dimension of `(..., time)` + + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + """ + dtype = waveform.dtype + device = waveform.device + central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device) + Q = torch.as_tensor(Q, dtype=dtype, device=device) + + w0 = 2 * math.pi * central_freq / sample_rate + + alpha = torch.sin(w0) / 2 / Q + + b0 = 1 - alpha + b1 = -2 * torch.cos(w0) + b2 = 1 + alpha + a0 = 1 + alpha + a1 = -2 * torch.cos(w0) + a2 = 1 - alpha + return biquad(waveform, b0, b1, b2, a0, a1, a2) + + +def band_biquad( + waveform: Tensor, + sample_rate: int, + central_freq: float, + Q: float = 0.707, + noise: bool = False, +) -> Tensor: + r"""Design two-pole band filter. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)` + sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) + central_freq (float or torch.Tensor): central frequency (in Hz) + Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``). + noise (bool, optional) : If ``True``, uses the alternate mode for un-pitched audio (e.g. percussion). + If ``False``, uses mode oriented to pitched audio, i.e. voice, singing, + or instrumental music (Default: ``False``). + + Returns: + Tensor: Waveform of dimension of `(..., time)` + + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + """ + dtype = waveform.dtype + device = waveform.device + central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device) + Q = torch.as_tensor(Q, dtype=dtype, device=device) + + w0 = 2 * math.pi * central_freq / sample_rate + bw_Hz = central_freq / Q + + a0 = 1.0 + a2 = torch.exp(-2 * math.pi * bw_Hz / sample_rate) + a1 = -4 * a2 / (1 + a2) * torch.cos(w0) + + b0 = torch.sqrt(1 - a1 * a1 / (4 * a2)) * (1 - a2) + + if noise: + mult = torch.sqrt(((1 + a2) * (1 + a2) - a1 * a1) * (1 - a2) / (1 + a2)) / b0 + b0 = mult * b0 + + b1 = 0.0 + b2 = 0.0 + + return biquad(waveform, b0, b1, b2, a0, a1, a2) + + +def bandpass_biquad( + waveform: Tensor, + sample_rate: int, + central_freq: float, + Q: float = 0.707, + const_skirt_gain: bool = False, +) -> Tensor: + r"""Design two-pole band-pass filter. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)` + sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) + central_freq (float or torch.Tensor): central frequency (in Hz) + Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``) + const_skirt_gain (bool, optional) : If ``True``, uses a constant skirt gain (peak gain = Q). + If ``False``, uses a constant 0dB peak gain. (Default: ``False``) + + Returns: + Tensor: Waveform of dimension of `(..., time)` + + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + """ + dtype = waveform.dtype + device = waveform.device + central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device) + Q = torch.as_tensor(Q, dtype=dtype, device=device) + + w0 = 2 * math.pi * central_freq / sample_rate + alpha = torch.sin(w0) / 2 / Q + + temp = torch.sin(w0) / 2 if const_skirt_gain else alpha + b0 = temp + b1 = 0.0 + b2 = -temp + a0 = 1 + alpha + a1 = -2 * torch.cos(w0) + a2 = 1 - alpha + return biquad(waveform, b0, b1, b2, a0, a1, a2) + + +def bandreject_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor: + r"""Design two-pole band-reject filter. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)` + sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) + central_freq (float or torch.Tensor): central frequency (in Hz) + Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``) + + Returns: + Tensor: Waveform of dimension of `(..., time)` + + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + """ + dtype = waveform.dtype + device = waveform.device + central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device) + Q = torch.as_tensor(Q, dtype=dtype, device=device) + + w0 = 2 * math.pi * central_freq / sample_rate + alpha = torch.sin(w0) / 2 / Q + + b0 = 1.0 + b1 = -2 * torch.cos(w0) + b2 = 1.0 + a0 = 1 + alpha + a1 = -2 * torch.cos(w0) + a2 = 1 - alpha + return biquad(waveform, b0, b1, b2, a0, a1, a2) + + +def bass_biquad( + waveform: Tensor, + sample_rate: int, + gain: float, + central_freq: float = 100, + Q: float = 0.707, +) -> Tensor: + r"""Design a bass tone-control effect. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)` + sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) + gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB. + central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``100``) + Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``). + + Returns: + Tensor: Waveform of dimension of `(..., time)` + + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + """ + dtype = waveform.dtype + device = waveform.device + central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device) + Q = torch.as_tensor(Q, dtype=dtype, device=device) + gain = torch.as_tensor(gain, dtype=dtype, device=device) + + w0 = 2 * math.pi * central_freq / sample_rate + alpha = torch.sin(w0) / 2 / Q + A = torch.exp(gain / 40 * math.log(10)) + + temp1 = 2 * torch.sqrt(A) * alpha + temp2 = (A - 1) * torch.cos(w0) + temp3 = (A + 1) * torch.cos(w0) + + b0 = A * ((A + 1) - temp2 + temp1) + b1 = 2 * A * ((A - 1) - temp3) + b2 = A * ((A + 1) - temp2 - temp1) + a0 = (A + 1) + temp2 + temp1 + a1 = -2 * ((A - 1) + temp3) + a2 = (A + 1) + temp2 - temp1 + + return biquad(waveform, b0 / a0, b1 / a0, b2 / a0, a0 / a0, a1 / a0, a2 / a0) + + +def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: float, a2: float) -> Tensor: + r"""Perform a biquad filter of input tensor. Initial conditions set to 0. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)` + b0 (float or torch.Tensor): numerator coefficient of current input, x[n] + b1 (float or torch.Tensor): numerator coefficient of input one time step ago x[n-1] + b2 (float or torch.Tensor): numerator coefficient of input two time steps ago x[n-2] + a0 (float or torch.Tensor): denominator coefficient of current output y[n], typically 1 + a1 (float or torch.Tensor): denominator coefficient of current output y[n-1] + a2 (float or torch.Tensor): denominator coefficient of current output y[n-2] + + Returns: + Tensor: Waveform with dimension of `(..., time)` + + Reference: + - https://en.wikipedia.org/wiki/Digital_biquad_filter + """ + + device = waveform.device + dtype = waveform.dtype + + b0 = torch.as_tensor(b0, dtype=dtype, device=device).view(1) + b1 = torch.as_tensor(b1, dtype=dtype, device=device).view(1) + b2 = torch.as_tensor(b2, dtype=dtype, device=device).view(1) + a0 = torch.as_tensor(a0, dtype=dtype, device=device).view(1) + a1 = torch.as_tensor(a1, dtype=dtype, device=device).view(1) + a2 = torch.as_tensor(a2, dtype=dtype, device=device).view(1) + + output_waveform = lfilter( + waveform, + torch.cat([a0, a1, a2]), + torch.cat([b0, b1, b2]), + ) + return output_waveform + + +def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor: + r"""Apply contrast effect. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Comparable with compression, this effect modifies an audio signal to make it sound louder + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)` + enhancement_amount (float, optional): controls the amount of the enhancement + Allowed range of values for enhancement_amount : 0-100 + Note that enhancement_amount = 0 still gives a significant contrast enhancement + + Returns: + Tensor: Waveform of dimension of `(..., time)` + + Reference: + - http://sox.sourceforge.net/sox.html + """ + + if not 0 <= enhancement_amount <= 100: + raise ValueError("Allowed range of values for enhancement_amount : 0-100") + + contrast = enhancement_amount / 750.0 + + temp1 = waveform * (math.pi / 2) + temp2 = contrast * torch.sin(temp1 * 4) + output_waveform = torch.sin(temp1 + temp2) + + return output_waveform + + +def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None) -> Tensor: + r"""Apply a DC shift to the audio. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + This can be useful to remove a DC offset + (caused perhaps by a hardware problem in the recording chain) from the audio + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)` + shift (float): indicates the amount to shift the audio + Allowed range of values for shift : -2.0 to +2.0 + limiter_gain (float of None, optional): It is used only on peaks to prevent clipping + It should have a value much less than 1 (e.g. 0.05 or 0.02) + + Returns: + Tensor: Waveform of dimension of `(..., time)` + + Reference: + - http://sox.sourceforge.net/sox.html + """ + output_waveform = waveform + limiter_threshold = 0.0 + + if limiter_gain is not None: + limiter_threshold = 1.0 - (abs(shift) - limiter_gain) + + # Note: + # the following index-based update breaks auto-grad support + if limiter_gain is not None and shift > 0: + mask = waveform > limiter_threshold + temp = (waveform[mask] - limiter_threshold) * limiter_gain / (1 - limiter_threshold) + output_waveform[mask] = (temp + limiter_threshold + shift).clamp(max=limiter_threshold) + output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1) + elif limiter_gain is not None and shift < 0: + mask = waveform < -limiter_threshold + temp = (waveform[mask] + limiter_threshold) * limiter_gain / (1 - limiter_threshold) + output_waveform[mask] = (temp - limiter_threshold + shift).clamp(min=-limiter_threshold) + output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1) + else: + output_waveform = (waveform + shift).clamp(min=-1, max=1) + + return output_waveform + + +def deemph_biquad(waveform: Tensor, sample_rate: int) -> Tensor: + r"""Apply ISO 908 CD de-emphasis (shelving) IIR filter. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)` + sample_rate (int): sampling rate of the waveform, Allowed sample rate ``44100`` or ``48000`` + + Returns: + Tensor: Waveform of dimension of `(..., time)` + + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + """ + + if sample_rate == 44100: + central_freq = 5283 + width_slope = 0.4845 + gain = -9.477 + elif sample_rate == 48000: + central_freq = 5356 + width_slope = 0.479 + gain = -9.62 + else: + raise ValueError("Sample rate must be 44100 (audio-CD) or 48000 (DAT)") + + w0 = 2 * math.pi * central_freq / sample_rate + A = math.exp(gain / 40.0 * math.log(10)) + alpha = math.sin(w0) / 2 * math.sqrt((A + 1 / A) * (1 / width_slope - 1) + 2) + + temp1 = 2 * math.sqrt(A) * alpha + temp2 = (A - 1) * math.cos(w0) + temp3 = (A + 1) * math.cos(w0) + + b0 = A * ((A + 1) + temp2 + temp1) + b1 = -2 * A * ((A - 1) + temp3) + b2 = A * ((A + 1) + temp2 - temp1) + a0 = (A + 1) - temp2 + temp1 + a1 = 2 * ((A - 1) - temp3) + a2 = (A + 1) - temp2 - temp1 + + return biquad(waveform, b0, b1, b2, a0, a1, a2) + + +def _add_noise_shaping(dithered_waveform: Tensor, waveform: Tensor) -> Tensor: + r"""Noise shaping is calculated by error: + error[n] = dithered[n] - original[n] + noise_shaped_waveform[n] = dithered[n] + error[n-1] + """ + wf_shape = waveform.size() + waveform = waveform.reshape(-1, wf_shape[-1]) + + dithered_shape = dithered_waveform.size() + dithered_waveform = dithered_waveform.reshape(-1, dithered_shape[-1]) + + error = dithered_waveform - waveform + + # add error[n-1] to dithered_waveform[n], so offset the error by 1 index + zeros = torch.zeros(1, dtype=error.dtype, device=error.device) + for index in range(error.size()[0]): + err = error[index] + error_offset = torch.cat((zeros, err)) + error[index] = error_offset[: waveform.size()[1]] + + noise_shaped = dithered_waveform + error + return noise_shaped.reshape(dithered_shape[:-1] + noise_shaped.shape[-1:]) + + +def _apply_probability_distribution(waveform: Tensor, density_function: str = "TPDF") -> Tensor: + r"""Apply a probability distribution function on a waveform. + + Triangular probability density function (TPDF) dither noise has a + triangular distribution; values in the center of the range have a higher + probability of occurring. + + Rectangular probability density function (RPDF) dither noise has a + uniform distribution; any value in the specified range has the same + probability of occurring. + + Gaussian probability density function (GPDF) has a normal distribution. + The relationship of probabilities of results follows a bell-shaped, + or Gaussian curve, typical of dither generated by analog sources. + Args: + waveform (Tensor): Tensor of audio of dimension (..., time) + density_function (str, optional): The density function of a + continuous random variable (Default: ``"TPDF"``) + Options: Triangular Probability Density Function - `TPDF` + Rectangular Probability Density Function - `RPDF` + Gaussian Probability Density Function - `GPDF` + Returns: + Tensor: waveform dithered with TPDF + """ + + # pack batch + shape = waveform.size() + waveform = waveform.reshape(-1, shape[-1]) + + channel_size = waveform.size()[0] - 1 + time_size = waveform.size()[-1] - 1 + + random_channel = ( + int( + torch.randint( + channel_size, + [ + 1, + ], + ).item() + ) + if channel_size > 0 + else 0 + ) + random_time = ( + int( + torch.randint( + time_size, + [ + 1, + ], + ).item() + ) + if time_size > 0 + else 0 + ) + + number_of_bits = 16 + up_scaling = 2 ** (number_of_bits - 1) - 2 + signal_scaled = waveform * up_scaling + down_scaling = 2 ** (number_of_bits - 1) + + signal_scaled_dis = waveform + if density_function == "RPDF": + RPDF = waveform[random_channel][random_time] - 0.5 + + signal_scaled_dis = signal_scaled + RPDF + elif density_function == "GPDF": + # TODO Replace by distribution code once + # https://github.com/pytorch/pytorch/issues/29843 is resolved + # gaussian = torch.distributions.normal.Normal(torch.mean(waveform, -1), 1).sample() + + num_rand_variables = 6 + + gaussian = waveform[random_channel][random_time] + for ws in num_rand_variables * [time_size]: + rand_chan = int( + torch.randint( + channel_size, + [ + 1, + ], + ).item() + ) + gaussian += waveform[rand_chan][ + int( + torch.randint( + ws, + [ + 1, + ], + ).item() + ) + ] + + signal_scaled_dis = signal_scaled + gaussian + else: + # dtype needed for https://github.com/pytorch/pytorch/issues/32358 + TPDF = torch.bartlett_window(time_size + 1, dtype=signal_scaled.dtype, device=signal_scaled.device) + TPDF = TPDF.repeat((channel_size + 1), 1) + signal_scaled_dis = signal_scaled + TPDF + + quantised_signal_scaled = torch.round(signal_scaled_dis) + quantised_signal = quantised_signal_scaled / down_scaling + + # unpack batch + return quantised_signal.reshape(shape[:-1] + quantised_signal.shape[-1:]) + + +def dither(waveform: Tensor, density_function: str = "TPDF", noise_shaping: bool = False) -> Tensor: + r"""Apply dither + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + Dither increases the perceived dynamic range of audio stored at a + particular bit-depth by eliminating nonlinear truncation distortion + (i.e. adding minimally perceived noise to mask distortion caused by quantization). + + Args: + waveform (Tensor): Tensor of audio of dimension (..., time) + density_function (str, optional): + The density function of a continuous random variable. One of + ``"TPDF"`` (Triangular Probability Density Function), + ``"RPDF"`` (Rectangular Probability Density Function) or + ``"GPDF"`` (Gaussian Probability Density Function) (Default: ``"TPDF"``). + noise_shaping (bool, optional): a filtering process that shapes the spectral + energy of quantisation error (Default: ``False``) + + Returns: + Tensor: waveform dithered + """ + dithered = _apply_probability_distribution(waveform, density_function=density_function) + + if noise_shaping: + return _add_noise_shaping(dithered, waveform) + else: + return dithered + + +def equalizer_biquad( + waveform: Tensor, + sample_rate: int, + center_freq: float, + gain: float, + Q: float = 0.707, +) -> Tensor: + r"""Design biquad peaking equalizer filter and perform filtering. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)` + sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) + center_freq (float): filter's central frequency + gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB + Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``) + + Returns: + Tensor: Waveform of dimension of `(..., time)` + """ + dtype = waveform.dtype + device = waveform.device + center_freq = torch.as_tensor(center_freq, dtype=dtype, device=device) + Q = torch.as_tensor(Q, dtype=dtype, device=device) + gain = torch.as_tensor(gain, dtype=dtype, device=device) + + w0 = 2 * math.pi * center_freq / sample_rate + A = torch.exp(gain / 40.0 * math.log(10)) + alpha = torch.sin(w0) / 2 / Q + + b0 = 1 + alpha * A + b1 = -2 * torch.cos(w0) + b2 = 1 - alpha * A + a0 = 1 + alpha / A + a1 = -2 * torch.cos(w0) + a2 = 1 - alpha / A + return biquad(waveform, b0, b1, b2, a0, a1, a2) + + +def filtfilt( + waveform: Tensor, + a_coeffs: Tensor, + b_coeffs: Tensor, + clamp: bool = True, +) -> Tensor: + r"""Apply an IIR filter forward and backward to a waveform. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Inspired by https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.filtfilt.html + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)`. Must be normalized to -1 to 1. + a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either + 1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`. + Lower delay coefficients are first, e.g. ``[a0, a1, a2, ...]``. + Must be same size as b_coeffs (pad with 0's as necessary). + b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either + 1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`. + Lower delay coefficients are first, e.g. ``[b0, b1, b2, ...]``. + Must be same size as a_coeffs (pad with 0's as necessary). + clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``) + + Returns: + Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs`` + are 2D Tensors, or `(..., time)` otherwise. + """ + forward_filtered = lfilter(waveform, a_coeffs, b_coeffs, clamp=False, batching=True) + backward_filtered = lfilter( + forward_filtered.flip(-1), + a_coeffs, + b_coeffs, + clamp=clamp, + batching=True, + ).flip(-1) + return backward_filtered + + +def flanger( + waveform: Tensor, + sample_rate: int, + delay: float = 0.0, + depth: float = 2.0, + regen: float = 0.0, + width: float = 71.0, + speed: float = 0.5, + phase: float = 25.0, + modulation: str = "sinusoidal", + interpolation: str = "linear", +) -> Tensor: + r"""Apply a flanger effect to the audio. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): audio waveform of dimension of `(..., channel, time)` . + Max 4 channels allowed + sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) + delay (float, optional): desired delay in milliseconds(ms) + Allowed range of values are 0 to 30 + depth (float, optional): desired delay depth in milliseconds(ms) + Allowed range of values are 0 to 10 + regen (float, optional): desired regen(feedback gain) in dB + Allowed range of values are -95 to 95 + width (float, optional): desired width(delay gain) in dB + Allowed range of values are 0 to 100 + speed (float, optional): modulation speed in Hz + Allowed range of values are 0.1 to 10 + phase (float, optional): percentage phase-shift for multi-channel + Allowed range of values are 0 to 100 + modulation (str, optional): Use either "sinusoidal" or "triangular" modulation. (Default: ``sinusoidal``) + interpolation (str, optional): Use either "linear" or "quadratic" for delay-line interpolation. + (Default: ``linear``) + + Returns: + Tensor: Waveform of dimension of `(..., channel, time)` + + Reference: + - http://sox.sourceforge.net/sox.html + + - Scott Lehman, `Effects Explained`_, + + .. _Effects Explained: + https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html + """ + + if modulation not in ("sinusoidal", "triangular"): + raise ValueError("Only 'sinusoidal' or 'triangular' modulation allowed") + + if interpolation not in ("linear", "quadratic"): + raise ValueError("Only 'linear' or 'quadratic' interpolation allowed") + + actual_shape = waveform.shape + device, dtype = waveform.device, waveform.dtype + + if actual_shape[-2] > 4: + raise ValueError("Max 4 channels allowed") + + # convert to 3D (batch, channels, time) + waveform = waveform.view(-1, actual_shape[-2], actual_shape[-1]) + + # Scaling + feedback_gain = regen / 100 + delay_gain = width / 100 + channel_phase = phase / 100 + delay_min = delay / 1000 + delay_depth = depth / 1000 + + n_channels = waveform.shape[-2] + + if modulation == "sinusoidal": + wave_type = "SINE" + else: + wave_type = "TRIANGLE" + + # Balance output: + in_gain = 1.0 / (1 + delay_gain) + delay_gain = delay_gain / (1 + delay_gain) + + # Balance feedback loop: + delay_gain = delay_gain * (1 - abs(feedback_gain)) + + delay_buf_length = int((delay_min + delay_depth) * sample_rate + 0.5) + delay_buf_length = delay_buf_length + 2 + + delay_bufs = torch.zeros(waveform.shape[0], n_channels, delay_buf_length, dtype=dtype, device=device) + delay_last = torch.zeros(waveform.shape[0], n_channels, dtype=dtype, device=device) + + lfo_length = int(sample_rate / speed) + + table_min = math.floor(delay_min * sample_rate + 0.5) + table_max = delay_buf_length - 2.0 + + lfo = _generate_wave_table( + wave_type=wave_type, + data_type="FLOAT", + table_size=lfo_length, + min=float(table_min), + max=float(table_max), + phase=3 * math.pi / 2, + device=device, + ) + + output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device) + + delay_buf_pos = 0 + lfo_pos = 0 + channel_idxs = torch.arange(0, n_channels, device=device) + + for i in range(waveform.shape[-1]): + + delay_buf_pos = (delay_buf_pos + delay_buf_length - 1) % delay_buf_length + + cur_channel_phase = (channel_idxs * lfo_length * channel_phase + 0.5).to(torch.int64) + delay_tensor = lfo[(lfo_pos + cur_channel_phase) % lfo_length] + frac_delay = torch.frac(delay_tensor) + delay_tensor = torch.floor(delay_tensor) + + int_delay = delay_tensor.to(torch.int64) + + temp = waveform[:, :, i] + + delay_bufs[:, :, delay_buf_pos] = temp + delay_last * feedback_gain + + delayed_0 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length] + + int_delay = int_delay + 1 + + delayed_1 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length] + + int_delay = int_delay + 1 + + if interpolation == "linear": + delayed = delayed_0 + (delayed_1 - delayed_0) * frac_delay + else: + delayed_2 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length] + + int_delay = int_delay + 1 + + delayed_2 = delayed_2 - delayed_0 + delayed_1 = delayed_1 - delayed_0 + a = delayed_2 * 0.5 - delayed_1 + b = delayed_1 * 2 - delayed_2 * 0.5 + + delayed = delayed_0 + (a * frac_delay + b) * frac_delay + + delay_last = delayed + output_waveform[:, :, i] = waveform[:, :, i] * in_gain + delayed * delay_gain + + lfo_pos = (lfo_pos + 1) % lfo_length + + return output_waveform.clamp(min=-1, max=1).view(actual_shape) + + +def gain(waveform: Tensor, gain_db: float = 1.0) -> Tensor: + r"""Apply amplification or attenuation to the whole waveform. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): Tensor of audio of dimension (..., time). + gain_db (float, optional) Gain adjustment in decibels (dB) (Default: ``1.0``). + + Returns: + Tensor: the whole waveform amplified by gain_db. + """ + if gain_db == 0: + return waveform + + ratio = 10 ** (gain_db / 20) + + return waveform * ratio + + +def highpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor: + r"""Design biquad highpass filter and perform filtering. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)` + sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) + cutoff_freq (float or torch.Tensor): filter cutoff frequency + Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``) + + Returns: + Tensor: Waveform dimension of `(..., time)` + """ + dtype = waveform.dtype + device = waveform.device + cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device) + Q = torch.as_tensor(Q, dtype=dtype, device=device) + + w0 = 2 * math.pi * cutoff_freq / sample_rate + alpha = torch.sin(w0) / 2.0 / Q + + b0 = (1 + torch.cos(w0)) / 2 + b1 = -1 - torch.cos(w0) + b2 = b0 + a0 = 1 + alpha + a1 = -2 * torch.cos(w0) + a2 = 1 - alpha + return biquad(waveform, b0, b1, b2, a0, a1, a2) + + +def _lfilter_core_generic_loop(input_signal_windows: Tensor, a_coeffs_flipped: Tensor, padded_output_waveform: Tensor): + n_order = a_coeffs_flipped.size(1) + a_coeffs_flipped = a_coeffs_flipped.unsqueeze(2) + for i_sample, o0 in enumerate(input_signal_windows.permute(2, 0, 1)): + windowed_output_signal = padded_output_waveform[:, :, i_sample : i_sample + n_order] + o0 -= (windowed_output_signal.transpose(0, 1) @ a_coeffs_flipped)[..., 0].t() + padded_output_waveform[:, :, i_sample + n_order - 1] = o0 + + +try: + _lfilter_core_cpu_loop = torch.ops.torchaudio._lfilter_core_loop +except RuntimeError as err: + assert str(err) == "No such operator torchaudio::_lfilter_core_loop" + _lfilter_core_cpu_loop = _lfilter_core_generic_loop + + +def _lfilter_core( + waveform: Tensor, + a_coeffs: Tensor, + b_coeffs: Tensor, +) -> Tensor: + + assert a_coeffs.size() == b_coeffs.size() + assert len(waveform.size()) == 3 + assert waveform.device == a_coeffs.device + assert b_coeffs.device == a_coeffs.device + + n_batch, n_channel, n_sample = waveform.size() + n_order = a_coeffs.size(1) + assert n_order > 0 + + # Pad the input and create output + + padded_waveform = torch.nn.functional.pad(waveform, [n_order - 1, 0]) + padded_output_waveform = torch.zeros_like(padded_waveform) + + # Set up the coefficients matrix + # Flip coefficients' order + a_coeffs_flipped = a_coeffs.flip(1) + b_coeffs_flipped = b_coeffs.flip(1) + + # calculate windowed_input_signal in parallel using convolution + input_signal_windows = torch.nn.functional.conv1d(padded_waveform, b_coeffs_flipped.unsqueeze(1), groups=n_channel) + + input_signal_windows.div_(a_coeffs[:, :1]) + a_coeffs_flipped.div_(a_coeffs[:, :1]) + + if ( + input_signal_windows.device == torch.device("cpu") + and a_coeffs_flipped.device == torch.device("cpu") + and padded_output_waveform.device == torch.device("cpu") + ): + _lfilter_core_cpu_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform) + else: + _lfilter_core_generic_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform) + + output = padded_output_waveform[:, :, n_order - 1 :] + return output + + +try: + _lfilter = torch.ops.torchaudio._lfilter +except RuntimeError as err: + assert str(err) == "No such operator torchaudio::_lfilter" + _lfilter = _lfilter_core + + +def lfilter(waveform: Tensor, a_coeffs: Tensor, b_coeffs: Tensor, clamp: bool = True, batching: bool = True) -> Tensor: + r"""Perform an IIR filter by evaluating difference equation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Note: + To avoid numerical problems, small filter order is preferred. + Using double precision could also minimize numerical precision errors. + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)`. Must be normalized to -1 to 1. + a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either + 1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`. + Lower delays coefficients are first, e.g. ``[a0, a1, a2, ...]``. + Must be same size as b_coeffs (pad with 0's as necessary). + b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either + 1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`. + Lower delays coefficients are first, e.g. ``[b0, b1, b2, ...]``. + Must be same size as a_coeffs (pad with 0's as necessary). + clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``) + batching (bool, optional): Effective only when coefficients are 2D. If ``True``, then waveform should be at + least 2D, and the size of second axis from last should equals to ``num_filters``. + The output can be expressed as ``output[..., i, :] = lfilter(waveform[..., i, :], + a_coeffs[i], b_coeffs[i], clamp=clamp, batching=False)``. (Default: ``True``) + + Returns: + Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs`` + are 2D Tensors, or `(..., time)` otherwise. + """ + assert a_coeffs.size() == b_coeffs.size() + assert a_coeffs.ndim <= 2 + + if a_coeffs.ndim > 1: + if batching: + assert waveform.ndim > 1 + assert waveform.shape[-2] == a_coeffs.shape[0] + else: + waveform = torch.stack([waveform] * a_coeffs.shape[0], -2) + else: + a_coeffs = a_coeffs.unsqueeze(0) + b_coeffs = b_coeffs.unsqueeze(0) + + # pack batch + shape = waveform.size() + waveform = waveform.reshape(-1, a_coeffs.shape[0], shape[-1]) + output = _lfilter(waveform, a_coeffs, b_coeffs) + + if clamp: + output = torch.clamp(output, min=-1.0, max=1.0) + + # unpack batch + output = output.reshape(shape[:-1] + output.shape[-1:]) + + return output + + +def lowpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor: + r"""Design biquad lowpass filter and perform filtering. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (torch.Tensor): audio waveform of dimension of `(..., time)` + sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) + cutoff_freq (float or torch.Tensor): filter cutoff frequency + Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``) + + Returns: + Tensor: Waveform of dimension of `(..., time)` + """ + dtype = waveform.dtype + device = waveform.device + cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device) + Q = torch.as_tensor(Q, dtype=dtype, device=device) + + w0 = 2 * math.pi * cutoff_freq / sample_rate + alpha = torch.sin(w0) / 2 / Q + + b0 = (1 - torch.cos(w0)) / 2 + b1 = 1 - torch.cos(w0) + b2 = b0 + a0 = 1 + alpha + a1 = -2 * torch.cos(w0) + a2 = 1 - alpha + return biquad(waveform, b0, b1, b2, a0, a1, a2) + + +def _overdrive_core_loop_generic( + waveform: Tensor, temp: Tensor, last_in: Tensor, last_out: Tensor, output_waveform: Tensor +): + for i in range(waveform.shape[-1]): + last_out = temp[:, i] - last_in + 0.995 * last_out + last_in = temp[:, i] + output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75 + + +try: + _overdrive_core_loop_cpu = torch.ops.torchaudio._overdrive_core_loop +except RuntimeError as err: + assert str(err) == "No such operator torchaudio::_overdrive_core_loop" + _overdrive_core_loop_cpu = _overdrive_core_loop_generic + + +def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor: + r"""Apply a overdrive effect to the audio. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + This effect applies a non linear distortion to the audio signal. + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)` + gain (float, optional): desired gain at the boost (or attenuation) in dB + Allowed range of values are 0 to 100 + colour (float, optional): controls the amount of even harmonic content in the over-driven output + Allowed range of values are 0 to 100 + + Returns: + Tensor: Waveform of dimension of `(..., time)` + + Reference: + - http://sox.sourceforge.net/sox.html + """ + actual_shape = waveform.shape + device, dtype = waveform.device, waveform.dtype + + # convert to 2D (..,time) + waveform = waveform.view(-1, actual_shape[-1]) + + gain = _dB2Linear(gain) + colour = colour / 200 + last_in = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device) + last_out = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device) + + temp = waveform * gain + colour + + mask1 = temp < -1 + temp[mask1] = torch.tensor(-2.0 / 3.0, dtype=dtype, device=device) + # Wrapping the constant with Tensor is required for Torchscript + + mask2 = temp > 1 + temp[mask2] = torch.tensor(2.0 / 3.0, dtype=dtype, device=device) + + mask3 = ~mask1 & ~mask2 + temp[mask3] = temp[mask3] - (temp[mask3] ** 3) * (1.0 / 3) + + output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device) + + # Uses CPU optimized loop function if available for CPU device + if device == torch.device("cpu"): + _overdrive_core_loop_cpu(waveform, temp, last_in, last_out, output_waveform) + else: + _overdrive_core_loop_generic(waveform, temp, last_in, last_out, output_waveform) + + return output_waveform.clamp(min=-1, max=1).view(actual_shape) + + +def phaser( + waveform: Tensor, + sample_rate: int, + gain_in: float = 0.4, + gain_out: float = 0.74, + delay_ms: float = 3.0, + decay: float = 0.4, + mod_speed: float = 0.5, + sinusoidal: bool = True, +) -> Tensor: + r"""Apply a phasing effect to the audio. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)` + sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) + gain_in (float, optional): desired input gain at the boost (or attenuation) in dB + Allowed range of values are 0 to 1 + gain_out (float, optional): desired output gain at the boost (or attenuation) in dB + Allowed range of values are 0 to 1e9 + delay_ms (float, optional): desired delay in milliseconds + Allowed range of values are 0 to 5.0 + decay (float, optional): desired decay relative to gain-in + Allowed range of values are 0 to 0.99 + mod_speed (float, optional): modulation speed in Hz + Allowed range of values are 0.1 to 2 + sinusoidal (bool, optional): If ``True``, uses sinusoidal modulation (preferable for multiple instruments) + If ``False``, uses triangular modulation (gives single instruments a sharper phasing effect) + (Default: ``True``) + + Returns: + Tensor: Waveform of dimension of `(..., time)` + + Reference: + - http://sox.sourceforge.net/sox.html + - Scott Lehman, `Effects Explained`_. + + .. _Effects Explained: + https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html + """ + actual_shape = waveform.shape + device, dtype = waveform.device, waveform.dtype + + # convert to 2D (channels,time) + waveform = waveform.view(-1, actual_shape[-1]) + + delay_buf_len = int((delay_ms * 0.001 * sample_rate) + 0.5) + delay_buf = torch.zeros(waveform.shape[0], delay_buf_len, dtype=dtype, device=device) + + mod_buf_len = int(sample_rate / mod_speed + 0.5) + + if sinusoidal: + wave_type = "SINE" + else: + wave_type = "TRIANGLE" + + mod_buf = _generate_wave_table( + wave_type=wave_type, + data_type="INT", + table_size=mod_buf_len, + min=1.0, + max=float(delay_buf_len), + phase=math.pi / 2, + device=device, + ) + + delay_pos = 0 + mod_pos = 0 + + output_waveform_pre_gain_list = [] + waveform = waveform * gain_in + delay_buf = delay_buf * decay + waveform_list = [waveform[:, i] for i in range(waveform.size(1))] + delay_buf_list = [delay_buf[:, i] for i in range(delay_buf.size(1))] + mod_buf_list = [mod_buf[i] for i in range(mod_buf.size(0))] + + for i in range(waveform.shape[-1]): + idx = int((delay_pos + mod_buf_list[mod_pos]) % delay_buf_len) + mod_pos = (mod_pos + 1) % mod_buf_len + delay_pos = (delay_pos + 1) % delay_buf_len + temp = (waveform_list[i]) + (delay_buf_list[idx]) + delay_buf_list[delay_pos] = temp * decay + output_waveform_pre_gain_list.append(temp) + + output_waveform = torch.stack(output_waveform_pre_gain_list, dim=1).to(dtype=dtype, device=device) + output_waveform.mul_(gain_out) + + return output_waveform.clamp(min=-1, max=1).view(actual_shape) + + +def riaa_biquad(waveform: Tensor, sample_rate: int) -> Tensor: + r"""Apply RIAA vinyl playback equalization. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)` + sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz). + Allowed sample rates in Hz : ``44100``,``48000``,``88200``,``96000`` + + Returns: + Tensor: Waveform of dimension of `(..., time)` + + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + """ + + if sample_rate == 44100: + zeros = [-0.2014898, 0.9233820] + poles = [0.7083149, 0.9924091] + + elif sample_rate == 48000: + zeros = [-0.1766069, 0.9321590] + poles = [0.7396325, 0.9931330] + + elif sample_rate == 88200: + zeros = [-0.1168735, 0.9648312] + poles = [0.8590646, 0.9964002] + + elif sample_rate == 96000: + zeros = [-0.1141486, 0.9676817] + poles = [0.8699137, 0.9966946] + + else: + raise ValueError("Sample rate must be 44.1k, 48k, 88.2k, or 96k") + + # polynomial coefficients with roots zeros[0] and zeros[1] + b0 = 1.0 + b1 = -(zeros[0] + zeros[1]) + b2 = zeros[0] * zeros[1] + + # polynomial coefficients with roots poles[0] and poles[1] + a0 = 1.0 + a1 = -(poles[0] + poles[1]) + a2 = poles[0] * poles[1] + + # Normalize to 0dB at 1kHz + y = 2 * math.pi * 1000 / sample_rate + b_re = b0 + b1 * math.cos(-y) + b2 * math.cos(-2 * y) + a_re = a0 + a1 * math.cos(-y) + a2 * math.cos(-2 * y) + b_im = b1 * math.sin(-y) + b2 * math.sin(-2 * y) + a_im = a1 * math.sin(-y) + a2 * math.sin(-2 * y) + g = 1 / math.sqrt((b_re**2 + b_im**2) / (a_re**2 + a_im**2)) + + b0 *= g + b1 *= g + b2 *= g + + return biquad(waveform, b0, b1, b2, a0, a1, a2) + + +def treble_biquad( + waveform: Tensor, + sample_rate: int, + gain: float, + central_freq: float = 3000, + Q: float = 0.707, +) -> Tensor: + r"""Design a treble tone-control effect. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): audio waveform of dimension of `(..., time)` + sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) + gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB. + central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``3000``) + Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``). + + Returns: + Tensor: Waveform of dimension of `(..., time)` + + Reference: + - http://sox.sourceforge.net/sox.html + - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF + """ + dtype = waveform.dtype + device = waveform.device + central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device) + Q = torch.as_tensor(Q, dtype=dtype, device=device) + gain = torch.as_tensor(gain, dtype=dtype, device=device) + + w0 = 2 * math.pi * central_freq / sample_rate + alpha = torch.sin(w0) / 2 / Q + A = torch.exp(gain / 40 * math.log(10)) + + temp1 = 2 * torch.sqrt(A) * alpha + temp2 = (A - 1) * torch.cos(w0) + temp3 = (A + 1) * torch.cos(w0) + + b0 = A * ((A + 1) + temp2 + temp1) + b1 = -2 * A * ((A - 1) + temp3) + b2 = A * ((A + 1) + temp2 - temp1) + a0 = (A + 1) - temp2 + temp1 + a1 = 2 * ((A - 1) - temp3) + a2 = (A + 1) - temp2 - temp1 + + return biquad(waveform, b0, b1, b2, a0, a1, a2) + + +def _measure( + measure_len_ws: int, + samples: Tensor, + spectrum: Tensor, + noise_spectrum: Tensor, + spectrum_window: Tensor, + spectrum_start: int, + spectrum_end: int, + cepstrum_window: Tensor, + cepstrum_start: int, + cepstrum_end: int, + noise_reduction_amount: float, + measure_smooth_time_mult: float, + noise_up_time_mult: float, + noise_down_time_mult: float, + index_ns: int, + boot_count: int, +) -> float: + + assert spectrum.size()[-1] == noise_spectrum.size()[-1] + + samplesLen_ns = samples.size()[-1] + dft_len_ws = spectrum.size()[-1] + + dftBuf = torch.zeros(dft_len_ws) + + _index_ns = torch.tensor([index_ns] + [(index_ns + i) % samplesLen_ns for i in range(1, measure_len_ws)]) + dftBuf[:measure_len_ws] = samples[_index_ns] * spectrum_window[:measure_len_ws] + + # memset(c->dftBuf + i, 0, (p->dft_len_ws - i) * sizeof(*c->dftBuf)); + dftBuf[measure_len_ws:dft_len_ws].zero_() + + # lsx_safe_rdft((int)p->dft_len_ws, 1, c->dftBuf); + _dftBuf = torch.fft.rfft(dftBuf) + + # memset(c->dftBuf, 0, p->spectrum_start * sizeof(*c->dftBuf)); + _dftBuf[:spectrum_start].zero_() + + mult: float = boot_count / (1.0 + boot_count) if boot_count >= 0 else measure_smooth_time_mult + + _d = _dftBuf[spectrum_start:spectrum_end].abs() + spectrum[spectrum_start:spectrum_end].mul_(mult).add_(_d * (1 - mult)) + _d = spectrum[spectrum_start:spectrum_end] ** 2 + + _zeros = torch.zeros(spectrum_end - spectrum_start) + _mult = ( + _zeros + if boot_count >= 0 + else torch.where( + _d > noise_spectrum[spectrum_start:spectrum_end], + torch.tensor(noise_up_time_mult), # if + torch.tensor(noise_down_time_mult), # else + ) + ) + + noise_spectrum[spectrum_start:spectrum_end].mul_(_mult).add_(_d * (1 - _mult)) + _d = torch.sqrt( + torch.max( + _zeros, + _d - noise_reduction_amount * noise_spectrum[spectrum_start:spectrum_end], + ) + ) + + _cepstrum_Buf: Tensor = torch.zeros(dft_len_ws >> 1) + _cepstrum_Buf[spectrum_start:spectrum_end] = _d * cepstrum_window + _cepstrum_Buf[spectrum_end : dft_len_ws >> 1].zero_() + + # lsx_safe_rdft((int)p->dft_len_ws >> 1, 1, c->dftBuf); + _cepstrum_Buf = torch.fft.rfft(_cepstrum_Buf) + + result: float = float(torch.sum(_cepstrum_Buf[cepstrum_start:cepstrum_end].abs().pow(2))) + result = math.log(result / (cepstrum_end - cepstrum_start)) if result > 0 else -math.inf + return max(0, 21 + result) + + +def vad( + waveform: Tensor, + sample_rate: int, + trigger_level: float = 7.0, + trigger_time: float = 0.25, + search_time: float = 1.0, + allowed_gap: float = 0.25, + pre_trigger_time: float = 0.0, + # Fine-tuning parameters + boot_time: float = 0.35, + noise_up_time: float = 0.1, + noise_down_time: float = 0.01, + noise_reduction_amount: float = 1.35, + measure_freq: float = 20.0, + measure_duration: Optional[float] = None, + measure_smooth_time: float = 0.4, + hp_filter_freq: float = 50.0, + lp_filter_freq: float = 6000.0, + hp_lifter_freq: float = 150.0, + lp_lifter_freq: float = 2000.0, +) -> Tensor: + r"""Voice Activity Detector. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + Attempts to trim silence and quiet background sounds from the ends of recordings of speech. + The algorithm currently uses a simple cepstral power measurement to detect voice, + so may be fooled by other things, especially music. + + The effect can trim only from the front of the audio, + so in order to trim from the back, the reverse effect must also be used. + + Args: + waveform (Tensor): Tensor of audio of dimension `(channels, time)` or `(time)` + Tensor of shape `(channels, time)` is treated as a multi-channel recording + of the same event and the resulting output will be trimmed to the earliest + voice activity in any channel. + sample_rate (int): Sample rate of audio signal. + trigger_level (float, optional): The measurement level used to trigger activity detection. + This may need to be cahnged depending on the noise level, signal level, + and other characteristics of the input audio. (Default: 7.0) + trigger_time (float, optional): The time constant (in seconds) + used to help ignore short bursts of sound. (Default: 0.25) + search_time (float, optional): The amount of audio (in seconds) + to search for quieter/shorter bursts of audio to include prior + to the detected trigger point. (Default: 1.0) + allowed_gap (float, optional): The allowed gap (in seconds) between + quieter/shorter bursts of audio to include prior + to the detected trigger point. (Default: 0.25) + pre_trigger_time (float, optional): The amount of audio (in seconds) to preserve + before the trigger point and any found quieter/shorter bursts. (Default: 0.0) + boot_time (float, optional) The algorithm (internally) uses adaptive noise + estimation/reduction in order to detect the start of the wanted audio. + This option sets the time for the initial noise estimate. (Default: 0.35) + noise_up_time (float, optional) Time constant used by the adaptive noise estimator + for when the noise level is increasing. (Default: 0.1) + noise_down_time (float, optional) Time constant used by the adaptive noise estimator + for when the noise level is decreasing. (Default: 0.01) + noise_reduction_amount (float, optional) Amount of noise reduction to use in + the detection algorithm (e.g. 0, 0.5, ...). (Default: 1.35) + measure_freq (float, optional) Frequency of the algorithm’s + processing/measurements. (Default: 20.0) + measure_duration: (float, optional) Measurement duration. + (Default: Twice the measurement period; i.e. with overlap.) + measure_smooth_time (float, optional) Time constant used to smooth + spectral measurements. (Default: 0.4) + hp_filter_freq (float, optional) "Brick-wall" frequency of high-pass filter applied + at the input to the detector algorithm. (Default: 50.0) + lp_filter_freq (float, optional) "Brick-wall" frequency of low-pass filter applied + at the input to the detector algorithm. (Default: 6000.0) + hp_lifter_freq (float, optional) "Brick-wall" frequency of high-pass lifter used + in the detector algorithm. (Default: 150.0) + lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used + in the detector algorithm. (Default: 2000.0) + + Returns: + Tensor: Tensor of audio of dimension `(..., time)`. + + Reference: + - http://sox.sourceforge.net/sox.html + """ + + if waveform.ndim > 2: + warnings.warn( + "Expected input tensor dimension of 1 for single channel" + f" or 2 for multi-channel. Got {waveform.ndim} instead. " + "Batch semantics is not supported. " + "Please refer to https://github.com/pytorch/audio/issues/1348" + " and https://github.com/pytorch/audio/issues/1468." + ) + + measure_duration: float = 2.0 / measure_freq if measure_duration is None else measure_duration + + measure_len_ws = int(sample_rate * measure_duration + 0.5) + measure_len_ns = measure_len_ws + # for (dft_len_ws = 16; dft_len_ws < measure_len_ws; dft_len_ws <<= 1); + dft_len_ws = 16 + while dft_len_ws < measure_len_ws: + dft_len_ws *= 2 + + measure_period_ns = int(sample_rate / measure_freq + 0.5) + measures_len = math.ceil(search_time * measure_freq) + search_pre_trigger_len_ns = measures_len * measure_period_ns + gap_len = int(allowed_gap * measure_freq + 0.5) + + fixed_pre_trigger_len_ns = int(pre_trigger_time * sample_rate + 0.5) + samplesLen_ns = fixed_pre_trigger_len_ns + search_pre_trigger_len_ns + measure_len_ns + + spectrum_window = torch.zeros(measure_len_ws) + for i in range(measure_len_ws): + # sox.h:741 define SOX_SAMPLE_MIN (sox_sample_t)SOX_INT_MIN(32) + spectrum_window[i] = 2.0 / math.sqrt(float(measure_len_ws)) + # lsx_apply_hann(spectrum_window, (int)measure_len_ws); + spectrum_window *= torch.hann_window(measure_len_ws, dtype=torch.float) + + spectrum_start: int = int(hp_filter_freq / sample_rate * dft_len_ws + 0.5) + spectrum_start: int = max(spectrum_start, 1) + spectrum_end: int = int(lp_filter_freq / sample_rate * dft_len_ws + 0.5) + spectrum_end: int = min(spectrum_end, dft_len_ws // 2) + + cepstrum_window = torch.zeros(spectrum_end - spectrum_start) + for i in range(spectrum_end - spectrum_start): + cepstrum_window[i] = 2.0 / math.sqrt(float(spectrum_end) - spectrum_start) + # lsx_apply_hann(cepstrum_window,(int)(spectrum_end - spectrum_start)); + cepstrum_window *= torch.hann_window(spectrum_end - spectrum_start, dtype=torch.float) + + cepstrum_start = math.ceil(sample_rate * 0.5 / lp_lifter_freq) + cepstrum_end = math.floor(sample_rate * 0.5 / hp_lifter_freq) + cepstrum_end = min(cepstrum_end, dft_len_ws // 4) + + assert cepstrum_end > cepstrum_start + + noise_up_time_mult = math.exp(-1.0 / (noise_up_time * measure_freq)) + noise_down_time_mult = math.exp(-1.0 / (noise_down_time * measure_freq)) + measure_smooth_time_mult = math.exp(-1.0 / (measure_smooth_time * measure_freq)) + trigger_meas_time_mult = math.exp(-1.0 / (trigger_time * measure_freq)) + + boot_count_max = int(boot_time * measure_freq - 0.5) + measure_timer_ns = measure_len_ns + boot_count = measures_index = flushedLen_ns = samplesIndex_ns = 0 + + # pack batch + shape = waveform.size() + waveform = waveform.view(-1, shape[-1]) + + n_channels, ilen = waveform.size() + + mean_meas = torch.zeros(n_channels) + samples = torch.zeros(n_channels, samplesLen_ns) + spectrum = torch.zeros(n_channels, dft_len_ws) + noise_spectrum = torch.zeros(n_channels, dft_len_ws) + measures = torch.zeros(n_channels, measures_len) + + has_triggered: bool = False + num_measures_to_flush: int = 0 + pos: int = 0 + + while pos < ilen and not has_triggered: + measure_timer_ns -= 1 + for i in range(n_channels): + samples[i, samplesIndex_ns] = waveform[i, pos] + # if (!p->measure_timer_ns) { + if measure_timer_ns == 0: + index_ns: int = (samplesIndex_ns + samplesLen_ns - measure_len_ns) % samplesLen_ns + meas: float = _measure( + measure_len_ws=measure_len_ws, + samples=samples[i], + spectrum=spectrum[i], + noise_spectrum=noise_spectrum[i], + spectrum_window=spectrum_window, + spectrum_start=spectrum_start, + spectrum_end=spectrum_end, + cepstrum_window=cepstrum_window, + cepstrum_start=cepstrum_start, + cepstrum_end=cepstrum_end, + noise_reduction_amount=noise_reduction_amount, + measure_smooth_time_mult=measure_smooth_time_mult, + noise_up_time_mult=noise_up_time_mult, + noise_down_time_mult=noise_down_time_mult, + index_ns=index_ns, + boot_count=boot_count, + ) + measures[i, measures_index] = meas + mean_meas[i] = mean_meas[i] * trigger_meas_time_mult + meas * (1.0 - trigger_meas_time_mult) + + has_triggered = has_triggered or (mean_meas[i] >= trigger_level) + if has_triggered: + n: int = measures_len + k: int = measures_index + jTrigger: int = n + jZero: int = n + j: int = 0 + + for j in range(n): + if (measures[i, k] >= trigger_level) and (j <= jTrigger + gap_len): + jZero = jTrigger = j + elif (measures[i, k] == 0) and (jTrigger >= jZero): + jZero = j + k = (k + n - 1) % n + j = min(j, jZero) + # num_measures_to_flush = range_limit(j, num_measures_to_flush, n); + num_measures_to_flush = min(max(num_measures_to_flush, j), n) + # end if has_triggered + # end if (measure_timer_ns == 0): + # end for + samplesIndex_ns += 1 + pos += 1 + # end while + if samplesIndex_ns == samplesLen_ns: + samplesIndex_ns = 0 + if measure_timer_ns == 0: + measure_timer_ns = measure_period_ns + measures_index += 1 + measures_index = measures_index % measures_len + if boot_count >= 0: + boot_count = -1 if boot_count == boot_count_max else boot_count + 1 + + if has_triggered: + flushedLen_ns = (measures_len - num_measures_to_flush) * measure_period_ns + samplesIndex_ns = (samplesIndex_ns + flushedLen_ns) % samplesLen_ns + + res = waveform[:, pos - samplesLen_ns + flushedLen_ns :] + # unpack batch + return res.view(shape[:-1] + res.shape[-1:]) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/functional.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/functional.py new file mode 100644 index 0000000000000000000000000000000000000000..665bf8c1f4f87b154cb96552dec404206352e75a --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/functional.py @@ -0,0 +1,2162 @@ +# -*- coding: utf-8 -*- + +import io +import math +import warnings +from collections.abc import Sequence +from typing import Optional, Tuple, Union, List + +import torch +import torchaudio +from torch import Tensor +from torchaudio._internal import module_utils as _mod_utils + +__all__ = [ + "spectrogram", + "inverse_spectrogram", + "griffinlim", + "amplitude_to_DB", + "DB_to_amplitude", + "compute_deltas", + "compute_kaldi_pitch", + "melscale_fbanks", + "linear_fbanks", + "create_dct", + "compute_deltas", + "detect_pitch_frequency", + "DB_to_amplitude", + "mu_law_encoding", + "mu_law_decoding", + "phase_vocoder", + "mask_along_axis", + "mask_along_axis_iid", + "sliding_window_cmn", + "spectral_centroid", + "apply_codec", + "resample", + "edit_distance", + "pitch_shift", + "rnnt_loss", + "psd", + "mvdr_weights_souden", + "mvdr_weights_rtf", + "rtf_evd", + "rtf_power", + "apply_beamforming", +] + + +def spectrogram( + waveform: Tensor, + pad: int, + window: Tensor, + n_fft: int, + hop_length: int, + win_length: int, + power: Optional[float], + normalized: bool, + center: bool = True, + pad_mode: str = "reflect", + onesided: bool = True, + return_complex: Optional[bool] = None, +) -> Tensor: + r"""Create a spectrogram or a batch of spectrograms from a raw audio signal. + The spectrogram can be either magnitude-only or complex. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + waveform (Tensor): Tensor of audio of dimension `(..., time)` + pad (int): Two sided padding of signal + window (Tensor): Window tensor that is applied/multiplied to each frame/window + n_fft (int): Size of FFT + hop_length (int): Length of hop between STFT windows + win_length (int): Window size + power (float or None): Exponent for the magnitude spectrogram, + (must be > 0) e.g., 1 for energy, 2 for power, etc. + If None, then the complex spectrum is returned instead. + normalized (bool): Whether to normalize by magnitude after stft + center (bool, optional): whether to pad :attr:`waveform` on both sides so + that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. + Default: ``True`` + pad_mode (string, optional): controls the padding method used when + :attr:`center` is ``True``. Default: ``"reflect"`` + onesided (bool, optional): controls whether to return half of results to + avoid redundancy. Default: ``True`` + return_complex (bool, optional): + Deprecated and not used. + + Returns: + Tensor: Dimension `(..., freq, time)`, freq is + ``n_fft // 2 + 1`` and ``n_fft`` is the number of + Fourier bins, and time is the number of window hops (n_frame). + """ + if return_complex is not None: + warnings.warn( + "`return_complex` argument is now deprecated and is not effective." + "`torchaudio.functional.spectrogram(power=None)` always returns a tensor with " + "complex dtype. Please remove the argument in the function call." + ) + + if pad > 0: + # TODO add "with torch.no_grad():" back when JIT supports it + waveform = torch.nn.functional.pad(waveform, (pad, pad), "constant") + + # pack batch + shape = waveform.size() + waveform = waveform.reshape(-1, shape[-1]) + + # default values are consistent with librosa.core.spectrum._spectrogram + spec_f = torch.stft( + input=waveform, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + pad_mode=pad_mode, + normalized=False, + onesided=onesided, + return_complex=True, + ) + + # unpack batch + spec_f = spec_f.reshape(shape[:-1] + spec_f.shape[-2:]) + + if normalized: + spec_f /= window.pow(2.0).sum().sqrt() + if power is not None: + if power == 1.0: + return spec_f.abs() + return spec_f.abs().pow(power) + return spec_f + + +def inverse_spectrogram( + spectrogram: Tensor, + length: Optional[int], + pad: int, + window: Tensor, + n_fft: int, + hop_length: int, + win_length: int, + normalized: bool, + center: bool = True, + pad_mode: str = "reflect", + onesided: bool = True, +) -> Tensor: + r"""Create an inverse spectrogram or a batch of inverse spectrograms from the provided + complex-valued spectrogram. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + spectrogram (Tensor): Complex tensor of audio of dimension (..., freq, time). + length (int or None): The output length of the waveform. + pad (int): Two sided padding of signal. It is only effective when ``length`` is provided. + window (Tensor): Window tensor that is applied/multiplied to each frame/window + n_fft (int): Size of FFT + hop_length (int): Length of hop between STFT windows + win_length (int): Window size + normalized (bool): Whether the stft output was normalized by magnitude + center (bool, optional): whether the waveform was padded on both sides so + that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. + Default: ``True`` + pad_mode (string, optional): controls the padding method used when + :attr:`center` is ``True``. This parameter is provided for compatibility with the + spectrogram function and is not used. Default: ``"reflect"`` + onesided (bool, optional): controls whether spectrogram was done in onesided mode. + Default: ``True`` + + Returns: + Tensor: Dimension `(..., time)`. Least squares estimation of the original signal. + """ + + if not spectrogram.is_complex(): + raise ValueError("Expected `spectrogram` to be complex dtype.") + + if normalized: + spectrogram = spectrogram * window.pow(2.0).sum().sqrt() + + # pack batch + shape = spectrogram.size() + spectrogram = spectrogram.reshape(-1, shape[-2], shape[-1]) + + # default values are consistent with librosa.core.spectrum._spectrogram + waveform = torch.istft( + input=spectrogram, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + normalized=False, + onesided=onesided, + length=length + 2 * pad if length is not None else None, + return_complex=False, + ) + + if length is not None and pad > 0: + # remove padding from front and back + waveform = waveform[:, pad:-pad] + + # unpack batch + waveform = waveform.reshape(shape[:-2] + waveform.shape[-1:]) + + return waveform + + +def _get_complex_dtype(real_dtype: torch.dtype): + if real_dtype == torch.double: + return torch.cdouble + if real_dtype == torch.float: + return torch.cfloat + if real_dtype == torch.half: + return torch.complex32 + raise ValueError(f"Unexpected dtype {real_dtype}") + + +def griffinlim( + specgram: Tensor, + window: Tensor, + n_fft: int, + hop_length: int, + win_length: int, + power: float, + n_iter: int, + momentum: float, + length: Optional[int], + rand_init: bool, +) -> Tensor: + r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Implementation ported from + *librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`] + and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`]. + + Args: + specgram (Tensor): A magnitude-only STFT spectrogram of dimension `(..., freq, frames)` + where freq is ``n_fft // 2 + 1``. + window (Tensor): Window tensor that is applied/multiplied to each frame/window + n_fft (int): Size of FFT, creates ``n_fft // 2 + 1`` bins + hop_length (int): Length of hop between STFT windows. ( + Default: ``win_length // 2``) + win_length (int): Window size. (Default: ``n_fft``) + power (float): Exponent for the magnitude spectrogram, + (must be > 0) e.g., 1 for energy, 2 for power, etc. + n_iter (int): Number of iteration for phase recovery process. + momentum (float): The momentum parameter for fast Griffin-Lim. + Setting this to 0 recovers the original Griffin-Lim method. + Values near 1 can lead to faster convergence, but above 1 may not converge. + length (int or None): Array length of the expected output. + rand_init (bool): Initializes phase randomly if True, to zero otherwise. + + Returns: + Tensor: waveform of `(..., time)`, where time equals the ``length`` parameter if given. + """ + assert momentum < 1, "momentum={} > 1 can be unstable".format(momentum) + assert momentum >= 0, "momentum={} < 0".format(momentum) + + # pack batch + shape = specgram.size() + specgram = specgram.reshape([-1] + list(shape[-2:])) + + specgram = specgram.pow(1 / power) + + # initialize the phase + if rand_init: + angles = torch.rand(specgram.size(), dtype=_get_complex_dtype(specgram.dtype), device=specgram.device) + else: + angles = torch.full(specgram.size(), 1, dtype=_get_complex_dtype(specgram.dtype), device=specgram.device) + + # And initialize the previous iterate to 0 + tprev = torch.tensor(0.0, dtype=specgram.dtype, device=specgram.device) + for _ in range(n_iter): + # Invert with our current estimate of the phases + inverse = torch.istft( + specgram * angles, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, length=length + ) + + # Rebuild the spectrogram + rebuilt = torch.stft( + input=inverse, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=True, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + + # Update our phase estimates + angles = rebuilt + if momentum: + angles = angles - tprev.mul_(momentum / (1 + momentum)) + angles = angles.div(angles.abs().add(1e-16)) + + # Store the previous iterate + tprev = rebuilt + + # Return the final phase estimates + waveform = torch.istft( + specgram * angles, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, length=length + ) + + # unpack batch + waveform = waveform.reshape(shape[:-2] + waveform.shape[-1:]) + + return waveform + + +def amplitude_to_DB( + x: Tensor, multiplier: float, amin: float, db_multiplier: float, top_db: Optional[float] = None +) -> Tensor: + r"""Turn a spectrogram from the power/amplitude scale to the decibel scale. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + The output of each tensor in a batch depends on the maximum value of that tensor, + and so may return different values for an audio clip split into snippets vs. a full clip. + + Args: + + x (Tensor): Input spectrogram(s) before being converted to decibel scale. Input should take + the form `(..., freq, time)`. Batched inputs should include a channel dimension and + have the form `(batch, channel, freq, time)`. + multiplier (float): Use 10. for power and 20. for amplitude + amin (float): Number to clamp ``x`` + db_multiplier (float): Log10(max(reference value and amin)) + top_db (float or None, optional): Minimum negative cut-off in decibels. A reasonable number + is 80. (Default: ``None``) + + Returns: + Tensor: Output tensor in decibel scale + """ + x_db = multiplier * torch.log10(torch.clamp(x, min=amin)) + x_db -= multiplier * db_multiplier + + if top_db is not None: + # Expand batch + shape = x_db.size() + packed_channels = shape[-3] if x_db.dim() > 2 else 1 + x_db = x_db.reshape(-1, packed_channels, shape[-2], shape[-1]) + + x_db = torch.max(x_db, (x_db.amax(dim=(-3, -2, -1)) - top_db).view(-1, 1, 1, 1)) + + # Repack batch + x_db = x_db.reshape(shape) + + return x_db + + +def DB_to_amplitude(x: Tensor, ref: float, power: float) -> Tensor: + r"""Turn a tensor from the decibel scale to the power/amplitude scale. + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + Args: + x (Tensor): Input tensor before being converted to power/amplitude scale. + ref (float): Reference which the output will be scaled by. + power (float): If power equals 1, will compute DB to power. If 0.5, will compute DB to amplitude. + + Returns: + Tensor: Output tensor in power/amplitude scale. + """ + return ref * torch.pow(torch.pow(10.0, 0.1 * x), power) + + +def _hz_to_mel(freq: float, mel_scale: str = "htk") -> float: + r"""Convert Hz to Mels. + + Args: + freqs (float): Frequencies in Hz + mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``) + + Returns: + mels (float): Frequency in Mels + """ + + if mel_scale not in ["slaney", "htk"]: + raise ValueError('mel_scale should be one of "htk" or "slaney".') + + if mel_scale == "htk": + return 2595.0 * math.log10(1.0 + (freq / 700.0)) + + # Fill in the linear part + f_min = 0.0 + f_sp = 200.0 / 3 + + mels = (freq - f_min) / f_sp + + # Fill in the log-scale part + min_log_hz = 1000.0 + min_log_mel = (min_log_hz - f_min) / f_sp + logstep = math.log(6.4) / 27.0 + + if freq >= min_log_hz: + mels = min_log_mel + math.log(freq / min_log_hz) / logstep + + return mels + + +def _mel_to_hz(mels: Tensor, mel_scale: str = "htk") -> Tensor: + """Convert mel bin numbers to frequencies. + + Args: + mels (Tensor): Mel frequencies + mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``) + + Returns: + freqs (Tensor): Mels converted in Hz + """ + + if mel_scale not in ["slaney", "htk"]: + raise ValueError('mel_scale should be one of "htk" or "slaney".') + + if mel_scale == "htk": + return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) + + # Fill in the linear scale + f_min = 0.0 + f_sp = 200.0 / 3 + freqs = f_min + f_sp * mels + + # And now the nonlinear scale + min_log_hz = 1000.0 + min_log_mel = (min_log_hz - f_min) / f_sp + logstep = math.log(6.4) / 27.0 + + log_t = mels >= min_log_mel + freqs[log_t] = min_log_hz * torch.exp(logstep * (mels[log_t] - min_log_mel)) + + return freqs + + +def _create_triangular_filterbank( + all_freqs: Tensor, + f_pts: Tensor, +) -> Tensor: + """Create a triangular filter bank. + + Args: + all_freqs (Tensor): STFT freq points of size (`n_freqs`). + f_pts (Tensor): Filter mid points of size (`n_filter`). + + Returns: + fb (Tensor): The filter bank of size (`n_freqs`, `n_filter`). + """ + # Adopted from Librosa + # calculate the difference between each filter mid point and each stft freq point in hertz + f_diff = f_pts[1:] - f_pts[:-1] # (n_filter + 1) + slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1) # (n_freqs, n_filter + 2) + # create overlapping triangles + zero = torch.zeros(1) + down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1] # (n_freqs, n_filter) + up_slopes = slopes[:, 2:] / f_diff[1:] # (n_freqs, n_filter) + fb = torch.max(zero, torch.min(down_slopes, up_slopes)) + + return fb + + +def melscale_fbanks( + n_freqs: int, + f_min: float, + f_max: float, + n_mels: int, + sample_rate: int, + norm: Optional[str] = None, + mel_scale: str = "htk", +) -> Tensor: + r"""Create a frequency bin conversion matrix. + + .. devices:: CPU + + .. properties:: TorchScript + + Note: + For the sake of the numerical compatibility with librosa, not all the coefficients + in the resulting filter bank has magnitude of 1. + + .. image:: https://download.pytorch.org/torchaudio/doc-assets/mel_fbanks.png + :alt: Visualization of generated filter bank + + Args: + n_freqs (int): Number of frequencies to highlight/apply + f_min (float): Minimum frequency (Hz) + f_max (float): Maximum frequency (Hz) + n_mels (int): Number of mel filterbanks + sample_rate (int): Sample rate of the audio waveform + norm (str or None, optional): If 'slaney', divide the triangular mel weights by the width of the mel band + (area normalization). (Default: ``None``) + mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``) + + Returns: + Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``) + meaning number of frequencies to highlight/apply to x the number of filterbanks. + Each column is a filterbank so that assuming there is a matrix A of + size (..., ``n_freqs``), the applied result would be + ``A * melscale_fbanks(A.size(-1), ...)``. + + """ + + if norm is not None and norm != "slaney": + raise ValueError("norm must be one of None or 'slaney'") + + # freq bins + all_freqs = torch.linspace(0, sample_rate // 2, n_freqs) + + # calculate mel freq bins + m_min = _hz_to_mel(f_min, mel_scale=mel_scale) + m_max = _hz_to_mel(f_max, mel_scale=mel_scale) + + m_pts = torch.linspace(m_min, m_max, n_mels + 2) + f_pts = _mel_to_hz(m_pts, mel_scale=mel_scale) + + # create filterbank + fb = _create_triangular_filterbank(all_freqs, f_pts) + + if norm is not None and norm == "slaney": + # Slaney-style mel is scaled to be approx constant energy per channel + enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels]) + fb *= enorm.unsqueeze(0) + + if (fb.max(dim=0).values == 0.0).any(): + warnings.warn( + "At least one mel filterbank has all zero values. " + f"The value for `n_mels` ({n_mels}) may be set too high. " + f"Or, the value for `n_freqs` ({n_freqs}) may be set too low." + ) + + return fb + + +def linear_fbanks( + n_freqs: int, + f_min: float, + f_max: float, + n_filter: int, + sample_rate: int, +) -> Tensor: + r"""Creates a linear triangular filterbank. + + .. devices:: CPU + + .. properties:: TorchScript + + Note: + For the sake of the numerical compatibility with librosa, not all the coefficients + in the resulting filter bank has magnitude of 1. + + .. image:: https://download.pytorch.org/torchaudio/doc-assets/lin_fbanks.png + :alt: Visualization of generated filter bank + + Args: + n_freqs (int): Number of frequencies to highlight/apply + f_min (float): Minimum frequency (Hz) + f_max (float): Maximum frequency (Hz) + n_filter (int): Number of (linear) triangular filter + sample_rate (int): Sample rate of the audio waveform + + Returns: + Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_filter``) + meaning number of frequencies to highlight/apply to x the number of filterbanks. + Each column is a filterbank so that assuming there is a matrix A of + size (..., ``n_freqs``), the applied result would be + ``A * linear_fbanks(A.size(-1), ...)``. + """ + # freq bins + all_freqs = torch.linspace(0, sample_rate // 2, n_freqs) + + # filter mid-points + f_pts = torch.linspace(f_min, f_max, n_filter + 2) + + # create filterbank + fb = _create_triangular_filterbank(all_freqs, f_pts) + + return fb + + +def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]) -> Tensor: + r"""Create a DCT transformation matrix with shape (``n_mels``, ``n_mfcc``), + normalized depending on norm. + + .. devices:: CPU + + .. properties:: TorchScript + + Args: + n_mfcc (int): Number of mfc coefficients to retain + n_mels (int): Number of mel filterbanks + norm (str or None): Norm to use (either 'ortho' or None) + + Returns: + Tensor: The transformation matrix, to be right-multiplied to + row-wise data of size (``n_mels``, ``n_mfcc``). + """ + # http://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II + n = torch.arange(float(n_mels)) + k = torch.arange(float(n_mfcc)).unsqueeze(1) + dct = torch.cos(math.pi / float(n_mels) * (n + 0.5) * k) # size (n_mfcc, n_mels) + if norm is None: + dct *= 2.0 + else: + assert norm == "ortho" + dct[0] *= 1.0 / math.sqrt(2.0) + dct *= math.sqrt(2.0 / float(n_mels)) + return dct.t() + + +def mu_law_encoding(x: Tensor, quantization_channels: int) -> Tensor: + r"""Encode signal based on mu-law companding. + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + For more info see the + `Wikipedia Entry `_ + + This algorithm expects the signal has been scaled to between -1 and 1 and + returns a signal encoded with values from 0 to quantization_channels - 1. + + Args: + x (Tensor): Input tensor + quantization_channels (int): Number of channels + + Returns: + Tensor: Input after mu-law encoding + """ + mu = quantization_channels - 1.0 + if not x.is_floating_point(): + warnings.warn( + "The input Tensor must be of floating type. \ + This will be an error in the v0.12 release." + ) + x = x.to(torch.float) + mu = torch.tensor(mu, dtype=x.dtype) + x_mu = torch.sign(x) * torch.log1p(mu * torch.abs(x)) / torch.log1p(mu) + x_mu = ((x_mu + 1) / 2 * mu + 0.5).to(torch.int64) + return x_mu + + +def mu_law_decoding(x_mu: Tensor, quantization_channels: int) -> Tensor: + r"""Decode mu-law encoded signal. + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + For more info see the + `Wikipedia Entry `_ + + This expects an input with values between 0 and quantization_channels - 1 + and returns a signal scaled between -1 and 1. + + Args: + x_mu (Tensor): Input tensor + quantization_channels (int): Number of channels + + Returns: + Tensor: Input after mu-law decoding + """ + mu = quantization_channels - 1.0 + if not x_mu.is_floating_point(): + x_mu = x_mu.to(torch.float) + mu = torch.tensor(mu, dtype=x_mu.dtype) + x = ((x_mu) / mu) * 2 - 1.0 + x = torch.sign(x) * (torch.exp(torch.abs(x) * torch.log1p(mu)) - 1.0) / mu + return x + + +def phase_vocoder(complex_specgrams: Tensor, rate: float, phase_advance: Tensor) -> Tensor: + r"""Given a STFT tensor, speed up in time without modifying pitch by a factor of ``rate``. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + complex_specgrams (Tensor): + A tensor of dimension `(..., freq, num_frame)` with complex dtype. + rate (float): Speed-up factor + phase_advance (Tensor): Expected phase advance in each bin. Dimension of `(freq, 1)` + + Returns: + Tensor: + Stretched spectrogram. The resulting tensor is of the same dtype as the input + spectrogram, but the number of frames is changed to ``ceil(num_frame / rate)``. + + Example + >>> freq, hop_length = 1025, 512 + >>> # (channel, freq, time) + >>> complex_specgrams = torch.randn(2, freq, 300, dtype=torch.cfloat) + >>> rate = 1.3 # Speed up by 30% + >>> phase_advance = torch.linspace( + >>> 0, math.pi * hop_length, freq)[..., None] + >>> x = phase_vocoder(complex_specgrams, rate, phase_advance) + >>> x.shape # with 231 == ceil(300 / 1.3) + torch.Size([2, 1025, 231]) + """ + if rate == 1.0: + return complex_specgrams + + # pack batch + shape = complex_specgrams.size() + complex_specgrams = complex_specgrams.reshape([-1] + list(shape[-2:])) + + # Figures out the corresponding real dtype, i.e. complex128 -> float64, complex64 -> float32 + # Note torch.real is a view so it does not incur any memory copy. + real_dtype = torch.real(complex_specgrams).dtype + time_steps = torch.arange(0, complex_specgrams.size(-1), rate, device=complex_specgrams.device, dtype=real_dtype) + + alphas = time_steps % 1.0 + phase_0 = complex_specgrams[..., :1].angle() + + # Time Padding + complex_specgrams = torch.nn.functional.pad(complex_specgrams, [0, 2]) + + # (new_bins, freq, 2) + complex_specgrams_0 = complex_specgrams.index_select(-1, time_steps.long()) + complex_specgrams_1 = complex_specgrams.index_select(-1, (time_steps + 1).long()) + + angle_0 = complex_specgrams_0.angle() + angle_1 = complex_specgrams_1.angle() + + norm_0 = complex_specgrams_0.abs() + norm_1 = complex_specgrams_1.abs() + + phase = angle_1 - angle_0 - phase_advance + phase = phase - 2 * math.pi * torch.round(phase / (2 * math.pi)) + + # Compute Phase Accum + phase = phase + phase_advance + phase = torch.cat([phase_0, phase[..., :-1]], dim=-1) + phase_acc = torch.cumsum(phase, -1) + + mag = alphas * norm_1 + (1 - alphas) * norm_0 + + complex_specgrams_stretch = torch.polar(mag, phase_acc) + + # unpack batch + complex_specgrams_stretch = complex_specgrams_stretch.reshape(shape[:-2] + complex_specgrams_stretch.shape[1:]) + return complex_specgrams_stretch + + +def _get_mask_param(mask_param: int, p: float, axis_length: int) -> int: + if p == 1.0: + return mask_param + else: + return min(mask_param, int(axis_length * p)) + + +def mask_along_axis_iid( + specgrams: Tensor, + mask_param: int, + mask_value: float, + axis: int, + p: float = 1.0, +) -> Tensor: + r"""Apply a mask along ``axis``. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Mask will be applied from indices ``[v_0, v_0 + v)``, + where ``v`` is sampled from ``uniform(0, max_v)`` and + ``v_0`` from ``uniform(0, specgrams.size(axis) - v)``, + with ``max_v = mask_param`` when ``p = 1.0`` and + ``max_v = min(mask_param, floor(specgrams.size(axis) * p))`` otherwise. + + Args: + specgrams (Tensor): Real spectrograms `(batch, channel, freq, time)` + mask_param (int): Number of columns to be masked will be uniformly sampled from [0, mask_param] + mask_value (float): Value to assign to the masked columns + axis (int): Axis to apply masking on (2 -> frequency, 3 -> time) + p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0) + + Returns: + Tensor: Masked spectrograms of dimensions `(batch, channel, freq, time)` + """ + + if axis not in [2, 3]: + raise ValueError("Only Frequency and Time masking are supported") + + if not 0.0 <= p <= 1.0: + raise ValueError(f"The value of p must be between 0.0 and 1.0 ({p} given).") + + mask_param = _get_mask_param(mask_param, p, specgrams.shape[axis]) + if mask_param < 1: + return specgrams + + device = specgrams.device + dtype = specgrams.dtype + + value = torch.rand(specgrams.shape[:2], device=device, dtype=dtype) * mask_param + min_value = torch.rand(specgrams.shape[:2], device=device, dtype=dtype) * (specgrams.size(axis) - value) + + # Create broadcastable mask + mask_start = min_value.long()[..., None, None] + mask_end = (min_value.long() + value.long())[..., None, None] + mask = torch.arange(0, specgrams.size(axis), device=device, dtype=dtype) + + # Per batch example masking + specgrams = specgrams.transpose(axis, -1) + specgrams = specgrams.masked_fill((mask >= mask_start) & (mask < mask_end), mask_value) + specgrams = specgrams.transpose(axis, -1) + + return specgrams + + +def mask_along_axis( + specgram: Tensor, + mask_param: int, + mask_value: float, + axis: int, + p: float = 1.0, +) -> Tensor: + r"""Apply a mask along ``axis``. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Mask will be applied from indices ``[v_0, v_0 + v)``, + where ``v`` is sampled from ``uniform(0, max_v)`` and + ``v_0`` from ``uniform(0, specgrams.size(axis) - v)``, with + ``max_v = mask_param`` when ``p = 1.0`` and + ``max_v = min(mask_param, floor(specgrams.size(axis) * p))`` + otherwise. + All examples will have the same mask interval. + + Args: + specgram (Tensor): Real spectrogram `(channel, freq, time)` + mask_param (int): Number of columns to be masked will be uniformly sampled from [0, mask_param] + mask_value (float): Value to assign to the masked columns + axis (int): Axis to apply masking on (1 -> frequency, 2 -> time) + p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0) + + Returns: + Tensor: Masked spectrogram of dimensions `(channel, freq, time)` + """ + if axis not in [1, 2]: + raise ValueError("Only Frequency and Time masking are supported") + + if not 0.0 <= p <= 1.0: + raise ValueError(f"The value of p must be between 0.0 and 1.0 ({p} given).") + + mask_param = _get_mask_param(mask_param, p, specgram.shape[axis]) + if mask_param < 1: + return specgram + + # pack batch + shape = specgram.size() + specgram = specgram.reshape([-1] + list(shape[-2:])) + value = torch.rand(1) * mask_param + min_value = torch.rand(1) * (specgram.size(axis) - value) + + mask_start = (min_value.long()).squeeze() + mask_end = (min_value.long() + value.long()).squeeze() + mask = torch.arange(0, specgram.shape[axis], device=specgram.device, dtype=specgram.dtype) + mask = (mask >= mask_start) & (mask < mask_end) + if axis == 1: + mask = mask.unsqueeze(-1) + + assert mask_end - mask_start < mask_param + + specgram = specgram.masked_fill(mask, mask_value) + + # unpack batch + specgram = specgram.reshape(shape[:-2] + specgram.shape[-2:]) + + return specgram + + +def compute_deltas(specgram: Tensor, win_length: int = 5, mode: str = "replicate") -> Tensor: + r"""Compute delta coefficients of a tensor, usually a spectrogram: + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + .. math:: + d_t = \frac{\sum_{n=1}^{\text{N}} n (c_{t+n} - c_{t-n})}{2 \sum_{n=1}^{\text{N}} n^2} + + where :math:`d_t` is the deltas at time :math:`t`, + :math:`c_t` is the spectrogram coeffcients at time :math:`t`, + :math:`N` is ``(win_length-1)//2``. + + Args: + specgram (Tensor): Tensor of audio of dimension `(..., freq, time)` + win_length (int, optional): The window length used for computing delta (Default: ``5``) + mode (str, optional): Mode parameter passed to padding (Default: ``"replicate"``) + + Returns: + Tensor: Tensor of deltas of dimension `(..., freq, time)` + + Example + >>> specgram = torch.randn(1, 40, 1000) + >>> delta = compute_deltas(specgram) + >>> delta2 = compute_deltas(delta) + """ + device = specgram.device + dtype = specgram.dtype + + # pack batch + shape = specgram.size() + specgram = specgram.reshape(1, -1, shape[-1]) + + assert win_length >= 3 + + n = (win_length - 1) // 2 + + # twice sum of integer squared + denom = n * (n + 1) * (2 * n + 1) / 3 + + specgram = torch.nn.functional.pad(specgram, (n, n), mode=mode) + + kernel = torch.arange(-n, n + 1, 1, device=device, dtype=dtype).repeat(specgram.shape[1], 1, 1) + + output = torch.nn.functional.conv1d(specgram, kernel, groups=specgram.shape[1]) / denom + + # unpack batch + output = output.reshape(shape) + + return output + + +def _compute_nccf(waveform: Tensor, sample_rate: int, frame_time: float, freq_low: int) -> Tensor: + r""" + Compute Normalized Cross-Correlation Function (NCCF). + + .. math:: + \phi_i(m) = \frac{\sum_{n=b_i}^{b_i + N-1} w(n) w(m+n)}{\sqrt{E(b_i) E(m+b_i)}}, + + where + :math:`\phi_i(m)` is the NCCF at frame :math:`i` with lag :math:`m`, + :math:`w` is the waveform, + :math:`N` is the length of a frame, + :math:`b_i` is the beginning of frame :math:`i`, + :math:`E(j)` is the energy :math:`\sum_{n=j}^{j+N-1} w^2(n)`. + """ + + EPSILON = 10 ** (-9) + + # Number of lags to check + lags = int(math.ceil(sample_rate / freq_low)) + + frame_size = int(math.ceil(sample_rate * frame_time)) + + waveform_length = waveform.size()[-1] + num_of_frames = int(math.ceil(waveform_length / frame_size)) + + p = lags + num_of_frames * frame_size - waveform_length + waveform = torch.nn.functional.pad(waveform, (0, p)) + + # Compute lags + output_lag = [] + for lag in range(1, lags + 1): + s1 = waveform[..., :-lag].unfold(-1, frame_size, frame_size)[..., :num_of_frames, :] + s2 = waveform[..., lag:].unfold(-1, frame_size, frame_size)[..., :num_of_frames, :] + + output_frames = ( + (s1 * s2).sum(-1) + / (EPSILON + torch.norm(s1, p=2, dim=-1)).pow(2) + / (EPSILON + torch.norm(s2, p=2, dim=-1)).pow(2) + ) + + output_lag.append(output_frames.unsqueeze(-1)) + + nccf = torch.cat(output_lag, -1) + + return nccf + + +def _combine_max(a: Tuple[Tensor, Tensor], b: Tuple[Tensor, Tensor], thresh: float = 0.99) -> Tuple[Tensor, Tensor]: + """ + Take value from first if bigger than a multiplicative factor of the second, elementwise. + """ + mask = a[0] > thresh * b[0] + values = mask * a[0] + ~mask * b[0] + indices = mask * a[1] + ~mask * b[1] + return values, indices + + +def _find_max_per_frame(nccf: Tensor, sample_rate: int, freq_high: int) -> Tensor: + r""" + For each frame, take the highest value of NCCF, + apply centered median smoothing, and convert to frequency. + + Note: If the max among all the lags is very close + to the first half of lags, then the latter is taken. + """ + + lag_min = int(math.ceil(sample_rate / freq_high)) + + # Find near enough max that is smallest + + best = torch.max(nccf[..., lag_min:], -1) + + half_size = nccf.shape[-1] // 2 + half = torch.max(nccf[..., lag_min:half_size], -1) + + best = _combine_max(half, best) + indices = best[1] + + # Add back minimal lag + indices += lag_min + # Add 1 empirical calibration offset + indices += 1 + + return indices + + +def _median_smoothing(indices: Tensor, win_length: int) -> Tensor: + r""" + Apply median smoothing to the 1D tensor over the given window. + """ + + # Centered windowed + pad_length = (win_length - 1) // 2 + + # "replicate" padding in any dimension + indices = torch.nn.functional.pad(indices, (pad_length, 0), mode="constant", value=0.0) + + indices[..., :pad_length] = torch.cat(pad_length * [indices[..., pad_length].unsqueeze(-1)], dim=-1) + roll = indices.unfold(-1, win_length, 1) + + values, _ = torch.median(roll, -1) + return values + + +def detect_pitch_frequency( + waveform: Tensor, + sample_rate: int, + frame_time: float = 10 ** (-2), + win_length: int = 30, + freq_low: int = 85, + freq_high: int = 3400, +) -> Tensor: + r"""Detect pitch frequency. + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + It is implemented using normalized cross-correlation function and median smoothing. + + Args: + waveform (Tensor): Tensor of audio of dimension `(..., freq, time)` + sample_rate (int): The sample rate of the waveform (Hz) + frame_time (float, optional): Duration of a frame (Default: ``10 ** (-2)``). + win_length (int, optional): The window length for median smoothing (in number of frames) (Default: ``30``). + freq_low (int, optional): Lowest frequency that can be detected (Hz) (Default: ``85``). + freq_high (int, optional): Highest frequency that can be detected (Hz) (Default: ``3400``). + + Returns: + Tensor: Tensor of freq of dimension `(..., frame)` + """ + # pack batch + shape = list(waveform.size()) + waveform = waveform.reshape([-1] + shape[-1:]) + + nccf = _compute_nccf(waveform, sample_rate, frame_time, freq_low) + indices = _find_max_per_frame(nccf, sample_rate, freq_high) + indices = _median_smoothing(indices, win_length) + + # Convert indices to frequency + EPSILON = 10 ** (-9) + freq = sample_rate / (EPSILON + indices.to(torch.float)) + + # unpack batch + freq = freq.reshape(shape[:-1] + list(freq.shape[-1:])) + + return freq + + +def sliding_window_cmn( + specgram: Tensor, + cmn_window: int = 600, + min_cmn_window: int = 100, + center: bool = False, + norm_vars: bool = False, +) -> Tensor: + r""" + Apply sliding-window cepstral mean (and optionally variance) normalization per utterance. + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + Args: + specgram (Tensor): Tensor of spectrogram of dimension `(..., time, freq)` + cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600) + min_cmn_window (int, optional): Minimum CMN window used at start of decoding (adds latency only at start). + Only applicable if center == false, ignored if center==true (int, default = 100) + center (bool, optional): If true, use a window centered on the current frame + (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false) + norm_vars (bool, optional): If true, normalize variance to one. (bool, default = false) + + Returns: + Tensor: Tensor matching input shape `(..., freq, time)` + """ + input_shape = specgram.shape + num_frames, num_feats = input_shape[-2:] + specgram = specgram.view(-1, num_frames, num_feats) + num_channels = specgram.shape[0] + + dtype = specgram.dtype + device = specgram.device + last_window_start = last_window_end = -1 + cur_sum = torch.zeros(num_channels, num_feats, dtype=dtype, device=device) + cur_sumsq = torch.zeros(num_channels, num_feats, dtype=dtype, device=device) + cmn_specgram = torch.zeros(num_channels, num_frames, num_feats, dtype=dtype, device=device) + for t in range(num_frames): + window_start = 0 + window_end = 0 + if center: + window_start = t - cmn_window // 2 + window_end = window_start + cmn_window + else: + window_start = t - cmn_window + window_end = t + 1 + if window_start < 0: + window_end -= window_start + window_start = 0 + if not center: + if window_end > t: + window_end = max(t + 1, min_cmn_window) + if window_end > num_frames: + window_start -= window_end - num_frames + window_end = num_frames + if window_start < 0: + window_start = 0 + if last_window_start == -1: + input_part = specgram[:, window_start : window_end - window_start, :] + cur_sum += torch.sum(input_part, 1) + if norm_vars: + cur_sumsq += torch.cumsum(input_part**2, 1)[:, -1, :] + else: + if window_start > last_window_start: + frame_to_remove = specgram[:, last_window_start, :] + cur_sum -= frame_to_remove + if norm_vars: + cur_sumsq -= frame_to_remove**2 + if window_end > last_window_end: + frame_to_add = specgram[:, last_window_end, :] + cur_sum += frame_to_add + if norm_vars: + cur_sumsq += frame_to_add**2 + window_frames = window_end - window_start + last_window_start = window_start + last_window_end = window_end + cmn_specgram[:, t, :] = specgram[:, t, :] - cur_sum / window_frames + if norm_vars: + if window_frames == 1: + cmn_specgram[:, t, :] = torch.zeros(num_channels, num_feats, dtype=dtype, device=device) + else: + variance = cur_sumsq + variance = variance / window_frames + variance -= (cur_sum**2) / (window_frames**2) + variance = torch.pow(variance, -0.5) + cmn_specgram[:, t, :] *= variance + + cmn_specgram = cmn_specgram.view(input_shape[:-2] + (num_frames, num_feats)) + if len(input_shape) == 2: + cmn_specgram = cmn_specgram.squeeze(0) + return cmn_specgram + + +def spectral_centroid( + waveform: Tensor, + sample_rate: int, + pad: int, + window: Tensor, + n_fft: int, + hop_length: int, + win_length: int, +) -> Tensor: + r"""Compute the spectral centroid for each channel along the time axis. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + The spectral centroid is defined as the weighted average of the + frequency values, weighted by their magnitude. + + Args: + waveform (Tensor): Tensor of audio of dimension `(..., time)` + sample_rate (int): Sample rate of the audio waveform + pad (int): Two sided padding of signal + window (Tensor): Window tensor that is applied/multiplied to each frame/window + n_fft (int): Size of FFT + hop_length (int): Length of hop between STFT windows + win_length (int): Window size + + Returns: + Tensor: Dimension `(..., time)` + """ + specgram = spectrogram( + waveform, + pad=pad, + window=window, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + power=1.0, + normalized=False, + ) + freqs = torch.linspace(0, sample_rate // 2, steps=1 + n_fft // 2, device=specgram.device).reshape((-1, 1)) + freq_dim = -2 + return (freqs * specgram).sum(dim=freq_dim) / specgram.sum(dim=freq_dim) + + +@_mod_utils.requires_sox() +def apply_codec( + waveform: Tensor, + sample_rate: int, + format: str, + channels_first: bool = True, + compression: Optional[float] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, +) -> Tensor: + r""" + Apply codecs as a form of augmentation. + + .. devices:: CPU + + Args: + waveform (Tensor): Audio data. Must be 2 dimensional. See also ```channels_first```. + sample_rate (int): Sample rate of the audio waveform. + format (str): File format. + channels_first (bool, optional): + When True, both the input and output Tensor have dimension `(channel, time)`. + Otherwise, they have dimension `(time, channel)`. + compression (float or None, optional): Used for formats other than WAV. + For more details see :py:func:`torchaudio.backend.sox_io_backend.save`. + encoding (str or None, optional): Changes the encoding for the supported formats. + For more details see :py:func:`torchaudio.backend.sox_io_backend.save`. + bits_per_sample (int or None, optional): Changes the bit depth for the supported formats. + For more details see :py:func:`torchaudio.backend.sox_io_backend.save`. + + Returns: + Tensor: Resulting Tensor. + If ``channels_first=True``, it has `(channel, time)` else `(time, channel)`. + """ + bytes = io.BytesIO() + torchaudio.backend.sox_io_backend.save( + bytes, waveform, sample_rate, channels_first, compression, format, encoding, bits_per_sample + ) + bytes.seek(0) + augmented, sr = torchaudio.backend.sox_io_backend.load(bytes, channels_first=channels_first, format=format) + if sr != sample_rate: + augmented = resample(augmented, sr, sample_rate) + return augmented + + +@_mod_utils.requires_kaldi() +def compute_kaldi_pitch( + waveform: torch.Tensor, + sample_rate: float, + frame_length: float = 25.0, + frame_shift: float = 10.0, + min_f0: float = 50, + max_f0: float = 400, + soft_min_f0: float = 10.0, + penalty_factor: float = 0.1, + lowpass_cutoff: float = 1000, + resample_frequency: float = 4000, + delta_pitch: float = 0.005, + nccf_ballast: float = 7000, + lowpass_filter_width: int = 1, + upsample_filter_width: int = 5, + max_frames_latency: int = 0, + frames_per_chunk: int = 0, + simulate_first_pass_online: bool = False, + recompute_frame: int = 500, + snip_edges: bool = True, +) -> torch.Tensor: + """Extract pitch based on method described in *A pitch extraction algorithm tuned + for automatic speech recognition* [:footcite:`6854049`]. + + .. devices:: CPU + + .. properties:: TorchScript + + This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi. + + Args: + waveform (Tensor): + The input waveform of shape `(..., time)`. + sample_rate (float): + Sample rate of `waveform`. + frame_length (float, optional): + Frame length in milliseconds. (default: 25.0) + frame_shift (float, optional): + Frame shift in milliseconds. (default: 10.0) + min_f0 (float, optional): + Minimum F0 to search for (Hz) (default: 50.0) + max_f0 (float, optional): + Maximum F0 to search for (Hz) (default: 400.0) + soft_min_f0 (float, optional): + Minimum f0, applied in soft way, must not exceed min-f0 (default: 10.0) + penalty_factor (float, optional): + Cost factor for FO change. (default: 0.1) + lowpass_cutoff (float, optional): + Cutoff frequency for LowPass filter (Hz) (default: 1000) + resample_frequency (float, optional): + Frequency that we down-sample the signal to. Must be more than twice lowpass-cutoff. + (default: 4000) + delta_pitch( float, optional): + Smallest relative change in pitch that our algorithm measures. (default: 0.005) + nccf_ballast (float, optional): + Increasing this factor reduces NCCF for quiet frames (default: 7000) + lowpass_filter_width (int, optional): + Integer that determines filter width of lowpass filter, more gives sharper filter. + (default: 1) + upsample_filter_width (int, optional): + Integer that determines filter width when upsampling NCCF. (default: 5) + max_frames_latency (int, optional): + Maximum number of frames of latency that we allow pitch tracking to introduce into + the feature processing (affects output only if ``frames_per_chunk > 0`` and + ``simulate_first_pass_online=True``) (default: 0) + frames_per_chunk (int, optional): + The number of frames used for energy normalization. (default: 0) + simulate_first_pass_online (bool, optional): + If true, the function will output features that correspond to what an online decoder + would see in the first pass of decoding -- not the final version of the features, + which is the default. (default: False) + Relevant if ``frames_per_chunk > 0``. + recompute_frame (int, optional): + Only relevant for compatibility with online pitch extraction. + A non-critical parameter; the frame at which we recompute some of the forward pointers, + after revising our estimate of the signal energy. + Relevant if ``frames_per_chunk > 0``. (default: 500) + snip_edges (bool, optional): + If this is set to false, the incomplete frames near the ending edge won't be snipped, + so that the number of frames is the file size divided by the frame-shift. + This makes different types of features give the same number of frames. (default: True) + + Returns: + Tensor: Pitch feature. Shape: `(batch, frames 2)` where the last dimension + corresponds to pitch and NCCF. + """ + shape = waveform.shape + waveform = waveform.reshape(-1, shape[-1]) + result = torch.ops.torchaudio.kaldi_ComputeKaldiPitch( + waveform, + sample_rate, + frame_length, + frame_shift, + min_f0, + max_f0, + soft_min_f0, + penalty_factor, + lowpass_cutoff, + resample_frequency, + delta_pitch, + nccf_ballast, + lowpass_filter_width, + upsample_filter_width, + max_frames_latency, + frames_per_chunk, + simulate_first_pass_online, + recompute_frame, + snip_edges, + ) + result = result.reshape(shape[:-1] + result.shape[-2:]) + return result + + +def _get_sinc_resample_kernel( + orig_freq: int, + new_freq: int, + gcd: int, + lowpass_filter_width: int = 6, + rolloff: float = 0.99, + resampling_method: str = "sinc_interpolation", + beta: Optional[float] = None, + device: torch.device = torch.device("cpu"), + dtype: Optional[torch.dtype] = None, +): + + if not (int(orig_freq) == orig_freq and int(new_freq) == new_freq): + raise Exception( + "Frequencies must be of integer type to ensure quality resampling computation. " + "To work around this, manually convert both frequencies to integer values " + "that maintain their resampling rate ratio before passing them into the function. " + "Example: To downsample a 44100 hz waveform by a factor of 8, use " + "`orig_freq=8` and `new_freq=1` instead of `orig_freq=44100` and `new_freq=5512.5`. " + "For more information, please refer to https://github.com/pytorch/audio/issues/1487." + ) + + if resampling_method not in ["sinc_interpolation", "kaiser_window"]: + raise ValueError("Invalid resampling method: {}".format(resampling_method)) + + orig_freq = int(orig_freq) // gcd + new_freq = int(new_freq) // gcd + + assert lowpass_filter_width > 0 + kernels = [] + base_freq = min(orig_freq, new_freq) + # This will perform antialiasing filtering by removing the highest frequencies. + # At first I thought I only needed this when downsampling, but when upsampling + # you will get edge artifacts without this, as the edge is equivalent to zero padding, + # which will add high freq artifacts. + base_freq *= rolloff + + # The key idea of the algorithm is that x(t) can be exactly reconstructed from x[i] (tensor) + # using the sinc interpolation formula: + # x(t) = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - t)) + # We can then sample the function x(t) with a different sample rate: + # y[j] = x(j / new_freq) + # or, + # y[j] = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - j / new_freq)) + + # We see here that y[j] is the convolution of x[i] with a specific filter, for which + # we take an FIR approximation, stopping when we see at least `lowpass_filter_width` zeros crossing. + # But y[j+1] is going to have a different set of weights and so on, until y[j + new_freq]. + # Indeed: + # y[j + new_freq] = sum_i x[i] sinc(pi * orig_freq * ((i / orig_freq - (j + new_freq) / new_freq)) + # = sum_i x[i] sinc(pi * orig_freq * ((i - orig_freq) / orig_freq - j / new_freq)) + # = sum_i x[i + orig_freq] sinc(pi * orig_freq * (i / orig_freq - j / new_freq)) + # so y[j+new_freq] uses the same filter as y[j], but on a shifted version of x by `orig_freq`. + # This will explain the F.conv1d after, with a stride of orig_freq. + width = math.ceil(lowpass_filter_width * orig_freq / base_freq) + # If orig_freq is still big after GCD reduction, most filters will be very unbalanced, i.e., + # they will have a lot of almost zero values to the left or to the right... + # There is probably a way to evaluate those filters more efficiently, but this is kept for + # future work. + idx_dtype = dtype if dtype is not None else torch.float64 + idx = torch.arange(-width, width + orig_freq, device=device, dtype=idx_dtype) + + for i in range(new_freq): + t = (-i / new_freq + idx / orig_freq) * base_freq + t = t.clamp_(-lowpass_filter_width, lowpass_filter_width) + + # we do not use built in torch windows here as we need to evaluate the window + # at specific positions, not over a regular grid. + if resampling_method == "sinc_interpolation": + window = torch.cos(t * math.pi / lowpass_filter_width / 2) ** 2 + else: + # kaiser_window + if beta is None: + beta = 14.769656459379492 + beta_tensor = torch.tensor(float(beta)) + window = torch.i0(beta_tensor * torch.sqrt(1 - (t / lowpass_filter_width) ** 2)) / torch.i0(beta_tensor) + t *= math.pi + kernel = torch.where(t == 0, torch.tensor(1.0).to(t), torch.sin(t) / t) + kernel.mul_(window) + kernels.append(kernel) + + scale = base_freq / orig_freq + kernels = torch.stack(kernels).view(new_freq, 1, -1).mul_(scale) + if dtype is None: + kernels = kernels.to(dtype=torch.float32) + return kernels, width + + +def _apply_sinc_resample_kernel( + waveform: Tensor, + orig_freq: int, + new_freq: int, + gcd: int, + kernel: Tensor, + width: int, +): + if not waveform.is_floating_point(): + raise TypeError(f"Expected floating point type for waveform tensor, but received {waveform.dtype}.") + + orig_freq = int(orig_freq) // gcd + new_freq = int(new_freq) // gcd + + # pack batch + shape = waveform.size() + waveform = waveform.view(-1, shape[-1]) + + num_wavs, length = waveform.shape + waveform = torch.nn.functional.pad(waveform, (width, width + orig_freq)) + resampled = torch.nn.functional.conv1d(waveform[:, None], kernel, stride=orig_freq) + resampled = resampled.transpose(1, 2).reshape(num_wavs, -1) + target_length = int(math.ceil(new_freq * length / orig_freq)) + resampled = resampled[..., :target_length] + + # unpack batch + resampled = resampled.view(shape[:-1] + resampled.shape[-1:]) + return resampled + + +def resample( + waveform: Tensor, + orig_freq: int, + new_freq: int, + lowpass_filter_width: int = 6, + rolloff: float = 0.99, + resampling_method: str = "sinc_interpolation", + beta: Optional[float] = None, +) -> Tensor: + r"""Resamples the waveform at the new frequency using bandlimited interpolation. [:footcite:`RESAMPLE`]. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Note: + ``transforms.Resample`` precomputes and reuses the resampling kernel, so using it will result in + more efficient computation if resampling multiple waveforms with the same resampling parameters. + + Args: + waveform (Tensor): The input signal of dimension `(..., time)` + orig_freq (int): The original frequency of the signal + new_freq (int): The desired frequency + lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper + but less efficient. (Default: ``6``) + rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist. + Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``) + resampling_method (str, optional): The resampling method to use. + Options: [``sinc_interpolation``, ``kaiser_window``] (Default: ``'sinc_interpolation'``) + beta (float or None, optional): The shape parameter used for kaiser window. + + Returns: + Tensor: The waveform at the new frequency of dimension `(..., time).` + """ + + assert orig_freq > 0.0 and new_freq > 0.0 + + if orig_freq == new_freq: + return waveform + + gcd = math.gcd(int(orig_freq), int(new_freq)) + + kernel, width = _get_sinc_resample_kernel( + orig_freq, + new_freq, + gcd, + lowpass_filter_width, + rolloff, + resampling_method, + beta, + waveform.device, + waveform.dtype, + ) + resampled = _apply_sinc_resample_kernel(waveform, orig_freq, new_freq, gcd, kernel, width) + return resampled + + +@torch.jit.unused +def edit_distance(seq1: Sequence, seq2: Sequence) -> int: + """ + Calculate the word level edit (Levenshtein) distance between two sequences. + + .. devices:: CPU + + The function computes an edit distance allowing deletion, insertion and + substitution. The result is an integer. + + For most applications, the two input sequences should be the same type. If + two strings are given, the output is the edit distance between the two + strings (character edit distance). If two lists of strings are given, the + output is the edit distance between sentences (word edit distance). Users + may want to normalize the output by the length of the reference sequence. + + Args: + seq1 (Sequence): the first sequence to compare. + seq2 (Sequence): the second sequence to compare. + Returns: + int: The distance between the first and second sequences. + """ + len_sent2 = len(seq2) + dold = list(range(len_sent2 + 1)) + dnew = [0 for _ in range(len_sent2 + 1)] + + for i in range(1, len(seq1) + 1): + dnew[0] = i + for j in range(1, len_sent2 + 1): + if seq1[i - 1] == seq2[j - 1]: + dnew[j] = dold[j - 1] + else: + substitution = dold[j - 1] + 1 + insertion = dnew[j - 1] + 1 + deletion = dold[j] + 1 + dnew[j] = min(substitution, insertion, deletion) + + dnew, dold = dold, dnew + + return int(dold[-1]) + + +def pitch_shift( + waveform: Tensor, + sample_rate: int, + n_steps: int, + bins_per_octave: int = 12, + n_fft: int = 512, + win_length: Optional[int] = None, + hop_length: Optional[int] = None, + window: Optional[Tensor] = None, +) -> Tensor: + """ + Shift the pitch of a waveform by ``n_steps`` steps. + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + Args: + waveform (Tensor): The input waveform of shape `(..., time)`. + sample_rate (int): Sample rate of `waveform`. + n_steps (int): The (fractional) steps to shift `waveform`. + bins_per_octave (int, optional): The number of steps per octave (Default: ``12``). + n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins (Default: ``512``). + win_length (int or None, optional): Window size. If None, then ``n_fft`` is used. (Default: ``None``). + hop_length (int or None, optional): Length of hop between STFT windows. If None, then + ``win_length // 4`` is used (Default: ``None``). + window (Tensor or None, optional): Window tensor that is applied/multiplied to each frame/window. + If None, then ``torch.hann_window(win_length)`` is used (Default: ``None``). + + + Returns: + Tensor: The pitch-shifted audio waveform of shape `(..., time)`. + """ + waveform_stretch = _stretch_waveform( + waveform, + n_steps, + bins_per_octave, + n_fft, + win_length, + hop_length, + window, + ) + rate = 2.0 ** (-float(n_steps) / bins_per_octave) + waveform_shift = resample(waveform_stretch, int(sample_rate / rate), sample_rate) + + return _fix_waveform_shape(waveform_shift, waveform.size()) + + +def _stretch_waveform( + waveform: Tensor, + n_steps: int, + bins_per_octave: int = 12, + n_fft: int = 512, + win_length: Optional[int] = None, + hop_length: Optional[int] = None, + window: Optional[Tensor] = None, +) -> Tensor: + """ + Pitch shift helper function to preprocess and stretch waveform before resampling step. + + Args: + See pitch_shift arg descriptions. + + Returns: + Tensor: The preprocessed waveform stretched prior to resampling. + """ + if hop_length is None: + hop_length = n_fft // 4 + if win_length is None: + win_length = n_fft + if window is None: + window = torch.hann_window(window_length=win_length, device=waveform.device) + + # pack batch + shape = waveform.size() + waveform = waveform.reshape(-1, shape[-1]) + + ori_len = shape[-1] + rate = 2.0 ** (-float(n_steps) / bins_per_octave) + spec_f = torch.stft( + input=waveform, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=True, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + phase_advance = torch.linspace(0, math.pi * hop_length, spec_f.shape[-2], device=spec_f.device)[..., None] + spec_stretch = phase_vocoder(spec_f, rate, phase_advance) + len_stretch = int(round(ori_len / rate)) + waveform_stretch = torch.istft( + spec_stretch, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, length=len_stretch + ) + return waveform_stretch + + +def _fix_waveform_shape( + waveform_shift: Tensor, + shape: List[int], +) -> Tensor: + """ + PitchShift helper function to process after resampling step to fix the shape back. + + Args: + waveform_shift(Tensor): The waveform after stretch and resample + shape (List[int]): The shape of initial waveform + + Returns: + Tensor: The pitch-shifted audio waveform of shape `(..., time)`. + """ + ori_len = shape[-1] + shift_len = waveform_shift.size()[-1] + if shift_len > ori_len: + waveform_shift = waveform_shift[..., :ori_len] + else: + waveform_shift = torch.nn.functional.pad(waveform_shift, [0, ori_len - shift_len]) + + # unpack batch + waveform_shift = waveform_shift.view(shape[:-1] + waveform_shift.shape[-1:]) + return waveform_shift + + +def rnnt_loss( + logits: Tensor, + targets: Tensor, + logit_lengths: Tensor, + target_lengths: Tensor, + blank: int = -1, + clamp: float = -1, + reduction: str = "mean", +): + """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks* + [:footcite:`graves2012sequence`]. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + The RNN Transducer loss extends the CTC loss by defining a distribution over output + sequences of all lengths, and by jointly modelling both input-output and output-output + dependencies. + + Args: + logits (Tensor): Tensor of dimension `(batch, max seq length, max target length + 1, class)` + containing output from joiner + targets (Tensor): Tensor of dimension `(batch, max target length)` containing targets with zero padded + logit_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of each sequence from encoder + target_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of targets for each sequence + blank (int, optional): blank label (Default: ``-1``) + clamp (float, optional): clamp for gradients (Default: ``-1``) + reduction (string, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. (Default: ``'mean'``) + Returns: + Tensor: Loss with the reduction option applied. If ``reduction`` is ``'none'``, then size `(batch)`, + otherwise scalar. + """ + if reduction not in ["none", "mean", "sum"]: + raise ValueError("reduction should be one of 'none', 'mean', or 'sum'") + + if blank < 0: # reinterpret blank index if blank < 0. + blank = logits.shape[-1] + blank + + costs, _ = torch.ops.torchaudio.rnnt_loss( + logits=logits, + targets=targets, + logit_lengths=logit_lengths, + target_lengths=target_lengths, + blank=blank, + clamp=clamp, + ) + + if reduction == "mean": + return costs.mean() + elif reduction == "sum": + return costs.sum() + + return costs + + +def psd( + specgram: Tensor, + mask: Optional[Tensor] = None, + normalize: bool = True, + eps: float = 1e-10, +) -> Tensor: + """Compute cross-channel power spectral density (PSD) matrix. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + specgram (torch.Tensor): Multi-channel complex-valued spectrum. + Tensor with dimensions `(..., channel, freq, time)`. + mask (torch.Tensor or None, optional): Time-Frequency mask for normalization. + Tensor with dimensions `(..., freq, time)`. (Default: ``None``) + normalize (bool, optional): If ``True``, normalize the mask along the time dimension. (Default: ``True``) + eps (float, optional): Value to add to the denominator in mask normalization. (Default: ``1e-15``) + + Returns: + torch.Tensor: The complex-valued PSD matrix of the input spectrum. + Tensor with dimensions `(..., freq, channel, channel)` + """ + specgram = specgram.transpose(-3, -2) # shape (freq, channel, time) + # outer product: + # (..., ch_1, time) x (..., ch_2, time) -> (..., time, ch_1, ch_2) + psd = torch.einsum("...ct,...et->...tce", [specgram, specgram.conj()]) + + if mask is not None: + assert ( + mask.shape[:-1] == specgram.shape[:-2] and mask.shape[-1] == specgram.shape[-1] + ), "The dimensions of mask except the channel dimension should be the same as specgram." + f"Found {mask.shape} for mask and {specgram.shape} for specgram." + # Normalized mask along time dimension: + if normalize: + mask = mask / (mask.sum(dim=-1, keepdim=True) + eps) + + psd = psd * mask[..., None, None] + + psd = psd.sum(dim=-3) + return psd + + +def _compute_mat_trace(input: torch.Tensor, dim1: int = -1, dim2: int = -2) -> torch.Tensor: + r"""Compute the trace of a Tensor along ``dim1`` and ``dim2`` dimensions. + + Args: + input (torch.Tensor): Tensor with dimensions `(..., channel, channel)`. + dim1 (int, optional): The first dimension of the diagonal matrix. + (Default: ``-1``) + dim2 (int, optional): The second dimension of the diagonal matrix. + (Default: ``-2``) + + Returns: + Tensor: The trace of the input Tensor. + """ + assert input.ndim >= 2, "The dimension of the tensor must be at least 2." + assert input.shape[dim1] == input.shape[dim2], "The size of ``dim1`` and ``dim2`` must be the same." + input = torch.diagonal(input, 0, dim1=dim1, dim2=dim2) + return input.sum(dim=-1) + + +def _tik_reg(mat: torch.Tensor, reg: float = 1e-7, eps: float = 1e-8) -> torch.Tensor: + """Perform Tikhonov regularization (only modifying real part). + + Args: + mat (torch.Tensor): Input matrix with dimensions `(..., channel, channel)`. + reg (float, optional): Regularization factor. (Default: 1e-8) + eps (float, optional): Value to avoid the correlation matrix is all-zero. (Default: ``1e-8``) + + Returns: + Tensor: Regularized matrix with dimensions `(..., channel, channel)`. + """ + # Add eps + C = mat.size(-1) + eye = torch.eye(C, dtype=mat.dtype, device=mat.device) + epsilon = _compute_mat_trace(mat).real[..., None, None] * reg + # in case that correlation_matrix is all-zero + epsilon = epsilon + eps + mat = mat + epsilon * eye[..., :, :] + return mat + + +def _assert_psd_matrices(psd_s: torch.Tensor, psd_n: torch.Tensor) -> None: + """Assertion checks of the PSD matrices of target speech and noise. + + Args: + psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech. + Tensor with dimensions `(..., freq, channel, channel)`. + psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise. + Tensor with dimensions `(..., freq, channel, channel)`. + """ + assert ( + psd_s.ndim >= 3 and psd_n.ndim >= 3 + ), "Expected at least 3D Tensor (..., freq, channel, channel) for psd_s and psd_n." + "Found {psd_s.shape} for psd_s and {psd_n.shape} for psd_n." + assert ( + psd_s.is_complex() and psd_n.is_complex() + ), "The type of psd_s and psd_n must be ``torch.cfloat`` or ``torch.cdouble``." + f"Found {psd_s.dtype} for psd_s and {psd_n.dtype} for psd_n." + assert ( + psd_s.shape == psd_n.shape + ), f"The dimensions of psd_s and psd_n should be the same. Found {psd_s.shape} and {psd_n.shape}." + assert ( + psd_s.shape[-1] == psd_s.shape[-2] + ), f"The last two dimensions of psd_s should be the same. Found {psd_s.shape}." + + +def mvdr_weights_souden( + psd_s: Tensor, + psd_n: Tensor, + reference_channel: Union[int, Tensor], + diagonal_loading: bool = True, + diag_eps: float = 1e-7, + eps: float = 1e-8, +) -> Tensor: + r"""Compute the Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) beamforming weights + by the method proposed by *Souden et, al.* [:footcite:`souden2009optimal`]. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Given the power spectral density (PSD) matrix of target speech :math:`\bf{\Phi}_{\textbf{SS}}`, + the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and a one-hot vector that represents the + reference channel :math:`\bf{u}`, the method computes the MVDR beamforming weight martrix + :math:`\textbf{w}_{\text{MVDR}}`. The formula is defined as: + + .. math:: + \textbf{w}_{\text{MVDR}}(f) = + \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bf{\Phi}_{\textbf{SS}}}}(f)} + {\text{Trace}({{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f) \bf{\Phi}_{\textbf{SS}}}(f))}}\bm{u} + + Args: + psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech. + Tensor with dimensions `(..., freq, channel, channel)`. + psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise. + Tensor with dimensions `(..., freq, channel, channel)`. + reference_channel (int or torch.Tensor): Specifies the reference channel. + If the dtype is ``int``, it represents the reference channel index. + If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension + is one-hot. + diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``. + (Default: ``True``) + diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading. + It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``) + eps (float, optional): Value to add to the denominator in the beamforming weight formula. + (Default: ``1e-8``) + + Returns: + torch.Tensor: The complex-valued MVDR beamforming weight matrix with dimensions `(..., freq, channel)`. + """ + _assert_psd_matrices(psd_s, psd_n) + + if diagonal_loading: + psd_n = _tik_reg(psd_n, reg=diag_eps) + numerator = torch.linalg.solve(psd_n, psd_s) # psd_n.inv() @ psd_s + # ws: (..., C, C) / (...,) -> (..., C, C) + ws = numerator / (_compute_mat_trace(numerator)[..., None, None] + eps) + if torch.jit.isinstance(reference_channel, int): + beamform_weights = ws[..., :, reference_channel] + elif torch.jit.isinstance(reference_channel, Tensor): + reference_channel = reference_channel.to(psd_n.dtype) + # h: (..., F, C_1, C_2) x (..., C_2) -> (..., F, C_1) + beamform_weights = torch.einsum("...c,...c->...", [ws, reference_channel[..., None, None, :]]) + else: + raise TypeError(f"Expected 'int' or 'Tensor' for reference_channel. Found: {type(reference_channel)}.") + + return beamform_weights + + +def mvdr_weights_rtf( + rtf: Tensor, + psd_n: Tensor, + reference_channel: Optional[Union[int, Tensor]] = None, + diagonal_loading: bool = True, + diag_eps: float = 1e-7, + eps: float = 1e-8, +) -> Tensor: + r"""Compute the Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) beamforming weights + based on the relative transfer function (RTF) and power spectral density (PSD) matrix of noise. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Given the relative transfer function (RTF) matrix or the steering vector of target speech :math:`\bm{v}`, + the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and a one-hot vector that represents the + reference channel :math:`\bf{u}`, the method computes the MVDR beamforming weight martrix + :math:`\textbf{w}_{\text{MVDR}}`. The formula is defined as: + + .. math:: + \textbf{w}_{\text{MVDR}}(f) = + \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}} + {{\bm{v}^{\mathsf{H}}}(f){\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)} + + where :math:`(.)^{\mathsf{H}}` denotes the Hermitian Conjugate operation. + + Args: + rtf (torch.Tensor): The complex-valued RTF vector of target speech. + Tensor with dimensions `(..., freq, channel)`. + psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise. + Tensor with dimensions `(..., freq, channel, channel)`. + reference_channel (int or torch.Tensor): Specifies the reference channel. + If the dtype is ``int``, it represents the reference channel index. + If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension + is one-hot. + diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``. + (Default: ``True``) + diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading. + It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``) + eps (float, optional): Value to add to the denominator in the beamforming weight formula. + (Default: ``1e-8``) + + Returns: + torch.Tensor: The complex-valued MVDR beamforming weight matrix with dimensions `(..., freq, channel)`. + """ + assert rtf.ndim >= 2, f"Expected at least 2D Tensor (..., freq, channel) for rtf. Found {rtf.shape}." + assert psd_n.ndim >= 3, f"Expected at least 3D Tensor (..., freq, channel, channel) for psd_n. Found {psd_n.shape}." + assert ( + rtf.is_complex() and psd_n.is_complex() + ), "The type of rtf and psd_n must be ``torch.cfloat`` or ``torch.cdouble``." + f"Found {rtf.dtype} for rtf and {psd_n.dtype} for psd_n." + assert ( + rtf.shape == psd_n.shape[:-1] + ), "The dimensions of rtf and the dimensions withou the last dimension of psd_n should be the same." + f"Found {rtf.shape} for rtf and {psd_n.shape} for psd_n." + assert ( + psd_n.shape[-1] == psd_n.shape[-2] + ), f"The last two dimensions of psd_n should be the same. Found {psd_n.shape}." + + if diagonal_loading: + psd_n = _tik_reg(psd_n, reg=diag_eps) + # numerator = psd_n.inv() @ stv + numerator = torch.linalg.solve(psd_n, rtf.unsqueeze(-1)).squeeze(-1) # (..., freq, channel) + # denominator = stv^H @ psd_n.inv() @ stv + denominator = torch.einsum("...d,...d->...", [rtf.conj(), numerator]) + beamform_weights = numerator / (denominator.real.unsqueeze(-1) + eps) + # normalize the numerator + if reference_channel is not None: + if torch.jit.isinstance(reference_channel, int): + scale = rtf[..., reference_channel].conj() + elif torch.jit.isinstance(reference_channel, Tensor): + reference_channel = reference_channel.to(psd_n.dtype) + scale = torch.einsum("...c,...c->...", [rtf.conj(), reference_channel[..., None, :]]) + else: + raise TypeError(f"Expected 'int' or 'Tensor' for reference_channel. Found: {type(reference_channel)}.") + + beamform_weights = beamform_weights * scale[..., None] + + return beamform_weights + + +def rtf_evd(psd_s: Tensor) -> Tensor: + r"""Estimate the relative transfer function (RTF) or the steering vector by eigenvalue decomposition. + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + Args: + psd_s (Tensor): The complex-valued power spectral density (PSD) matrix of target speech. + Tensor of dimension `(..., freq, channel, channel)` + + Returns: + Tensor: The estimated complex-valued RTF of target speech. + Tensor of dimension `(..., freq, channel)` + """ + assert psd_s.is_complex(), f"The type of psd_s must be ``torch.cfloat`` or ``torch.cdouble``. Found {psd_s.dtype}." + assert ( + psd_s.shape[-1] == psd_s.shape[-2] + ), f"The last two dimensions of psd_s should be the same. Found {psd_s.shape}." + _, v = torch.linalg.eigh(psd_s) # v is sorted along with eigenvalues in ascending order + rtf = v[..., -1] # choose the eigenvector with max eigenvalue + return rtf + + +def rtf_power( + psd_s: Tensor, + psd_n: Tensor, + reference_channel: Union[int, Tensor], + n_iter: int = 3, + diagonal_loading: bool = True, + diag_eps: float = 1e-7, +) -> Tensor: + r"""Estimate the relative transfer function (RTF) or the steering vector by the power method. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech. + Tensor with dimensions `(..., freq, channel, channel)`. + psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise. + Tensor with dimensions `(..., freq, channel, channel)`. + reference_channel (int or torch.Tensor): Specifies the reference channel. + If the dtype is ``int``, it represents the reference channel index. + If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension + is one-hot. + diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``. + (Default: ``True``) + diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading. + It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``) + + Returns: + torch.Tensor: The estimated complex-valued RTF of target speech. + Tensor of dimension `(..., freq, channel)`. + """ + _assert_psd_matrices(psd_s, psd_n) + assert n_iter > 0, "The number of iteration must be greater than 0." + + # Apply diagonal loading to psd_n to improve robustness. + if diagonal_loading: + psd_n = _tik_reg(psd_n, reg=diag_eps) + # phi is regarded as the first iteration + phi = torch.linalg.solve(psd_n, psd_s) # psd_n.inv() @ psd_s + if torch.jit.isinstance(reference_channel, int): + rtf = phi[..., reference_channel] + elif torch.jit.isinstance(reference_channel, Tensor): + reference_channel = reference_channel.to(psd_n.dtype) + rtf = torch.einsum("...c,...c->...", [phi, reference_channel[..., None, None, :]]) + else: + raise TypeError(f"Expected 'int' or 'Tensor' for reference_channel. Found: {type(reference_channel)}.") + rtf = rtf.unsqueeze(-1) # (..., freq, channel, 1) + if n_iter >= 2: + # The number of iterations in the for loop is `n_iter - 2` + # because the `phi` above and `torch.matmul(psd_s, rtf)` are regarded as + # two iterations. + for _ in range(n_iter - 2): + rtf = torch.matmul(phi, rtf) + rtf = torch.matmul(psd_s, rtf) + else: + # if there is only one iteration, the rtf is the psd_s[..., referenc_channel] + # which is psd_n @ phi @ ref_channel + rtf = torch.matmul(psd_n, rtf) + return rtf.squeeze(-1) + + +def apply_beamforming(beamform_weights: Tensor, specgram: Tensor) -> Tensor: + r"""Apply the beamforming weight to the multi-channel noisy spectrum to obtain the single-channel enhanced spectrum. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + .. math:: + \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f) + where :math:`\textbf{w}_{\text{bf}}(f)` is the beamforming weight for the :math:`f`-th frequency bin, + :math:`\textbf{Y}` is the multi-channel spectrum for the :math:`f`-th frequency bin. + + Args: + beamform_weights (Tensor): The complex-valued beamforming weight matrix. + Tensor of dimension `(..., freq, channel)` + specgram (Tensor): The multi-channel complex-valued noisy spectrum. + Tensor of dimension `(..., channel, freq, time)` + + Returns: + Tensor: The single-channel complex-valued enhanced spectrum. + Tensor of dimension `(..., freq, time)` + """ + assert ( + beamform_weights.shape[:-2] == specgram.shape[:-3] + ), "The dimensions except the last two dimensions of beamform_weights should be the same " + "as the dimensions except the last three dimensions of specgram." + f"Found {beamform_weights.shape} for beamform_weights and {specgram.shape} for specgram." + assert ( + beamform_weights.is_complex() and specgram.is_complex() + ), "The type of beamform_weights and specgram must be ``torch.cfloat`` or ``torch.cdouble``." + f"Found {beamform_weights.dtype} for beamform_weights and {specgram.dtype} for specgram." + + # (..., freq, channel) x (..., channel, freq, time) -> (..., freq, time) + specgram_enhanced = torch.einsum("...fc,...cft->...ft", [beamform_weights.conj(), specgram]) + return specgram_enhanced diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/__init__.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bb0f84732cae237ed657624592f7c090700eeb98 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/__init__.py @@ -0,0 +1,53 @@ +from .conformer import Conformer +from .conv_tasnet import ConvTasNet +from .deepspeech import DeepSpeech +from .emformer import Emformer +from .rnnt import emformer_rnnt_base, emformer_rnnt_model, RNNT +from .rnnt_decoder import Hypothesis, RNNTBeamSearch +from .tacotron2 import Tacotron2 +from .wav2letter import Wav2Letter +from .wav2vec2 import ( + hubert_base, + hubert_large, + hubert_pretrain_base, + hubert_pretrain_large, + hubert_pretrain_model, + hubert_pretrain_xlarge, + hubert_xlarge, + HuBERTPretrainModel, + wav2vec2_base, + wav2vec2_large, + wav2vec2_large_lv60k, + wav2vec2_model, + Wav2Vec2Model, +) +from .wavernn import WaveRNN + + +__all__ = [ + "Wav2Letter", + "WaveRNN", + "ConvTasNet", + "DeepSpeech", + "Wav2Vec2Model", + "HuBERTPretrainModel", + "wav2vec2_model", + "wav2vec2_base", + "wav2vec2_large", + "wav2vec2_large_lv60k", + "hubert_base", + "hubert_large", + "hubert_xlarge", + "hubert_pretrain_model", + "hubert_pretrain_base", + "hubert_pretrain_large", + "hubert_pretrain_xlarge", + "Tacotron2", + "Conformer", + "Emformer", + "Hypothesis", + "RNNT", + "RNNTBeamSearch", + "emformer_rnnt_base", + "emformer_rnnt_model", +] diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/conformer.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/conformer.py new file mode 100644 index 0000000000000000000000000000000000000000..890c2945c753fc1defdb9f0f152d7e167a7d7182 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/conformer.py @@ -0,0 +1,292 @@ +from typing import Optional, Tuple + +import torch + + +__all__ = ["Conformer"] + + +def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor: + batch_size = lengths.shape[0] + max_length = int(torch.max(lengths).item()) + padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand( + batch_size, max_length + ) >= lengths.unsqueeze(1) + return padding_mask + + +class _ConvolutionModule(torch.nn.Module): + r"""Conformer convolution module. + + Args: + input_dim (int): input dimension. + num_channels (int): number of depthwise convolution layer input channels. + depthwise_kernel_size (int): kernel size of depthwise convolution layer. + dropout (float, optional): dropout probability. (Default: 0.0) + bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``) + use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``) + """ + + def __init__( + self, + input_dim: int, + num_channels: int, + depthwise_kernel_size: int, + dropout: float = 0.0, + bias: bool = False, + use_group_norm: bool = False, + ) -> None: + super().__init__() + assert (depthwise_kernel_size - 1) % 2 == 0, "depthwise_kernel_size must be odd to achieve 'SAME' padding." + self.layer_norm = torch.nn.LayerNorm(input_dim) + self.sequential = torch.nn.Sequential( + torch.nn.Conv1d( + input_dim, + 2 * num_channels, + 1, + stride=1, + padding=0, + bias=bias, + ), + torch.nn.GLU(dim=1), + torch.nn.Conv1d( + num_channels, + num_channels, + depthwise_kernel_size, + stride=1, + padding=(depthwise_kernel_size - 1) // 2, + groups=num_channels, + bias=bias, + ), + torch.nn.GroupNorm(num_groups=1, num_channels=num_channels) + if use_group_norm + else torch.nn.BatchNorm1d(num_channels), + torch.nn.SiLU(), + torch.nn.Conv1d( + num_channels, + input_dim, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ), + torch.nn.Dropout(dropout), + ) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + r""" + Args: + input (torch.Tensor): with shape `(B, T, D)`. + + Returns: + torch.Tensor: output, with shape `(B, T, D)`. + """ + x = self.layer_norm(input) + x = x.transpose(1, 2) + x = self.sequential(x) + return x.transpose(1, 2) + + +class _FeedForwardModule(torch.nn.Module): + r"""Positionwise feed forward layer. + + Args: + input_dim (int): input dimension. + hidden_dim (int): hidden dimension. + dropout (float, optional): dropout probability. (Default: 0.0) + """ + + def __init__(self, input_dim: int, hidden_dim: int, dropout: float = 0.0) -> None: + super().__init__() + self.sequential = torch.nn.Sequential( + torch.nn.LayerNorm(input_dim), + torch.nn.Linear(input_dim, hidden_dim, bias=True), + torch.nn.SiLU(), + torch.nn.Dropout(dropout), + torch.nn.Linear(hidden_dim, input_dim, bias=True), + torch.nn.Dropout(dropout), + ) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + r""" + Args: + input (torch.Tensor): with shape `(*, D)`. + + Returns: + torch.Tensor: output, with shape `(*, D)`. + """ + return self.sequential(input) + + +class ConformerLayer(torch.nn.Module): + r"""Conformer layer that constitutes Conformer. + + Args: + input_dim (int): input dimension. + ffn_dim (int): hidden layer dimension of feedforward network. + num_attention_heads (int): number of attention heads. + depthwise_conv_kernel_size (int): kernel size of depthwise convolution layer. + dropout (float, optional): dropout probability. (Default: 0.0) + use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d`` + in the convolution module. (Default: ``False``) + convolution_first (bool, optional): apply the convolution module ahead of + the attention module. (Default: ``False``) + """ + + def __init__( + self, + input_dim: int, + ffn_dim: int, + num_attention_heads: int, + depthwise_conv_kernel_size: int, + dropout: float = 0.0, + use_group_norm: bool = False, + convolution_first: bool = False, + ) -> None: + super().__init__() + + self.ffn1 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout) + + self.self_attn_layer_norm = torch.nn.LayerNorm(input_dim) + self.self_attn = torch.nn.MultiheadAttention(input_dim, num_attention_heads, dropout=dropout) + self.self_attn_dropout = torch.nn.Dropout(dropout) + + self.conv_module = _ConvolutionModule( + input_dim=input_dim, + num_channels=input_dim, + depthwise_kernel_size=depthwise_conv_kernel_size, + dropout=dropout, + bias=True, + use_group_norm=use_group_norm, + ) + + self.ffn2 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout) + self.final_layer_norm = torch.nn.LayerNorm(input_dim) + self.convolution_first = convolution_first + + def _apply_convolution(self, input: torch.Tensor) -> torch.Tensor: + residual = input + input = input.transpose(0, 1) + input = self.conv_module(input) + input = input.transpose(0, 1) + input = residual + input + return input + + def forward(self, input: torch.Tensor, key_padding_mask: Optional[torch.Tensor]) -> torch.Tensor: + r""" + Args: + input (torch.Tensor): input, with shape `(T, B, D)`. + key_padding_mask (torch.Tensor or None): key padding mask to use in self attention layer. + + Returns: + torch.Tensor: output, with shape `(T, B, D)`. + """ + residual = input + x = self.ffn1(input) + x = x * 0.5 + residual + + if self.convolution_first: + x = self._apply_convolution(x) + + residual = x + x = self.self_attn_layer_norm(x) + x, _ = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=key_padding_mask, + need_weights=False, + ) + x = self.self_attn_dropout(x) + x = x + residual + + if not self.convolution_first: + x = self._apply_convolution(x) + + residual = x + x = self.ffn2(x) + x = x * 0.5 + residual + + x = self.final_layer_norm(x) + return x + + +class Conformer(torch.nn.Module): + r"""Implements the Conformer architecture introduced in + *Conformer: Convolution-augmented Transformer for Speech Recognition* + [:footcite:`gulati2020conformer`]. + + Args: + input_dim (int): input dimension. + num_heads (int): number of attention heads in each Conformer layer. + ffn_dim (int): hidden layer dimension of feedforward networks. + num_layers (int): number of Conformer layers to instantiate. + depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer. + dropout (float, optional): dropout probability. (Default: 0.0) + use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d`` + in the convolution module. (Default: ``False``) + convolution_first (bool, optional): apply the convolution module ahead of + the attention module. (Default: ``False``) + + Examples: + >>> conformer = Conformer( + >>> input_dim=80, + >>> num_heads=4, + >>> ffn_dim=128, + >>> num_layers=4, + >>> depthwise_conv_kernel_size=31, + >>> ) + >>> lengths = torch.randint(1, 400, (10,)) # (batch,) + >>> input = torch.rand(10, int(lengths.max()), input_dim) # (batch, num_frames, input_dim) + >>> output = conformer(input, lengths) + """ + + def __init__( + self, + input_dim: int, + num_heads: int, + ffn_dim: int, + num_layers: int, + depthwise_conv_kernel_size: int, + dropout: float = 0.0, + use_group_norm: bool = False, + convolution_first: bool = False, + ): + super().__init__() + + self.conformer_layers = torch.nn.ModuleList( + [ + ConformerLayer( + input_dim, + ffn_dim, + num_heads, + depthwise_conv_kernel_size, + dropout=dropout, + use_group_norm=use_group_norm, + convolution_first=convolution_first, + ) + for _ in range(num_layers) + ] + ) + + def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + r""" + Args: + input (torch.Tensor): with shape `(B, T, input_dim)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + + Returns: + (torch.Tensor, torch.Tensor) + torch.Tensor + output frames, with shape `(B, T, input_dim)` + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in output frames. + """ + encoder_padding_mask = _lengths_to_padding_mask(lengths) + + x = input.transpose(0, 1) + for layer in self.conformer_layers: + x = layer(x, encoder_padding_mask) + return x.transpose(0, 1), lengths diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/conv_tasnet.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/conv_tasnet.py new file mode 100644 index 0000000000000000000000000000000000000000..b424661d26c71360af7cf829cde16a3ac2551ce7 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/conv_tasnet.py @@ -0,0 +1,301 @@ +"""Implements Conv-TasNet with building blocks of it. + +Based on https://github.com/naplab/Conv-TasNet/tree/e66d82a8f956a69749ec8a4ae382217faa097c5c +""" + +from typing import Optional, Tuple + +import torch + + +class ConvBlock(torch.nn.Module): + """1D Convolutional block. + + Args: + io_channels (int): The number of input/output channels, + hidden_channels (int): The number of channels in the internal layers, . + kernel_size (int): The convolution kernel size of the middle layer,

. + padding (int): Padding value of the convolution in the middle layer. + dilation (int, optional): Dilation value of the convolution in the middle layer. + no_redisual (bool, optional): Disable residual block/output. + + Note: + This implementation corresponds to the "non-causal" setting in the paper. + """ + + def __init__( + self, + io_channels: int, + hidden_channels: int, + kernel_size: int, + padding: int, + dilation: int = 1, + no_residual: bool = False, + ): + super().__init__() + + self.conv_layers = torch.nn.Sequential( + torch.nn.Conv1d(in_channels=io_channels, out_channels=hidden_channels, kernel_size=1), + torch.nn.PReLU(), + torch.nn.GroupNorm(num_groups=1, num_channels=hidden_channels, eps=1e-08), + torch.nn.Conv1d( + in_channels=hidden_channels, + out_channels=hidden_channels, + kernel_size=kernel_size, + padding=padding, + dilation=dilation, + groups=hidden_channels, + ), + torch.nn.PReLU(), + torch.nn.GroupNorm(num_groups=1, num_channels=hidden_channels, eps=1e-08), + ) + + self.res_out = ( + None + if no_residual + else torch.nn.Conv1d(in_channels=hidden_channels, out_channels=io_channels, kernel_size=1) + ) + self.skip_out = torch.nn.Conv1d(in_channels=hidden_channels, out_channels=io_channels, kernel_size=1) + + def forward(self, input: torch.Tensor) -> Tuple[Optional[torch.Tensor], torch.Tensor]: + feature = self.conv_layers(input) + if self.res_out is None: + residual = None + else: + residual = self.res_out(feature) + skip_out = self.skip_out(feature) + return residual, skip_out + + +class MaskGenerator(torch.nn.Module): + """TCN (Temporal Convolution Network) Separation Module + + Generates masks for separation. + + Args: + input_dim (int): Input feature dimension, . + num_sources (int): The number of sources to separate. + kernel_size (int): The convolution kernel size of conv blocks,

. + num_featrs (int): Input/output feature dimenstion of conv blocks, . + num_hidden (int): Intermediate feature dimention of conv blocks, + num_layers (int): The number of conv blocks in one stack, . + num_stacks (int): The number of conv block stacks, . + msk_activate (str): The activation function of the mask output. + + Note: + This implementation corresponds to the "non-causal" setting in the paper. + """ + + def __init__( + self, + input_dim: int, + num_sources: int, + kernel_size: int, + num_feats: int, + num_hidden: int, + num_layers: int, + num_stacks: int, + msk_activate: str, + ): + super().__init__() + + self.input_dim = input_dim + self.num_sources = num_sources + + self.input_norm = torch.nn.GroupNorm(num_groups=1, num_channels=input_dim, eps=1e-8) + self.input_conv = torch.nn.Conv1d(in_channels=input_dim, out_channels=num_feats, kernel_size=1) + + self.receptive_field = 0 + self.conv_layers = torch.nn.ModuleList([]) + for s in range(num_stacks): + for l in range(num_layers): + multi = 2**l + self.conv_layers.append( + ConvBlock( + io_channels=num_feats, + hidden_channels=num_hidden, + kernel_size=kernel_size, + dilation=multi, + padding=multi, + # The last ConvBlock does not need residual + no_residual=(l == (num_layers - 1) and s == (num_stacks - 1)), + ) + ) + self.receptive_field += kernel_size if s == 0 and l == 0 else (kernel_size - 1) * multi + self.output_prelu = torch.nn.PReLU() + self.output_conv = torch.nn.Conv1d( + in_channels=num_feats, + out_channels=input_dim * num_sources, + kernel_size=1, + ) + if msk_activate == "sigmoid": + self.mask_activate = torch.nn.Sigmoid() + elif msk_activate == "relu": + self.mask_activate = torch.nn.ReLU() + else: + raise ValueError(f"Unsupported activation {msk_activate}") + + def forward(self, input: torch.Tensor) -> torch.Tensor: + """Generate separation mask. + + Args: + input (torch.Tensor): 3D Tensor with shape [batch, features, frames] + + Returns: + Tensor: shape [batch, num_sources, features, frames] + """ + batch_size = input.shape[0] + feats = self.input_norm(input) + feats = self.input_conv(feats) + output = 0.0 + for layer in self.conv_layers: + residual, skip = layer(feats) + if residual is not None: # the last conv layer does not produce residual + feats = feats + residual + output = output + skip + output = self.output_prelu(output) + output = self.output_conv(output) + output = self.mask_activate(output) + return output.view(batch_size, self.num_sources, self.input_dim, -1) + + +class ConvTasNet(torch.nn.Module): + """Conv-TasNet: a fully-convolutional time-domain audio separation network + *Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation* + [:footcite:`Luo_2019`]. + + Args: + num_sources (int, optional): The number of sources to split. + enc_kernel_size (int, optional): The convolution kernel size of the encoder/decoder, . + enc_num_feats (int, optional): The feature dimensions passed to mask generator, . + msk_kernel_size (int, optional): The convolution kernel size of the mask generator,

. + msk_num_feats (int, optional): The input/output feature dimension of conv block in the mask generator, . + msk_num_hidden_feats (int, optional): The internal feature dimension of conv block of the mask generator, . + msk_num_layers (int, optional): The number of layers in one conv block of the mask generator, . + msk_num_stacks (int, optional): The numbr of conv blocks of the mask generator, . + msk_activate (str, optional): The activation function of the mask output (Default: ``sigmoid``). + + Note: + This implementation corresponds to the "non-causal" setting in the paper. + """ + + def __init__( + self, + num_sources: int = 2, + # encoder/decoder parameters + enc_kernel_size: int = 16, + enc_num_feats: int = 512, + # mask generator parameters + msk_kernel_size: int = 3, + msk_num_feats: int = 128, + msk_num_hidden_feats: int = 512, + msk_num_layers: int = 8, + msk_num_stacks: int = 3, + msk_activate: str = "sigmoid", + ): + super().__init__() + + self.num_sources = num_sources + self.enc_num_feats = enc_num_feats + self.enc_kernel_size = enc_kernel_size + self.enc_stride = enc_kernel_size // 2 + + self.encoder = torch.nn.Conv1d( + in_channels=1, + out_channels=enc_num_feats, + kernel_size=enc_kernel_size, + stride=self.enc_stride, + padding=self.enc_stride, + bias=False, + ) + self.mask_generator = MaskGenerator( + input_dim=enc_num_feats, + num_sources=num_sources, + kernel_size=msk_kernel_size, + num_feats=msk_num_feats, + num_hidden=msk_num_hidden_feats, + num_layers=msk_num_layers, + num_stacks=msk_num_stacks, + msk_activate=msk_activate, + ) + self.decoder = torch.nn.ConvTranspose1d( + in_channels=enc_num_feats, + out_channels=1, + kernel_size=enc_kernel_size, + stride=self.enc_stride, + padding=self.enc_stride, + bias=False, + ) + + def _align_num_frames_with_strides(self, input: torch.Tensor) -> Tuple[torch.Tensor, int]: + """Pad input Tensor so that the end of the input tensor corresponds with + + 1. (if kernel size is odd) the center of the last convolution kernel + or 2. (if kernel size is even) the end of the first half of the last convolution kernel + + Assumption: + The resulting Tensor will be padded with the size of stride (== kernel_width // 2) + on the both ends in Conv1D + + |<--- k_1 --->| + | | |<-- k_n-1 -->| + | | | |<--- k_n --->| + | | | | | + | | | | | + | v v v | + |<---->|<--- input signal --->|<--->|<---->| + stride PAD stride + + Args: + input (torch.Tensor): 3D Tensor with shape (batch_size, channels==1, frames) + + Returns: + Tensor: Padded Tensor + int: Number of paddings performed + """ + batch_size, num_channels, num_frames = input.shape + is_odd = self.enc_kernel_size % 2 + num_strides = (num_frames - is_odd) // self.enc_stride + num_remainings = num_frames - (is_odd + num_strides * self.enc_stride) + if num_remainings == 0: + return input, 0 + + num_paddings = self.enc_stride - num_remainings + pad = torch.zeros( + batch_size, + num_channels, + num_paddings, + dtype=input.dtype, + device=input.device, + ) + return torch.cat([input, pad], 2), num_paddings + + def forward(self, input: torch.Tensor) -> torch.Tensor: + """Perform source separation. Generate audio source waveforms. + + Args: + input (torch.Tensor): 3D Tensor with shape [batch, channel==1, frames] + + Returns: + Tensor: 3D Tensor with shape [batch, channel==num_sources, frames] + """ + if input.ndim != 3 or input.shape[1] != 1: + raise ValueError(f"Expected 3D tensor (batch, channel==1, frames). Found: {input.shape}") + + # B: batch size + # L: input frame length + # L': padded input frame length + # F: feature dimension + # M: feature frame length + # S: number of sources + + padded, num_pads = self._align_num_frames_with_strides(input) # B, 1, L' + batch_size, num_padded_frames = padded.shape[0], padded.shape[2] + feats = self.encoder(padded) # B, F, M + masked = self.mask_generator(feats) * feats.unsqueeze(1) # B, S, F, M + masked = masked.view(batch_size * self.num_sources, self.enc_num_feats, -1) # B*S, F, M + decoded = self.decoder(masked) # B*S, 1, L' + output = decoded.view(batch_size, self.num_sources, num_padded_frames) # B, S, L' + if num_pads > 0: + output = output[..., :-num_pads] # B, S, L + return output diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/deepspeech.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/deepspeech.py new file mode 100644 index 0000000000000000000000000000000000000000..e279498e4916a9ff7a943af36853fee392d0b240 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/deepspeech.py @@ -0,0 +1,85 @@ +import torch + +__all__ = ["DeepSpeech"] + + +class FullyConnected(torch.nn.Module): + """ + Args: + n_feature: Number of input features + n_hidden: Internal hidden unit size. + """ + + def __init__(self, n_feature: int, n_hidden: int, dropout: float, relu_max_clip: int = 20) -> None: + super(FullyConnected, self).__init__() + self.fc = torch.nn.Linear(n_feature, n_hidden, bias=True) + self.relu_max_clip = relu_max_clip + self.dropout = dropout + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.fc(x) + x = torch.nn.functional.relu(x) + x = torch.nn.functional.hardtanh(x, 0, self.relu_max_clip) + if self.dropout: + x = torch.nn.functional.dropout(x, self.dropout, self.training) + return x + + +class DeepSpeech(torch.nn.Module): + """ + DeepSpeech model architecture from *Deep Speech: Scaling up end-to-end speech recognition* + [:footcite:`hannun2014deep`]. + + Args: + n_feature: Number of input features + n_hidden: Internal hidden unit size. + n_class: Number of output classes + """ + + def __init__( + self, + n_feature: int, + n_hidden: int = 2048, + n_class: int = 40, + dropout: float = 0.0, + ) -> None: + super(DeepSpeech, self).__init__() + self.n_hidden = n_hidden + self.fc1 = FullyConnected(n_feature, n_hidden, dropout) + self.fc2 = FullyConnected(n_hidden, n_hidden, dropout) + self.fc3 = FullyConnected(n_hidden, n_hidden, dropout) + self.bi_rnn = torch.nn.RNN(n_hidden, n_hidden, num_layers=1, nonlinearity="relu", bidirectional=True) + self.fc4 = FullyConnected(n_hidden, n_hidden, dropout) + self.out = torch.nn.Linear(n_hidden, n_class) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Args: + x (torch.Tensor): Tensor of dimension (batch, channel, time, feature). + Returns: + Tensor: Predictor tensor of dimension (batch, time, class). + """ + # N x C x T x F + x = self.fc1(x) + # N x C x T x H + x = self.fc2(x) + # N x C x T x H + x = self.fc3(x) + # N x C x T x H + x = x.squeeze(1) + # N x T x H + x = x.transpose(0, 1) + # T x N x H + x, _ = self.bi_rnn(x) + # The fifth (non-recurrent) layer takes both the forward and backward units as inputs + x = x[:, :, : self.n_hidden] + x[:, :, self.n_hidden :] + # T x N x H + x = self.fc4(x) + # T x N x H + x = self.out(x) + # T x N x n_class + x = x.permute(1, 0, 2) + # N x T x n_class + x = torch.nn.functional.log_softmax(x, dim=2) + # N x T x n_class + return x diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/emformer.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/emformer.py new file mode 100644 index 0000000000000000000000000000000000000000..72de3ddcb7ae027859d0a76dfcfd157ff5a91d08 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/emformer.py @@ -0,0 +1,876 @@ +import math +from typing import List, Optional, Tuple + +import torch + + +__all__ = ["Emformer"] + + +def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor: + batch_size = lengths.shape[0] + max_length = int(torch.max(lengths).item()) + padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand( + batch_size, max_length + ) >= lengths.unsqueeze(1) + return padding_mask + + +def _gen_padding_mask( + utterance: torch.Tensor, + right_context: torch.Tensor, + summary: torch.Tensor, + lengths: torch.Tensor, + mems: torch.Tensor, + left_context_key: Optional[torch.Tensor] = None, +) -> Optional[torch.Tensor]: + T = right_context.size(0) + utterance.size(0) + summary.size(0) + B = right_context.size(1) + if B == 1: + padding_mask = None + else: + right_context_blocks_length = T - torch.max(lengths).int() - summary.size(0) + left_context_blocks_length = left_context_key.size(0) if left_context_key is not None else 0 + klengths = lengths + mems.size(0) + right_context_blocks_length + left_context_blocks_length + padding_mask = _lengths_to_padding_mask(lengths=klengths) + return padding_mask + + +def _get_activation_module(activation: str) -> torch.nn.Module: + if activation == "relu": + return torch.nn.ReLU() + elif activation == "gelu": + return torch.nn.GELU() + elif activation == "silu": + return torch.nn.SiLU() + else: + raise ValueError(f"Unsupported activation {activation}") + + +def _get_weight_init_gains(weight_init_scale_strategy: Optional[str], num_layers: int) -> List[Optional[float]]: + if weight_init_scale_strategy is None: + return [None for _ in range(num_layers)] + elif weight_init_scale_strategy == "depthwise": + return [1.0 / math.sqrt(layer_idx + 1) for layer_idx in range(num_layers)] + elif weight_init_scale_strategy == "constant": + return [1.0 / math.sqrt(2) for layer_idx in range(num_layers)] + else: + raise ValueError(f"Unsupported weight_init_scale_strategy value {weight_init_scale_strategy}") + + +def _gen_attention_mask_block( + col_widths: List[int], col_mask: List[bool], num_rows: int, device: torch.device +) -> torch.Tensor: + assert len(col_widths) == len(col_mask), "Length of col_widths must match that of col_mask" + + mask_block = [ + torch.ones(num_rows, col_width, device=device) + if is_ones_col + else torch.zeros(num_rows, col_width, device=device) + for col_width, is_ones_col in zip(col_widths, col_mask) + ] + return torch.cat(mask_block, dim=1) + + +class _EmformerAttention(torch.nn.Module): + r"""Emformer layer attention module. + + Args: + input_dim (int): input dimension. + num_heads (int): number of attention heads in each Emformer layer. + dropout (float, optional): dropout probability. (Default: 0.0) + weight_init_gain (float or None, optional): scale factor to apply when initializing + attention module parameters. (Default: ``None``) + tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``) + negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8) + """ + + def __init__( + self, + input_dim: int, + num_heads: int, + dropout: float = 0.0, + weight_init_gain: Optional[float] = None, + tanh_on_mem: bool = False, + negative_inf: float = -1e8, + ): + super().__init__() + + if input_dim % num_heads != 0: + raise ValueError(f"input_dim ({input_dim}) is not a multiple of num_heads ({num_heads}).") + + self.input_dim = input_dim + self.num_heads = num_heads + self.dropout = dropout + self.tanh_on_mem = tanh_on_mem + self.negative_inf = negative_inf + + self.scaling = (self.input_dim // self.num_heads) ** -0.5 + + self.emb_to_key_value = torch.nn.Linear(input_dim, 2 * input_dim, bias=True) + self.emb_to_query = torch.nn.Linear(input_dim, input_dim, bias=True) + self.out_proj = torch.nn.Linear(input_dim, input_dim, bias=True) + + if weight_init_gain: + torch.nn.init.xavier_uniform_(self.emb_to_key_value.weight, gain=weight_init_gain) + torch.nn.init.xavier_uniform_(self.emb_to_query.weight, gain=weight_init_gain) + + def _gen_key_value(self, input: torch.Tensor, mems: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + T, _, _ = input.shape + summary_length = mems.size(0) + 1 + right_ctx_utterance_block = input[: T - summary_length] + mems_right_ctx_utterance_block = torch.cat([mems, right_ctx_utterance_block]) + key, value = self.emb_to_key_value(mems_right_ctx_utterance_block).chunk(chunks=2, dim=2) + return key, value + + def _gen_attention_probs( + self, + attention_weights: torch.Tensor, + attention_mask: torch.Tensor, + padding_mask: Optional[torch.Tensor], + ) -> torch.Tensor: + attention_weights_float = attention_weights.float() + attention_weights_float = attention_weights_float.masked_fill(attention_mask.unsqueeze(0), self.negative_inf) + T = attention_weights.size(1) + B = attention_weights.size(0) // self.num_heads + if padding_mask is not None: + attention_weights_float = attention_weights_float.view(B, self.num_heads, T, -1) + attention_weights_float = attention_weights_float.masked_fill( + padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), self.negative_inf + ) + attention_weights_float = attention_weights_float.view(B * self.num_heads, T, -1) + attention_probs = torch.nn.functional.softmax(attention_weights_float, dim=-1).type_as(attention_weights) + return torch.nn.functional.dropout(attention_probs, p=float(self.dropout), training=self.training) + + def _forward_impl( + self, + utterance: torch.Tensor, + lengths: torch.Tensor, + right_context: torch.Tensor, + summary: torch.Tensor, + mems: torch.Tensor, + attention_mask: torch.Tensor, + left_context_key: Optional[torch.Tensor] = None, + left_context_val: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + B = utterance.size(1) + T = right_context.size(0) + utterance.size(0) + summary.size(0) + + # Compute query with [right context, utterance, summary]. + query = self.emb_to_query(torch.cat([right_context, utterance, summary])) + + # Compute key and value with [mems, right context, utterance]. + key, value = self.emb_to_key_value(torch.cat([mems, right_context, utterance])).chunk(chunks=2, dim=2) + + if left_context_key is not None and left_context_val is not None: + right_context_blocks_length = T - torch.max(lengths).int() - summary.size(0) + key = torch.cat( + [ + key[: mems.size(0) + right_context_blocks_length], + left_context_key, + key[mems.size(0) + right_context_blocks_length :], + ], + ) + value = torch.cat( + [ + value[: mems.size(0) + right_context_blocks_length], + left_context_val, + value[mems.size(0) + right_context_blocks_length :], + ], + ) + + # Compute attention weights from query, key, and value. + reshaped_query, reshaped_key, reshaped_value = [ + tensor.contiguous().view(-1, B * self.num_heads, self.input_dim // self.num_heads).transpose(0, 1) + for tensor in [query, key, value] + ] + attention_weights = torch.bmm(reshaped_query * self.scaling, reshaped_key.transpose(1, 2)) + + # Compute padding mask. + padding_mask = _gen_padding_mask(utterance, right_context, summary, lengths, mems, left_context_key) + + # Compute attention probabilities. + attention_probs = self._gen_attention_probs(attention_weights, attention_mask, padding_mask) + + # Compute attention. + attention = torch.bmm(attention_probs, reshaped_value) + assert attention.shape == ( + B * self.num_heads, + T, + self.input_dim // self.num_heads, + ) + attention = attention.transpose(0, 1).contiguous().view(T, B, self.input_dim) + + # Apply output projection. + output_right_context_mems = self.out_proj(attention) + + summary_length = summary.size(0) + output_right_context = output_right_context_mems[: T - summary_length] + output_mems = output_right_context_mems[T - summary_length :] + if self.tanh_on_mem: + output_mems = torch.tanh(output_mems) + else: + output_mems = torch.clamp(output_mems, min=-10, max=10) + + return output_right_context, output_mems, key, value + + def forward( + self, + utterance: torch.Tensor, + lengths: torch.Tensor, + right_context: torch.Tensor, + summary: torch.Tensor, + mems: torch.Tensor, + attention_mask: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + r"""Forward pass for training. + + B: batch size; + D: feature dimension of each frame; + T: number of utterance frames; + R: number of right context frames; + S: number of summary elements; + M: number of memory elements. + + Args: + utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``utterance``. + right_context (torch.Tensor): right context frames, with shape `(R, B, D)`. + summary (torch.Tensor): summary elements, with shape `(S, B, D)`. + mems (torch.Tensor): memory elements, with shape `(M, B, D)`. + attention_mask (torch.Tensor): attention mask for underlying attention module. + + Returns: + (Tensor, Tensor): + Tensor + output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`. + Tensor + updated memory elements, with shape `(M, B, D)`. + """ + output, output_mems, _, _ = self._forward_impl(utterance, lengths, right_context, summary, mems, attention_mask) + return output, output_mems[:-1] + + @torch.jit.export + def infer( + self, + utterance: torch.Tensor, + lengths: torch.Tensor, + right_context: torch.Tensor, + summary: torch.Tensor, + mems: torch.Tensor, + left_context_key: torch.Tensor, + left_context_val: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Forward pass for inference. + + B: batch size; + D: feature dimension of each frame; + T: number of utterance frames; + R: number of right context frames; + S: number of summary elements; + M: number of memory elements. + + Args: + utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``utterance``. + right_context (torch.Tensor): right context frames, with shape `(R, B, D)`. + summary (torch.Tensor): summary elements, with shape `(S, B, D)`. + mems (torch.Tensor): memory elements, with shape `(M, B, D)`. + left_context_key (torch.Tensor): left context attention key computed from preceding invocation. + left_context_val (torch.Tensor): left context attention value computed from preceding invocation. + + Returns: + (Tensor, Tensor, Tensor, and Tensor): + Tensor + output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`. + Tensor + updated memory elements, with shape `(M, B, D)`. + Tensor + attention key computed for left context and utterance. + Tensor + attention value computed for left context and utterance. + """ + query_dim = right_context.size(0) + utterance.size(0) + summary.size(0) + key_dim = right_context.size(0) + utterance.size(0) + mems.size(0) + left_context_key.size(0) + attention_mask = torch.zeros(query_dim, key_dim).to(dtype=torch.bool, device=utterance.device) + attention_mask[-1, : mems.size(0)] = True + output, output_mems, key, value = self._forward_impl( + utterance, + lengths, + right_context, + summary, + mems, + attention_mask, + left_context_key=left_context_key, + left_context_val=left_context_val, + ) + return ( + output, + output_mems, + key[mems.size(0) + right_context.size(0) :], + value[mems.size(0) + right_context.size(0) :], + ) + + +class _EmformerLayer(torch.nn.Module): + r"""Emformer layer that constitutes Emformer. + + Args: + input_dim (int): input dimension. + num_heads (int): number of attention heads. + ffn_dim: (int): hidden layer dimension of feedforward network. + segment_length (int): length of each input segment. + dropout (float, optional): dropout probability. (Default: 0.0) + activation (str, optional): activation function to use in feedforward network. + Must be one of ("relu", "gelu", "silu"). (Default: "relu") + left_context_length (int, optional): length of left context. (Default: 0) + max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0) + weight_init_gain (float or None, optional): scale factor to apply when initializing + attention module parameters. (Default: ``None``) + tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``) + negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8) + """ + + def __init__( + self, + input_dim: int, + num_heads: int, + ffn_dim: int, + segment_length: int, + dropout: float = 0.0, + activation: str = "relu", + left_context_length: int = 0, + max_memory_size: int = 0, + weight_init_gain: Optional[float] = None, + tanh_on_mem: bool = False, + negative_inf: float = -1e8, + ): + super().__init__() + + self.attention = _EmformerAttention( + input_dim=input_dim, + num_heads=num_heads, + dropout=dropout, + weight_init_gain=weight_init_gain, + tanh_on_mem=tanh_on_mem, + negative_inf=negative_inf, + ) + self.dropout = torch.nn.Dropout(dropout) + self.memory_op = torch.nn.AvgPool1d(kernel_size=segment_length, stride=segment_length, ceil_mode=True) + + activation_module = _get_activation_module(activation) + self.pos_ff = torch.nn.Sequential( + torch.nn.LayerNorm(input_dim), + torch.nn.Linear(input_dim, ffn_dim), + activation_module, + torch.nn.Dropout(dropout), + torch.nn.Linear(ffn_dim, input_dim), + torch.nn.Dropout(dropout), + ) + self.layer_norm_input = torch.nn.LayerNorm(input_dim) + self.layer_norm_output = torch.nn.LayerNorm(input_dim) + + self.left_context_length = left_context_length + self.segment_length = segment_length + self.max_memory_size = max_memory_size + self.input_dim = input_dim + + self.use_mem = max_memory_size > 0 + + def _init_state(self, batch_size: int, device: Optional[torch.device]) -> List[torch.Tensor]: + empty_memory = torch.zeros(self.max_memory_size, batch_size, self.input_dim, device=device) + left_context_key = torch.zeros(self.left_context_length, batch_size, self.input_dim, device=device) + left_context_val = torch.zeros(self.left_context_length, batch_size, self.input_dim, device=device) + past_length = torch.zeros(1, batch_size, dtype=torch.int32, device=device) + return [empty_memory, left_context_key, left_context_val, past_length] + + def _unpack_state(self, state: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + past_length = state[3][0][0].item() + past_left_context_length = min(self.left_context_length, past_length) + past_mem_length = min(self.max_memory_size, math.ceil(past_length / self.segment_length)) + pre_mems = state[0][self.max_memory_size - past_mem_length :] + lc_key = state[1][self.left_context_length - past_left_context_length :] + lc_val = state[2][self.left_context_length - past_left_context_length :] + return pre_mems, lc_key, lc_val + + def _pack_state( + self, + next_k: torch.Tensor, + next_v: torch.Tensor, + update_length: int, + mems: torch.Tensor, + state: List[torch.Tensor], + ) -> List[torch.Tensor]: + new_k = torch.cat([state[1], next_k]) + new_v = torch.cat([state[2], next_v]) + state[0] = torch.cat([state[0], mems])[-self.max_memory_size :] + state[1] = new_k[new_k.shape[0] - self.left_context_length :] + state[2] = new_v[new_v.shape[0] - self.left_context_length :] + state[3] = state[3] + update_length + return state + + def _process_attention_output( + self, + rc_output: torch.Tensor, + utterance: torch.Tensor, + right_context: torch.Tensor, + ) -> torch.Tensor: + result = self.dropout(rc_output) + torch.cat([right_context, utterance]) + result = self.pos_ff(result) + result + result = self.layer_norm_output(result) + return result + + def _apply_pre_attention_layer_norm( + self, utterance: torch.Tensor, right_context: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + layer_norm_input = self.layer_norm_input(torch.cat([right_context, utterance])) + return ( + layer_norm_input[right_context.size(0) :], + layer_norm_input[: right_context.size(0)], + ) + + def _apply_post_attention_ffn( + self, rc_output: torch.Tensor, utterance: torch.Tensor, right_context: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + rc_output = self._process_attention_output(rc_output, utterance, right_context) + return rc_output[right_context.size(0) :], rc_output[: right_context.size(0)] + + def _apply_attention_forward( + self, + utterance: torch.Tensor, + lengths: torch.Tensor, + right_context: torch.Tensor, + mems: torch.Tensor, + attention_mask: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + if attention_mask is None: + raise ValueError("attention_mask must be not None when for_inference is False") + + if self.use_mem: + summary = self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1) + else: + summary = torch.empty(0).to(dtype=utterance.dtype, device=utterance.device) + rc_output, next_m = self.attention( + utterance=utterance, + lengths=lengths, + right_context=right_context, + summary=summary, + mems=mems, + attention_mask=attention_mask, + ) + return rc_output, next_m + + def _apply_attention_infer( + self, + utterance: torch.Tensor, + lengths: torch.Tensor, + right_context: torch.Tensor, + mems: torch.Tensor, + state: Optional[List[torch.Tensor]], + ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]: + if state is None: + state = self._init_state(utterance.size(1), device=utterance.device) + pre_mems, lc_key, lc_val = self._unpack_state(state) + if self.use_mem: + summary = self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1) + summary = summary[:1] + else: + summary = torch.empty(0).to(dtype=utterance.dtype, device=utterance.device) + rc_output, next_m, next_k, next_v = self.attention.infer( + utterance=utterance, + lengths=lengths, + right_context=right_context, + summary=summary, + mems=pre_mems, + left_context_key=lc_key, + left_context_val=lc_val, + ) + state = self._pack_state(next_k, next_v, utterance.size(0), mems, state) + return rc_output, next_m, state + + def forward( + self, + utterance: torch.Tensor, + lengths: torch.Tensor, + right_context: torch.Tensor, + mems: torch.Tensor, + attention_mask: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Forward pass for training. + + B: batch size; + D: feature dimension of each frame; + T: number of utterance frames; + R: number of right context frames; + M: number of memory elements. + + Args: + utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``utterance``. + right_context (torch.Tensor): right context frames, with shape `(R, B, D)`. + mems (torch.Tensor): memory elements, with shape `(M, B, D)`. + attention_mask (torch.Tensor): attention mask for underlying attention module. + + Returns: + (Tensor, Tensor, Tensor): + Tensor + encoded utterance frames, with shape `(T, B, D)`. + Tensor + updated right context frames, with shape `(R, B, D)`. + Tensor + updated memory elements, with shape `(M, B, D)`. + """ + ( + layer_norm_utterance, + layer_norm_right_context, + ) = self._apply_pre_attention_layer_norm(utterance, right_context) + rc_output, output_mems = self._apply_attention_forward( + layer_norm_utterance, + lengths, + layer_norm_right_context, + mems, + attention_mask, + ) + output_utterance, output_right_context = self._apply_post_attention_ffn(rc_output, utterance, right_context) + return output_utterance, output_right_context, output_mems + + @torch.jit.export + def infer( + self, + utterance: torch.Tensor, + lengths: torch.Tensor, + right_context: torch.Tensor, + state: Optional[List[torch.Tensor]], + mems: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], torch.Tensor]: + r"""Forward pass for inference. + + B: batch size; + D: feature dimension of each frame; + T: number of utterance frames; + R: number of right context frames; + M: number of memory elements. + + Args: + utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``utterance``. + right_context (torch.Tensor): right context frames, with shape `(R, B, D)`. + state (List[torch.Tensor] or None): list of tensors representing layer internal state + generated in preceding invocation of ``infer``. + mems (torch.Tensor): memory elements, with shape `(M, B, D)`. + + Returns: + (Tensor, Tensor, List[torch.Tensor], Tensor): + Tensor + encoded utterance frames, with shape `(T, B, D)`. + Tensor + updated right context frames, with shape `(R, B, D)`. + List[Tensor] + list of tensors representing layer internal state + generated in current invocation of ``infer``. + Tensor + updated memory elements, with shape `(M, B, D)`. + """ + ( + layer_norm_utterance, + layer_norm_right_context, + ) = self._apply_pre_attention_layer_norm(utterance, right_context) + rc_output, output_mems, output_state = self._apply_attention_infer( + layer_norm_utterance, lengths, layer_norm_right_context, mems, state + ) + output_utterance, output_right_context = self._apply_post_attention_ffn(rc_output, utterance, right_context) + return output_utterance, output_right_context, output_state, output_mems + + +class _EmformerImpl(torch.nn.Module): + def __init__( + self, + emformer_layers: torch.nn.ModuleList, + segment_length: int, + left_context_length: int = 0, + right_context_length: int = 0, + max_memory_size: int = 0, + ): + super().__init__() + + self.use_mem = max_memory_size > 0 + self.memory_op = torch.nn.AvgPool1d( + kernel_size=segment_length, + stride=segment_length, + ceil_mode=True, + ) + self.emformer_layers = emformer_layers + self.left_context_length = left_context_length + self.right_context_length = right_context_length + self.segment_length = segment_length + self.max_memory_size = max_memory_size + + def _gen_right_context(self, input: torch.Tensor) -> torch.Tensor: + T = input.shape[0] + num_segs = math.ceil((T - self.right_context_length) / self.segment_length) + right_context_blocks = [] + for seg_idx in range(num_segs - 1): + start = (seg_idx + 1) * self.segment_length + end = start + self.right_context_length + right_context_blocks.append(input[start:end]) + right_context_blocks.append(input[T - self.right_context_length :]) + return torch.cat(right_context_blocks) + + def _gen_attention_mask_col_widths(self, seg_idx: int, utterance_length: int) -> List[int]: + num_segs = math.ceil(utterance_length / self.segment_length) + rc = self.right_context_length + lc = self.left_context_length + rc_start = seg_idx * rc + rc_end = rc_start + rc + seg_start = max(seg_idx * self.segment_length - lc, 0) + seg_end = min((seg_idx + 1) * self.segment_length, utterance_length) + rc_length = self.right_context_length * num_segs + + if self.use_mem: + m_start = max(seg_idx - self.max_memory_size, 0) + mem_length = num_segs - 1 + col_widths = [ + m_start, # before memory + seg_idx - m_start, # memory + mem_length - seg_idx, # after memory + rc_start, # before right context + rc, # right context + rc_length - rc_end, # after right context + seg_start, # before query segment + seg_end - seg_start, # query segment + utterance_length - seg_end, # after query segment + ] + else: + col_widths = [ + rc_start, # before right context + rc, # right context + rc_length - rc_end, # after right context + seg_start, # before query segment + seg_end - seg_start, # query segment + utterance_length - seg_end, # after query segment + ] + + return col_widths + + def _gen_attention_mask(self, input: torch.Tensor) -> torch.Tensor: + utterance_length = input.size(0) + num_segs = math.ceil(utterance_length / self.segment_length) + + rc_mask = [] + query_mask = [] + summary_mask = [] + + if self.use_mem: + num_cols = 9 + # memory, right context, query segment + rc_q_cols_mask = [idx in [1, 4, 7] for idx in range(num_cols)] + # right context, query segment + s_cols_mask = [idx in [4, 7] for idx in range(num_cols)] + masks_to_concat = [rc_mask, query_mask, summary_mask] + else: + num_cols = 6 + # right context, query segment + rc_q_cols_mask = [idx in [1, 4] for idx in range(num_cols)] + s_cols_mask = None + masks_to_concat = [rc_mask, query_mask] + + for seg_idx in range(num_segs): + col_widths = self._gen_attention_mask_col_widths(seg_idx, utterance_length) + + rc_mask_block = _gen_attention_mask_block( + col_widths, rc_q_cols_mask, self.right_context_length, input.device + ) + rc_mask.append(rc_mask_block) + + query_mask_block = _gen_attention_mask_block( + col_widths, + rc_q_cols_mask, + min( + self.segment_length, + utterance_length - seg_idx * self.segment_length, + ), + input.device, + ) + query_mask.append(query_mask_block) + + if s_cols_mask is not None: + summary_mask_block = _gen_attention_mask_block(col_widths, s_cols_mask, 1, input.device) + summary_mask.append(summary_mask_block) + + attention_mask = (1 - torch.cat([torch.cat(mask) for mask in masks_to_concat])).to(torch.bool) + return attention_mask + + def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + r"""Forward pass for training and non-streaming inference. + + B: batch size; + T: max number of input frames in batch; + D: feature dimension of each frame. + + Args: + input (torch.Tensor): utterance frames right-padded with right context frames, with + shape `(B, T + right_context_length, D)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid utterance frames for i-th batch element in ``input``. + + Returns: + (Tensor, Tensor): + Tensor + output frames, with shape `(B, T, D)`. + Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in output frames. + """ + input = input.permute(1, 0, 2) + right_context = self._gen_right_context(input) + utterance = input[: input.size(0) - self.right_context_length] + attention_mask = self._gen_attention_mask(utterance) + mems = ( + self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)[:-1] + if self.use_mem + else torch.empty(0).to(dtype=input.dtype, device=input.device) + ) + output = utterance + for layer in self.emformer_layers: + output, right_context, mems = layer(output, lengths, right_context, mems, attention_mask) + return output.permute(1, 0, 2), lengths + + @torch.jit.export + def infer( + self, + input: torch.Tensor, + lengths: torch.Tensor, + states: Optional[List[List[torch.Tensor]]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Forward pass for streaming inference. + + B: batch size; + D: feature dimension of each frame. + + Args: + input (torch.Tensor): utterance frames right-padded with right context frames, with + shape `(B, segment_length + right_context_length, D)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + states (List[List[torch.Tensor]] or None, optional): list of lists of tensors + representing internal state generated in preceding invocation of ``infer``. (Default: ``None``) + + Returns: + (Tensor, Tensor, List[List[Tensor]]): + Tensor + output frames, with shape `(B, segment_length, D)`. + Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in output frames. + List[List[Tensor]] + output states; list of lists of tensors representing internal state + generated in current invocation of ``infer``. + """ + assert input.size(1) == self.segment_length + self.right_context_length, ( + "Per configured segment_length and right_context_length" + f", expected size of {self.segment_length + self.right_context_length} for dimension 1 of input" + f", but got {input.size(1)}." + ) + input = input.permute(1, 0, 2) + right_context_start_idx = input.size(0) - self.right_context_length + right_context = input[right_context_start_idx:] + utterance = input[:right_context_start_idx] + output_lengths = torch.clamp(lengths - self.right_context_length, min=0) + mems = ( + self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1) + if self.use_mem + else torch.empty(0).to(dtype=input.dtype, device=input.device) + ) + output = utterance + output_states: List[List[torch.Tensor]] = [] + for layer_idx, layer in enumerate(self.emformer_layers): + output, right_context, output_state, mems = layer.infer( + output, + output_lengths, + right_context, + None if states is None else states[layer_idx], + mems, + ) + output_states.append(output_state) + + return output.permute(1, 0, 2), output_lengths, output_states + + +class Emformer(_EmformerImpl): + r"""Implements the Emformer architecture introduced in + *Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency Streaming Speech Recognition* + [:footcite:`shi2021emformer`]. + + Args: + input_dim (int): input dimension. + num_heads (int): number of attention heads in each Emformer layer. + ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network. + num_layers (int): number of Emformer layers to instantiate. + segment_length (int): length of each input segment. + dropout (float, optional): dropout probability. (Default: 0.0) + activation (str, optional): activation function to use in each Emformer layer's + feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu") + left_context_length (int, optional): length of left context. (Default: 0) + right_context_length (int, optional): length of right context. (Default: 0) + max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0) + weight_init_scale_strategy (str or None, optional): per-layer weight initialization scaling + strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise") + tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``) + negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8) + + Examples: + >>> emformer = Emformer(512, 8, 2048, 20, 4, right_context_length=1) + >>> input = torch.rand(128, 400, 512) # batch, num_frames, feature_dim + >>> lengths = torch.randint(1, 200, (128,)) # batch + >>> output, lengths = emformer(input, lengths) + >>> input = torch.rand(128, 5, 512) + >>> lengths = torch.ones(128) * 5 + >>> output, lengths, states = emformer.infer(input, lengths, None) + """ + + def __init__( + self, + input_dim: int, + num_heads: int, + ffn_dim: int, + num_layers: int, + segment_length: int, + dropout: float = 0.0, + activation: str = "relu", + left_context_length: int = 0, + right_context_length: int = 0, + max_memory_size: int = 0, + weight_init_scale_strategy: Optional[str] = "depthwise", + tanh_on_mem: bool = False, + negative_inf: float = -1e8, + ): + weight_init_gains = _get_weight_init_gains(weight_init_scale_strategy, num_layers) + emformer_layers = torch.nn.ModuleList( + [ + _EmformerLayer( + input_dim, + num_heads, + ffn_dim, + segment_length, + dropout=dropout, + activation=activation, + left_context_length=left_context_length, + max_memory_size=max_memory_size, + weight_init_gain=weight_init_gains[layer_idx], + tanh_on_mem=tanh_on_mem, + negative_inf=negative_inf, + ) + for layer_idx in range(num_layers) + ] + ) + super().__init__( + emformer_layers, + segment_length, + left_context_length=left_context_length, + right_context_length=right_context_length, + max_memory_size=max_memory_size, + ) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/rnnt.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/rnnt.py new file mode 100644 index 0000000000000000000000000000000000000000..e7cfee6dd9a253b8829492962110c2f494f29e16 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/rnnt.py @@ -0,0 +1,813 @@ +from abc import ABC, abstractmethod +from typing import List, Optional, Tuple + +import torch +from torchaudio.models import Emformer + + +__all__ = ["RNNT", "emformer_rnnt_base", "emformer_rnnt_model"] + + +class _TimeReduction(torch.nn.Module): + r"""Coalesces frames along time dimension into a + fewer number of frames with higher feature dimensionality. + + Args: + stride (int): number of frames to merge for each output frame. + """ + + def __init__(self, stride: int) -> None: + super().__init__() + self.stride = stride + + def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + r"""Forward pass. + + B: batch size; + T: maximum input sequence length in batch; + D: feature dimension of each input sequence frame. + + Args: + input (torch.Tensor): input sequences, with shape `(B, T, D)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + + Returns: + (torch.Tensor, torch.Tensor): + torch.Tensor + output sequences, with shape + `(B, T // stride, D * stride)` + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in output sequences. + """ + B, T, D = input.shape + num_frames = T - (T % self.stride) + input = input[:, :num_frames, :] + lengths = lengths.div(self.stride, rounding_mode="trunc") + T_max = num_frames // self.stride + + output = input.reshape(B, T_max, D * self.stride) + output = output.contiguous() + return output, lengths + + +class _CustomLSTM(torch.nn.Module): + r"""Custom long-short-term memory (LSTM) block that applies layer normalization + to internal nodes. + + Args: + input_dim (int): input dimension. + hidden_dim (int): hidden dimension. + layer_norm (bool, optional): if ``True``, enables layer normalization. (Default: ``False``) + layer_norm_epsilon (float, optional): value of epsilon to use in + layer normalization layers (Default: 1e-5) + """ + + def __init__( + self, + input_dim: int, + hidden_dim: int, + layer_norm: bool = False, + layer_norm_epsilon: float = 1e-5, + ) -> None: + super().__init__() + self.x2g = torch.nn.Linear(input_dim, 4 * hidden_dim, bias=(not layer_norm)) + self.p2g = torch.nn.Linear(hidden_dim, 4 * hidden_dim, bias=False) + if layer_norm: + self.c_norm = torch.nn.LayerNorm(hidden_dim, eps=layer_norm_epsilon) + self.g_norm = torch.nn.LayerNorm(4 * hidden_dim, eps=layer_norm_epsilon) + else: + self.c_norm = torch.nn.Identity() + self.g_norm = torch.nn.Identity() + + self.hidden_dim = hidden_dim + + def forward( + self, input: torch.Tensor, state: Optional[List[torch.Tensor]] + ) -> Tuple[torch.Tensor, List[torch.Tensor]]: + r"""Forward pass. + + B: batch size; + T: maximum sequence length in batch; + D: feature dimension of each input sequence element. + + Args: + input (torch.Tensor): with shape `(T, B, D)`. + state (List[torch.Tensor] or None): list of tensors + representing internal state generated in preceding invocation + of ``forward``. + + Returns: + (torch.Tensor, List[torch.Tensor]): + torch.Tensor + output, with shape `(T, B, hidden_dim)`. + List[torch.Tensor] + list of tensors representing internal state generated + in current invocation of ``forward``. + """ + if state is None: + B = input.size(1) + h = torch.zeros(B, self.hidden_dim, device=input.device, dtype=input.dtype) + c = torch.zeros(B, self.hidden_dim, device=input.device, dtype=input.dtype) + else: + h, c = state + + gated_input = self.x2g(input) + outputs = [] + for gates in gated_input.unbind(0): + gates = gates + self.p2g(h) + gates = self.g_norm(gates) + input_gate, forget_gate, cell_gate, output_gate = gates.chunk(4, 1) + input_gate = input_gate.sigmoid() + forget_gate = forget_gate.sigmoid() + cell_gate = cell_gate.tanh() + output_gate = output_gate.sigmoid() + c = forget_gate * c + input_gate * cell_gate + c = self.c_norm(c) + h = output_gate * c.tanh() + outputs.append(h) + + output = torch.stack(outputs, dim=0) + state = [h, c] + + return output, state + + +class _Transcriber(ABC): + @abstractmethod + def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + pass + + @abstractmethod + def infer( + self, + input: torch.Tensor, + lengths: torch.Tensor, + states: Optional[List[List[torch.Tensor]]], + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + pass + + +class _EmformerEncoder(torch.nn.Module, _Transcriber): + r"""Emformer-based recurrent neural network transducer (RNN-T) encoder (transcription network). + + Args: + input_dim (int): feature dimension of each input sequence element. + output_dim (int): feature dimension of each output sequence element. + segment_length (int): length of input segment expressed as number of frames. + right_context_length (int): length of right context expressed as number of frames. + time_reduction_input_dim (int): dimension to scale each element in input sequences to + prior to applying time reduction block. + time_reduction_stride (int): factor by which to reduce length of input sequence. + transformer_num_heads (int): number of attention heads in each Emformer layer. + transformer_ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network. + transformer_num_layers (int): number of Emformer layers to instantiate. + transformer_left_context_length (int): length of left context. + transformer_dropout (float, optional): transformer dropout probability. (Default: 0.0) + transformer_activation (str, optional): activation function to use in each Emformer layer's + feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu") + transformer_max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0) + transformer_weight_init_scale_strategy (str, optional): per-layer weight initialization scaling + strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise") + transformer_tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``) + """ + + def __init__( + self, + *, + input_dim: int, + output_dim: int, + segment_length: int, + right_context_length: int, + time_reduction_input_dim: int, + time_reduction_stride: int, + transformer_num_heads: int, + transformer_ffn_dim: int, + transformer_num_layers: int, + transformer_left_context_length: int, + transformer_dropout: float = 0.0, + transformer_activation: str = "relu", + transformer_max_memory_size: int = 0, + transformer_weight_init_scale_strategy: str = "depthwise", + transformer_tanh_on_mem: bool = False, + ) -> None: + super().__init__() + self.input_linear = torch.nn.Linear( + input_dim, + time_reduction_input_dim, + bias=False, + ) + self.time_reduction = _TimeReduction(time_reduction_stride) + transformer_input_dim = time_reduction_input_dim * time_reduction_stride + self.transformer = Emformer( + transformer_input_dim, + transformer_num_heads, + transformer_ffn_dim, + transformer_num_layers, + segment_length // time_reduction_stride, + dropout=transformer_dropout, + activation=transformer_activation, + left_context_length=transformer_left_context_length, + right_context_length=right_context_length // time_reduction_stride, + max_memory_size=transformer_max_memory_size, + weight_init_scale_strategy=transformer_weight_init_scale_strategy, + tanh_on_mem=transformer_tanh_on_mem, + ) + self.output_linear = torch.nn.Linear(transformer_input_dim, output_dim) + self.layer_norm = torch.nn.LayerNorm(output_dim) + + def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + r"""Forward pass for training. + + B: batch size; + T: maximum input sequence length in batch; + D: feature dimension of each input sequence frame (input_dim). + + Args: + input (torch.Tensor): input frame sequences right-padded with right context, with + shape `(B, T + right context length, D)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + + Returns: + (torch.Tensor, torch.Tensor): + torch.Tensor + output frame sequences, with + shape `(B, T // time_reduction_stride, output_dim)`. + torch.Tensor + output input lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output frame sequences. + """ + input_linear_out = self.input_linear(input) + time_reduction_out, time_reduction_lengths = self.time_reduction(input_linear_out, lengths) + transformer_out, transformer_lengths = self.transformer(time_reduction_out, time_reduction_lengths) + output_linear_out = self.output_linear(transformer_out) + layer_norm_out = self.layer_norm(output_linear_out) + return layer_norm_out, transformer_lengths + + @torch.jit.export + def infer( + self, + input: torch.Tensor, + lengths: torch.Tensor, + states: Optional[List[List[torch.Tensor]]], + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Forward pass for inference. + + B: batch size; + T: maximum input sequence segment length in batch; + D: feature dimension of each input sequence frame (input_dim). + + Args: + input (torch.Tensor): input frame sequence segments right-padded with right context, with + shape `(B, T + right context length, D)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + state (List[List[torch.Tensor]] or None): list of lists of tensors + representing internal state generated in preceding invocation + of ``infer``. + + Returns: + (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]): + torch.Tensor + output frame sequences, with + shape `(B, T // time_reduction_stride, output_dim)`. + torch.Tensor + output input lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output. + List[List[torch.Tensor]] + output states; list of lists of tensors + representing internal state generated in current invocation + of ``infer``. + """ + input_linear_out = self.input_linear(input) + time_reduction_out, time_reduction_lengths = self.time_reduction(input_linear_out, lengths) + ( + transformer_out, + transformer_lengths, + transformer_states, + ) = self.transformer.infer(time_reduction_out, time_reduction_lengths, states) + output_linear_out = self.output_linear(transformer_out) + layer_norm_out = self.layer_norm(output_linear_out) + return layer_norm_out, transformer_lengths, transformer_states + + +class _Predictor(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) prediction network. + + Args: + num_symbols (int): size of target token lexicon. + output_dim (int): feature dimension of each output sequence element. + symbol_embedding_dim (int): dimension of each target token embedding. + num_lstm_layers (int): number of LSTM layers to instantiate. + lstm_hidden_dim (int): output dimension of each LSTM layer. + lstm_layer_norm (bool, optional): if ``True``, enables layer normalization + for LSTM layers. (Default: ``False``) + lstm_layer_norm_epsilon (float, optional): value of epsilon to use in + LSTM layer normalization layers. (Default: 1e-5) + lstm_dropout (float, optional): LSTM dropout probability. (Default: 0.0) + + """ + + def __init__( + self, + num_symbols: int, + output_dim: int, + symbol_embedding_dim: int, + num_lstm_layers: int, + lstm_hidden_dim: int, + lstm_layer_norm: bool = False, + lstm_layer_norm_epsilon: float = 1e-5, + lstm_dropout: float = 0.0, + ) -> None: + super().__init__() + self.embedding = torch.nn.Embedding(num_symbols, symbol_embedding_dim) + self.input_layer_norm = torch.nn.LayerNorm(symbol_embedding_dim) + self.lstm_layers = torch.nn.ModuleList( + [ + _CustomLSTM( + symbol_embedding_dim if idx == 0 else lstm_hidden_dim, + lstm_hidden_dim, + layer_norm=lstm_layer_norm, + layer_norm_epsilon=lstm_layer_norm_epsilon, + ) + for idx in range(num_lstm_layers) + ] + ) + self.dropout = torch.nn.Dropout(p=lstm_dropout) + self.linear = torch.nn.Linear(lstm_hidden_dim, output_dim) + self.output_layer_norm = torch.nn.LayerNorm(output_dim) + + self.lstm_dropout = lstm_dropout + + def forward( + self, + input: torch.Tensor, + lengths: torch.Tensor, + state: Optional[List[List[torch.Tensor]]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Forward pass. + + B: batch size; + U: maximum sequence length in batch; + D: feature dimension of each input sequence element. + + Args: + input (torch.Tensor): target sequences, with shape `(B, U)` and each element + mapping to a target symbol, i.e. in range `[0, num_symbols)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + state (List[List[torch.Tensor]] or None, optional): list of lists of tensors + representing internal state generated in preceding invocation + of ``forward``. (Default: ``None``) + + Returns: + (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]): + torch.Tensor + output encoding sequences, with shape `(B, U, output_dim)` + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output encoding sequences. + List[List[torch.Tensor]] + output states; list of lists of tensors + representing internal state generated in current invocation of ``forward``. + """ + input_tb = input.permute(1, 0) + embedding_out = self.embedding(input_tb) + input_layer_norm_out = self.input_layer_norm(embedding_out) + + lstm_out = input_layer_norm_out + state_out: List[List[torch.Tensor]] = [] + for layer_idx, lstm in enumerate(self.lstm_layers): + lstm_out, lstm_state_out = lstm(lstm_out, None if state is None else state[layer_idx]) + lstm_out = self.dropout(lstm_out) + state_out.append(lstm_state_out) + + linear_out = self.linear(lstm_out) + output_layer_norm_out = self.output_layer_norm(linear_out) + return output_layer_norm_out.permute(1, 0, 2), lengths, state_out + + +class _Joiner(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) joint network. + + Args: + input_dim (int): source and target input dimension. + output_dim (int): output dimension. + activation (str, optional): activation function to use in the joiner. + Must be one of ("relu", "tanh"). (Default: "relu") + + """ + + def __init__(self, input_dim: int, output_dim: int, activation: str = "relu") -> None: + super().__init__() + self.linear = torch.nn.Linear(input_dim, output_dim, bias=True) + if activation == "relu": + self.activation = torch.nn.ReLU() + elif activation == "tanh": + self.activation = torch.nn.Tanh() + else: + raise ValueError(f"Unsupported activation {activation}") + + def forward( + self, + source_encodings: torch.Tensor, + source_lengths: torch.Tensor, + target_encodings: torch.Tensor, + target_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Forward pass for training. + + B: batch size; + T: maximum source sequence length in batch; + U: maximum target sequence length in batch; + D: dimension of each source and target sequence encoding. + + Args: + source_encodings (torch.Tensor): source encoding sequences, with + shape `(B, T, D)`. + source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``source_encodings``. + target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`. + target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``target_encodings``. + + Returns: + (torch.Tensor, torch.Tensor, torch.Tensor): + torch.Tensor + joint network output, with shape `(B, T, U, output_dim)`. + torch.Tensor + output source lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 1 for i-th batch element in joint network output. + torch.Tensor + output target lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 2 for i-th batch element in joint network output. + """ + joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous() + activation_out = self.activation(joint_encodings) + output = self.linear(activation_out) + return output, source_lengths, target_lengths + + +class RNNT(torch.nn.Module): + r"""torchaudio.models.RNNT() + + Recurrent neural network transducer (RNN-T) model. + + Note: + To build the model, please use one of the factory functions. + + Args: + transcriber (torch.nn.Module): transcription network. + predictor (torch.nn.Module): prediction network. + joiner (torch.nn.Module): joint network. + """ + + def __init__(self, transcriber: _Transcriber, predictor: _Predictor, joiner: _Joiner) -> None: + super().__init__() + self.transcriber = transcriber + self.predictor = predictor + self.joiner = joiner + + def forward( + self, + sources: torch.Tensor, + source_lengths: torch.Tensor, + targets: torch.Tensor, + target_lengths: torch.Tensor, + predictor_state: Optional[List[List[torch.Tensor]]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Forward pass for training. + + B: batch size; + T: maximum source sequence length in batch; + U: maximum target sequence length in batch; + D: feature dimension of each source sequence element. + + Args: + sources (torch.Tensor): source frame sequences right-padded with right context, with + shape `(B, T, D)`. + source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``sources``. + targets (torch.Tensor): target sequences, with shape `(B, U)` and each element + mapping to a target symbol. + target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``targets``. + predictor_state (List[List[torch.Tensor]] or None, optional): list of lists of tensors + representing prediction network internal state generated in preceding invocation + of ``forward``. (Default: ``None``) + + Returns: + (torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]]): + torch.Tensor + joint network output, with shape + `(B, max output source length, max output target length, output_dim (number of target symbols))`. + torch.Tensor + output source lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 1 for i-th batch element in joint network output. + torch.Tensor + output target lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 2 for i-th batch element in joint network output. + List[List[torch.Tensor]] + output states; list of lists of tensors + representing prediction network internal state generated in current invocation + of ``forward``. + """ + source_encodings, source_lengths = self.transcriber( + input=sources, + lengths=source_lengths, + ) + target_encodings, target_lengths, predictor_state = self.predictor( + input=targets, + lengths=target_lengths, + state=predictor_state, + ) + output, source_lengths, target_lengths = self.joiner( + source_encodings=source_encodings, + source_lengths=source_lengths, + target_encodings=target_encodings, + target_lengths=target_lengths, + ) + + return ( + output, + source_lengths, + target_lengths, + predictor_state, + ) + + @torch.jit.export + def transcribe_streaming( + self, + sources: torch.Tensor, + source_lengths: torch.Tensor, + state: Optional[List[List[torch.Tensor]]], + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Applies transcription network to sources in streaming mode. + + B: batch size; + T: maximum source sequence segment length in batch; + D: feature dimension of each source sequence frame. + + Args: + sources (torch.Tensor): source frame sequence segments right-padded with right context, with + shape `(B, T + right context length, D)`. + source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``sources``. + state (List[List[torch.Tensor]] or None): list of lists of tensors + representing transcription network internal state generated in preceding invocation + of ``transcribe_streaming``. + + Returns: + (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]): + torch.Tensor + output frame sequences, with + shape `(B, T // time_reduction_stride, output_dim)`. + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output. + List[List[torch.Tensor]] + output states; list of lists of tensors + representing transcription network internal state generated in current invocation + of ``transcribe_streaming``. + """ + return self.transcriber.infer(sources, source_lengths, state) + + @torch.jit.export + def transcribe( + self, + sources: torch.Tensor, + source_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + r"""Applies transcription network to sources in non-streaming mode. + + B: batch size; + T: maximum source sequence length in batch; + D: feature dimension of each source sequence frame. + + Args: + sources (torch.Tensor): source frame sequences right-padded with right context, with + shape `(B, T + right context length, D)`. + source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``sources``. + + Returns: + (torch.Tensor, torch.Tensor): + torch.Tensor + output frame sequences, with + shape `(B, T // time_reduction_stride, output_dim)`. + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output frame sequences. + """ + return self.transcriber(sources, source_lengths) + + @torch.jit.export + def predict( + self, + targets: torch.Tensor, + target_lengths: torch.Tensor, + state: Optional[List[List[torch.Tensor]]], + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Applies prediction network to targets. + + B: batch size; + U: maximum target sequence length in batch; + D: feature dimension of each target sequence frame. + + Args: + targets (torch.Tensor): target sequences, with shape `(B, U)` and each element + mapping to a target symbol, i.e. in range `[0, num_symbols)`. + target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``targets``. + state (List[List[torch.Tensor]] or None): list of lists of tensors + representing internal state generated in preceding invocation + of ``predict``. + + Returns: + (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]): + torch.Tensor + output frame sequences, with shape `(B, U, output_dim)`. + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output. + List[List[torch.Tensor]] + output states; list of lists of tensors + representing internal state generated in current invocation of ``predict``. + """ + return self.predictor(input=targets, lengths=target_lengths, state=state) + + @torch.jit.export + def join( + self, + source_encodings: torch.Tensor, + source_lengths: torch.Tensor, + target_encodings: torch.Tensor, + target_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Applies joint network to source and target encodings. + + B: batch size; + T: maximum source sequence length in batch; + U: maximum target sequence length in batch; + D: dimension of each source and target sequence encoding. + + Args: + source_encodings (torch.Tensor): source encoding sequences, with + shape `(B, T, D)`. + source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``source_encodings``. + target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`. + target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``target_encodings``. + + Returns: + (torch.Tensor, torch.Tensor, torch.Tensor): + torch.Tensor + joint network output, with shape `(B, T, U, output_dim)`. + torch.Tensor + output source lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 1 for i-th batch element in joint network output. + torch.Tensor + output target lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 2 for i-th batch element in joint network output. + """ + output, source_lengths, target_lengths = self.joiner( + source_encodings=source_encodings, + source_lengths=source_lengths, + target_encodings=target_encodings, + target_lengths=target_lengths, + ) + return output, source_lengths, target_lengths + + +def emformer_rnnt_model( + *, + input_dim: int, + encoding_dim: int, + num_symbols: int, + segment_length: int, + right_context_length: int, + time_reduction_input_dim: int, + time_reduction_stride: int, + transformer_num_heads: int, + transformer_ffn_dim: int, + transformer_num_layers: int, + transformer_dropout: float, + transformer_activation: str, + transformer_left_context_length: int, + transformer_max_memory_size: int, + transformer_weight_init_scale_strategy: str, + transformer_tanh_on_mem: bool, + symbol_embedding_dim: int, + num_lstm_layers: int, + lstm_layer_norm: bool, + lstm_layer_norm_epsilon: float, + lstm_dropout: float, +) -> RNNT: + r"""Builds Emformer-based recurrent neural network transducer (RNN-T) model. + + Note: + For non-streaming inference, the expectation is for `transcribe` to be called on input + sequences right-concatenated with `right_context_length` frames. + + For streaming inference, the expectation is for `transcribe_streaming` to be called + on input chunks comprising `segment_length` frames right-concatenated with `right_context_length` + frames. + + Args: + input_dim (int): dimension of input sequence frames passed to transcription network. + encoding_dim (int): dimension of transcription- and prediction-network-generated encodings + passed to joint network. + num_symbols (int): cardinality of set of target tokens. + segment_length (int): length of input segment expressed as number of frames. + right_context_length (int): length of right context expressed as number of frames. + time_reduction_input_dim (int): dimension to scale each element in input sequences to + prior to applying time reduction block. + time_reduction_stride (int): factor by which to reduce length of input sequence. + transformer_num_heads (int): number of attention heads in each Emformer layer. + transformer_ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network. + transformer_num_layers (int): number of Emformer layers to instantiate. + transformer_left_context_length (int): length of left context considered by Emformer. + transformer_dropout (float): Emformer dropout probability. + transformer_activation (str): activation function to use in each Emformer layer's + feedforward network. Must be one of ("relu", "gelu", "silu"). + transformer_max_memory_size (int): maximum number of memory elements to use. + transformer_weight_init_scale_strategy (str): per-layer weight initialization scaling + strategy. Must be one of ("depthwise", "constant", ``None``). + transformer_tanh_on_mem (bool): if ``True``, applies tanh to memory elements. + symbol_embedding_dim (int): dimension of each target token embedding. + num_lstm_layers (int): number of LSTM layers to instantiate. + lstm_layer_norm (bool): if ``True``, enables layer normalization for LSTM layers. + lstm_layer_norm_epsilon (float): value of epsilon to use in LSTM layer normalization layers. + lstm_dropout (float): LSTM dropout probability. + + Returns: + RNNT: + Emformer RNN-T model. + """ + encoder = _EmformerEncoder( + input_dim=input_dim, + output_dim=encoding_dim, + segment_length=segment_length, + right_context_length=right_context_length, + time_reduction_input_dim=time_reduction_input_dim, + time_reduction_stride=time_reduction_stride, + transformer_num_heads=transformer_num_heads, + transformer_ffn_dim=transformer_ffn_dim, + transformer_num_layers=transformer_num_layers, + transformer_dropout=transformer_dropout, + transformer_activation=transformer_activation, + transformer_left_context_length=transformer_left_context_length, + transformer_max_memory_size=transformer_max_memory_size, + transformer_weight_init_scale_strategy=transformer_weight_init_scale_strategy, + transformer_tanh_on_mem=transformer_tanh_on_mem, + ) + predictor = _Predictor( + num_symbols, + encoding_dim, + symbol_embedding_dim=symbol_embedding_dim, + num_lstm_layers=num_lstm_layers, + lstm_hidden_dim=symbol_embedding_dim, + lstm_layer_norm=lstm_layer_norm, + lstm_layer_norm_epsilon=lstm_layer_norm_epsilon, + lstm_dropout=lstm_dropout, + ) + joiner = _Joiner(encoding_dim, num_symbols) + return RNNT(encoder, predictor, joiner) + + +def emformer_rnnt_base(num_symbols: int) -> RNNT: + r"""Builds basic version of Emformer RNN-T model. + + Args: + num_symbols (int): The size of target token lexicon. + + Returns: + RNNT: + Emformer RNN-T model. + """ + return emformer_rnnt_model( + input_dim=80, + encoding_dim=1024, + num_symbols=num_symbols, + segment_length=16, + right_context_length=4, + time_reduction_input_dim=128, + time_reduction_stride=4, + transformer_num_heads=8, + transformer_ffn_dim=2048, + transformer_num_layers=20, + transformer_dropout=0.1, + transformer_activation="gelu", + transformer_left_context_length=30, + transformer_max_memory_size=0, + transformer_weight_init_scale_strategy="depthwise", + transformer_tanh_on_mem=True, + symbol_embedding_dim=512, + num_lstm_layers=3, + lstm_layer_norm=True, + lstm_layer_norm_epsilon=1e-3, + lstm_dropout=0.3, + ) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/rnnt_decoder.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/rnnt_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..f44afc8c190dc2ad432ee2cef299a8d7364efc65 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/rnnt_decoder.py @@ -0,0 +1,340 @@ +from typing import Callable, Dict, List, Optional, Tuple + +import torch +from torchaudio.models import RNNT + + +__all__ = ["Hypothesis", "RNNTBeamSearch"] + + +Hypothesis = Tuple[List[int], torch.Tensor, List[List[torch.Tensor]], float] +Hypothesis.__doc__ = """Hypothesis generated by RNN-T beam search decoder, + represented as tuple of (tokens, prediction network output, prediction network state, score). + """ + + +def _get_hypo_tokens(hypo: Hypothesis) -> List[int]: + return hypo[0] + + +def _get_hypo_predictor_out(hypo: Hypothesis) -> torch.Tensor: + return hypo[1] + + +def _get_hypo_state(hypo: Hypothesis) -> List[List[torch.Tensor]]: + return hypo[2] + + +def _get_hypo_score(hypo: Hypothesis) -> float: + return hypo[3] + + +def _get_hypo_key(hypo: Hypothesis) -> str: + return str(hypo[0]) + + +def _batch_state(hypos: List[Hypothesis]) -> List[List[torch.Tensor]]: + states: List[List[torch.Tensor]] = [] + for i in range(len(_get_hypo_state(hypos[0]))): + batched_state_components: List[torch.Tensor] = [] + for j in range(len(_get_hypo_state(hypos[0])[i])): + batched_state_components.append(torch.cat([_get_hypo_state(hypo)[i][j] for hypo in hypos])) + states.append(batched_state_components) + return states + + +def _slice_state(states: List[List[torch.Tensor]], idx: int, device: torch.device) -> List[List[torch.Tensor]]: + idx_tensor = torch.tensor([idx], device=device) + return [[state.index_select(0, idx_tensor) for state in state_tuple] for state_tuple in states] + + +def _default_hypo_sort_key(hypo: Hypothesis) -> float: + return _get_hypo_score(hypo) / (len(_get_hypo_tokens(hypo)) + 1) + + +def _compute_updated_scores( + hypos: List[Hypothesis], + next_token_probs: torch.Tensor, + beam_width: int, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + hypo_scores = torch.tensor([_get_hypo_score(h) for h in hypos]).unsqueeze(1) + nonblank_scores = hypo_scores + next_token_probs[:, :-1] # [beam_width, num_tokens - 1] + nonblank_nbest_scores, nonblank_nbest_idx = nonblank_scores.reshape(-1).topk(beam_width) + nonblank_nbest_hypo_idx = nonblank_nbest_idx.div(nonblank_scores.shape[1], rounding_mode="trunc") + nonblank_nbest_token = nonblank_nbest_idx % nonblank_scores.shape[1] + return nonblank_nbest_scores, nonblank_nbest_hypo_idx, nonblank_nbest_token + + +def _remove_hypo(hypo: Hypothesis, hypo_list: List[Hypothesis]) -> None: + for i, elem in enumerate(hypo_list): + if _get_hypo_key(hypo) == _get_hypo_key(elem): + del hypo_list[i] + break + + +class RNNTBeamSearch(torch.nn.Module): + r"""Beam search decoder for RNN-T model. + + Args: + model (RNNT): RNN-T model to use. + blank (int): index of blank token in vocabulary. + temperature (float, optional): temperature to apply to joint network output. + Larger values yield more uniform samples. (Default: 1.0) + hypo_sort_key (Callable[[Hypothesis], float] or None, optional): callable that computes a score + for a given hypothesis to rank hypotheses by. If ``None``, defaults to callable that returns + hypothesis score normalized by token sequence length. (Default: None) + step_max_tokens (int, optional): maximum number of tokens to emit per input time step. (Default: 100) + """ + + def __init__( + self, + model: RNNT, + blank: int, + temperature: float = 1.0, + hypo_sort_key: Optional[Callable[[Hypothesis], float]] = None, + step_max_tokens: int = 100, + ) -> None: + super().__init__() + self.model = model + self.blank = blank + self.temperature = temperature + + if hypo_sort_key is None: + self.hypo_sort_key = _default_hypo_sort_key + else: + self.hypo_sort_key = hypo_sort_key + + self.step_max_tokens = step_max_tokens + + def _init_b_hypos(self, hypo: Optional[Hypothesis], device: torch.device) -> List[Hypothesis]: + if hypo is not None: + token = _get_hypo_tokens(hypo)[-1] + state = _get_hypo_state(hypo) + else: + token = self.blank + state = None + + one_tensor = torch.tensor([1], device=device) + pred_out, _, pred_state = self.model.predict(torch.tensor([[token]], device=device), one_tensor, state) + init_hypo = ( + [token], + pred_out[0].detach(), + pred_state, + 0.0, + ) + return [init_hypo] + + def _gen_next_token_probs( + self, enc_out: torch.Tensor, hypos: List[Hypothesis], device: torch.device + ) -> torch.Tensor: + one_tensor = torch.tensor([1], device=device) + predictor_out = torch.stack([_get_hypo_predictor_out(h) for h in hypos], dim=0) + joined_out, _, _ = self.model.join( + enc_out, + one_tensor, + predictor_out, + torch.tensor([1] * len(hypos), device=device), + ) # [beam_width, 1, 1, num_tokens] + joined_out = torch.nn.functional.log_softmax(joined_out / self.temperature, dim=3) + return joined_out[:, 0, 0] + + def _gen_b_hypos( + self, + b_hypos: List[Hypothesis], + a_hypos: List[Hypothesis], + next_token_probs: torch.Tensor, + key_to_b_hypo: Dict[str, Hypothesis], + ) -> List[Hypothesis]: + for i in range(len(a_hypos)): + h_a = a_hypos[i] + append_blank_score = _get_hypo_score(h_a) + next_token_probs[i, -1] + if _get_hypo_key(h_a) in key_to_b_hypo: + h_b = key_to_b_hypo[_get_hypo_key(h_a)] + _remove_hypo(h_b, b_hypos) + score = float(torch.tensor(_get_hypo_score(h_b)).logaddexp(append_blank_score)) + else: + score = float(append_blank_score) + h_b = ( + _get_hypo_tokens(h_a), + _get_hypo_predictor_out(h_a), + _get_hypo_state(h_a), + score, + ) + b_hypos.append(h_b) + key_to_b_hypo[_get_hypo_key(h_b)] = h_b + _, sorted_idx = torch.tensor([_get_hypo_score(hypo) for hypo in b_hypos]).sort() + return [b_hypos[idx] for idx in sorted_idx] + + def _gen_a_hypos( + self, + a_hypos: List[Hypothesis], + b_hypos: List[Hypothesis], + next_token_probs: torch.Tensor, + t: int, + beam_width: int, + device: torch.device, + ) -> List[Hypothesis]: + ( + nonblank_nbest_scores, + nonblank_nbest_hypo_idx, + nonblank_nbest_token, + ) = _compute_updated_scores(a_hypos, next_token_probs, beam_width) + + if len(b_hypos) < beam_width: + b_nbest_score = -float("inf") + else: + b_nbest_score = _get_hypo_score(b_hypos[-beam_width]) + + base_hypos: List[Hypothesis] = [] + new_tokens: List[int] = [] + new_scores: List[float] = [] + for i in range(beam_width): + score = float(nonblank_nbest_scores[i]) + if score > b_nbest_score: + a_hypo_idx = int(nonblank_nbest_hypo_idx[i]) + base_hypos.append(a_hypos[a_hypo_idx]) + new_tokens.append(int(nonblank_nbest_token[i])) + new_scores.append(score) + + if base_hypos: + new_hypos = self._gen_new_hypos(base_hypos, new_tokens, new_scores, t, device) + else: + new_hypos: List[Hypothesis] = [] + + return new_hypos + + def _gen_new_hypos( + self, + base_hypos: List[Hypothesis], + tokens: List[int], + scores: List[float], + t: int, + device: torch.device, + ) -> List[Hypothesis]: + tgt_tokens = torch.tensor([[token] for token in tokens], device=device) + states = _batch_state(base_hypos) + pred_out, _, pred_states = self.model.predict( + tgt_tokens, + torch.tensor([1] * len(base_hypos), device=device), + states, + ) + new_hypos: List[Hypothesis] = [] + for i, h_a in enumerate(base_hypos): + new_tokens = _get_hypo_tokens(h_a) + [tokens[i]] + new_hypos.append((new_tokens, pred_out[i].detach(), _slice_state(pred_states, i, device), scores[i])) + return new_hypos + + def _search( + self, + enc_out: torch.Tensor, + hypo: Optional[Hypothesis], + beam_width: int, + ) -> List[Hypothesis]: + n_time_steps = enc_out.shape[1] + device = enc_out.device + + a_hypos: List[Hypothesis] = [] + b_hypos = self._init_b_hypos(hypo, device) + for t in range(n_time_steps): + a_hypos = b_hypos + b_hypos = torch.jit.annotate(List[Hypothesis], []) + key_to_b_hypo: Dict[str, Hypothesis] = {} + symbols_current_t = 0 + + while a_hypos: + next_token_probs = self._gen_next_token_probs(enc_out[:, t : t + 1], a_hypos, device) + next_token_probs = next_token_probs.cpu() + b_hypos = self._gen_b_hypos(b_hypos, a_hypos, next_token_probs, key_to_b_hypo) + + if symbols_current_t == self.step_max_tokens: + break + + a_hypos = self._gen_a_hypos( + a_hypos, + b_hypos, + next_token_probs, + t, + beam_width, + device, + ) + if a_hypos: + symbols_current_t += 1 + + _, sorted_idx = torch.tensor([self.hypo_sort_key(hypo) for hypo in b_hypos]).topk(beam_width) + b_hypos = [b_hypos[idx] for idx in sorted_idx] + + return b_hypos + + def forward(self, input: torch.Tensor, length: torch.Tensor, beam_width: int) -> List[Hypothesis]: + r"""Performs beam search for the given input sequence. + + T: number of frames; + D: feature dimension of each frame. + + Args: + input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D). + length (torch.Tensor): number of valid frames in input + sequence, with shape () or (1,). + beam_width (int): beam size to use during search. + + Returns: + List[Hypothesis]: top-``beam_width`` hypotheses found by beam search. + """ + assert input.dim() == 2 or ( + input.dim() == 3 and input.shape[0] == 1 + ), "input must be of shape (T, D) or (1, T, D)" + if input.dim() == 2: + input = input.unsqueeze(0) + + assert length.shape == () or length.shape == (1,), "length must be of shape () or (1,)" + if input.dim() == 0: + input = input.unsqueeze(0) + + enc_out, _ = self.model.transcribe(input, length) + return self._search(enc_out, None, beam_width) + + @torch.jit.export + def infer( + self, + input: torch.Tensor, + length: torch.Tensor, + beam_width: int, + state: Optional[List[List[torch.Tensor]]] = None, + hypothesis: Optional[Hypothesis] = None, + ) -> Tuple[List[Hypothesis], List[List[torch.Tensor]]]: + r"""Performs beam search for the given input sequence in streaming mode. + + T: number of frames; + D: feature dimension of each frame. + + Args: + input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D). + length (torch.Tensor): number of valid frames in input + sequence, with shape () or (1,). + beam_width (int): beam size to use during search. + state (List[List[torch.Tensor]] or None, optional): list of lists of tensors + representing transcription network internal state generated in preceding + invocation. (Default: ``None``) + hypothesis (Hypothesis or None): hypothesis from preceding invocation to seed + search with. (Default: ``None``) + + Returns: + (List[Hypothesis], List[List[torch.Tensor]]): + List[Hypothesis] + top-``beam_width`` hypotheses found by beam search. + List[List[torch.Tensor]] + list of lists of tensors representing transcription network + internal state generated in current invocation. + """ + assert input.dim() == 2 or ( + input.dim() == 3 and input.shape[0] == 1 + ), "input must be of shape (T, D) or (1, T, D)" + if input.dim() == 2: + input = input.unsqueeze(0) + + assert length.shape == () or length.shape == (1,), "length must be of shape () or (1,)" + if length.dim() == 0: + length = length.unsqueeze(0) + + enc_out, _, state = self.model.transcribe_streaming(input, length, state) + return self._search(enc_out, hypothesis, beam_width), state diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/tacotron2.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/tacotron2.py new file mode 100644 index 0000000000000000000000000000000000000000..e2bcc01a1471c3da70ad4ae1d566d47bc0d6bf5a --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/tacotron2.py @@ -0,0 +1,1046 @@ +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** + +import warnings +from typing import List, Optional, Tuple, Union + +import torch +from torch import nn, Tensor +from torch.nn import functional as F + + +__all__ = [ + "Tacotron2", +] + + +def _get_linear_layer(in_dim: int, out_dim: int, bias: bool = True, w_init_gain: str = "linear") -> torch.nn.Linear: + r"""Linear layer with xavier uniform initialization. + + Args: + in_dim (int): Size of each input sample. + out_dim (int): Size of each output sample. + bias (bool, optional): If set to ``False``, the layer will not learn an additive bias. (Default: ``True``) + w_init_gain (str, optional): Parameter passed to ``torch.nn.init.calculate_gain`` + for setting the gain parameter of ``xavier_uniform_``. (Default: ``linear``) + + Returns: + (torch.nn.Linear): The corresponding linear layer. + """ + linear = torch.nn.Linear(in_dim, out_dim, bias=bias) + torch.nn.init.xavier_uniform_(linear.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) + return linear + + +def _get_conv1d_layer( + in_channels: int, + out_channels: int, + kernel_size: int = 1, + stride: int = 1, + padding: Optional[Union[str, int, Tuple[int]]] = None, + dilation: int = 1, + bias: bool = True, + w_init_gain: str = "linear", +) -> torch.nn.Conv1d: + r"""1D convolution with xavier uniform initialization. + + Args: + in_channels (int): Number of channels in the input image. + out_channels (int): Number of channels produced by the convolution. + kernel_size (int, optional): Number of channels in the input image. (Default: ``1``) + stride (int, optional): Number of channels in the input image. (Default: ``1``) + padding (str, int or tuple, optional): Padding added to both sides of the input. + (Default: dilation * (kernel_size - 1) / 2) + dilation (int, optional): Number of channels in the input image. (Default: ``1``) + w_init_gain (str, optional): Parameter passed to ``torch.nn.init.calculate_gain`` + for setting the gain parameter of ``xavier_uniform_``. (Default: ``linear``) + + Returns: + (torch.nn.Conv1d): The corresponding Conv1D layer. + """ + if padding is None: + assert kernel_size % 2 == 1 + padding = int(dilation * (kernel_size - 1) / 2) + + conv1d = torch.nn.Conv1d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias, + ) + + torch.nn.init.xavier_uniform_(conv1d.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) + + return conv1d + + +def _get_mask_from_lengths(lengths: Tensor) -> Tensor: + r"""Returns a binary mask based on ``lengths``. The ``i``-th row and ``j``-th column of the mask + is ``1`` if ``j`` is smaller than ``i``-th element of ``lengths. + + Args: + lengths (Tensor): The length of each element in the batch, with shape (n_batch, ). + + Returns: + mask (Tensor): The binary mask, with shape (n_batch, max of ``lengths``). + """ + max_len = torch.max(lengths).item() + ids = torch.arange(0, max_len, device=lengths.device, dtype=lengths.dtype) + mask = (ids < lengths.unsqueeze(1)).byte() + mask = torch.le(mask, 0) + return mask + + +class _LocationLayer(nn.Module): + r"""Location layer used in the Attention model. + + Args: + attention_n_filter (int): Number of filters for attention model. + attention_kernel_size (int): Kernel size for attention model. + attention_hidden_dim (int): Dimension of attention hidden representation. + """ + + def __init__( + self, + attention_n_filter: int, + attention_kernel_size: int, + attention_hidden_dim: int, + ): + super().__init__() + padding = int((attention_kernel_size - 1) / 2) + self.location_conv = _get_conv1d_layer( + 2, + attention_n_filter, + kernel_size=attention_kernel_size, + padding=padding, + bias=False, + stride=1, + dilation=1, + ) + self.location_dense = _get_linear_layer( + attention_n_filter, attention_hidden_dim, bias=False, w_init_gain="tanh" + ) + + def forward(self, attention_weights_cat: Tensor) -> Tensor: + r"""Location layer used in the Attention model. + + Args: + attention_weights_cat (Tensor): Cumulative and previous attention weights + with shape (n_batch, 2, max of ``text_lengths``). + + Returns: + processed_attention (Tensor): Cumulative and previous attention weights + with shape (n_batch, ``attention_hidden_dim``). + """ + # (n_batch, attention_n_filter, text_lengths.max()) + processed_attention = self.location_conv(attention_weights_cat) + processed_attention = processed_attention.transpose(1, 2) + # (n_batch, text_lengths.max(), attention_hidden_dim) + processed_attention = self.location_dense(processed_attention) + return processed_attention + + +class _Attention(nn.Module): + r"""Locally sensitive attention model. + + Args: + attention_rnn_dim (int): Number of hidden units for RNN. + encoder_embedding_dim (int): Number of embedding dimensions in the Encoder. + attention_hidden_dim (int): Dimension of attention hidden representation. + attention_location_n_filter (int): Number of filters for Attention model. + attention_location_kernel_size (int): Kernel size for Attention model. + """ + + def __init__( + self, + attention_rnn_dim: int, + encoder_embedding_dim: int, + attention_hidden_dim: int, + attention_location_n_filter: int, + attention_location_kernel_size: int, + ) -> None: + super().__init__() + self.query_layer = _get_linear_layer(attention_rnn_dim, attention_hidden_dim, bias=False, w_init_gain="tanh") + self.memory_layer = _get_linear_layer( + encoder_embedding_dim, attention_hidden_dim, bias=False, w_init_gain="tanh" + ) + self.v = _get_linear_layer(attention_hidden_dim, 1, bias=False) + self.location_layer = _LocationLayer( + attention_location_n_filter, + attention_location_kernel_size, + attention_hidden_dim, + ) + self.score_mask_value = -float("inf") + + def _get_alignment_energies(self, query: Tensor, processed_memory: Tensor, attention_weights_cat: Tensor) -> Tensor: + r"""Get the alignment vector. + + Args: + query (Tensor): Decoder output with shape (n_batch, n_mels * n_frames_per_step). + processed_memory (Tensor): Processed Encoder outputs + with shape (n_batch, max of ``text_lengths``, attention_hidden_dim). + attention_weights_cat (Tensor): Cumulative and previous attention weights + with shape (n_batch, 2, max of ``text_lengths``). + + Returns: + alignment (Tensor): attention weights, it is a tensor with shape (batch, max of ``text_lengths``). + """ + + processed_query = self.query_layer(query.unsqueeze(1)) + processed_attention_weights = self.location_layer(attention_weights_cat) + energies = self.v(torch.tanh(processed_query + processed_attention_weights + processed_memory)) + + alignment = energies.squeeze(2) + return alignment + + def forward( + self, + attention_hidden_state: Tensor, + memory: Tensor, + processed_memory: Tensor, + attention_weights_cat: Tensor, + mask: Tensor, + ) -> Tuple[Tensor, Tensor]: + r"""Pass the input through the Attention model. + + Args: + attention_hidden_state (Tensor): Attention rnn last output with shape (n_batch, ``attention_rnn_dim``). + memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``). + processed_memory (Tensor): Processed Encoder outputs + with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``). + attention_weights_cat (Tensor): Previous and cumulative attention weights + with shape (n_batch, current_num_frames * 2, max of ``text_lengths``). + mask (Tensor): Binary mask for padded data with shape (n_batch, current_num_frames). + + Returns: + attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``). + attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``). + """ + alignment = self._get_alignment_energies(attention_hidden_state, processed_memory, attention_weights_cat) + + alignment = alignment.masked_fill(mask, self.score_mask_value) + + attention_weights = F.softmax(alignment, dim=1) + attention_context = torch.bmm(attention_weights.unsqueeze(1), memory) + attention_context = attention_context.squeeze(1) + + return attention_context, attention_weights + + +class _Prenet(nn.Module): + r"""Prenet Module. It is consists of ``len(output_size)`` linear layers. + + Args: + in_dim (int): The size of each input sample. + output_sizes (list): The output dimension of each linear layers. + """ + + def __init__(self, in_dim: int, out_sizes: List[int]) -> None: + super().__init__() + in_sizes = [in_dim] + out_sizes[:-1] + self.layers = nn.ModuleList( + [_get_linear_layer(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, out_sizes)] + ) + + def forward(self, x: Tensor) -> Tensor: + r"""Pass the input through Prenet. + + Args: + x (Tensor): The input sequence to Prenet with shape (n_batch, in_dim). + + Return: + x (Tensor): Tensor with shape (n_batch, sizes[-1]) + """ + + for linear in self.layers: + x = F.dropout(F.relu(linear(x)), p=0.5, training=True) + return x + + +class _Postnet(nn.Module): + r"""Postnet Module. + + Args: + n_mels (int): Number of mel bins. + postnet_embedding_dim (int): Postnet embedding dimension. + postnet_kernel_size (int): Postnet kernel size. + postnet_n_convolution (int): Number of postnet convolutions. + """ + + def __init__( + self, + n_mels: int, + postnet_embedding_dim: int, + postnet_kernel_size: int, + postnet_n_convolution: int, + ): + super().__init__() + self.convolutions = nn.ModuleList() + + for i in range(postnet_n_convolution): + in_channels = n_mels if i == 0 else postnet_embedding_dim + out_channels = n_mels if i == (postnet_n_convolution - 1) else postnet_embedding_dim + init_gain = "linear" if i == (postnet_n_convolution - 1) else "tanh" + num_features = n_mels if i == (postnet_n_convolution - 1) else postnet_embedding_dim + self.convolutions.append( + nn.Sequential( + _get_conv1d_layer( + in_channels, + out_channels, + kernel_size=postnet_kernel_size, + stride=1, + padding=int((postnet_kernel_size - 1) / 2), + dilation=1, + w_init_gain=init_gain, + ), + nn.BatchNorm1d(num_features), + ) + ) + + self.n_convs = len(self.convolutions) + + def forward(self, x: Tensor) -> Tensor: + r"""Pass the input through Postnet. + + Args: + x (Tensor): The input sequence with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``). + + Return: + x (Tensor): Tensor with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``). + """ + + for i, conv in enumerate(self.convolutions): + if i < self.n_convs - 1: + x = F.dropout(torch.tanh(conv(x)), 0.5, training=self.training) + else: + x = F.dropout(conv(x), 0.5, training=self.training) + + return x + + +class _Encoder(nn.Module): + r"""Encoder Module. + + Args: + encoder_embedding_dim (int): Number of embedding dimensions in the encoder. + encoder_n_convolution (int): Number of convolution layers in the encoder. + encoder_kernel_size (int): The kernel size in the encoder. + + Examples + >>> encoder = _Encoder(3, 512, 5) + >>> input = torch.rand(10, 20, 30) + >>> output = encoder(input) # shape: (10, 30, 512) + """ + + def __init__( + self, + encoder_embedding_dim: int, + encoder_n_convolution: int, + encoder_kernel_size: int, + ) -> None: + super().__init__() + + self.convolutions = nn.ModuleList() + for _ in range(encoder_n_convolution): + conv_layer = nn.Sequential( + _get_conv1d_layer( + encoder_embedding_dim, + encoder_embedding_dim, + kernel_size=encoder_kernel_size, + stride=1, + padding=int((encoder_kernel_size - 1) / 2), + dilation=1, + w_init_gain="relu", + ), + nn.BatchNorm1d(encoder_embedding_dim), + ) + self.convolutions.append(conv_layer) + + self.lstm = nn.LSTM( + encoder_embedding_dim, + int(encoder_embedding_dim / 2), + 1, + batch_first=True, + bidirectional=True, + ) + self.lstm.flatten_parameters() + + def forward(self, x: Tensor, input_lengths: Tensor) -> Tensor: + r"""Pass the input through the Encoder. + + Args: + x (Tensor): The input sequences with shape (n_batch, encoder_embedding_dim, n_seq). + input_lengths (Tensor): The length of each input sequence with shape (n_batch, ). + + Return: + x (Tensor): A tensor with shape (n_batch, n_seq, encoder_embedding_dim). + """ + + for conv in self.convolutions: + x = F.dropout(F.relu(conv(x)), 0.5, self.training) + + x = x.transpose(1, 2) + + input_lengths = input_lengths.cpu() + x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True) + + outputs, _ = self.lstm(x) + outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True) + + return outputs + + +class _Decoder(nn.Module): + r"""Decoder with Attention model. + + Args: + n_mels (int): number of mel bins + n_frames_per_step (int): number of frames processed per step, only 1 is supported + encoder_embedding_dim (int): the number of embedding dimensions in the encoder. + decoder_rnn_dim (int): number of units in decoder LSTM + decoder_max_step (int): maximum number of output mel spectrograms + decoder_dropout (float): dropout probability for decoder LSTM + decoder_early_stopping (bool): stop decoding when all samples are finished + attention_rnn_dim (int): number of units in attention LSTM + attention_hidden_dim (int): dimension of attention hidden representation + attention_location_n_filter (int): number of filters for attention model + attention_location_kernel_size (int): kernel size for attention model + attention_dropout (float): dropout probability for attention LSTM + prenet_dim (int): number of ReLU units in prenet layers + gate_threshold (float): probability threshold for stop token + """ + + def __init__( + self, + n_mels: int, + n_frames_per_step: int, + encoder_embedding_dim: int, + decoder_rnn_dim: int, + decoder_max_step: int, + decoder_dropout: float, + decoder_early_stopping: bool, + attention_rnn_dim: int, + attention_hidden_dim: int, + attention_location_n_filter: int, + attention_location_kernel_size: int, + attention_dropout: float, + prenet_dim: int, + gate_threshold: float, + ) -> None: + + super().__init__() + self.n_mels = n_mels + self.n_frames_per_step = n_frames_per_step + self.encoder_embedding_dim = encoder_embedding_dim + self.attention_rnn_dim = attention_rnn_dim + self.decoder_rnn_dim = decoder_rnn_dim + self.prenet_dim = prenet_dim + self.decoder_max_step = decoder_max_step + self.gate_threshold = gate_threshold + self.attention_dropout = attention_dropout + self.decoder_dropout = decoder_dropout + self.decoder_early_stopping = decoder_early_stopping + + self.prenet = _Prenet(n_mels * n_frames_per_step, [prenet_dim, prenet_dim]) + + self.attention_rnn = nn.LSTMCell(prenet_dim + encoder_embedding_dim, attention_rnn_dim) + + self.attention_layer = _Attention( + attention_rnn_dim, + encoder_embedding_dim, + attention_hidden_dim, + attention_location_n_filter, + attention_location_kernel_size, + ) + + self.decoder_rnn = nn.LSTMCell(attention_rnn_dim + encoder_embedding_dim, decoder_rnn_dim, True) + + self.linear_projection = _get_linear_layer(decoder_rnn_dim + encoder_embedding_dim, n_mels * n_frames_per_step) + + self.gate_layer = _get_linear_layer( + decoder_rnn_dim + encoder_embedding_dim, 1, bias=True, w_init_gain="sigmoid" + ) + + def _get_initial_frame(self, memory: Tensor) -> Tensor: + r"""Gets all zeros frames to use as the first decoder input. + + Args: + memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``). + + Returns: + decoder_input (Tensor): all zeros frames with shape + (n_batch, max of ``text_lengths``, ``n_mels * n_frames_per_step``). + """ + + n_batch = memory.size(0) + dtype = memory.dtype + device = memory.device + decoder_input = torch.zeros(n_batch, self.n_mels * self.n_frames_per_step, dtype=dtype, device=device) + return decoder_input + + def _initialize_decoder_states( + self, memory: Tensor + ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: + r"""Initializes attention rnn states, decoder rnn states, attention + weights, attention cumulative weights, attention context, stores memory + and stores processed memory. + + Args: + memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``). + + Returns: + attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``). + attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``). + decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``). + decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``). + attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``). + attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``). + attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``). + processed_memory (Tensor): Processed encoder outputs + with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``). + """ + n_batch = memory.size(0) + max_time = memory.size(1) + dtype = memory.dtype + device = memory.device + + attention_hidden = torch.zeros(n_batch, self.attention_rnn_dim, dtype=dtype, device=device) + attention_cell = torch.zeros(n_batch, self.attention_rnn_dim, dtype=dtype, device=device) + + decoder_hidden = torch.zeros(n_batch, self.decoder_rnn_dim, dtype=dtype, device=device) + decoder_cell = torch.zeros(n_batch, self.decoder_rnn_dim, dtype=dtype, device=device) + + attention_weights = torch.zeros(n_batch, max_time, dtype=dtype, device=device) + attention_weights_cum = torch.zeros(n_batch, max_time, dtype=dtype, device=device) + attention_context = torch.zeros(n_batch, self.encoder_embedding_dim, dtype=dtype, device=device) + + processed_memory = self.attention_layer.memory_layer(memory) + + return ( + attention_hidden, + attention_cell, + decoder_hidden, + decoder_cell, + attention_weights, + attention_weights_cum, + attention_context, + processed_memory, + ) + + def _parse_decoder_inputs(self, decoder_inputs: Tensor) -> Tensor: + r"""Prepares decoder inputs. + + Args: + decoder_inputs (Tensor): Inputs used for teacher-forced training, i.e. mel-specs, + with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``) + + Returns: + inputs (Tensor): Processed decoder inputs with shape (max of ``mel_specgram_lengths``, n_batch, ``n_mels``). + """ + # (n_batch, n_mels, mel_specgram_lengths.max()) -> (n_batch, mel_specgram_lengths.max(), n_mels) + decoder_inputs = decoder_inputs.transpose(1, 2) + decoder_inputs = decoder_inputs.view( + decoder_inputs.size(0), + int(decoder_inputs.size(1) / self.n_frames_per_step), + -1, + ) + # (n_batch, mel_specgram_lengths.max(), n_mels) -> (mel_specgram_lengths.max(), n_batch, n_mels) + decoder_inputs = decoder_inputs.transpose(0, 1) + return decoder_inputs + + def _parse_decoder_outputs( + self, mel_specgram: Tensor, gate_outputs: Tensor, alignments: Tensor + ) -> Tuple[Tensor, Tensor, Tensor]: + r"""Prepares decoder outputs for output + + Args: + mel_specgram (Tensor): mel spectrogram with shape (max of ``mel_specgram_lengths``, n_batch, ``n_mels``) + gate_outputs (Tensor): predicted stop token with shape (max of ``mel_specgram_lengths``, n_batch) + alignments (Tensor): sequence of attention weights from the decoder + with shape (max of ``mel_specgram_lengths``, n_batch, max of ``text_lengths``) + + Returns: + mel_specgram (Tensor): mel spectrogram with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``) + gate_outputs (Tensor): predicted stop token with shape (n_batch, max of ``mel_specgram_lengths``) + alignments (Tensor): sequence of attention weights from the decoder + with shape (n_batch, max of ``mel_specgram_lengths``, max of ``text_lengths``) + """ + # (mel_specgram_lengths.max(), n_batch, text_lengths.max()) + # -> (n_batch, mel_specgram_lengths.max(), text_lengths.max()) + alignments = alignments.transpose(0, 1).contiguous() + # (mel_specgram_lengths.max(), n_batch) -> (n_batch, mel_specgram_lengths.max()) + gate_outputs = gate_outputs.transpose(0, 1).contiguous() + # (mel_specgram_lengths.max(), n_batch, n_mels) -> (n_batch, mel_specgram_lengths.max(), n_mels) + mel_specgram = mel_specgram.transpose(0, 1).contiguous() + # decouple frames per step + shape = (mel_specgram.shape[0], -1, self.n_mels) + mel_specgram = mel_specgram.view(*shape) + # (n_batch, mel_specgram_lengths.max(), n_mels) -> (n_batch, n_mels, T_out) + mel_specgram = mel_specgram.transpose(1, 2) + + return mel_specgram, gate_outputs, alignments + + def decode( + self, + decoder_input: Tensor, + attention_hidden: Tensor, + attention_cell: Tensor, + decoder_hidden: Tensor, + decoder_cell: Tensor, + attention_weights: Tensor, + attention_weights_cum: Tensor, + attention_context: Tensor, + memory: Tensor, + processed_memory: Tensor, + mask: Tensor, + ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: + r"""Decoder step using stored states, attention and memory + + Args: + decoder_input (Tensor): Output of the Prenet with shape (n_batch, ``prenet_dim``). + attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``). + attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``). + decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``). + decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``). + attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``). + attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``). + attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``). + memory (Tensor): Encoder output with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``). + processed_memory (Tensor): Processed Encoder outputs + with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``). + mask (Tensor): Binary mask for padded data with shape (n_batch, current_num_frames). + + Returns: + decoder_output: Predicted mel spectrogram for the current frame with shape (n_batch, ``n_mels``). + gate_prediction (Tensor): Prediction of the stop token with shape (n_batch, ``1``). + attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``). + attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``). + decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``). + decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``). + attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``). + attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``). + attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``). + """ + cell_input = torch.cat((decoder_input, attention_context), -1) + + attention_hidden, attention_cell = self.attention_rnn(cell_input, (attention_hidden, attention_cell)) + attention_hidden = F.dropout(attention_hidden, self.attention_dropout, self.training) + + attention_weights_cat = torch.cat((attention_weights.unsqueeze(1), attention_weights_cum.unsqueeze(1)), dim=1) + attention_context, attention_weights = self.attention_layer( + attention_hidden, memory, processed_memory, attention_weights_cat, mask + ) + + attention_weights_cum += attention_weights + decoder_input = torch.cat((attention_hidden, attention_context), -1) + + decoder_hidden, decoder_cell = self.decoder_rnn(decoder_input, (decoder_hidden, decoder_cell)) + decoder_hidden = F.dropout(decoder_hidden, self.decoder_dropout, self.training) + + decoder_hidden_attention_context = torch.cat((decoder_hidden, attention_context), dim=1) + decoder_output = self.linear_projection(decoder_hidden_attention_context) + + gate_prediction = self.gate_layer(decoder_hidden_attention_context) + + return ( + decoder_output, + gate_prediction, + attention_hidden, + attention_cell, + decoder_hidden, + decoder_cell, + attention_weights, + attention_weights_cum, + attention_context, + ) + + def forward( + self, memory: Tensor, mel_specgram_truth: Tensor, memory_lengths: Tensor + ) -> Tuple[Tensor, Tensor, Tensor]: + r"""Decoder forward pass for training. + + Args: + memory (Tensor): Encoder outputs + with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``). + mel_specgram_truth (Tensor): Decoder ground-truth mel-specs for teacher forcing + with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``). + memory_lengths (Tensor): Encoder output lengths for attention masking + (the same as ``text_lengths``) with shape (n_batch, ). + + Returns: + mel_specgram (Tensor): Predicted mel spectrogram + with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``). + gate_outputs (Tensor): Predicted stop token for each timestep + with shape (n_batch, max of ``mel_specgram_lengths``). + alignments (Tensor): Sequence of attention weights from the decoder + with shape (n_batch, max of ``mel_specgram_lengths``, max of ``text_lengths``). + """ + + decoder_input = self._get_initial_frame(memory).unsqueeze(0) + decoder_inputs = self._parse_decoder_inputs(mel_specgram_truth) + decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0) + decoder_inputs = self.prenet(decoder_inputs) + + mask = _get_mask_from_lengths(memory_lengths) + ( + attention_hidden, + attention_cell, + decoder_hidden, + decoder_cell, + attention_weights, + attention_weights_cum, + attention_context, + processed_memory, + ) = self._initialize_decoder_states(memory) + + mel_outputs, gate_outputs, alignments = [], [], [] + while len(mel_outputs) < decoder_inputs.size(0) - 1: + decoder_input = decoder_inputs[len(mel_outputs)] + ( + mel_output, + gate_output, + attention_hidden, + attention_cell, + decoder_hidden, + decoder_cell, + attention_weights, + attention_weights_cum, + attention_context, + ) = self.decode( + decoder_input, + attention_hidden, + attention_cell, + decoder_hidden, + decoder_cell, + attention_weights, + attention_weights_cum, + attention_context, + memory, + processed_memory, + mask, + ) + + mel_outputs += [mel_output.squeeze(1)] + gate_outputs += [gate_output.squeeze(1)] + alignments += [attention_weights] + + mel_specgram, gate_outputs, alignments = self._parse_decoder_outputs( + torch.stack(mel_outputs), torch.stack(gate_outputs), torch.stack(alignments) + ) + + return mel_specgram, gate_outputs, alignments + + def _get_go_frame(self, memory: Tensor) -> Tensor: + """Gets all zeros frames to use as the first decoder input + + args: + memory (Tensor): Encoder outputs + with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``). + + returns: + decoder_input (Tensor): All zeros frames with shape(n_batch, ``n_mels`` * ``n_frame_per_step``). + """ + + n_batch = memory.size(0) + dtype = memory.dtype + device = memory.device + decoder_input = torch.zeros(n_batch, self.n_mels * self.n_frames_per_step, dtype=dtype, device=device) + return decoder_input + + @torch.jit.export + def infer(self, memory: Tensor, memory_lengths: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]: + """Decoder inference + + Args: + memory (Tensor): Encoder outputs + with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``). + memory_lengths (Tensor): Encoder output lengths for attention masking + (the same as ``text_lengths``) with shape (n_batch, ). + + Returns: + mel_specgram (Tensor): Predicted mel spectrogram + with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``). + mel_specgram_lengths (Tensor): the length of the predicted mel spectrogram (n_batch, )) + gate_outputs (Tensor): Predicted stop token for each timestep + with shape (n_batch, max of ``mel_specgram_lengths``). + alignments (Tensor): Sequence of attention weights from the decoder + with shape (n_batch, max of ``mel_specgram_lengths``, max of ``text_lengths``). + """ + batch_size, device = memory.size(0), memory.device + + decoder_input = self._get_go_frame(memory) + + mask = _get_mask_from_lengths(memory_lengths) + ( + attention_hidden, + attention_cell, + decoder_hidden, + decoder_cell, + attention_weights, + attention_weights_cum, + attention_context, + processed_memory, + ) = self._initialize_decoder_states(memory) + + mel_specgram_lengths = torch.zeros([batch_size], dtype=torch.int32, device=device) + finished = torch.zeros([batch_size], dtype=torch.bool, device=device) + mel_specgrams: List[Tensor] = [] + gate_outputs: List[Tensor] = [] + alignments: List[Tensor] = [] + for _ in range(self.decoder_max_step): + decoder_input = self.prenet(decoder_input) + ( + mel_specgram, + gate_output, + attention_hidden, + attention_cell, + decoder_hidden, + decoder_cell, + attention_weights, + attention_weights_cum, + attention_context, + ) = self.decode( + decoder_input, + attention_hidden, + attention_cell, + decoder_hidden, + decoder_cell, + attention_weights, + attention_weights_cum, + attention_context, + memory, + processed_memory, + mask, + ) + + mel_specgrams.append(mel_specgram.unsqueeze(0)) + gate_outputs.append(gate_output.transpose(0, 1)) + alignments.append(attention_weights) + mel_specgram_lengths[~finished] += 1 + + finished |= torch.sigmoid(gate_output.squeeze(1)) > self.gate_threshold + if self.decoder_early_stopping and torch.all(finished): + break + + decoder_input = mel_specgram + + if len(mel_specgrams) == self.decoder_max_step: + warnings.warn( + "Reached max decoder steps. The generated spectrogram might not cover " "the whole transcript." + ) + + mel_specgrams = torch.cat(mel_specgrams, dim=0) + gate_outputs = torch.cat(gate_outputs, dim=0) + alignments = torch.cat(alignments, dim=0) + + mel_specgrams, gate_outputs, alignments = self._parse_decoder_outputs(mel_specgrams, gate_outputs, alignments) + + return mel_specgrams, mel_specgram_lengths, gate_outputs, alignments + + +class Tacotron2(nn.Module): + r"""Tacotron2 model based on the implementation from + `Nvidia `_. + + The original implementation was introduced in + *Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions* + [:footcite:`shen2018natural`]. + + Args: + mask_padding (bool, optional): Use mask padding (Default: ``False``). + n_mels (int, optional): Number of mel bins (Default: ``80``). + n_symbol (int, optional): Number of symbols for the input text (Default: ``148``). + n_frames_per_step (int, optional): Number of frames processed per step, only 1 is supported (Default: ``1``). + symbol_embedding_dim (int, optional): Input embedding dimension (Default: ``512``). + encoder_n_convolution (int, optional): Number of encoder convolutions (Default: ``3``). + encoder_kernel_size (int, optional): Encoder kernel size (Default: ``5``). + encoder_embedding_dim (int, optional): Encoder embedding dimension (Default: ``512``). + decoder_rnn_dim (int, optional): Number of units in decoder LSTM (Default: ``1024``). + decoder_max_step (int, optional): Maximum number of output mel spectrograms (Default: ``2000``). + decoder_dropout (float, optional): Dropout probability for decoder LSTM (Default: ``0.1``). + decoder_early_stopping (bool, optional): Continue decoding after all samples are finished (Default: ``True``). + attention_rnn_dim (int, optional): Number of units in attention LSTM (Default: ``1024``). + attention_hidden_dim (int, optional): Dimension of attention hidden representation (Default: ``128``). + attention_location_n_filter (int, optional): Number of filters for attention model (Default: ``32``). + attention_location_kernel_size (int, optional): Kernel size for attention model (Default: ``31``). + attention_dropout (float, optional): Dropout probability for attention LSTM (Default: ``0.1``). + prenet_dim (int, optional): Number of ReLU units in prenet layers (Default: ``256``). + postnet_n_convolution (int, optional): Number of postnet convolutions (Default: ``5``). + postnet_kernel_size (int, optional): Postnet kernel size (Default: ``5``). + postnet_embedding_dim (int, optional): Postnet embedding dimension (Default: ``512``). + gate_threshold (float, optional): Probability threshold for stop token (Default: ``0.5``). + """ + + def __init__( + self, + mask_padding: bool = False, + n_mels: int = 80, + n_symbol: int = 148, + n_frames_per_step: int = 1, + symbol_embedding_dim: int = 512, + encoder_embedding_dim: int = 512, + encoder_n_convolution: int = 3, + encoder_kernel_size: int = 5, + decoder_rnn_dim: int = 1024, + decoder_max_step: int = 2000, + decoder_dropout: float = 0.1, + decoder_early_stopping: bool = True, + attention_rnn_dim: int = 1024, + attention_hidden_dim: int = 128, + attention_location_n_filter: int = 32, + attention_location_kernel_size: int = 31, + attention_dropout: float = 0.1, + prenet_dim: int = 256, + postnet_n_convolution: int = 5, + postnet_kernel_size: int = 5, + postnet_embedding_dim: int = 512, + gate_threshold: float = 0.5, + ) -> None: + super().__init__() + + self.mask_padding = mask_padding + self.n_mels = n_mels + self.n_frames_per_step = n_frames_per_step + self.embedding = nn.Embedding(n_symbol, symbol_embedding_dim) + torch.nn.init.xavier_uniform_(self.embedding.weight) + self.encoder = _Encoder(encoder_embedding_dim, encoder_n_convolution, encoder_kernel_size) + self.decoder = _Decoder( + n_mels, + n_frames_per_step, + encoder_embedding_dim, + decoder_rnn_dim, + decoder_max_step, + decoder_dropout, + decoder_early_stopping, + attention_rnn_dim, + attention_hidden_dim, + attention_location_n_filter, + attention_location_kernel_size, + attention_dropout, + prenet_dim, + gate_threshold, + ) + self.postnet = _Postnet(n_mels, postnet_embedding_dim, postnet_kernel_size, postnet_n_convolution) + + def forward( + self, + tokens: Tensor, + token_lengths: Tensor, + mel_specgram: Tensor, + mel_specgram_lengths: Tensor, + ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: + r"""Pass the input through the Tacotron2 model. This is in teacher + forcing mode, which is generally used for training. + + The input ``tokens`` should be padded with zeros to length max of ``token_lengths``. + The input ``mel_specgram`` should be padded with zeros to length max of ``mel_specgram_lengths``. + + Args: + tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of token_lengths)`. + token_lengths (Tensor): The valid length of each sample in ``tokens`` with shape `(n_batch, )`. + mel_specgram (Tensor): The target mel spectrogram + with shape `(n_batch, n_mels, max of mel_specgram_lengths)`. + mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape `(n_batch, )`. + + Returns: + [Tensor, Tensor, Tensor, Tensor]: + Tensor + Mel spectrogram before Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`. + Tensor + Mel spectrogram after Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`. + Tensor + The output for stop token at each time step with shape `(n_batch, max of mel_specgram_lengths)`. + Tensor + Sequence of attention weights from the decoder with + shape `(n_batch, max of mel_specgram_lengths, max of token_lengths)`. + """ + + embedded_inputs = self.embedding(tokens).transpose(1, 2) + + encoder_outputs = self.encoder(embedded_inputs, token_lengths) + mel_specgram, gate_outputs, alignments = self.decoder( + encoder_outputs, mel_specgram, memory_lengths=token_lengths + ) + + mel_specgram_postnet = self.postnet(mel_specgram) + mel_specgram_postnet = mel_specgram + mel_specgram_postnet + + if self.mask_padding: + mask = _get_mask_from_lengths(mel_specgram_lengths) + mask = mask.expand(self.n_mels, mask.size(0), mask.size(1)) + mask = mask.permute(1, 0, 2) + + mel_specgram.masked_fill_(mask, 0.0) + mel_specgram_postnet.masked_fill_(mask, 0.0) + gate_outputs.masked_fill_(mask[:, 0, :], 1e3) + + return mel_specgram, mel_specgram_postnet, gate_outputs, alignments + + @torch.jit.export + def infer(self, tokens: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Tensor, Tensor]: + r"""Using Tacotron2 for inference. The input is a batch of encoded + sentences (``tokens``) and its corresponding lengths (``lengths``). The + output is the generated mel spectrograms, its corresponding lengths, and + the attention weights from the decoder. + + The input `tokens` should be padded with zeros to length max of ``lengths``. + + Args: + tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of lengths)`. + lengths (Tensor or None, optional): + The valid length of each sample in ``tokens`` with shape `(n_batch, )`. + If ``None``, it is assumed that the all the tokens are valid. Default: ``None`` + + Returns: + (Tensor, Tensor, Tensor): + Tensor + The predicted mel spectrogram with shape `(n_batch, n_mels, max of mel_specgram_lengths)`. + Tensor + The length of the predicted mel spectrogram with shape `(n_batch, )`. + Tensor + Sequence of attention weights from the decoder with shape + `(n_batch, max of mel_specgram_lengths, max of lengths)`. + """ + n_batch, max_length = tokens.shape + if lengths is None: + lengths = torch.tensor([max_length]).expand(n_batch).to(tokens.device, tokens.dtype) + + assert lengths is not None # For TorchScript compiler + + embedded_inputs = self.embedding(tokens).transpose(1, 2) + encoder_outputs = self.encoder(embedded_inputs, lengths) + mel_specgram, mel_specgram_lengths, _, alignments = self.decoder.infer(encoder_outputs, lengths) + + mel_outputs_postnet = self.postnet(mel_specgram) + mel_outputs_postnet = mel_specgram + mel_outputs_postnet + + alignments = alignments.unfold(1, n_batch, n_batch).transpose(0, 2) + + return mel_outputs_postnet, mel_specgram_lengths, alignments diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/wav2letter.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/wav2letter.py new file mode 100644 index 0000000000000000000000000000000000000000..922287002388b2869c77a11adf9d9b18deb8e5bd --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/wav2letter.py @@ -0,0 +1,71 @@ +from torch import nn, Tensor + +__all__ = [ + "Wav2Letter", +] + + +class Wav2Letter(nn.Module): + r"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech + Recognition System* [:footcite:`collobert2016wav2letter`]. + + :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}` + + Args: + num_classes (int, optional): Number of classes to be classified. (Default: ``40``) + input_type (str, optional): Wav2Letter can use as input: ``waveform``, ``power_spectrum`` + or ``mfcc`` (Default: ``waveform``). + num_features (int, optional): Number of input features that the network will receive (Default: ``1``). + """ + + def __init__(self, num_classes: int = 40, input_type: str = "waveform", num_features: int = 1) -> None: + super(Wav2Letter, self).__init__() + + acoustic_num_features = 250 if input_type == "waveform" else num_features + acoustic_model = nn.Sequential( + nn.Conv1d(in_channels=acoustic_num_features, out_channels=250, kernel_size=48, stride=2, padding=23), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=2000, out_channels=num_classes, kernel_size=1, stride=1, padding=0), + nn.ReLU(inplace=True), + ) + + if input_type == "waveform": + waveform_model = nn.Sequential( + nn.Conv1d(in_channels=num_features, out_channels=250, kernel_size=250, stride=160, padding=45), + nn.ReLU(inplace=True), + ) + self.acoustic_model = nn.Sequential(waveform_model, acoustic_model) + + if input_type in ["power_spectrum", "mfcc"]: + self.acoustic_model = acoustic_model + + def forward(self, x: Tensor) -> Tensor: + r""" + Args: + x (torch.Tensor): Tensor of dimension (batch_size, num_features, input_length). + + Returns: + Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length). + """ + + x = self.acoustic_model(x) + x = nn.functional.log_softmax(x, dim=1) + return x diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/wavernn.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/wavernn.py new file mode 100644 index 0000000000000000000000000000000000000000..aafdfaea15e39660abbbeac214eb903ecfd21190 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/wavernn.py @@ -0,0 +1,402 @@ +import math +from typing import List, Optional, Tuple + +import torch +import torch.nn.functional as F +from torch import nn, Tensor + +__all__ = [ + "ResBlock", + "MelResNet", + "Stretch2d", + "UpsampleNetwork", + "WaveRNN", +] + + +class ResBlock(nn.Module): + r"""ResNet block based on *Efficient Neural Audio Synthesis* [:footcite:`kalchbrenner2018efficient`]. + + Args: + n_freq: the number of bins in a spectrogram. (Default: ``128``) + + Examples + >>> resblock = ResBlock() + >>> input = torch.rand(10, 128, 512) # a random spectrogram + >>> output = resblock(input) # shape: (10, 128, 512) + """ + + def __init__(self, n_freq: int = 128) -> None: + super().__init__() + + self.resblock_model = nn.Sequential( + nn.Conv1d(in_channels=n_freq, out_channels=n_freq, kernel_size=1, bias=False), + nn.BatchNorm1d(n_freq), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=n_freq, out_channels=n_freq, kernel_size=1, bias=False), + nn.BatchNorm1d(n_freq), + ) + + def forward(self, specgram: Tensor) -> Tensor: + r"""Pass the input through the ResBlock layer. + Args: + specgram (Tensor): the input sequence to the ResBlock layer (n_batch, n_freq, n_time). + + Return: + Tensor shape: (n_batch, n_freq, n_time) + """ + + return self.resblock_model(specgram) + specgram + + +class MelResNet(nn.Module): + r"""MelResNet layer uses a stack of ResBlocks on spectrogram. + + Args: + n_res_block: the number of ResBlock in stack. (Default: ``10``) + n_freq: the number of bins in a spectrogram. (Default: ``128``) + n_hidden: the number of hidden dimensions of resblock. (Default: ``128``) + n_output: the number of output dimensions of melresnet. (Default: ``128``) + kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``) + + Examples + >>> melresnet = MelResNet() + >>> input = torch.rand(10, 128, 512) # a random spectrogram + >>> output = melresnet(input) # shape: (10, 128, 508) + """ + + def __init__( + self, n_res_block: int = 10, n_freq: int = 128, n_hidden: int = 128, n_output: int = 128, kernel_size: int = 5 + ) -> None: + super().__init__() + + ResBlocks = [ResBlock(n_hidden) for _ in range(n_res_block)] + + self.melresnet_model = nn.Sequential( + nn.Conv1d(in_channels=n_freq, out_channels=n_hidden, kernel_size=kernel_size, bias=False), + nn.BatchNorm1d(n_hidden), + nn.ReLU(inplace=True), + *ResBlocks, + nn.Conv1d(in_channels=n_hidden, out_channels=n_output, kernel_size=1), + ) + + def forward(self, specgram: Tensor) -> Tensor: + r"""Pass the input through the MelResNet layer. + Args: + specgram (Tensor): the input sequence to the MelResNet layer (n_batch, n_freq, n_time). + + Return: + Tensor shape: (n_batch, n_output, n_time - kernel_size + 1) + """ + + return self.melresnet_model(specgram) + + +class Stretch2d(nn.Module): + r"""Upscale the frequency and time dimensions of a spectrogram. + + Args: + time_scale: the scale factor in time dimension + freq_scale: the scale factor in frequency dimension + + Examples + >>> stretch2d = Stretch2d(time_scale=10, freq_scale=5) + + >>> input = torch.rand(10, 100, 512) # a random spectrogram + >>> output = stretch2d(input) # shape: (10, 500, 5120) + """ + + def __init__(self, time_scale: int, freq_scale: int) -> None: + super().__init__() + + self.freq_scale = freq_scale + self.time_scale = time_scale + + def forward(self, specgram: Tensor) -> Tensor: + r"""Pass the input through the Stretch2d layer. + + Args: + specgram (Tensor): the input sequence to the Stretch2d layer (..., n_freq, n_time). + + Return: + Tensor shape: (..., n_freq * freq_scale, n_time * time_scale) + """ + + return specgram.repeat_interleave(self.freq_scale, -2).repeat_interleave(self.time_scale, -1) + + +class UpsampleNetwork(nn.Module): + r"""Upscale the dimensions of a spectrogram. + + Args: + upsample_scales: the list of upsample scales. + n_res_block: the number of ResBlock in stack. (Default: ``10``) + n_freq: the number of bins in a spectrogram. (Default: ``128``) + n_hidden: the number of hidden dimensions of resblock. (Default: ``128``) + n_output: the number of output dimensions of melresnet. (Default: ``128``) + kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``) + + Examples + >>> upsamplenetwork = UpsampleNetwork(upsample_scales=[4, 4, 16]) + >>> input = torch.rand(10, 128, 10) # a random spectrogram + >>> output = upsamplenetwork(input) # shape: (10, 128, 1536), (10, 128, 1536) + """ + + def __init__( + self, + upsample_scales: List[int], + n_res_block: int = 10, + n_freq: int = 128, + n_hidden: int = 128, + n_output: int = 128, + kernel_size: int = 5, + ) -> None: + super().__init__() + + total_scale = 1 + for upsample_scale in upsample_scales: + total_scale *= upsample_scale + self.total_scale: int = total_scale + + self.indent = (kernel_size - 1) // 2 * total_scale + self.resnet = MelResNet(n_res_block, n_freq, n_hidden, n_output, kernel_size) + self.resnet_stretch = Stretch2d(total_scale, 1) + + up_layers = [] + for scale in upsample_scales: + stretch = Stretch2d(scale, 1) + conv = nn.Conv2d( + in_channels=1, out_channels=1, kernel_size=(1, scale * 2 + 1), padding=(0, scale), bias=False + ) + torch.nn.init.constant_(conv.weight, 1.0 / (scale * 2 + 1)) + up_layers.append(stretch) + up_layers.append(conv) + self.upsample_layers = nn.Sequential(*up_layers) + + def forward(self, specgram: Tensor) -> Tuple[Tensor, Tensor]: + r"""Pass the input through the UpsampleNetwork layer. + + Args: + specgram (Tensor): the input sequence to the UpsampleNetwork layer (n_batch, n_freq, n_time) + + Return: + Tensor shape: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale), + (n_batch, n_output, (n_time - kernel_size + 1) * total_scale) + where total_scale is the product of all elements in upsample_scales. + """ + + resnet_output = self.resnet(specgram).unsqueeze(1) + resnet_output = self.resnet_stretch(resnet_output) + resnet_output = resnet_output.squeeze(1) + + specgram = specgram.unsqueeze(1) + upsampling_output = self.upsample_layers(specgram) + upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent : -self.indent] + + return upsampling_output, resnet_output + + +class WaveRNN(nn.Module): + r"""WaveRNN model based on the implementation from `fatchord `_. + + The original implementation was introduced in *Efficient Neural Audio Synthesis* + [:footcite:`kalchbrenner2018efficient`]. The input channels of waveform and spectrogram have to be 1. + The product of `upsample_scales` must equal `hop_length`. + + Args: + upsample_scales: the list of upsample scales. + n_classes: the number of output classes. + hop_length: the number of samples between the starts of consecutive frames. + n_res_block: the number of ResBlock in stack. (Default: ``10``) + n_rnn: the dimension of RNN layer. (Default: ``512``) + n_fc: the dimension of fully connected layer. (Default: ``512``) + kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``) + n_freq: the number of bins in a spectrogram. (Default: ``128``) + n_hidden: the number of hidden dimensions of resblock. (Default: ``128``) + n_output: the number of output dimensions of melresnet. (Default: ``128``) + + Example + >>> wavernn = WaveRNN(upsample_scales=[5,5,8], n_classes=512, hop_length=200) + >>> waveform, sample_rate = torchaudio.load(file) + >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length) + >>> specgram = MelSpectrogram(sample_rate)(waveform) # shape: (n_batch, n_channel, n_freq, n_time) + >>> output = wavernn(waveform, specgram) + >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, n_classes) + """ + + def __init__( + self, + upsample_scales: List[int], + n_classes: int, + hop_length: int, + n_res_block: int = 10, + n_rnn: int = 512, + n_fc: int = 512, + kernel_size: int = 5, + n_freq: int = 128, + n_hidden: int = 128, + n_output: int = 128, + ) -> None: + super().__init__() + + self.kernel_size = kernel_size + self._pad = (kernel_size - 1 if kernel_size % 2 else kernel_size) // 2 + self.n_rnn = n_rnn + self.n_aux = n_output // 4 + self.hop_length = hop_length + self.n_classes = n_classes + self.n_bits: int = int(math.log2(self.n_classes)) + + total_scale = 1 + for upsample_scale in upsample_scales: + total_scale *= upsample_scale + if total_scale != self.hop_length: + raise ValueError(f"Expected: total_scale == hop_length, but found {total_scale} != {hop_length}") + + self.upsample = UpsampleNetwork(upsample_scales, n_res_block, n_freq, n_hidden, n_output, kernel_size) + self.fc = nn.Linear(n_freq + self.n_aux + 1, n_rnn) + + self.rnn1 = nn.GRU(n_rnn, n_rnn, batch_first=True) + self.rnn2 = nn.GRU(n_rnn + self.n_aux, n_rnn, batch_first=True) + + self.relu1 = nn.ReLU(inplace=True) + self.relu2 = nn.ReLU(inplace=True) + + self.fc1 = nn.Linear(n_rnn + self.n_aux, n_fc) + self.fc2 = nn.Linear(n_fc + self.n_aux, n_fc) + self.fc3 = nn.Linear(n_fc, self.n_classes) + + def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor: + r"""Pass the input through the WaveRNN model. + + Args: + waveform: the input waveform to the WaveRNN layer (n_batch, 1, (n_time - kernel_size + 1) * hop_length) + specgram: the input spectrogram to the WaveRNN layer (n_batch, 1, n_freq, n_time) + + Return: + Tensor: shape (n_batch, 1, (n_time - kernel_size + 1) * hop_length, n_classes) + """ + + assert waveform.size(1) == 1, "Require the input channel of waveform is 1" + assert specgram.size(1) == 1, "Require the input channel of specgram is 1" + # remove channel dimension until the end + waveform, specgram = waveform.squeeze(1), specgram.squeeze(1) + + batch_size = waveform.size(0) + h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device) + h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device) + # output of upsample: + # specgram: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale) + # aux: (n_batch, n_output, (n_time - kernel_size + 1) * total_scale) + specgram, aux = self.upsample(specgram) + specgram = specgram.transpose(1, 2) + aux = aux.transpose(1, 2) + + aux_idx = [self.n_aux * i for i in range(5)] + a1 = aux[:, :, aux_idx[0] : aux_idx[1]] + a2 = aux[:, :, aux_idx[1] : aux_idx[2]] + a3 = aux[:, :, aux_idx[2] : aux_idx[3]] + a4 = aux[:, :, aux_idx[3] : aux_idx[4]] + + x = torch.cat([waveform.unsqueeze(-1), specgram, a1], dim=-1) + x = self.fc(x) + res = x + x, _ = self.rnn1(x, h1) + + x = x + res + res = x + x = torch.cat([x, a2], dim=-1) + x, _ = self.rnn2(x, h2) + + x = x + res + x = torch.cat([x, a3], dim=-1) + x = self.fc1(x) + x = self.relu1(x) + + x = torch.cat([x, a4], dim=-1) + x = self.fc2(x) + x = self.relu2(x) + x = self.fc3(x) + + # bring back channel dimension + return x.unsqueeze(1) + + @torch.jit.export + def infer(self, specgram: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]: + r"""Inference method of WaveRNN. + + This function currently only supports multinomial sampling, which assumes the + network is trained on cross entropy loss. + + Args: + specgram (Tensor): + Batch of spectrograms. Shape: `(n_batch, n_freq, n_time)`. + lengths (Tensor or None, optional): + Indicates the valid length of each audio in the batch. + Shape: `(batch, )`. + When the ``specgram`` contains spectrograms with different durations, + by providing ``lengths`` argument, the model will compute + the corresponding valid output lengths. + If ``None``, it is assumed that all the audio in ``waveforms`` + have valid length. Default: ``None``. + + Returns: + (Tensor, Optional[Tensor]): + Tensor + The inferred waveform of size `(n_batch, 1, n_time)`. + 1 stands for a single channel. + Tensor or None + If ``lengths`` argument was provided, a Tensor of shape `(batch, )` + is returned. + It indicates the valid length in time axis of the output Tensor. + """ + + device = specgram.device + dtype = specgram.dtype + + specgram = torch.nn.functional.pad(specgram, (self._pad, self._pad)) + specgram, aux = self.upsample(specgram) + if lengths is not None: + lengths = lengths * self.upsample.total_scale + + output: List[Tensor] = [] + b_size, _, seq_len = specgram.size() + + h1 = torch.zeros((1, b_size, self.n_rnn), device=device, dtype=dtype) + h2 = torch.zeros((1, b_size, self.n_rnn), device=device, dtype=dtype) + x = torch.zeros((b_size, 1), device=device, dtype=dtype) + + aux_split = [aux[:, self.n_aux * i : self.n_aux * (i + 1), :] for i in range(4)] + + for i in range(seq_len): + + m_t = specgram[:, :, i] + + a1_t, a2_t, a3_t, a4_t = [a[:, :, i] for a in aux_split] + + x = torch.cat([x, m_t, a1_t], dim=1) + x = self.fc(x) + _, h1 = self.rnn1(x.unsqueeze(1), h1) + + x = x + h1[0] + inp = torch.cat([x, a2_t], dim=1) + _, h2 = self.rnn2(inp.unsqueeze(1), h2) + + x = x + h2[0] + x = torch.cat([x, a3_t], dim=1) + x = F.relu(self.fc1(x)) + + x = torch.cat([x, a4_t], dim=1) + x = F.relu(self.fc2(x)) + + logits = self.fc3(x) + + posterior = F.softmax(logits, dim=1) + + x = torch.multinomial(posterior, 1).float() + # Transform label [0, 2 ** n_bits - 1] to waveform [-1, 1] + x = 2 * x / (2**self.n_bits - 1.0) - 1.0 + + output.append(x) + + return torch.stack(output).permute(1, 2, 0), lengths diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/pipelines/__init__.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/pipelines/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b8d96bf37e4ab10a10463f871e5702982cd6b6da --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/pipelines/__init__.py @@ -0,0 +1,71 @@ +from ._tts import ( + TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH, + TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH, + TACOTRON2_WAVERNN_CHAR_LJSPEECH, + TACOTRON2_WAVERNN_PHONE_LJSPEECH, + Tacotron2TTSBundle, +) +from ._wav2vec2.impl import ( + HUBERT_ASR_LARGE, + HUBERT_ASR_XLARGE, + HUBERT_BASE, + HUBERT_LARGE, + HUBERT_XLARGE, + VOXPOPULI_ASR_BASE_10K_DE, + VOXPOPULI_ASR_BASE_10K_EN, + VOXPOPULI_ASR_BASE_10K_ES, + VOXPOPULI_ASR_BASE_10K_FR, + VOXPOPULI_ASR_BASE_10K_IT, + WAV2VEC2_ASR_BASE_100H, + WAV2VEC2_ASR_BASE_10M, + WAV2VEC2_ASR_BASE_960H, + WAV2VEC2_ASR_LARGE_100H, + WAV2VEC2_ASR_LARGE_10M, + WAV2VEC2_ASR_LARGE_960H, + WAV2VEC2_ASR_LARGE_LV60K_100H, + WAV2VEC2_ASR_LARGE_LV60K_10M, + WAV2VEC2_ASR_LARGE_LV60K_960H, + WAV2VEC2_BASE, + WAV2VEC2_LARGE, + WAV2VEC2_LARGE_LV60K, + WAV2VEC2_XLSR53, + Wav2Vec2ASRBundle, + Wav2Vec2Bundle, +) +from .rnnt_pipeline import EMFORMER_RNNT_BASE_LIBRISPEECH, RNNTBundle + + +__all__ = [ + "Wav2Vec2Bundle", + "Wav2Vec2ASRBundle", + "WAV2VEC2_BASE", + "WAV2VEC2_LARGE", + "WAV2VEC2_LARGE_LV60K", + "WAV2VEC2_ASR_BASE_10M", + "WAV2VEC2_ASR_BASE_100H", + "WAV2VEC2_ASR_BASE_960H", + "WAV2VEC2_ASR_LARGE_10M", + "WAV2VEC2_ASR_LARGE_100H", + "WAV2VEC2_ASR_LARGE_960H", + "WAV2VEC2_ASR_LARGE_LV60K_10M", + "WAV2VEC2_ASR_LARGE_LV60K_100H", + "WAV2VEC2_ASR_LARGE_LV60K_960H", + "WAV2VEC2_XLSR53", + "VOXPOPULI_ASR_BASE_10K_EN", + "VOXPOPULI_ASR_BASE_10K_ES", + "VOXPOPULI_ASR_BASE_10K_DE", + "VOXPOPULI_ASR_BASE_10K_FR", + "VOXPOPULI_ASR_BASE_10K_IT", + "HUBERT_BASE", + "HUBERT_LARGE", + "HUBERT_XLARGE", + "HUBERT_ASR_LARGE", + "HUBERT_ASR_XLARGE", + "Tacotron2TTSBundle", + "TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH", + "TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH", + "TACOTRON2_WAVERNN_CHAR_LJSPEECH", + "TACOTRON2_WAVERNN_PHONE_LJSPEECH", + "RNNTBundle", + "EMFORMER_RNNT_BASE_LIBRISPEECH", +] diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/pipelines/rnnt_pipeline.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/pipelines/rnnt_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..d6cb403eb65c0dbae77fcea6ef2b280b30c0565d --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/pipelines/rnnt_pipeline.py @@ -0,0 +1,380 @@ +import json +import math +from abc import ABC, abstractmethod +from dataclasses import dataclass +from functools import partial +from typing import Callable, List, Tuple + +import torch +import torchaudio +from torchaudio._internal import module_utils +from torchaudio.models import emformer_rnnt_base, RNNT, RNNTBeamSearch + + +__all__ = [] + +_decibel = 2 * 20 * math.log10(torch.iinfo(torch.int16).max) +_gain = pow(10, 0.05 * _decibel) + + +def _piecewise_linear_log(x): + x[x > math.e] = torch.log(x[x > math.e]) + x[x <= math.e] = x[x <= math.e] / math.e + return x + + +class _FunctionalModule(torch.nn.Module): + def __init__(self, functional): + super().__init__() + self.functional = functional + + def forward(self, input): + return self.functional(input) + + +class _GlobalStatsNormalization(torch.nn.Module): + def __init__(self, global_stats_path): + super().__init__() + + with open(global_stats_path) as f: + blob = json.loads(f.read()) + + self.register_buffer("mean", torch.tensor(blob["mean"])) + self.register_buffer("invstddev", torch.tensor(blob["invstddev"])) + + def forward(self, input): + return (input - self.mean) * self.invstddev + + +class _FeatureExtractor(ABC): + @abstractmethod + def __call__(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Generates features and length output from the given input tensor. + + Args: + input (torch.Tensor): input tensor. + + Returns: + (torch.Tensor, torch.Tensor): + torch.Tensor: + Features, with shape `(length, *)`. + torch.Tensor: + Length, with shape `(1,)`. + """ + + +class _TokenProcessor(ABC): + @abstractmethod + def __call__(self, tokens: List[int], **kwargs) -> str: + """Decodes given list of tokens to text sequence. + + Args: + tokens (List[int]): list of tokens to decode. + + Returns: + str: + Decoded text sequence. + """ + + +class _ModuleFeatureExtractor(torch.nn.Module, _FeatureExtractor): + """``torch.nn.Module``-based feature extraction pipeline. + + Args: + pipeline (torch.nn.Module): module that implements feature extraction logic. + """ + + def __init__(self, pipeline: torch.nn.Module) -> None: + super().__init__() + self.pipeline = pipeline + + def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Generates features and length output from the given input tensor. + + Args: + input (torch.Tensor): input tensor. + + Returns: + (torch.Tensor, torch.Tensor): + torch.Tensor: + Features, with shape `(length, *)`. + torch.Tensor: + Length, with shape `(1,)`. + """ + features = self.pipeline(input) + length = torch.tensor([features.shape[0]]) + return features, length + + +class _SentencePieceTokenProcessor(_TokenProcessor): + """SentencePiece-model-based token processor. + + Args: + sp_model_path (str): path to SentencePiece model. + """ + + def __init__(self, sp_model_path: str) -> None: + if not module_utils.is_module_available("sentencepiece"): + raise RuntimeError("SentencePiece is not available. Please install it.") + + import sentencepiece as spm + + self.sp_model = spm.SentencePieceProcessor(model_file=sp_model_path) + self.post_process_remove_list = { + self.sp_model.unk_id(), + self.sp_model.eos_id(), + self.sp_model.pad_id(), + } + + def __call__(self, tokens: List[int], lstrip: bool = True) -> str: + """Decodes given list of tokens to text sequence. + + Args: + tokens (List[int]): list of tokens to decode. + lstrip (bool, optional): if ``True``, returns text sequence with leading whitespace + removed. (Default: ``True``). + + Returns: + str: + Decoded text sequence. + """ + filtered_hypo_tokens = [ + token_index for token_index in tokens[1:] if token_index not in self.post_process_remove_list + ] + output_string = "".join(self.sp_model.id_to_piece(filtered_hypo_tokens)).replace("\u2581", " ") + + if lstrip: + return output_string.lstrip() + else: + return output_string + + +@dataclass +class RNNTBundle: + """torchaudio.pipelines.RNNTBundle() + + Dataclass that bundles components for performing automatic speech recognition (ASR, speech-to-text) + inference with an RNN-T model. + + More specifically, the class provides methods that produce the featurization pipeline, + decoder wrapping the specified RNN-T model, and output token post-processor that together + constitute a complete end-to-end ASR inference pipeline that produces a text sequence + given a raw waveform. + + It can support non-streaming (full-context) inference as well as streaming inference. + + Users should not directly instantiate objects of this class; rather, users should use the + instances (representing pre-trained models) that exist within the module, + e.g. :py:obj:`EMFORMER_RNNT_BASE_LIBRISPEECH`. + + Example + >>> import torchaudio + >>> from torchaudio.pipelines import EMFORMER_RNNT_BASE_LIBRISPEECH + >>> import torch + >>> + >>> # Non-streaming inference. + >>> # Build feature extractor, decoder with RNN-T model, and token processor. + >>> feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_feature_extractor() + 100%|███████████████████████████████| 3.81k/3.81k [00:00<00:00, 4.22MB/s] + >>> decoder = EMFORMER_RNNT_BASE_LIBRISPEECH.get_decoder() + Downloading: "https://download.pytorch.org/torchaudio/models/emformer_rnnt_base_librispeech.pt" + 100%|███████████████████████████████| 293M/293M [00:07<00:00, 42.1MB/s] + >>> token_processor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_token_processor() + 100%|███████████████████████████████| 295k/295k [00:00<00:00, 25.4MB/s] + >>> + >>> # Instantiate LibriSpeech dataset; retrieve waveform for first sample. + >>> dataset = torchaudio.datasets.LIBRISPEECH("/home/librispeech", url="test-clean") + >>> waveform = next(iter(dataset))[0].squeeze() + >>> + >>> with torch.no_grad(): + >>> # Produce mel-scale spectrogram features. + >>> features, length = feature_extractor(waveform) + >>> + >>> # Generate top-10 hypotheses. + >>> hypotheses = decoder(features, length, 10) + >>> + >>> # For top hypothesis, convert predicted tokens to text. + >>> text = token_processor(hypotheses[0][0]) + >>> print(text) + he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to [...] + >>> + >>> + >>> # Streaming inference. + >>> hop_length = EMFORMER_RNNT_BASE_LIBRISPEECH.hop_length + >>> num_samples_segment = EMFORMER_RNNT_BASE_LIBRISPEECH.segment_length * hop_length + >>> num_samples_segment_right_context = ( + >>> num_samples_segment + EMFORMER_RNNT_BASE_LIBRISPEECH.right_context_length * hop_length + >>> ) + >>> + >>> # Build streaming inference feature extractor. + >>> streaming_feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_streaming_feature_extractor() + >>> + >>> # Process same waveform as before, this time sequentially across overlapping segments + >>> # to simulate streaming inference. Note the usage of ``streaming_feature_extractor`` and ``decoder.infer``. + >>> state, hypothesis = None, None + >>> for idx in range(0, len(waveform), num_samples_segment): + >>> segment = waveform[idx: idx + num_samples_segment_right_context] + >>> segment = torch.nn.functional.pad(segment, (0, num_samples_segment_right_context - len(segment))) + >>> with torch.no_grad(): + >>> features, length = streaming_feature_extractor(segment) + >>> hypotheses, state = decoder.infer(features, length, 10, state=state, hypothesis=hypothesis) + >>> hypothesis = hypotheses[0] + >>> transcript = token_processor(hypothesis[0]) + >>> if transcript: + >>> print(transcript, end=" ", flush=True) + he hoped there would be stew for dinner turn ips and car rots and bru 'd oes and fat mut ton pieces to [...] + """ + + class FeatureExtractor(_FeatureExtractor): + pass + + class TokenProcessor(_TokenProcessor): + pass + + _rnnt_path: str + _rnnt_factory_func: Callable[[], RNNT] + _global_stats_path: str + _sp_model_path: str + _right_padding: int + _blank: int + _sample_rate: int + _n_fft: int + _n_mels: int + _hop_length: int + _segment_length: int + _right_context_length: int + + def _get_model(self) -> RNNT: + model = self._rnnt_factory_func() + path = torchaudio.utils.download_asset(self._rnnt_path) + state_dict = torch.load(path) + model.load_state_dict(state_dict) + model.eval() + return model + + @property + def sample_rate(self) -> int: + """Sample rate (in cycles per second) of input waveforms. + + :type: int + """ + return self._sample_rate + + @property + def n_fft(self) -> int: + """Size of FFT window to use. + + :type: int + """ + return self._n_fft + + @property + def n_mels(self) -> int: + """Number of mel spectrogram features to extract from input waveforms. + + :type: int + """ + return self._n_mels + + @property + def hop_length(self) -> int: + """Number of samples between successive frames in input expected by model. + + :type: int + """ + return self._hop_length + + @property + def segment_length(self) -> int: + """Number of frames in segment in input expected by model. + + :type: int + """ + return self._segment_length + + @property + def right_context_length(self) -> int: + """Number of frames in right contextual block in input expected by model. + + :type: int + """ + return self._right_context_length + + def get_decoder(self) -> RNNTBeamSearch: + """Constructs RNN-T decoder. + + Returns: + RNNTBeamSearch + """ + model = self._get_model() + return RNNTBeamSearch(model, self._blank) + + def get_feature_extractor(self) -> FeatureExtractor: + """Constructs feature extractor for non-streaming (full-context) ASR. + + Returns: + FeatureExtractor + """ + local_path = torchaudio.utils.download_asset(self._global_stats_path) + return _ModuleFeatureExtractor( + torch.nn.Sequential( + torchaudio.transforms.MelSpectrogram( + sample_rate=self.sample_rate, n_fft=self.n_fft, n_mels=self.n_mels, hop_length=self.hop_length + ), + _FunctionalModule(lambda x: x.transpose(1, 0)), + _FunctionalModule(lambda x: _piecewise_linear_log(x * _gain)), + _GlobalStatsNormalization(local_path), + _FunctionalModule(lambda x: torch.nn.functional.pad(x, (0, 0, 0, self._right_padding))), + ) + ) + + def get_streaming_feature_extractor(self) -> FeatureExtractor: + """Constructs feature extractor for streaming (simultaneous) ASR. + + Returns: + FeatureExtractor + """ + local_path = torchaudio.utils.download_asset(self._global_stats_path) + return _ModuleFeatureExtractor( + torch.nn.Sequential( + torchaudio.transforms.MelSpectrogram( + sample_rate=self.sample_rate, n_fft=self.n_fft, n_mels=self.n_mels, hop_length=self.hop_length + ), + _FunctionalModule(lambda x: x.transpose(1, 0)), + _FunctionalModule(lambda x: _piecewise_linear_log(x * _gain)), + _GlobalStatsNormalization(local_path), + ) + ) + + def get_token_processor(self) -> TokenProcessor: + """Constructs token processor. + + Returns: + TokenProcessor + """ + local_path = torchaudio.utils.download_asset(self._sp_model_path) + return _SentencePieceTokenProcessor(local_path) + + +EMFORMER_RNNT_BASE_LIBRISPEECH = RNNTBundle( + _rnnt_path="models/emformer_rnnt_base_librispeech.pt", + _rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=4097), + _global_stats_path="pipeline-assets/global_stats_rnnt_librispeech.json", + _sp_model_path="pipeline-assets/spm_bpe_4096_librispeech.model", + _right_padding=4, + _blank=4096, + _sample_rate=16000, + _n_fft=400, + _n_mels=80, + _hop_length=160, + _segment_length=16, + _right_context_length=4, +) +EMFORMER_RNNT_BASE_LIBRISPEECH.__doc__ = """Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both streaming and non-streaming inference. + + The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base` + and utilizes weights trained on LibriSpeech using training script ``train.py`` + `here `__ with default arguments. + + Please refer to :py:class:`RNNTBundle` for usage instructions. + """ diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/__init__.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..527da5c7d2859c74b4207863f2c47edda0a62a05 --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/__init__.py @@ -0,0 +1,57 @@ +from ._multi_channel import MVDR, PSD, RTFMVDR, SoudenMVDR +from ._transforms import ( + AmplitudeToDB, + ComputeDeltas, + Fade, + FrequencyMasking, + GriffinLim, + InverseMelScale, + InverseSpectrogram, + LFCC, + MelScale, + MelSpectrogram, + MFCC, + MuLawDecoding, + MuLawEncoding, + PitchShift, + Resample, + RNNTLoss, + SlidingWindowCmn, + SpectralCentroid, + Spectrogram, + TimeMasking, + TimeStretch, + Vad, + Vol, +) + + +__all__ = [ + "AmplitudeToDB", + "ComputeDeltas", + "Fade", + "FrequencyMasking", + "GriffinLim", + "InverseMelScale", + "InverseSpectrogram", + "LFCC", + "MFCC", + "MVDR", + "MelScale", + "MelSpectrogram", + "MuLawDecoding", + "MuLawEncoding", + "PSD", + "PitchShift", + "RNNTLoss", + "RTFMVDR", + "Resample", + "SlidingWindowCmn", + "SoudenMVDR", + "SpectralCentroid", + "Spectrogram", + "TimeMasking", + "TimeStretch", + "Vad", + "Vol", +] diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/_multi_channel.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/_multi_channel.py new file mode 100644 index 0000000000000000000000000000000000000000..1a97dbc27345c3345612f2b18efbb7b1e049bc4e --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/_multi_channel.py @@ -0,0 +1,464 @@ +# -*- coding: utf-8 -*- + +import warnings +from typing import Optional, Union + +import torch +from torch import Tensor +from torchaudio import functional as F + + +__all__ = [] + + +def _get_mvdr_vector( + psd_s: torch.Tensor, + psd_n: torch.Tensor, + reference_vector: torch.Tensor, + solution: str = "ref_channel", + diagonal_loading: bool = True, + diag_eps: float = 1e-7, + eps: float = 1e-8, +) -> torch.Tensor: + r"""Compute the MVDR beamforming weights with ``solution`` argument. + + Args: + psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech. + Tensor with dimensions `(..., freq, channel, channel)`. + psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise. + Tensor with dimensions `(..., freq, channel, channel)`. + reference_vector (torch.Tensor): one-hot reference channel matrix. + solution (str, optional): Solution to compute the MVDR beamforming weights. + Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``) + diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``. + (Default: ``True``) + diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading. + It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``) + eps (float, optional): Value to add to the denominator in the beamforming weight formula. + (Default: ``1e-8``) + + Returns: + torch.Tensor: the mvdr beamforming weight matrix + """ + if solution == "ref_channel": + beamform_vector = F.mvdr_weights_souden(psd_s, psd_n, reference_vector, diagonal_loading, diag_eps, eps) + else: + if solution == "stv_evd": + stv = F.rtf_evd(psd_s) + else: + stv = F.rtf_power(psd_s, psd_n, reference_vector, diagonal_loading=diagonal_loading, diag_eps=diag_eps) + beamform_vector = F.mvdr_weights_rtf(stv, psd_n, reference_vector, diagonal_loading, diag_eps, eps) + + return beamform_vector + + +class PSD(torch.nn.Module): + r"""Compute cross-channel power spectral density (PSD) matrix. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + multi_mask (bool, optional): If ``True``, only accepts multi-channel Time-Frequency masks. (Default: ``False``) + normalize (bool, optional): If ``True``, normalize the mask along the time dimension. (Default: ``True``) + eps (float, optional): Value to add to the denominator in mask normalization. (Default: ``1e-15``) + """ + + def __init__(self, multi_mask: bool = False, normalize: bool = True, eps: float = 1e-15): + super().__init__() + self.multi_mask = multi_mask + self.normalize = normalize + self.eps = eps + + def forward(self, specgram: torch.Tensor, mask: Optional[torch.Tensor] = None): + """ + Args: + specgram (torch.Tensor): Multi-channel complex-valued spectrum. + Tensor with dimensions `(..., channel, freq, time)`. + mask (torch.Tensor or None, optional): Time-Frequency mask for normalization. + Tensor with dimensions `(..., freq, time)` if multi_mask is ``False`` or + with dimensions `(..., channel, freq, time)` if multi_mask is ``True``. + (Default: ``None``) + + Returns: + torch.Tensor: The complex-valued PSD matrix of the input spectrum. + Tensor with dimensions `(..., freq, channel, channel)` + """ + if mask is not None: + if self.multi_mask: + # Averaging mask along channel dimension + mask = mask.mean(dim=-3) # (..., freq, time) + psd = F.psd(specgram, mask, self.normalize, self.eps) + + return psd + + +class MVDR(torch.nn.Module): + """Minimum Variance Distortionless Response (MVDR) module that performs MVDR beamforming with Time-Frequency masks. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Based on https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/beamformer.py + + We provide three solutions of MVDR beamforming. One is based on *reference channel selection* + [:footcite:`souden2009optimal`] (``solution=ref_channel``). + + .. math:: + \\textbf{w}_{\\text{MVDR}}(f) =\ + \\frac{{{\\bf{\\Phi}_{\\textbf{NN}}^{-1}}(f){\\bf{\\Phi}_{\\textbf{SS}}}}(f)}\ + {\\text{Trace}({{{\\bf{\\Phi}_{\\textbf{NN}}^{-1}}(f) \\bf{\\Phi}_{\\textbf{SS}}}(f))}}\\bm{u} + + where :math:`\\bf{\\Phi}_{\\textbf{SS}}` and :math:`\\bf{\\Phi}_{\\textbf{NN}}` are the covariance\ + matrices of speech and noise, respectively. :math:`\\bf{u}` is an one-hot vector to determine the\ + reference channel. + + The other two solutions are based on the steering vector (``solution=stv_evd`` or ``solution=stv_power``). + + .. math:: + \\textbf{w}_{\\text{MVDR}}(f) =\ + \\frac{{{\\bf{\\Phi}_{\\textbf{NN}}^{-1}}(f){\\bm{v}}(f)}}\ + {{\\bm{v}^{\\mathsf{H}}}(f){\\bf{\\Phi}_{\\textbf{NN}}^{-1}}(f){\\bm{v}}(f)} + + where :math:`\\bm{v}` is the acoustic transfer function or the steering vector.\ + :math:`.^{\\mathsf{H}}` denotes the Hermitian Conjugate operation. + + We apply either *eigenvalue decomposition* + [:footcite:`higuchi2016robust`] or the *power method* [:footcite:`mises1929praktische`] to get the + steering vector from the PSD matrix of speech. + + After estimating the beamforming weight, the enhanced Short-time Fourier Transform (STFT) is obtained by + + .. math:: + \\hat{\\bf{S}} = {\\bf{w}^\\mathsf{H}}{\\bf{Y}}, {\\bf{w}} \\in \\mathbb{C}^{M \\times F} + + where :math:`\\bf{Y}` and :math:`\\hat{\\bf{S}}` are the STFT of the multi-channel noisy speech and\ + the single-channel enhanced speech, respectively. + + For online streaming audio, we provide a *recursive method* [:footcite:`higuchi2017online`] to update the + PSD matrices of speech and noise, respectively. + + Args: + ref_channel (int, optional): Reference channel for beamforming. (Default: ``0``) + solution (str, optional): Solution to compute the MVDR beamforming weights. + Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``) + multi_mask (bool, optional): If ``True``, only accepts multi-channel Time-Frequency masks. (Default: ``False``) + diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to the covariance matrix + of the noise. (Default: ``True``) + diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading. + It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``) + online (bool, optional): If ``True``, updates the MVDR beamforming weights based on + the previous covarience matrices. (Default: ``False``) + + Note: + To improve the numerical stability, the input spectrogram will be converted to double precision + (``torch.complex128`` or ``torch.cdouble``) dtype for internal computation. The output spectrogram + is converted to the dtype of the input spectrogram to be compatible with other modules. + + Note: + If you use ``stv_evd`` solution, the gradient of the same input may not be identical if the + eigenvalues of the PSD matrix are not distinct (i.e. some eigenvalues are close or identical). + """ + + def __init__( + self, + ref_channel: int = 0, + solution: str = "ref_channel", + multi_mask: bool = False, + diag_loading: bool = True, + diag_eps: float = 1e-7, + online: bool = False, + ): + super().__init__() + assert solution in [ + "ref_channel", + "stv_evd", + "stv_power", + ], "Unknown solution provided. Must be one of [``ref_channel``, ``stv_evd``, ``stv_power``]." + self.ref_channel = ref_channel + self.solution = solution + self.multi_mask = multi_mask + self.diag_loading = diag_loading + self.diag_eps = diag_eps + self.online = online + self.psd = PSD(multi_mask) + + psd_s: torch.Tensor = torch.zeros(1) + psd_n: torch.Tensor = torch.zeros(1) + mask_sum_s: torch.Tensor = torch.zeros(1) + mask_sum_n: torch.Tensor = torch.zeros(1) + self.register_buffer("psd_s", psd_s) + self.register_buffer("psd_n", psd_n) + self.register_buffer("mask_sum_s", mask_sum_s) + self.register_buffer("mask_sum_n", mask_sum_n) + + def _get_updated_mvdr_vector( + self, + psd_s: torch.Tensor, + psd_n: torch.Tensor, + mask_s: torch.Tensor, + mask_n: torch.Tensor, + reference_vector: torch.Tensor, + solution: str = "ref_channel", + diagonal_loading: bool = True, + diag_eps: float = 1e-7, + eps: float = 1e-8, + ) -> torch.Tensor: + r"""Recursively update the MVDR beamforming vector. + + Args: + psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech. + Tensor with dimensions `(..., freq, channel, channel)`. + psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise. + Tensor with dimensions `(..., freq, channel, channel)`. + mask_s (torch.Tensor): Time-Frequency mask of the target speech. + Tensor with dimensions `(..., freq, time)` if multi_mask is ``False`` + or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``. + mask_n (torch.Tensor or None, optional): Time-Frequency mask of the noise. + Tensor with dimensions `(..., freq, time)` if multi_mask is ``False`` + or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``. + reference_vector (torch.Tensor): One-hot reference channel matrix. + solution (str, optional): Solution to compute the MVDR beamforming weights. + Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``) + diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``. + (Default: ``True``) + diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading. + It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``) + eps (float, optional): Value to add to the denominator in the beamforming weight formula. + (Default: ``1e-8``) + + Returns: + torch.Tensor: The MVDR beamforming weight matrix. + """ + if self.multi_mask: + # Averaging mask along channel dimension + mask_s = mask_s.mean(dim=-3) # (..., freq, time) + mask_n = mask_n.mean(dim=-3) # (..., freq, time) + if self.psd_s.ndim == 1: + self.psd_s = psd_s + self.psd_n = psd_n + self.mask_sum_s = mask_s.sum(dim=-1) + self.mask_sum_n = mask_n.sum(dim=-1) + return _get_mvdr_vector(psd_s, psd_n, reference_vector, solution, diagonal_loading, diag_eps, eps) + else: + psd_s = self._get_updated_psd_speech(psd_s, mask_s) + psd_n = self._get_updated_psd_noise(psd_n, mask_n) + self.psd_s = psd_s + self.psd_n = psd_n + self.mask_sum_s = self.mask_sum_s + mask_s.sum(dim=-1) + self.mask_sum_n = self.mask_sum_n + mask_n.sum(dim=-1) + return _get_mvdr_vector(psd_s, psd_n, reference_vector, solution, diagonal_loading, diag_eps, eps) + + def _get_updated_psd_speech(self, psd_s: torch.Tensor, mask_s: torch.Tensor) -> torch.Tensor: + r"""Update psd of speech recursively. + + Args: + psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech. + Tensor with dimensions `(..., freq, channel, channel)`. + mask_s (torch.Tensor): Time-Frequency mask of the target speech. + Tensor with dimensions `(..., freq, time)`. + + Returns: + torch.Tensor: The updated PSD matrix of target speech. + """ + numerator = self.mask_sum_s / (self.mask_sum_s + mask_s.sum(dim=-1)) + denominator = 1 / (self.mask_sum_s + mask_s.sum(dim=-1)) + psd_s = self.psd_s * numerator[..., None, None] + psd_s * denominator[..., None, None] + return psd_s + + def _get_updated_psd_noise(self, psd_n: torch.Tensor, mask_n: torch.Tensor) -> torch.Tensor: + r"""Update psd of noise recursively. + + Args: + psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise. + Tensor with dimensions `(..., freq, channel, channel)`. + mask_n (torch.Tensor or None, optional): Time-Frequency mask of the noise. + Tensor with dimensions `(..., freq, time)`. + + Returns: + torch.Tensor: The updated PSD matrix of noise. + """ + numerator = self.mask_sum_n / (self.mask_sum_n + mask_n.sum(dim=-1)) + denominator = 1 / (self.mask_sum_n + mask_n.sum(dim=-1)) + psd_n = self.psd_n * numerator[..., None, None] + psd_n * denominator[..., None, None] + return psd_n + + def forward( + self, specgram: torch.Tensor, mask_s: torch.Tensor, mask_n: Optional[torch.Tensor] = None + ) -> torch.Tensor: + """Perform MVDR beamforming. + + Args: + specgram (torch.Tensor): Multi-channel complex-valued spectrum. + Tensor with dimensions `(..., channel, freq, time)` + mask_s (torch.Tensor): Time-Frequency mask of target speech. + Tensor with dimensions `(..., freq, time)` if multi_mask is ``False`` + or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``. + mask_n (torch.Tensor or None, optional): Time-Frequency mask of noise. + Tensor with dimensions `(..., freq, time)` if multi_mask is ``False`` + or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``. + (Default: None) + + Returns: + torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`. + """ + dtype = specgram.dtype + if specgram.ndim < 3: + raise ValueError(f"Expected at least 3D tensor (..., channel, freq, time). Found: {specgram.shape}") + if not specgram.is_complex(): + raise ValueError( + f"The type of ``specgram`` tensor must be ``torch.cfloat`` or ``torch.cdouble``.\ + Found: {specgram.dtype}" + ) + if specgram.dtype == torch.cfloat: + specgram = specgram.cdouble() # Convert specgram to ``torch.cdouble``. + + if mask_n is None: + warnings.warn("``mask_n`` is not provided, use ``1 - mask_s`` as ``mask_n``.") + mask_n = 1 - mask_s + + psd_s = self.psd(specgram, mask_s) # (..., freq, time, channel, channel) + psd_n = self.psd(specgram, mask_n) # (..., freq, time, channel, channel) + + u = torch.zeros(specgram.size()[:-2], device=specgram.device, dtype=torch.cdouble) # (..., channel) + u[..., self.ref_channel].fill_(1) + + if self.online: + w_mvdr = self._get_updated_mvdr_vector( + psd_s, psd_n, mask_s, mask_n, u, self.solution, self.diag_loading, self.diag_eps + ) + else: + w_mvdr = _get_mvdr_vector(psd_s, psd_n, u, self.solution, self.diag_loading, self.diag_eps) + + specgram_enhanced = F.apply_beamforming(w_mvdr, specgram) + + return specgram_enhanced.to(dtype) + + +class RTFMVDR(torch.nn.Module): + r"""Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) module + based on the relative transfer function (RTF) and power spectral density (PSD) matrix of noise. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Given the multi-channel complex-valued spectrum :math:`\textbf{Y}`, the relative transfer function (RTF) matrix + or the steering vector of target speech :math:`\bm{v}`, the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and + a one-hot vector that represents the reference channel :math:`\bf{u}`, the module computes the single-channel + complex-valued spectrum of the enhanced speech :math:`\hat{\textbf{S}}`. The formula is defined as: + + .. math:: + \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f) + + where :math:`\textbf{w}_{\text{bf}}(f)` is the MVDR beamforming weight for the :math:`f`-th frequency bin, + :math:`(.)^{\mathsf{H}}` denotes the Hermitian Conjugate operation. + + The beamforming weight is computed by: + + .. math:: + \textbf{w}_{\text{MVDR}}(f) = + \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}} + {{\bm{v}^{\mathsf{H}}}(f){\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)} + """ + + def forward( + self, + specgram: Tensor, + rtf: Tensor, + psd_n: Tensor, + reference_channel: Union[int, Tensor], + diagonal_loading: bool = True, + diag_eps: float = 1e-7, + eps: float = 1e-8, + ) -> Tensor: + """ + Args: + specgram (torch.Tensor): Multi-channel complex-valued spectrum. + Tensor with dimensions `(..., channel, freq, time)` + rtf (torch.Tensor): The complex-valued RTF vector of target speech. + Tensor with dimensions `(..., freq, channel)`. + psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise. + Tensor with dimensions `(..., freq, channel, channel)`. + reference_channel (int or torch.Tensor): Specifies the reference channel. + If the dtype is ``int``, it represents the reference channel index. + If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension + is one-hot. + diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``. + (Default: ``True``) + diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading. + It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``) + eps (float, optional): Value to add to the denominator in the beamforming weight formula. + (Default: ``1e-8``) + + Returns: + torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`. + """ + w_mvdr = F.mvdr_weights_rtf(rtf, psd_n, reference_channel, diagonal_loading, diag_eps, eps) + spectrum_enhanced = F.apply_beamforming(w_mvdr, specgram) + return spectrum_enhanced + + +class SoudenMVDR(torch.nn.Module): + r"""Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) module + based on the method proposed by *Souden et, al.* [:footcite:`souden2009optimal`]. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Given the multi-channel complex-valued spectrum :math:`\textbf{Y}`, the power spectral density (PSD) matrix + of target speech :math:`\bf{\Phi}_{\textbf{SS}}`, the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and + a one-hot vector that represents the reference channel :math:`\bf{u}`, the module computes the single-channel + complex-valued spectrum of the enhanced speech :math:`\hat{\textbf{S}}`. The formula is defined as: + + .. math:: + \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f) + + where :math:`\textbf{w}_{\text{bf}}(f)` is the MVDR beamforming weight for the :math:`f`-th frequency bin. + + The beamforming weight is computed by: + + .. math:: + \textbf{w}_{\text{MVDR}}(f) = + \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bf{\Phi}_{\textbf{SS}}}}(f)} + {\text{Trace}({{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f) \bf{\Phi}_{\textbf{SS}}}(f))}}\bm{u} + """ + + def forward( + self, + specgram: Tensor, + psd_s: Tensor, + psd_n: Tensor, + reference_channel: Union[int, Tensor], + diagonal_loading: bool = True, + diag_eps: float = 1e-7, + eps: float = 1e-8, + ) -> torch.Tensor: + """ + Args: + specgram (torch.Tensor): Multi-channel complex-valued spectrum. + Tensor with dimensions `(..., channel, freq, time)`. + psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech. + Tensor with dimensions `(..., freq, channel, channel)`. + psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise. + Tensor with dimensions `(..., freq, channel, channel)`. + reference_channel (int or torch.Tensor): Specifies the reference channel. + If the dtype is ``int``, it represents the reference channel index. + If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension + is one-hot. + diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``. + (Default: ``True``) + diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading. + It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``) + eps (float, optional): Value to add to the denominator in the beamforming weight formula. + (Default: ``1e-8``) + + Returns: + torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`. + """ + w_mvdr = F.mvdr_weights_souden(psd_s, psd_n, reference_channel, diagonal_loading, diag_eps, eps) + spectrum_enhanced = F.apply_beamforming(w_mvdr, specgram) + return spectrum_enhanced diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/_transforms.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..489901f9eabfb774bac5adf539a7345785dc264a --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/_transforms.py @@ -0,0 +1,1693 @@ +# -*- coding: utf-8 -*- + +import math +import warnings +from typing import Callable, Optional + +import torch +from torch import Tensor +from torch.nn.modules.lazy import LazyModuleMixin +from torch.nn.parameter import UninitializedParameter + +from torchaudio import functional as F +from torchaudio.functional.functional import ( + _apply_sinc_resample_kernel, + _get_sinc_resample_kernel, + _stretch_waveform, + _fix_waveform_shape, +) + +__all__ = [] + + +class Spectrogram(torch.nn.Module): + r"""Create a spectrogram from a audio signal. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``) + win_length (int or None, optional): Window size. (Default: ``n_fft``) + hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``) + pad (int, optional): Two sided padding of signal. (Default: ``0``) + window_fn (Callable[..., Tensor], optional): A function to create a window tensor + that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``) + power (float or None, optional): Exponent for the magnitude spectrogram, + (must be > 0) e.g., 1 for energy, 2 for power, etc. + If None, then the complex spectrum is returned instead. (Default: ``2``) + normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``) + wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``) + center (bool, optional): whether to pad :attr:`waveform` on both sides so + that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. + (Default: ``True``) + pad_mode (string, optional): controls the padding method used when + :attr:`center` is ``True``. (Default: ``"reflect"``) + onesided (bool, optional): controls whether to return half of results to + avoid redundancy (Default: ``True``) + return_complex (bool, optional): + Deprecated and not used. + + Example + >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True) + >>> transform = torchaudio.transforms.Spectrogram(n_fft=800) + >>> spectrogram = transform(waveform) + + """ + __constants__ = ["n_fft", "win_length", "hop_length", "pad", "power", "normalized"] + + def __init__( + self, + n_fft: int = 400, + win_length: Optional[int] = None, + hop_length: Optional[int] = None, + pad: int = 0, + window_fn: Callable[..., Tensor] = torch.hann_window, + power: Optional[float] = 2.0, + normalized: bool = False, + wkwargs: Optional[dict] = None, + center: bool = True, + pad_mode: str = "reflect", + onesided: bool = True, + return_complex: Optional[bool] = None, + ) -> None: + super(Spectrogram, self).__init__() + self.n_fft = n_fft + # number of FFT bins. the returned STFT result will have n_fft // 2 + 1 + # number of frequencies due to onesided=True in torch.stft + self.win_length = win_length if win_length is not None else n_fft + self.hop_length = hop_length if hop_length is not None else self.win_length // 2 + window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs) + self.register_buffer("window", window) + self.pad = pad + self.power = power + self.normalized = normalized + self.center = center + self.pad_mode = pad_mode + self.onesided = onesided + if return_complex is not None: + warnings.warn( + "`return_complex` argument is now deprecated and is not effective." + "`torchaudio.transforms.Spectrogram(power=None)` always returns a tensor with " + "complex dtype. Please remove the argument in the function call." + ) + + def forward(self, waveform: Tensor) -> Tensor: + r""" + Args: + waveform (Tensor): Tensor of audio of dimension (..., time). + + Returns: + Tensor: Dimension (..., freq, time), where freq is + ``n_fft // 2 + 1`` where ``n_fft`` is the number of + Fourier bins, and time is the number of window hops (n_frame). + """ + return F.spectrogram( + waveform, + self.pad, + self.window, + self.n_fft, + self.hop_length, + self.win_length, + self.power, + self.normalized, + self.center, + self.pad_mode, + self.onesided, + ) + + +class InverseSpectrogram(torch.nn.Module): + r"""Create an inverse spectrogram to recover an audio signal from a spectrogram. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``) + win_length (int or None, optional): Window size. (Default: ``n_fft``) + hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``) + pad (int, optional): Two sided padding of signal. (Default: ``0``) + window_fn (Callable[..., Tensor], optional): A function to create a window tensor + that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``) + normalized (bool, optional): Whether the spectrogram was normalized by magnitude after stft. + (Default: ``False``) + wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``) + center (bool, optional): whether the signal in spectrogram was padded on both sides so + that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. + (Default: ``True``) + pad_mode (string, optional): controls the padding method used when + :attr:`center` is ``True``. (Default: ``"reflect"``) + onesided (bool, optional): controls whether spectrogram was used to return half of results to + avoid redundancy (Default: ``True``) + + Example + >>> batch, freq, time = 2, 257, 100 + >>> length = 25344 + >>> spectrogram = torch.randn(batch, freq, time, dtype=torch.cdouble) + >>> transform = transforms.InverseSpectrogram(n_fft=512) + >>> waveform = transform(spectrogram, length) + """ + __constants__ = ["n_fft", "win_length", "hop_length", "pad", "power", "normalized"] + + def __init__( + self, + n_fft: int = 400, + win_length: Optional[int] = None, + hop_length: Optional[int] = None, + pad: int = 0, + window_fn: Callable[..., Tensor] = torch.hann_window, + normalized: bool = False, + wkwargs: Optional[dict] = None, + center: bool = True, + pad_mode: str = "reflect", + onesided: bool = True, + ) -> None: + super(InverseSpectrogram, self).__init__() + self.n_fft = n_fft + # number of FFT bins. the returned STFT result will have n_fft // 2 + 1 + # number of frequencies due to onesided=True in torch.stft + self.win_length = win_length if win_length is not None else n_fft + self.hop_length = hop_length if hop_length is not None else self.win_length // 2 + window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs) + self.register_buffer("window", window) + self.pad = pad + self.normalized = normalized + self.center = center + self.pad_mode = pad_mode + self.onesided = onesided + + def forward(self, spectrogram: Tensor, length: Optional[int] = None) -> Tensor: + r""" + Args: + spectrogram (Tensor): Complex tensor of audio of dimension (..., freq, time). + length (int or None, optional): The output length of the waveform. + + Returns: + Tensor: Dimension (..., time), Least squares estimation of the original signal. + """ + return F.inverse_spectrogram( + spectrogram, + length, + self.pad, + self.window, + self.n_fft, + self.hop_length, + self.win_length, + self.normalized, + self.center, + self.pad_mode, + self.onesided, + ) + + +class GriffinLim(torch.nn.Module): + r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Implementation ported from + *librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`] + and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`]. + + Args: + n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``) + n_iter (int, optional): Number of iteration for phase recovery process. (Default: ``32``) + win_length (int or None, optional): Window size. (Default: ``n_fft``) + hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``) + window_fn (Callable[..., Tensor], optional): A function to create a window tensor + that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``) + power (float, optional): Exponent for the magnitude spectrogram, + (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``) + wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``) + momentum (float, optional): The momentum parameter for fast Griffin-Lim. + Setting this to 0 recovers the original Griffin-Lim method. + Values near 1 can lead to faster convergence, but above 1 may not converge. (Default: ``0.99``) + length (int, optional): Array length of the expected output. (Default: ``None``) + rand_init (bool, optional): Initializes phase randomly if True and to zero otherwise. (Default: ``True``) + + Example + >>> batch, freq, time = 2, 257, 100 + >>> spectrogram = torch.randn(batch, freq, time) + >>> transform = transforms.GriffinLim(n_fft=512) + >>> waveform = transform(spectrogram) + """ + __constants__ = ["n_fft", "n_iter", "win_length", "hop_length", "power", "length", "momentum", "rand_init"] + + def __init__( + self, + n_fft: int = 400, + n_iter: int = 32, + win_length: Optional[int] = None, + hop_length: Optional[int] = None, + window_fn: Callable[..., Tensor] = torch.hann_window, + power: float = 2.0, + wkwargs: Optional[dict] = None, + momentum: float = 0.99, + length: Optional[int] = None, + rand_init: bool = True, + ) -> None: + super(GriffinLim, self).__init__() + + assert momentum < 1, "momentum={} > 1 can be unstable".format(momentum) + assert momentum >= 0, "momentum={} < 0".format(momentum) + + self.n_fft = n_fft + self.n_iter = n_iter + self.win_length = win_length if win_length is not None else n_fft + self.hop_length = hop_length if hop_length is not None else self.win_length // 2 + window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs) + self.register_buffer("window", window) + self.length = length + self.power = power + self.momentum = momentum / (1 + momentum) + self.rand_init = rand_init + + def forward(self, specgram: Tensor) -> Tensor: + r""" + Args: + specgram (Tensor): + A magnitude-only STFT spectrogram of dimension (..., freq, frames) + where freq is ``n_fft // 2 + 1``. + + Returns: + Tensor: waveform of (..., time), where time equals the ``length`` parameter if given. + """ + return F.griffinlim( + specgram, + self.window, + self.n_fft, + self.hop_length, + self.win_length, + self.power, + self.n_iter, + self.momentum, + self.length, + self.rand_init, + ) + + +class AmplitudeToDB(torch.nn.Module): + r"""Turn a tensor from the power/amplitude scale to the decibel scale. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + This output depends on the maximum value in the input tensor, and so + may return different values for an audio clip split into snippets vs. a + a full clip. + + Args: + stype (str, optional): scale of input tensor (``'power'`` or ``'magnitude'``). The + power being the elementwise square of the magnitude. (Default: ``'power'``) + top_db (float or None, optional): minimum negative cut-off in decibels. A reasonable + number is 80. (Default: ``None``) + """ + __constants__ = ["multiplier", "amin", "ref_value", "db_multiplier"] + + def __init__(self, stype: str = "power", top_db: Optional[float] = None) -> None: + super(AmplitudeToDB, self).__init__() + self.stype = stype + if top_db is not None and top_db < 0: + raise ValueError("top_db must be positive value") + self.top_db = top_db + self.multiplier = 10.0 if stype == "power" else 20.0 + self.amin = 1e-10 + self.ref_value = 1.0 + self.db_multiplier = math.log10(max(self.amin, self.ref_value)) + + def forward(self, x: Tensor) -> Tensor: + r"""Numerically stable implementation from Librosa. + + https://librosa.org/doc/latest/generated/librosa.amplitude_to_db.html + + Args: + x (Tensor): Input tensor before being converted to decibel scale. + + Returns: + Tensor: Output tensor in decibel scale. + """ + return F.amplitude_to_DB(x, self.multiplier, self.amin, self.db_multiplier, self.top_db) + + +class MelScale(torch.nn.Module): + r"""Turn a normal STFT into a mel frequency STFT with triangular filter banks. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + n_mels (int, optional): Number of mel filterbanks. (Default: ``128``) + sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``) + f_min (float, optional): Minimum frequency. (Default: ``0.``) + f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``) + n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``) + norm (str or None, optional): If ``'slaney'``, divide the triangular mel weights by the width of the mel band + (area normalization). (Default: ``None``) + mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``) + + See also: + :py:func:`torchaudio.functional.melscale_fbanks` - The function used to + generate the filter banks. + """ + __constants__ = ["n_mels", "sample_rate", "f_min", "f_max"] + + def __init__( + self, + n_mels: int = 128, + sample_rate: int = 16000, + f_min: float = 0.0, + f_max: Optional[float] = None, + n_stft: int = 201, + norm: Optional[str] = None, + mel_scale: str = "htk", + ) -> None: + super(MelScale, self).__init__() + self.n_mels = n_mels + self.sample_rate = sample_rate + self.f_max = f_max if f_max is not None else float(sample_rate // 2) + self.f_min = f_min + self.norm = norm + self.mel_scale = mel_scale + + assert f_min <= self.f_max, "Require f_min: {} < f_max: {}".format(f_min, self.f_max) + fb = F.melscale_fbanks(n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, self.norm, self.mel_scale) + self.register_buffer("fb", fb) + + def forward(self, specgram: Tensor) -> Tensor: + r""" + Args: + specgram (Tensor): A spectrogram STFT of dimension (..., freq, time). + + Returns: + Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time). + """ + + # (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time) + mel_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2) + + return mel_specgram + + +class InverseMelScale(torch.nn.Module): + r"""Estimate a STFT in normal frequency domain from mel frequency domain. + + .. devices:: CPU CUDA + + It minimizes the euclidian norm between the input mel-spectrogram and the product between + the estimated spectrogram and the filter banks using SGD. + + Args: + n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. + n_mels (int, optional): Number of mel filterbanks. (Default: ``128``) + sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``) + f_min (float, optional): Minimum frequency. (Default: ``0.``) + f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``) + max_iter (int, optional): Maximum number of optimization iterations. (Default: ``100000``) + tolerance_loss (float, optional): Value of loss to stop optimization at. (Default: ``1e-5``) + tolerance_change (float, optional): Difference in losses to stop optimization at. (Default: ``1e-8``) + sgdargs (dict or None, optional): Arguments for the SGD optimizer. (Default: ``None``) + norm (str or None, optional): If 'slaney', divide the triangular mel weights by the width of the mel band + (area normalization). (Default: ``None``) + mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``) + """ + __constants__ = [ + "n_stft", + "n_mels", + "sample_rate", + "f_min", + "f_max", + "max_iter", + "tolerance_loss", + "tolerance_change", + "sgdargs", + ] + + def __init__( + self, + n_stft: int, + n_mels: int = 128, + sample_rate: int = 16000, + f_min: float = 0.0, + f_max: Optional[float] = None, + max_iter: int = 100000, + tolerance_loss: float = 1e-5, + tolerance_change: float = 1e-8, + sgdargs: Optional[dict] = None, + norm: Optional[str] = None, + mel_scale: str = "htk", + ) -> None: + super(InverseMelScale, self).__init__() + self.n_mels = n_mels + self.sample_rate = sample_rate + self.f_max = f_max or float(sample_rate // 2) + self.f_min = f_min + self.max_iter = max_iter + self.tolerance_loss = tolerance_loss + self.tolerance_change = tolerance_change + self.sgdargs = sgdargs or {"lr": 0.1, "momentum": 0.9} + + assert f_min <= self.f_max, "Require f_min: {} < f_max: {}".format(f_min, self.f_max) + + fb = F.melscale_fbanks(n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, norm, mel_scale) + self.register_buffer("fb", fb) + + def forward(self, melspec: Tensor) -> Tensor: + r""" + Args: + melspec (Tensor): A Mel frequency spectrogram of dimension (..., ``n_mels``, time) + + Returns: + Tensor: Linear scale spectrogram of size (..., freq, time) + """ + # pack batch + shape = melspec.size() + melspec = melspec.view(-1, shape[-2], shape[-1]) + + n_mels, time = shape[-2], shape[-1] + freq, _ = self.fb.size() # (freq, n_mels) + melspec = melspec.transpose(-1, -2) + assert self.n_mels == n_mels + + specgram = torch.rand( + melspec.size()[0], time, freq, requires_grad=True, dtype=melspec.dtype, device=melspec.device + ) + + optim = torch.optim.SGD([specgram], **self.sgdargs) + + loss = float("inf") + for _ in range(self.max_iter): + optim.zero_grad() + diff = melspec - specgram.matmul(self.fb) + new_loss = diff.pow(2).sum(axis=-1).mean() + # take sum over mel-frequency then average over other dimensions + # so that loss threshold is applied par unit timeframe + new_loss.backward() + optim.step() + specgram.data = specgram.data.clamp(min=0) + + new_loss = new_loss.item() + if new_loss < self.tolerance_loss or abs(loss - new_loss) < self.tolerance_change: + break + loss = new_loss + + specgram.requires_grad_(False) + specgram = specgram.clamp(min=0).transpose(-1, -2) + + # unpack batch + specgram = specgram.view(shape[:-2] + (freq, time)) + return specgram + + +class MelSpectrogram(torch.nn.Module): + r"""Create MelSpectrogram for a raw audio signal. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and + and :py:func:`torchaudio.transforms.MelScale`. + + Sources + * https://gist.github.com/kastnerkyle/179d6e9a88202ab0a2fe + * https://timsainb.github.io/spectrograms-mfccs-and-inversion-in-python.html + * http://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html + + Args: + sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``) + n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``) + win_length (int or None, optional): Window size. (Default: ``n_fft``) + hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``) + f_min (float, optional): Minimum frequency. (Default: ``0.``) + f_max (float or None, optional): Maximum frequency. (Default: ``None``) + pad (int, optional): Two sided padding of signal. (Default: ``0``) + n_mels (int, optional): Number of mel filterbanks. (Default: ``128``) + window_fn (Callable[..., Tensor], optional): A function to create a window tensor + that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``) + power (float, optional): Exponent for the magnitude spectrogram, + (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``) + normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``) + wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``) + center (bool, optional): whether to pad :attr:`waveform` on both sides so + that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. + (Default: ``True``) + pad_mode (string, optional): controls the padding method used when + :attr:`center` is ``True``. (Default: ``"reflect"``) + onesided (bool, optional): controls whether to return half of results to + avoid redundancy. (Default: ``True``) + norm (str or None, optional): If 'slaney', divide the triangular mel weights by the width of the mel band + (area normalization). (Default: ``None``) + mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``) + + Example + >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True) + >>> transform = transforms.MelSpectrogram(sample_rate) + >>> mel_specgram = transform(waveform) # (channel, n_mels, time) + + See also: + :py:func:`torchaudio.functional.melscale_fbanks` - The function used to + generate the filter banks. + """ + __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_mels", "f_min"] + + def __init__( + self, + sample_rate: int = 16000, + n_fft: int = 400, + win_length: Optional[int] = None, + hop_length: Optional[int] = None, + f_min: float = 0.0, + f_max: Optional[float] = None, + pad: int = 0, + n_mels: int = 128, + window_fn: Callable[..., Tensor] = torch.hann_window, + power: float = 2.0, + normalized: bool = False, + wkwargs: Optional[dict] = None, + center: bool = True, + pad_mode: str = "reflect", + onesided: bool = True, + norm: Optional[str] = None, + mel_scale: str = "htk", + ) -> None: + super(MelSpectrogram, self).__init__() + self.sample_rate = sample_rate + self.n_fft = n_fft + self.win_length = win_length if win_length is not None else n_fft + self.hop_length = hop_length if hop_length is not None else self.win_length // 2 + self.pad = pad + self.power = power + self.normalized = normalized + self.n_mels = n_mels # number of mel frequency bins + self.f_max = f_max + self.f_min = f_min + self.spectrogram = Spectrogram( + n_fft=self.n_fft, + win_length=self.win_length, + hop_length=self.hop_length, + pad=self.pad, + window_fn=window_fn, + power=self.power, + normalized=self.normalized, + wkwargs=wkwargs, + center=center, + pad_mode=pad_mode, + onesided=onesided, + ) + self.mel_scale = MelScale( + self.n_mels, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, norm, mel_scale + ) + + def forward(self, waveform: Tensor) -> Tensor: + r""" + Args: + waveform (Tensor): Tensor of audio of dimension (..., time). + + Returns: + Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time). + """ + specgram = self.spectrogram(waveform) + mel_specgram = self.mel_scale(specgram) + return mel_specgram + + +class MFCC(torch.nn.Module): + r"""Create the Mel-frequency cepstrum coefficients from an audio signal. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + By default, this calculates the MFCC on the DB-scaled Mel spectrogram. + This is not the textbook implementation, but is implemented here to + give consistency with librosa. + + This output depends on the maximum value in the input spectrogram, and so + may return different values for an audio clip split into snippets vs. a + a full clip. + + Args: + sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``) + n_mfcc (int, optional): Number of mfc coefficients to retain. (Default: ``40``) + dct_type (int, optional): type of DCT (discrete cosine transform) to use. (Default: ``2``) + norm (str, optional): norm to use. (Default: ``'ortho'``) + log_mels (bool, optional): whether to use log-mel spectrograms instead of db-scaled. (Default: ``False``) + melkwargs (dict or None, optional): arguments for MelSpectrogram. (Default: ``None``) + + See also: + :py:func:`torchaudio.functional.melscale_fbanks` - The function used to + generate the filter banks. + """ + __constants__ = ["sample_rate", "n_mfcc", "dct_type", "top_db", "log_mels"] + + def __init__( + self, + sample_rate: int = 16000, + n_mfcc: int = 40, + dct_type: int = 2, + norm: str = "ortho", + log_mels: bool = False, + melkwargs: Optional[dict] = None, + ) -> None: + super(MFCC, self).__init__() + supported_dct_types = [2] + if dct_type not in supported_dct_types: + raise ValueError("DCT type not supported: {}".format(dct_type)) + self.sample_rate = sample_rate + self.n_mfcc = n_mfcc + self.dct_type = dct_type + self.norm = norm + self.top_db = 80.0 + self.amplitude_to_DB = AmplitudeToDB("power", self.top_db) + + melkwargs = melkwargs or {} + self.MelSpectrogram = MelSpectrogram(sample_rate=self.sample_rate, **melkwargs) + + if self.n_mfcc > self.MelSpectrogram.n_mels: + raise ValueError("Cannot select more MFCC coefficients than # mel bins") + dct_mat = F.create_dct(self.n_mfcc, self.MelSpectrogram.n_mels, self.norm) + self.register_buffer("dct_mat", dct_mat) + self.log_mels = log_mels + + def forward(self, waveform: Tensor) -> Tensor: + r""" + Args: + waveform (Tensor): Tensor of audio of dimension (..., time). + + Returns: + Tensor: specgram_mel_db of size (..., ``n_mfcc``, time). + """ + mel_specgram = self.MelSpectrogram(waveform) + if self.log_mels: + log_offset = 1e-6 + mel_specgram = torch.log(mel_specgram + log_offset) + else: + mel_specgram = self.amplitude_to_DB(mel_specgram) + + # (..., time, n_mels) dot (n_mels, n_mfcc) -> (..., n_nfcc, time) + mfcc = torch.matmul(mel_specgram.transpose(-1, -2), self.dct_mat).transpose(-1, -2) + return mfcc + + +class LFCC(torch.nn.Module): + r"""Create the linear-frequency cepstrum coefficients from an audio signal. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + By default, this calculates the LFCC on the DB-scaled linear filtered spectrogram. + This is not the textbook implementation, but is implemented here to + give consistency with librosa. + + This output depends on the maximum value in the input spectrogram, and so + may return different values for an audio clip split into snippets vs. a + a full clip. + + Args: + sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``) + n_filter (int, optional): Number of linear filters to apply. (Default: ``128``) + n_lfcc (int, optional): Number of lfc coefficients to retain. (Default: ``40``) + f_min (float, optional): Minimum frequency. (Default: ``0.``) + f_max (float or None, optional): Maximum frequency. (Default: ``None``) + dct_type (int, optional): type of DCT (discrete cosine transform) to use. (Default: ``2``) + norm (str, optional): norm to use. (Default: ``'ortho'``) + log_lf (bool, optional): whether to use log-lf spectrograms instead of db-scaled. (Default: ``False``) + speckwargs (dict or None, optional): arguments for Spectrogram. (Default: ``None``) + + + See also: + :py:func:`torchaudio.functional.linear_fbanks` - The function used to + generate the filter banks. + """ + __constants__ = ["sample_rate", "n_filter", "n_lfcc", "dct_type", "top_db", "log_lf"] + + def __init__( + self, + sample_rate: int = 16000, + n_filter: int = 128, + f_min: float = 0.0, + f_max: Optional[float] = None, + n_lfcc: int = 40, + dct_type: int = 2, + norm: str = "ortho", + log_lf: bool = False, + speckwargs: Optional[dict] = None, + ) -> None: + super(LFCC, self).__init__() + supported_dct_types = [2] + if dct_type not in supported_dct_types: + raise ValueError("DCT type not supported: {}".format(dct_type)) + self.sample_rate = sample_rate + self.f_min = f_min + self.f_max = f_max if f_max is not None else float(sample_rate // 2) + self.n_filter = n_filter + self.n_lfcc = n_lfcc + self.dct_type = dct_type + self.norm = norm + self.top_db = 80.0 + self.amplitude_to_DB = AmplitudeToDB("power", self.top_db) + + speckwargs = speckwargs or {} + self.Spectrogram = Spectrogram(**speckwargs) + + if self.n_lfcc > self.Spectrogram.n_fft: + raise ValueError("Cannot select more LFCC coefficients than # fft bins") + + filter_mat = F.linear_fbanks( + n_freqs=self.Spectrogram.n_fft // 2 + 1, + f_min=self.f_min, + f_max=self.f_max, + n_filter=self.n_filter, + sample_rate=self.sample_rate, + ) + self.register_buffer("filter_mat", filter_mat) + + dct_mat = F.create_dct(self.n_lfcc, self.n_filter, self.norm) + self.register_buffer("dct_mat", dct_mat) + self.log_lf = log_lf + + def forward(self, waveform: Tensor) -> Tensor: + r""" + Args: + waveform (Tensor): Tensor of audio of dimension (..., time). + + Returns: + Tensor: Linear Frequency Cepstral Coefficients of size (..., ``n_lfcc``, time). + """ + specgram = self.Spectrogram(waveform) + + # (..., time, freq) dot (freq, n_filter) -> (..., n_filter, time) + specgram = torch.matmul(specgram.transpose(-1, -2), self.filter_mat).transpose(-1, -2) + + if self.log_lf: + log_offset = 1e-6 + specgram = torch.log(specgram + log_offset) + else: + specgram = self.amplitude_to_DB(specgram) + + # (..., time, n_filter) dot (n_filter, n_lfcc) -> (..., n_lfcc, time) + lfcc = torch.matmul(specgram.transpose(-1, -2), self.dct_mat).transpose(-1, -2) + return lfcc + + +class MuLawEncoding(torch.nn.Module): + r"""Encode signal based on mu-law companding. + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + For more info see the + `Wikipedia Entry `_ + + This algorithm assumes the signal has been scaled to between -1 and 1 and + returns a signal encoded with values from 0 to quantization_channels - 1 + + Args: + quantization_channels (int, optional): Number of channels. (Default: ``256``) + + Example + >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True) + >>> transform = torchaudio.transforms.MuLawEncoding(quantization_channels=512) + >>> mulawtrans = transform(waveform) + + """ + __constants__ = ["quantization_channels"] + + def __init__(self, quantization_channels: int = 256) -> None: + super(MuLawEncoding, self).__init__() + self.quantization_channels = quantization_channels + + def forward(self, x: Tensor) -> Tensor: + r""" + Args: + x (Tensor): A signal to be encoded. + + Returns: + Tensor: An encoded signal. + """ + return F.mu_law_encoding(x, self.quantization_channels) + + +class MuLawDecoding(torch.nn.Module): + r"""Decode mu-law encoded signal. + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + For more info see the + `Wikipedia Entry `_ + + This expects an input with values between 0 and ``quantization_channels - 1`` + and returns a signal scaled between -1 and 1. + + Args: + quantization_channels (int, optional): Number of channels. (Default: ``256``) + + Example + >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True) + >>> transform = torchaudio.transforms.MuLawDecoding(quantization_channels=512) + >>> mulawtrans = transform(waveform) + """ + __constants__ = ["quantization_channels"] + + def __init__(self, quantization_channels: int = 256) -> None: + super(MuLawDecoding, self).__init__() + self.quantization_channels = quantization_channels + + def forward(self, x_mu: Tensor) -> Tensor: + r""" + Args: + x_mu (Tensor): A mu-law encoded signal which needs to be decoded. + + Returns: + Tensor: The signal decoded. + """ + return F.mu_law_decoding(x_mu, self.quantization_channels) + + +class Resample(torch.nn.Module): + r"""Resample a signal from one frequency to another. A resampling method can be given. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Note: + If resampling on waveforms of higher precision than float32, there may be a small loss of precision + because the kernel is cached once as float32. If high precision resampling is important for your application, + the functional form will retain higher precision, but run slower because it does not cache the kernel. + Alternatively, you could rewrite a transform that caches a higher precision kernel. + + Args: + orig_freq (int, optional): The original frequency of the signal. (Default: ``16000``) + new_freq (int, optional): The desired frequency. (Default: ``16000``) + resampling_method (str, optional): The resampling method to use. + Options: [``sinc_interpolation``, ``kaiser_window``] (Default: ``'sinc_interpolation'``) + lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper + but less efficient. (Default: ``6``) + rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist. + Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``) + beta (float or None, optional): The shape parameter used for kaiser window. + dtype (torch.device, optional): + Determnines the precision that resampling kernel is pre-computed and cached. If not provided, + kernel is computed with ``torch.float64`` then cached as ``torch.float32``. + If you need higher precision, provide ``torch.float64``, and the pre-computed kernel is computed and + cached as ``torch.float64``. If you use resample with lower precision, then instead of providing this + providing this argument, please use ``Resample.to(dtype)``, so that the kernel generation is still + carried out on ``torch.float64``. + + Example + >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True) + >>> transform = transforms.Resample(sample_rate, sample_rate/10) + >>> waveform = transform(waveform) + """ + + def __init__( + self, + orig_freq: int = 16000, + new_freq: int = 16000, + resampling_method: str = "sinc_interpolation", + lowpass_filter_width: int = 6, + rolloff: float = 0.99, + beta: Optional[float] = None, + *, + dtype: Optional[torch.dtype] = None, + ) -> None: + super().__init__() + + self.orig_freq = orig_freq + self.new_freq = new_freq + self.gcd = math.gcd(int(self.orig_freq), int(self.new_freq)) + self.resampling_method = resampling_method + self.lowpass_filter_width = lowpass_filter_width + self.rolloff = rolloff + self.beta = beta + + if self.orig_freq != self.new_freq: + kernel, self.width = _get_sinc_resample_kernel( + self.orig_freq, + self.new_freq, + self.gcd, + self.lowpass_filter_width, + self.rolloff, + self.resampling_method, + beta, + dtype=dtype, + ) + self.register_buffer("kernel", kernel) + + def forward(self, waveform: Tensor) -> Tensor: + r""" + Args: + waveform (Tensor): Tensor of audio of dimension (..., time). + + Returns: + Tensor: Output signal of dimension (..., time). + """ + if self.orig_freq == self.new_freq: + return waveform + return _apply_sinc_resample_kernel(waveform, self.orig_freq, self.new_freq, self.gcd, self.kernel, self.width) + + +class ComputeDeltas(torch.nn.Module): + r"""Compute delta coefficients of a tensor, usually a spectrogram. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + See `torchaudio.functional.compute_deltas` for more details. + + Args: + win_length (int, optional): The window length used for computing delta. (Default: ``5``) + mode (str, optional): Mode parameter passed to padding. (Default: ``'replicate'``) + """ + __constants__ = ["win_length"] + + def __init__(self, win_length: int = 5, mode: str = "replicate") -> None: + super(ComputeDeltas, self).__init__() + self.win_length = win_length + self.mode = mode + + def forward(self, specgram: Tensor) -> Tensor: + r""" + Args: + specgram (Tensor): Tensor of audio of dimension (..., freq, time). + + Returns: + Tensor: Tensor of deltas of dimension (..., freq, time). + """ + return F.compute_deltas(specgram, win_length=self.win_length, mode=self.mode) + + +class TimeStretch(torch.nn.Module): + r"""Stretch stft in time without modifying pitch for a given rate. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Proposed in *SpecAugment* [:footcite:`specaugment`]. + + Args: + hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``) + n_freq (int, optional): number of filter banks from stft. (Default: ``201``) + fixed_rate (float or None, optional): rate to speed up or slow down by. + If None is provided, rate must be passed to the forward method. (Default: ``None``) + + Example + >>> spectrogram = torchaudio.transforms.Spectrogram() + >>> stretch = torchaudio.transforms.TimeStretch() + >>> + >>> original = spectrogram(waveform) + >>> streched_1_2 = stretch(original, 1.2) + >>> streched_0_9 = stretch(original, 0.9) + + .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch_1.png + :width: 600 + :alt: Spectrogram streched by 1.2 + + .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch_2.png + :width: 600 + :alt: The original spectrogram + + .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch_3.png + :width: 600 + :alt: Spectrogram streched by 0.9 + + """ + __constants__ = ["fixed_rate"] + + def __init__(self, hop_length: Optional[int] = None, n_freq: int = 201, fixed_rate: Optional[float] = None) -> None: + super(TimeStretch, self).__init__() + + self.fixed_rate = fixed_rate + + n_fft = (n_freq - 1) * 2 + hop_length = hop_length if hop_length is not None else n_fft // 2 + self.register_buffer("phase_advance", torch.linspace(0, math.pi * hop_length, n_freq)[..., None]) + + def forward(self, complex_specgrams: Tensor, overriding_rate: Optional[float] = None) -> Tensor: + r""" + Args: + complex_specgrams (Tensor): + A tensor of dimension `(..., freq, num_frame)` with complex dtype. + overriding_rate (float or None, optional): speed up to apply to this batch. + If no rate is passed, use ``self.fixed_rate``. (Default: ``None``) + + Returns: + Tensor: + Stretched spectrogram. The resulting tensor is of the same dtype as the input + spectrogram, but the number of frames is changed to ``ceil(num_frame / rate)``. + """ + if overriding_rate is None: + if self.fixed_rate is None: + raise ValueError("If no fixed_rate is specified, must pass a valid rate to the forward method.") + rate = self.fixed_rate + else: + rate = overriding_rate + return F.phase_vocoder(complex_specgrams, rate, self.phase_advance) + + +class Fade(torch.nn.Module): + r"""Add a fade in and/or fade out to an waveform. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + fade_in_len (int, optional): Length of fade-in (time frames). (Default: ``0``) + fade_out_len (int, optional): Length of fade-out (time frames). (Default: ``0``) + fade_shape (str, optional): Shape of fade. Must be one of: "quarter_sine", + ``"half_sine"``, ``"linear"``, ``"logarithmic"``, ``"exponential"``. + (Default: ``"linear"``) + + Example + >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True) + >>> transform = transforms.Fade(fade_in_len=sample_rate, fade_out_len=2 * sample_rate, fade_shape='linear') + >>> faded_waveform = transform(waveform) + """ + + def __init__(self, fade_in_len: int = 0, fade_out_len: int = 0, fade_shape: str = "linear") -> None: + super(Fade, self).__init__() + self.fade_in_len = fade_in_len + self.fade_out_len = fade_out_len + self.fade_shape = fade_shape + + def forward(self, waveform: Tensor) -> Tensor: + r""" + Args: + waveform (Tensor): Tensor of audio of dimension `(..., time)`. + + Returns: + Tensor: Tensor of audio of dimension `(..., time)`. + """ + waveform_length = waveform.size()[-1] + device = waveform.device + return self._fade_in(waveform_length, device) * self._fade_out(waveform_length, device) * waveform + + def _fade_in(self, waveform_length: int, device: torch.device) -> Tensor: + fade = torch.linspace(0, 1, self.fade_in_len, device=device) + ones = torch.ones(waveform_length - self.fade_in_len, device=device) + + if self.fade_shape == "linear": + fade = fade + + if self.fade_shape == "exponential": + fade = torch.pow(2, (fade - 1)) * fade + + if self.fade_shape == "logarithmic": + fade = torch.log10(0.1 + fade) + 1 + + if self.fade_shape == "quarter_sine": + fade = torch.sin(fade * math.pi / 2) + + if self.fade_shape == "half_sine": + fade = torch.sin(fade * math.pi - math.pi / 2) / 2 + 0.5 + + return torch.cat((fade, ones)).clamp_(0, 1) + + def _fade_out(self, waveform_length: int, device: torch.device) -> Tensor: + fade = torch.linspace(0, 1, self.fade_out_len, device=device) + ones = torch.ones(waveform_length - self.fade_out_len, device=device) + + if self.fade_shape == "linear": + fade = -fade + 1 + + if self.fade_shape == "exponential": + fade = torch.pow(2, -fade) * (1 - fade) + + if self.fade_shape == "logarithmic": + fade = torch.log10(1.1 - fade) + 1 + + if self.fade_shape == "quarter_sine": + fade = torch.sin(fade * math.pi / 2 + math.pi / 2) + + if self.fade_shape == "half_sine": + fade = torch.sin(fade * math.pi + math.pi / 2) / 2 + 0.5 + + return torch.cat((ones, fade)).clamp_(0, 1) + + +class _AxisMasking(torch.nn.Module): + r"""Apply masking to a spectrogram. + + Args: + mask_param (int): Maximum possible length of the mask. + axis (int): What dimension the mask is applied on. + iid_masks (bool): Applies iid masks to each of the examples in the batch dimension. + This option is applicable only when the input tensor is 4D. + p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0) + """ + __constants__ = ["mask_param", "axis", "iid_masks", "p"] + + def __init__(self, mask_param: int, axis: int, iid_masks: bool, p: float = 1.0) -> None: + + super(_AxisMasking, self).__init__() + self.mask_param = mask_param + self.axis = axis + self.iid_masks = iid_masks + self.p = p + + def forward(self, specgram: Tensor, mask_value: float = 0.0) -> Tensor: + r""" + Args: + specgram (Tensor): Tensor of dimension `(..., freq, time)`. + mask_value (float): Value to assign to the masked columns. + + Returns: + Tensor: Masked spectrogram of dimensions `(..., freq, time)`. + """ + # if iid_masks flag marked and specgram has a batch dimension + if self.iid_masks and specgram.dim() == 4: + return F.mask_along_axis_iid(specgram, self.mask_param, mask_value, self.axis + 1, p=self.p) + else: + return F.mask_along_axis(specgram, self.mask_param, mask_value, self.axis, p=self.p) + + +class FrequencyMasking(_AxisMasking): + r"""Apply masking to a spectrogram in the frequency domain. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Proposed in *SpecAugment* [:footcite:`specaugment`]. + + Args: + freq_mask_param (int): maximum possible length of the mask. + Indices uniformly sampled from [0, freq_mask_param). + iid_masks (bool, optional): whether to apply different masks to each + example/channel in the batch. (Default: ``False``) + This option is applicable only when the input tensor is 4D. + + Example + >>> spectrogram = torchaudio.transforms.Spectrogram() + >>> masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80) + >>> + >>> original = spectrogram(waveform) + >>> masked = masking(original) + + .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_freq_masking1.png + :alt: The original spectrogram + + .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_freq_masking2.png + :alt: The spectrogram masked along frequency axis + """ + + def __init__(self, freq_mask_param: int, iid_masks: bool = False) -> None: + super(FrequencyMasking, self).__init__(freq_mask_param, 1, iid_masks) + + +class TimeMasking(_AxisMasking): + r"""Apply masking to a spectrogram in the time domain. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Proposed in *SpecAugment* [:footcite:`specaugment`]. + + Args: + time_mask_param (int): maximum possible length of the mask. + Indices uniformly sampled from [0, time_mask_param). + iid_masks (bool, optional): whether to apply different masks to each + example/channel in the batch. (Default: ``False``) + This option is applicable only when the input tensor is 4D. + p (float, optional): maximum proportion of time steps that can be masked. + Must be within range [0.0, 1.0]. (Default: 1.0) + + Example + >>> spectrogram = torchaudio.transforms.Spectrogram() + >>> masking = torchaudio.transforms.TimeMasking(time_mask_param=80) + >>> + >>> original = spectrogram(waveform) + >>> masked = masking(original) + + .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_masking1.png + :alt: The original spectrogram + + .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_masking2.png + :alt: The spectrogram masked along time axis + """ + + def __init__(self, time_mask_param: int, iid_masks: bool = False, p: float = 1.0) -> None: + if not 0.0 <= p <= 1.0: + raise ValueError(f"The value of p must be between 0.0 and 1.0 ({p} given).") + super(TimeMasking, self).__init__(time_mask_param, 2, iid_masks, p=p) + + +class Vol(torch.nn.Module): + r"""Add a volume to an waveform. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + gain (float): Interpreted according to the given gain_type: + If ``gain_type`` = ``amplitude``, ``gain`` is a positive amplitude ratio. + If ``gain_type`` = ``power``, ``gain`` is a power (voltage squared). + If ``gain_type`` = ``db``, ``gain`` is in decibels. + gain_type (str, optional): Type of gain. One of: ``amplitude``, ``power``, ``db`` (Default: ``amplitude``) + """ + + def __init__(self, gain: float, gain_type: str = "amplitude"): + super(Vol, self).__init__() + self.gain = gain + self.gain_type = gain_type + + if gain_type in ["amplitude", "power"] and gain < 0: + raise ValueError("If gain_type = amplitude or power, gain must be positive.") + + def forward(self, waveform: Tensor) -> Tensor: + r""" + Args: + waveform (Tensor): Tensor of audio of dimension `(..., time)`. + + Returns: + Tensor: Tensor of audio of dimension `(..., time)`. + """ + if self.gain_type == "amplitude": + waveform = waveform * self.gain + + if self.gain_type == "db": + waveform = F.gain(waveform, self.gain) + + if self.gain_type == "power": + waveform = F.gain(waveform, 10 * math.log10(self.gain)) + + return torch.clamp(waveform, -1, 1) + + +class SlidingWindowCmn(torch.nn.Module): + r""" + Apply sliding-window cepstral mean (and optionally variance) normalization per utterance. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + Args: + cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600) + min_cmn_window (int, optional): Minimum CMN window used at start of decoding (adds latency only at start). + Only applicable if center == false, ignored if center==true (int, default = 100) + center (bool, optional): If true, use a window centered on the current frame + (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false) + norm_vars (bool, optional): If true, normalize variance to one. (bool, default = false) + """ + + def __init__( + self, cmn_window: int = 600, min_cmn_window: int = 100, center: bool = False, norm_vars: bool = False + ) -> None: + super().__init__() + self.cmn_window = cmn_window + self.min_cmn_window = min_cmn_window + self.center = center + self.norm_vars = norm_vars + + def forward(self, specgram: Tensor) -> Tensor: + r""" + Args: + specgram (Tensor): Tensor of spectrogram of dimension `(..., time, freq)`. + + Returns: + Tensor: Tensor of spectrogram of dimension `(..., time, freq)`. + """ + cmn_specgram = F.sliding_window_cmn(specgram, self.cmn_window, self.min_cmn_window, self.center, self.norm_vars) + return cmn_specgram + + +class Vad(torch.nn.Module): + r"""Voice Activity Detector. Similar to SoX implementation. + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + Attempts to trim silence and quiet background sounds from the ends of recordings of speech. + The algorithm currently uses a simple cepstral power measurement to detect voice, + so may be fooled by other things, especially music. + + The effect can trim only from the front of the audio, + so in order to trim from the back, the reverse effect must also be used. + + Args: + sample_rate (int): Sample rate of audio signal. + trigger_level (float, optional): The measurement level used to trigger activity detection. + This may need to be cahnged depending on the noise level, signal level, + and other characteristics of the input audio. (Default: 7.0) + trigger_time (float, optional): The time constant (in seconds) + used to help ignore short bursts of sound. (Default: 0.25) + search_time (float, optional): The amount of audio (in seconds) + to search for quieter/shorter bursts of audio to include prior + to the detected trigger point. (Default: 1.0) + allowed_gap (float, optional): The allowed gap (in seconds) between + quiteter/shorter bursts of audio to include prior + to the detected trigger point. (Default: 0.25) + pre_trigger_time (float, optional): The amount of audio (in seconds) to preserve + before the trigger point and any found quieter/shorter bursts. (Default: 0.0) + boot_time (float, optional) The algorithm (internally) uses adaptive noise + estimation/reduction in order to detect the start of the wanted audio. + This option sets the time for the initial noise estimate. (Default: 0.35) + noise_up_time (float, optional) Time constant used by the adaptive noise estimator + for when the noise level is increasing. (Default: 0.1) + noise_down_time (float, optional) Time constant used by the adaptive noise estimator + for when the noise level is decreasing. (Default: 0.01) + noise_reduction_amount (float, optional) Amount of noise reduction to use in + the detection algorithm (e.g. 0, 0.5, ...). (Default: 1.35) + measure_freq (float, optional) Frequency of the algorithm’s + processing/measurements. (Default: 20.0) + measure_duration: (float or None, optional) Measurement duration. + (Default: Twice the measurement period; i.e. with overlap.) + measure_smooth_time (float, optional) Time constant used to smooth + spectral measurements. (Default: 0.4) + hp_filter_freq (float, optional) "Brick-wall" frequency of high-pass filter applied + at the input to the detector algorithm. (Default: 50.0) + lp_filter_freq (float, optional) "Brick-wall" frequency of low-pass filter applied + at the input to the detector algorithm. (Default: 6000.0) + hp_lifter_freq (float, optional) "Brick-wall" frequency of high-pass lifter used + in the detector algorithm. (Default: 150.0) + lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used + in the detector algorithm. (Default: 2000.0) + + Reference: + - http://sox.sourceforge.net/sox.html + """ + + def __init__( + self, + sample_rate: int, + trigger_level: float = 7.0, + trigger_time: float = 0.25, + search_time: float = 1.0, + allowed_gap: float = 0.25, + pre_trigger_time: float = 0.0, + boot_time: float = 0.35, + noise_up_time: float = 0.1, + noise_down_time: float = 0.01, + noise_reduction_amount: float = 1.35, + measure_freq: float = 20.0, + measure_duration: Optional[float] = None, + measure_smooth_time: float = 0.4, + hp_filter_freq: float = 50.0, + lp_filter_freq: float = 6000.0, + hp_lifter_freq: float = 150.0, + lp_lifter_freq: float = 2000.0, + ) -> None: + super().__init__() + + self.sample_rate = sample_rate + self.trigger_level = trigger_level + self.trigger_time = trigger_time + self.search_time = search_time + self.allowed_gap = allowed_gap + self.pre_trigger_time = pre_trigger_time + self.boot_time = boot_time + self.noise_up_time = noise_up_time + self.noise_down_time = noise_down_time + self.noise_reduction_amount = noise_reduction_amount + self.measure_freq = measure_freq + self.measure_duration = measure_duration + self.measure_smooth_time = measure_smooth_time + self.hp_filter_freq = hp_filter_freq + self.lp_filter_freq = lp_filter_freq + self.hp_lifter_freq = hp_lifter_freq + self.lp_lifter_freq = lp_lifter_freq + + def forward(self, waveform: Tensor) -> Tensor: + r""" + Args: + waveform (Tensor): Tensor of audio of dimension `(channels, time)` or `(time)` + Tensor of shape `(channels, time)` is treated as a multi-channel recording + of the same event and the resulting output will be trimmed to the earliest + voice activity in any channel. + """ + return F.vad( + waveform=waveform, + sample_rate=self.sample_rate, + trigger_level=self.trigger_level, + trigger_time=self.trigger_time, + search_time=self.search_time, + allowed_gap=self.allowed_gap, + pre_trigger_time=self.pre_trigger_time, + boot_time=self.boot_time, + noise_up_time=self.noise_up_time, + noise_down_time=self.noise_down_time, + noise_reduction_amount=self.noise_reduction_amount, + measure_freq=self.measure_freq, + measure_duration=self.measure_duration, + measure_smooth_time=self.measure_smooth_time, + hp_filter_freq=self.hp_filter_freq, + lp_filter_freq=self.lp_filter_freq, + hp_lifter_freq=self.hp_lifter_freq, + lp_lifter_freq=self.lp_lifter_freq, + ) + + +class SpectralCentroid(torch.nn.Module): + r"""Compute the spectral centroid for each channel along the time axis. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + The spectral centroid is defined as the weighted average of the + frequency values, weighted by their magnitude. + + Args: + sample_rate (int): Sample rate of audio signal. + n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``) + win_length (int or None, optional): Window size. (Default: ``n_fft``) + hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``) + pad (int, optional): Two sided padding of signal. (Default: ``0``) + window_fn (Callable[..., Tensor], optional): A function to create a window tensor + that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``) + wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``) + + Example + >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True) + >>> transform = transforms.SpectralCentroid(sample_rate) + >>> spectral_centroid = transform(waveform) # (channel, time) + """ + __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad"] + + def __init__( + self, + sample_rate: int, + n_fft: int = 400, + win_length: Optional[int] = None, + hop_length: Optional[int] = None, + pad: int = 0, + window_fn: Callable[..., Tensor] = torch.hann_window, + wkwargs: Optional[dict] = None, + ) -> None: + super(SpectralCentroid, self).__init__() + self.sample_rate = sample_rate + self.n_fft = n_fft + self.win_length = win_length if win_length is not None else n_fft + self.hop_length = hop_length if hop_length is not None else self.win_length // 2 + window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs) + self.register_buffer("window", window) + self.pad = pad + + def forward(self, waveform: Tensor) -> Tensor: + r""" + Args: + waveform (Tensor): Tensor of audio of dimension `(..., time)`. + + Returns: + Tensor: Spectral Centroid of size `(..., time)`. + """ + + return F.spectral_centroid( + waveform, self.sample_rate, self.pad, self.window, self.n_fft, self.hop_length, self.win_length + ) + + +class PitchShift(LazyModuleMixin, torch.nn.Module): + r"""Shift the pitch of a waveform by ``n_steps`` steps. + + .. devices:: CPU CUDA + + .. properties:: TorchScript + + Args: + waveform (Tensor): The input waveform of shape `(..., time)`. + sample_rate (int): Sample rate of `waveform`. + n_steps (int): The (fractional) steps to shift `waveform`. + bins_per_octave (int, optional): The number of steps per octave (Default : ``12``). + n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins (Default: ``512``). + win_length (int or None, optional): Window size. If None, then ``n_fft`` is used. (Default: ``None``). + hop_length (int or None, optional): Length of hop between STFT windows. If None, then ``win_length // 4`` + is used (Default: ``None``). + window (Tensor or None, optional): Window tensor that is applied/multiplied to each frame/window. + If None, then ``torch.hann_window(win_length)`` is used (Default: ``None``). + + Example + >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True) + >>> transform = transforms.PitchShift(sample_rate, 4) + >>> waveform_shift = transform(waveform) # (channel, time) + """ + __constants__ = ["sample_rate", "n_steps", "bins_per_octave", "n_fft", "win_length", "hop_length"] + + kernel: UninitializedParameter + width: int + + def __init__( + self, + sample_rate: int, + n_steps: int, + bins_per_octave: int = 12, + n_fft: int = 512, + win_length: Optional[int] = None, + hop_length: Optional[int] = None, + window_fn: Callable[..., Tensor] = torch.hann_window, + wkwargs: Optional[dict] = None, + ) -> None: + super().__init__() + self.n_steps = n_steps + self.bins_per_octave = bins_per_octave + self.sample_rate = sample_rate + self.n_fft = n_fft + self.win_length = win_length if win_length is not None else n_fft + self.hop_length = hop_length if hop_length is not None else self.win_length // 4 + window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs) + self.register_buffer("window", window) + rate = 2.0 ** (-float(n_steps) / bins_per_octave) + self.orig_freq = int(sample_rate / rate) + self.gcd = math.gcd(int(self.orig_freq), int(sample_rate)) + + if self.orig_freq != sample_rate: + self.width = -1 + self.kernel = UninitializedParameter(device=None, dtype=None) + + def initialize_parameters(self, input): + if self.has_uninitialized_params(): + if self.orig_freq != self.sample_rate: + with torch.no_grad(): + kernel, self.width = _get_sinc_resample_kernel( + self.orig_freq, + self.sample_rate, + self.gcd, + dtype=input.dtype, + device=input.device, + ) + self.kernel.materialize(kernel.shape) + self.kernel.copy_(kernel) + + def forward(self, waveform: Tensor) -> Tensor: + r""" + Args: + waveform (Tensor): Tensor of audio of dimension `(..., time)`. + + Returns: + Tensor: The pitch-shifted audio of shape `(..., time)`. + """ + shape = waveform.size() + + waveform_stretch = _stretch_waveform( + waveform, + self.n_steps, + self.bins_per_octave, + self.n_fft, + self.win_length, + self.hop_length, + self.window, + ) + + if self.orig_freq != self.sample_rate: + waveform_shift = _apply_sinc_resample_kernel( + waveform_stretch, + self.orig_freq, + self.sample_rate, + self.gcd, + self.kernel, + self.width, + ) + else: + waveform_shift = waveform_stretch + + return _fix_waveform_shape( + waveform_shift, + shape, + ) + + +class RNNTLoss(torch.nn.Module): + """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks* + [:footcite:`graves2012sequence`]. + + .. devices:: CPU CUDA + + .. properties:: Autograd TorchScript + + The RNN Transducer loss extends the CTC loss by defining a distribution over output + sequences of all lengths, and by jointly modelling both input-output and output-output + dependencies. + + Args: + blank (int, optional): blank label (Default: ``-1``) + clamp (float, optional): clamp for gradients (Default: ``-1``) + reduction (string, optional): Specifies the reduction to apply to the output: + ``'none'`` | ``'mean'`` | ``'sum'``. (Default: ``'mean'``) + + Example + >>> # Hypothetical values + >>> logits = torch.tensor([[[[0.1, 0.6, 0.1, 0.1, 0.1], + >>> [0.1, 0.1, 0.6, 0.1, 0.1], + >>> [0.1, 0.1, 0.2, 0.8, 0.1]], + >>> [[0.1, 0.6, 0.1, 0.1, 0.1], + >>> [0.1, 0.1, 0.2, 0.1, 0.1], + >>> [0.7, 0.1, 0.2, 0.1, 0.1]]]], + >>> dtype=torch.float32, + >>> requires_grad=True) + >>> targets = torch.tensor([[1, 2]], dtype=torch.int) + >>> logit_lengths = torch.tensor([2], dtype=torch.int) + >>> target_lengths = torch.tensor([2], dtype=torch.int) + >>> transform = transforms.RNNTLoss(blank=0) + >>> loss = transform(logits, targets, logit_lengths, target_lengths) + >>> loss.backward() + """ + + def __init__( + self, + blank: int = -1, + clamp: float = -1.0, + reduction: str = "mean", + ): + super().__init__() + self.blank = blank + self.clamp = clamp + self.reduction = reduction + + def forward( + self, + logits: Tensor, + targets: Tensor, + logit_lengths: Tensor, + target_lengths: Tensor, + ): + """ + Args: + logits (Tensor): Tensor of dimension `(batch, max seq length, max target length + 1, class)` + containing output from joiner + targets (Tensor): Tensor of dimension `(batch, max target length)` containing targets with zero padded + logit_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of each sequence from encoder + target_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of targets for each sequence + Returns: + Tensor: Loss with the reduction option applied. If ``reduction`` is ``'none'``, then size (batch), + otherwise scalar. + """ + return F.rnnt_loss(logits, targets, logit_lengths, target_lengths, self.blank, self.clamp, self.reduction) diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/utils/download.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/utils/download.py new file mode 100644 index 0000000000000000000000000000000000000000..c4a3a062ddb96d7988bef4a5c8bbdf886f51a92d --- /dev/null +++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/utils/download.py @@ -0,0 +1,89 @@ +import hashlib +import logging +from os import PathLike +from pathlib import Path +from typing import Union + +import torch + + +_LG = logging.getLogger(__name__) + + +def _get_local_path(key): + path = Path(torch.hub.get_dir()) / "torchaudio" / Path(key) + path.parent.mkdir(parents=True, exist_ok=True) + return path + + +def _download(key, path, progress): + url = f"https://download.pytorch.org/torchaudio/{key}" + torch.hub.download_url_to_file(url, path, progress=progress) + + +def _get_hash(path, hash, chunk_size=1028): + m = hashlib.sha256() + with open(path, "rb") as file: + data = file.read(chunk_size) + while data: + m.update(data) + data = file.read(chunk_size) + return m.hexdigest() + + +def download_asset( + key: str, + hash: str = "", + path: Union[str, PathLike] = "", + *, + progress: bool = True, +) -> str: + """Download and store torchaudio assets to local file system. + + If a file exists at the download path, then that path is returned with or without + hash validation. + + Args: + key (str): The asset identifier. + hash (str, optional): + The value of SHA256 hash of the asset. If provided, it is used to verify + the downloaded / cached object. If not provided, then no hash validation + is performed. This means if a file exists at the download path, then the path + is returned as-is without verifying the identity of the file. + path (path-like object, optional): + By default, the downloaded asset is saved in a directory under + :py:func:`torch.hub.get_dir` and intermediate directories based on the given `key` + are created. + This argument can be used to overwrite the target location. + When this argument is provided, all the intermediate directories have to be + created beforehand. + progress (bool): Whether to show progress bar for downloading. Default: ``True``. + + Note: + Currently the valid key values are the route on ``download.pytorch.org/torchaudio``, + but this is an implementation detail. + + Returns: + str: The path to the asset on the local file system. + """ + path = path or _get_local_path(key) + + if path.exists(): + _LG.info("The local file (%s) exists. Skipping the download.", path) + else: + _LG.info("Downloading %s to %s", key, path) + _download(key, path, progress=progress) + + if hash: + _LG.info("Verifying the hash value.") + digest = _get_hash(path, hash) + + if digest != hash: + raise ValueError( + f"The hash value of the downloaded file ({path}), '{digest}' does not match " + f"the provided hash value, '{hash}'." + ) + + _LG.info("Hash validated.") + + return str(path)