diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATN.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATN.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bff3e84c92533c81f8656b0ec6e02e28c2eb2ac8
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATN.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfig.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfig.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab2b84f781a5805ca2a9617e4f28512a4e7762a2
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfig.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfigSet.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfigSet.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b0b60c89c44732b8c5df488636203517cc7e059
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfigSet.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializationOptions.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializationOptions.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3057100b71db367709ab45d20946a4b5b469b45
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializationOptions.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializer.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializer.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30e518b4ddd4c0c5bff7c9ca7a11dcf926f8d497
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializer.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNSimulator.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNSimulator.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27b60e55d23a1e168aa42e5ad206cb37a8ebd241
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNSimulator.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNState.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNState.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab641f63f9a4da8127853556ef9f76e2de0f6f74
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNState.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNType.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNType.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15381271e2e329d524bb660fdc3c7c5b45381b73
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNType.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerATNSimulator.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerATNSimulator.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..774ca125462d17c27e0b26513e27711223aec7a1
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerATNSimulator.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerAction.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerAction.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..686640240c2e2ea967859cafaae1d886db7aeb67
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerAction.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerActionExecutor.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerActionExecutor.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4c8f63a1b058cc2e6443955f40db394808d9593
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerActionExecutor.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ParserATNSimulator.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ParserATNSimulator.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e245cfcbe7ba5a83692bbd61ab0ef1b5bde04776
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ParserATNSimulator.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/PredictionMode.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/PredictionMode.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab2a3b414508d7deb429e2cc43ef35e129c7c6f1
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/PredictionMode.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/SemanticContext.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/SemanticContext.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76ea1aa70f35faac6b84b170a70a89fbb0a3f34a
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/SemanticContext.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/Transition.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/Transition.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b0ee8536e03db74fef621c929e34c203db55d82
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/Transition.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/__init__.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..703a3c9b8d5ac82e3d4f0e96eff54cf01f8e0324
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/__init__.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFA.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFA.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ab4677275f9453ed463c14de16103fa59611637
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFA.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFASerializer.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFASerializer.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b22cd5ca6461eeb9d7bcacbaaa839689b0daf30a
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFASerializer.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFAState.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFAState.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2fc2f2b796a4a23edf837db0474ba087683a864c
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFAState.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/__init__.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c0a4e0f8b09e3c7e003d928a918479591186392
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/__init__.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/DiagnosticErrorListener.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/DiagnosticErrorListener.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b25e662f2159370fdbbeef584846a6652000f338
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/DiagnosticErrorListener.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorListener.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorListener.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b39b81d99e78e62b90b957e970c20daf394668f5
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorListener.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorStrategy.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorStrategy.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd371dd65f59c160b442ac4dbb7a7df2e9a28dbe
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorStrategy.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/Errors.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/Errors.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f04efb03c684f1890a276cbbd46277ff2588a07a
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/Errors.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/__init__.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97a6c10d0016dc7528f07d83b15b3a538b9dbb22
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/__init__.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreeMatch.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreeMatch.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20f227b581f4d62f01a8f7f48ce9c3d0bcda6c12
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreeMatch.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePattern.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePattern.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..027984b0bc83ed5b36ed28d5b7d8fcd7785212f0
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePattern.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePatternMatcher.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePatternMatcher.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..421c65dc1f589b47a6d9811f2d45ae02e2beadc4
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePatternMatcher.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/TokenTagToken.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/TokenTagToken.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..39070d91afe6bcdb6fc31412369dae6964d4b8ba
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/TokenTagToken.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/Trees.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/Trees.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..519877130354fe2d7e9476ad24ec698492d0d3d9
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/Trees.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/__init__.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c091329dfebd8011d45c9f8c67c06b3cb1d0d512
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/__init__.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/XPath.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/XPath.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b87d2ac7969aaaf6ac406a3e4e81442e6befef4
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/XPath.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/__init__.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff886c81a72601acf45558dfe8b26281a534d173
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/__init__.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/goog.npz b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/goog.npz
new file mode 100644
index 0000000000000000000000000000000000000000..bd82e71bf7d72da7db030381c31f769a3d9736bb
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/goog.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:400917cf30e6b664f7b0da93d7c745860d3aa9008da8b7f160d2dd12e6a318b1
+size 22845
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/topobathy.npz b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/topobathy.npz
new file mode 100644
index 0000000000000000000000000000000000000000..67fc6c403643c5b4e0624005b7bd99ac59e856fd
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/topobathy.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0244e03291702df45024dcb5cacbc4f3d4cb30d72dfa7fd371c4ac61c42b4fbf
+size 45224
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/__init__.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e5ca977bd8c96a361a162b772ba69d431fc711a
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/__init__.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/_extension.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/_extension.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c09334a52431c675f071cda8cb80e9c2bc29d58
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/_extension.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/kaldi_io.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/kaldi_io.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3822dfcbe87f6193460c276007eaabb78c75af5
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/kaldi_io.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/version.cpython-38.pyc b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/version.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8113682b5a49357301713fee035101fd6c96a4bd
Binary files /dev/null and b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/version.cpython-38.pyc differ
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/__init__.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..65579b4f01ba09695860717f1e6cd90d6e42b631
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/__init__.py
@@ -0,0 +1,5 @@
+from . import kaldi
+
+__all__ = [
+    "kaldi",
+]
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/kaldi.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/kaldi.py
new file mode 100644
index 0000000000000000000000000000000000000000..12092d90d1dcbd634ece7dc5a0693b9a2aaf0c5f
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/kaldi.py
@@ -0,0 +1,815 @@
+import math
+from typing import Tuple
+
+import torch
+import torchaudio
+from torch import Tensor
+
+__all__ = [
+    "get_mel_banks",
+    "inverse_mel_scale",
+    "inverse_mel_scale_scalar",
+    "mel_scale",
+    "mel_scale_scalar",
+    "spectrogram",
+    "fbank",
+    "mfcc",
+    "vtln_warp_freq",
+    "vtln_warp_mel_freq",
+]
+
+# numeric_limits<float>::epsilon() 1.1920928955078125e-07
+EPSILON = torch.tensor(torch.finfo(torch.float).eps)
+# 1 milliseconds = 0.001 seconds
+MILLISECONDS_TO_SECONDS = 0.001
+
+# window types
+HAMMING = "hamming"
+HANNING = "hanning"
+POVEY = "povey"
+RECTANGULAR = "rectangular"
+BLACKMAN = "blackman"
+WINDOWS = [HAMMING, HANNING, POVEY, RECTANGULAR, BLACKMAN]
+
+
+def _get_epsilon(device, dtype):
+    return EPSILON.to(device=device, dtype=dtype)
+
+
+def _next_power_of_2(x: int) -> int:
+    r"""Returns the smallest power of 2 that is greater than x"""
+    return 1 if x == 0 else 2 ** (x - 1).bit_length()
+
+
+def _get_strided(waveform: Tensor, window_size: int, window_shift: int, snip_edges: bool) -> Tensor:
+    r"""Given a waveform (1D tensor of size ``num_samples``), it returns a 2D tensor (m, ``window_size``)
+    representing how the window is shifted along the waveform. Each row is a frame.
+
+    Args:
+        waveform (Tensor): Tensor of size ``num_samples``
+        window_size (int): Frame length
+        window_shift (int): Frame shift
+        snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends.
+
+    Returns:
+        Tensor: 2D tensor of size (m, ``window_size``) where each row is a frame
+    """
+    assert waveform.dim() == 1
+    num_samples = waveform.size(0)
+    strides = (window_shift * waveform.stride(0), waveform.stride(0))
+
+    if snip_edges:
+        if num_samples < window_size:
+            return torch.empty((0, 0), dtype=waveform.dtype, device=waveform.device)
+        else:
+            m = 1 + (num_samples - window_size) // window_shift
+    else:
+        reversed_waveform = torch.flip(waveform, [0])
+        m = (num_samples + (window_shift // 2)) // window_shift
+        pad = window_size // 2 - window_shift // 2
+        pad_right = reversed_waveform
+        if pad > 0:
+            # torch.nn.functional.pad returns [2,1,0,1,2] for 'reflect'
+            # but we want [2, 1, 0, 0, 1, 2]
+            pad_left = reversed_waveform[-pad:]
+            waveform = torch.cat((pad_left, waveform, pad_right), dim=0)
+        else:
+            # pad is negative so we want to trim the waveform at the front
+            waveform = torch.cat((waveform[-pad:], pad_right), dim=0)
+
+    sizes = (m, window_size)
+    return waveform.as_strided(sizes, strides)
+
+
+def _feature_window_function(
+    window_type: str,
+    window_size: int,
+    blackman_coeff: float,
+    device: torch.device,
+    dtype: int,
+) -> Tensor:
+    r"""Returns a window function with the given type and size"""
+    if window_type == HANNING:
+        return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype)
+    elif window_type == HAMMING:
+        return torch.hamming_window(window_size, periodic=False, alpha=0.54, beta=0.46, device=device, dtype=dtype)
+    elif window_type == POVEY:
+        # like hanning but goes to zero at edges
+        return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype).pow(0.85)
+    elif window_type == RECTANGULAR:
+        return torch.ones(window_size, device=device, dtype=dtype)
+    elif window_type == BLACKMAN:
+        a = 2 * math.pi / (window_size - 1)
+        window_function = torch.arange(window_size, device=device, dtype=dtype)
+        # can't use torch.blackman_window as they use different coefficients
+        return (
+            blackman_coeff
+            - 0.5 * torch.cos(a * window_function)
+            + (0.5 - blackman_coeff) * torch.cos(2 * a * window_function)
+        ).to(device=device, dtype=dtype)
+    else:
+        raise Exception("Invalid window type " + window_type)
+
+
+def _get_log_energy(strided_input: Tensor, epsilon: Tensor, energy_floor: float) -> Tensor:
+    r"""Returns the log energy of size (m) for a strided_input (m,*)"""
+    device, dtype = strided_input.device, strided_input.dtype
+    log_energy = torch.max(strided_input.pow(2).sum(1), epsilon).log()  # size (m)
+    if energy_floor == 0.0:
+        return log_energy
+    return torch.max(log_energy, torch.tensor(math.log(energy_floor), device=device, dtype=dtype))
+
+
+def _get_waveform_and_window_properties(
+    waveform: Tensor,
+    channel: int,
+    sample_frequency: float,
+    frame_shift: float,
+    frame_length: float,
+    round_to_power_of_two: bool,
+    preemphasis_coefficient: float,
+) -> Tuple[Tensor, int, int, int]:
+    r"""Gets the waveform and window properties"""
+    channel = max(channel, 0)
+    assert channel < waveform.size(0), "Invalid channel {} for size {}".format(channel, waveform.size(0))
+    waveform = waveform[channel, :]  # size (n)
+    window_shift = int(sample_frequency * frame_shift * MILLISECONDS_TO_SECONDS)
+    window_size = int(sample_frequency * frame_length * MILLISECONDS_TO_SECONDS)
+    padded_window_size = _next_power_of_2(window_size) if round_to_power_of_two else window_size
+
+    assert 2 <= window_size <= len(waveform), "choose a window size {} that is [2, {}]".format(
+        window_size, len(waveform)
+    )
+    assert 0 < window_shift, "`window_shift` must be greater than 0"
+    assert padded_window_size % 2 == 0, (
+        "the padded `window_size` must be divisible by two." " use `round_to_power_of_two` or change `frame_length`"
+    )
+    assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]"
+    assert sample_frequency > 0, "`sample_frequency` must be greater than zero"
+    return waveform, window_shift, window_size, padded_window_size
+
+
+def _get_window(
+    waveform: Tensor,
+    padded_window_size: int,
+    window_size: int,
+    window_shift: int,
+    window_type: str,
+    blackman_coeff: float,
+    snip_edges: bool,
+    raw_energy: bool,
+    energy_floor: float,
+    dither: float,
+    remove_dc_offset: bool,
+    preemphasis_coefficient: float,
+) -> Tuple[Tensor, Tensor]:
+    r"""Gets a window and its log energy
+
+    Returns:
+        (Tensor, Tensor): strided_input of size (m, ``padded_window_size``) and signal_log_energy of size (m)
+    """
+    device, dtype = waveform.device, waveform.dtype
+    epsilon = _get_epsilon(device, dtype)
+
+    # size (m, window_size)
+    strided_input = _get_strided(waveform, window_size, window_shift, snip_edges)
+
+    if dither != 0.0:
+        # Returns a random number strictly between 0 and 1
+        x = torch.max(epsilon, torch.rand(strided_input.shape, device=device, dtype=dtype))
+        rand_gauss = torch.sqrt(-2 * x.log()) * torch.cos(2 * math.pi * x)
+        strided_input = strided_input + rand_gauss * dither
+
+    if remove_dc_offset:
+        # Subtract each row/frame by its mean
+        row_means = torch.mean(strided_input, dim=1).unsqueeze(1)  # size (m, 1)
+        strided_input = strided_input - row_means
+
+    if raw_energy:
+        # Compute the log energy of each row/frame before applying preemphasis and
+        # window function
+        signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor)  # size (m)
+
+    if preemphasis_coefficient != 0.0:
+        # strided_input[i,j] -= preemphasis_coefficient * strided_input[i, max(0, j-1)] for all i,j
+        offset_strided_input = torch.nn.functional.pad(strided_input.unsqueeze(0), (1, 0), mode="replicate").squeeze(
+            0
+        )  # size (m, window_size + 1)
+        strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :-1]
+
+    # Apply window_function to each row/frame
+    window_function = _feature_window_function(window_type, window_size, blackman_coeff, device, dtype).unsqueeze(
+        0
+    )  # size (1, window_size)
+    strided_input = strided_input * window_function  # size (m, window_size)
+
+    # Pad columns with zero until we reach size (m, padded_window_size)
+    if padded_window_size != window_size:
+        padding_right = padded_window_size - window_size
+        strided_input = torch.nn.functional.pad(
+            strided_input.unsqueeze(0), (0, padding_right), mode="constant", value=0
+        ).squeeze(0)
+
+    # Compute energy after window function (not the raw one)
+    if not raw_energy:
+        signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor)  # size (m)
+
+    return strided_input, signal_log_energy
+
+
+def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
+    # subtracts the column mean of the tensor size (m, n) if subtract_mean=True
+    # it returns size (m, n)
+    if subtract_mean:
+        col_means = torch.mean(tensor, dim=0).unsqueeze(0)
+        tensor = tensor - col_means
+    return tensor
+
+
+def spectrogram(
+    waveform: Tensor,
+    blackman_coeff: float = 0.42,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    min_duration: float = 0.0,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    window_type: str = POVEY,
+) -> Tensor:
+    r"""Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's
+    compute-spectrogram-feats.
+
+    Args:
+        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
+        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
+        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
+        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
+            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
+        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
+            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
+            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
+        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
+        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
+        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
+        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
+        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
+        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. (Default: ``True``)
+        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
+            specified there) (Default: ``16000.0``)
+        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
+        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
+            it this way.  (Default: ``False``)
+        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
+         (Default: ``'povey'``)
+
+    Returns:
+        Tensor: A spectrogram identical to what Kaldi would output. The shape is
+        (m, ``padded_window_size // 2 + 1``) where m is calculated in _get_strided
+    """
+    device, dtype = waveform.device, waveform.dtype
+    epsilon = _get_epsilon(device, dtype)
+
+    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
+        waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient
+    )
+
+    if len(waveform) < min_duration * sample_frequency:
+        # signal is too short
+        return torch.empty(0)
+
+    strided_input, signal_log_energy = _get_window(
+        waveform,
+        padded_window_size,
+        window_size,
+        window_shift,
+        window_type,
+        blackman_coeff,
+        snip_edges,
+        raw_energy,
+        energy_floor,
+        dither,
+        remove_dc_offset,
+        preemphasis_coefficient,
+    )
+
+    # size (m, padded_window_size // 2 + 1, 2)
+    fft = torch.fft.rfft(strided_input)
+
+    # Convert the FFT into a power spectrum
+    power_spectrum = torch.max(fft.abs().pow(2.0), epsilon).log()  # size (m, padded_window_size // 2 + 1)
+    power_spectrum[:, 0] = signal_log_energy
+
+    power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
+    return power_spectrum
+
+
+def inverse_mel_scale_scalar(mel_freq: float) -> float:
+    return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
+
+
+def inverse_mel_scale(mel_freq: Tensor) -> Tensor:
+    return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
+
+
+def mel_scale_scalar(freq: float) -> float:
+    return 1127.0 * math.log(1.0 + freq / 700.0)
+
+
+def mel_scale(freq: Tensor) -> Tensor:
+    return 1127.0 * (1.0 + freq / 700.0).log()
+
+
+def vtln_warp_freq(
+    vtln_low_cutoff: float,
+    vtln_high_cutoff: float,
+    low_freq: float,
+    high_freq: float,
+    vtln_warp_factor: float,
+    freq: Tensor,
+) -> Tensor:
+    r"""This computes a VTLN warping function that is not the same as HTK's one,
+    but has similar inputs (this function has the advantage of never producing
+    empty bins).
+
+    This function computes a warp function F(freq), defined between low_freq
+    and high_freq inclusive, with the following properties:
+        F(low_freq) == low_freq
+        F(high_freq) == high_freq
+    The function is continuous and piecewise linear with two inflection
+        points.
+    The lower inflection point (measured in terms of the unwarped
+        frequency) is at frequency l, determined as described below.
+    The higher inflection point is at a frequency h, determined as
+        described below.
+    If l <= f <= h, then F(f) = f/vtln_warp_factor.
+    If the higher inflection point (measured in terms of the unwarped
+        frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
+        Since (by the last point) F(h) == h/vtln_warp_factor, then
+        max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
+        h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
+          = vtln_high_cutoff * min(1, vtln_warp_factor).
+    If the lower inflection point (measured in terms of the unwarped
+        frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
+        This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
+                            = vtln_low_cutoff * max(1, vtln_warp_factor)
+    Args:
+        vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
+        vtln_high_cutoff (float): Upper frequency cutoffs for VTLN
+        low_freq (float): Lower frequency cutoffs in mel computation
+        high_freq (float): Upper frequency cutoffs in mel computation
+        vtln_warp_factor (float): Vtln warp factor
+        freq (Tensor): given frequency in Hz
+
+    Returns:
+        Tensor: Freq after vtln warp
+    """
+    assert vtln_low_cutoff > low_freq, "be sure to set the vtln_low option higher than low_freq"
+    assert vtln_high_cutoff < high_freq, "be sure to set the vtln_high option lower than high_freq [or negative]"
+    l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
+    h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
+    scale = 1.0 / vtln_warp_factor
+    Fl = scale * l  # F(l)
+    Fh = scale * h  # F(h)
+    assert l > low_freq and h < high_freq
+    # slope of left part of the 3-piece linear function
+    scale_left = (Fl - low_freq) / (l - low_freq)
+    # [slope of center part is just "scale"]
+
+    # slope of right part of the 3-piece linear function
+    scale_right = (high_freq - Fh) / (high_freq - h)
+
+    res = torch.empty_like(freq)
+
+    outside_low_high_freq = torch.lt(freq, low_freq) | torch.gt(freq, high_freq)  # freq < low_freq || freq > high_freq
+    before_l = torch.lt(freq, l)  # freq < l
+    before_h = torch.lt(freq, h)  # freq < h
+    after_h = torch.ge(freq, h)  # freq >= h
+
+    # order of operations matter here (since there is overlapping frequency regions)
+    res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
+    res[before_h] = scale * freq[before_h]
+    res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
+    res[outside_low_high_freq] = freq[outside_low_high_freq]
+
+    return res
+
+
+def vtln_warp_mel_freq(
+    vtln_low_cutoff: float,
+    vtln_high_cutoff: float,
+    low_freq,
+    high_freq: float,
+    vtln_warp_factor: float,
+    mel_freq: Tensor,
+) -> Tensor:
+    r"""
+    Args:
+        vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
+        vtln_high_cutoff (float): Upper frequency cutoffs for VTLN
+        low_freq (float): Lower frequency cutoffs in mel computation
+        high_freq (float): Upper frequency cutoffs in mel computation
+        vtln_warp_factor (float): Vtln warp factor
+        mel_freq (Tensor): Given frequency in Mel
+
+    Returns:
+        Tensor: ``mel_freq`` after vtln warp
+    """
+    return mel_scale(
+        vtln_warp_freq(
+            vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq, vtln_warp_factor, inverse_mel_scale(mel_freq)
+        )
+    )
+
+
+def get_mel_banks(
+    num_bins: int,
+    window_length_padded: int,
+    sample_freq: float,
+    low_freq: float,
+    high_freq: float,
+    vtln_low: float,
+    vtln_high: float,
+    vtln_warp_factor: float,
+) -> Tuple[Tensor, Tensor]:
+    """
+    Returns:
+        (Tensor, Tensor): The tuple consists of ``bins`` (which is
+        melbank of size (``num_bins``, ``num_fft_bins``)) and ``center_freqs`` (which is
+        center frequencies of bins of size (``num_bins``)).
+    """
+    assert num_bins > 3, "Must have at least 3 mel bins"
+    assert window_length_padded % 2 == 0
+    num_fft_bins = window_length_padded / 2
+    nyquist = 0.5 * sample_freq
+
+    if high_freq <= 0.0:
+        high_freq += nyquist
+
+    assert (
+        (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq)
+    ), "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist)
+
+    # fft-bin width [think of it as Nyquist-freq / half-window-length]
+    fft_bin_width = sample_freq / window_length_padded
+    mel_low_freq = mel_scale_scalar(low_freq)
+    mel_high_freq = mel_scale_scalar(high_freq)
+
+    # divide by num_bins+1 in next line because of end-effects where the bins
+    # spread out to the sides.
+    mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
+
+    if vtln_high < 0.0:
+        vtln_high += nyquist
+
+    assert vtln_warp_factor == 1.0 or (
+        (low_freq < vtln_low < high_freq) and (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)
+    ), "Bad values in options: vtln-low {} and vtln-high {}, versus " "low-freq {} and high-freq {}".format(
+        vtln_low, vtln_high, low_freq, high_freq
+    )
+
+    bin = torch.arange(num_bins).unsqueeze(1)
+    left_mel = mel_low_freq + bin * mel_freq_delta  # size(num_bins, 1)
+    center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta  # size(num_bins, 1)
+    right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta  # size(num_bins, 1)
+
+    if vtln_warp_factor != 1.0:
+        left_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, left_mel)
+        center_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, center_mel)
+        right_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, right_mel)
+
+    center_freqs = inverse_mel_scale(center_mel)  # size (num_bins)
+    # size(1, num_fft_bins)
+    mel = mel_scale(fft_bin_width * torch.arange(num_fft_bins)).unsqueeze(0)
+
+    # size (num_bins, num_fft_bins)
+    up_slope = (mel - left_mel) / (center_mel - left_mel)
+    down_slope = (right_mel - mel) / (right_mel - center_mel)
+
+    if vtln_warp_factor == 1.0:
+        # left_mel < center_mel < right_mel so we can min the two slopes and clamp negative values
+        bins = torch.max(torch.zeros(1), torch.min(up_slope, down_slope))
+    else:
+        # warping can move the order of left_mel, center_mel, right_mel anywhere
+        bins = torch.zeros_like(up_slope)
+        up_idx = torch.gt(mel, left_mel) & torch.le(mel, center_mel)  # left_mel < mel <= center_mel
+        down_idx = torch.gt(mel, center_mel) & torch.lt(mel, right_mel)  # center_mel < mel < right_mel
+        bins[up_idx] = up_slope[up_idx]
+        bins[down_idx] = down_slope[down_idx]
+
+    return bins, center_freqs
+
+
+def fbank(
+    waveform: Tensor,
+    blackman_coeff: float = 0.42,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    high_freq: float = 0.0,
+    htk_compat: bool = False,
+    low_freq: float = 20.0,
+    min_duration: float = 0.0,
+    num_mel_bins: int = 23,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    use_energy: bool = False,
+    use_log_fbank: bool = True,
+    use_power: bool = True,
+    vtln_high: float = -500.0,
+    vtln_low: float = 100.0,
+    vtln_warp: float = 1.0,
+    window_type: str = POVEY,
+) -> Tensor:
+    r"""Create a fbank from a raw audio signal. This matches the input/output of Kaldi's
+    compute-fbank-feats.
+
+    Args:
+        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
+        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
+        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
+        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
+            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
+        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
+            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
+            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
+        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
+        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
+        high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
+         (Default: ``0.0``)
+        htk_compat (bool, optional): If true, put energy last.  Warning: not sufficient to get HTK compatible features
+         (need to change other parameters). (Default: ``False``)
+        low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``)
+        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
+        num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``)
+        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
+        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
+        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. (Default: ``True``)
+        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
+            specified there) (Default: ``16000.0``)
+        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
+        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
+            it this way.  (Default: ``False``)
+        use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
+        use_log_fbank (bool, optional):If true, produce log-filterbank, else produce linear. (Default: ``True``)
+        use_power (bool, optional): If true, use power, else use magnitude. (Default: ``True``)
+        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if
+            negative, offset from high-mel-freq (Default: ``-500.0``)
+        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
+        vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
+        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
+         (Default: ``'povey'``)
+
+    Returns:
+        Tensor: A fbank identical to what Kaldi would output. The shape is (m, ``num_mel_bins + use_energy``)
+        where m is calculated in _get_strided
+    """
+    device, dtype = waveform.device, waveform.dtype
+
+    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
+        waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient
+    )
+
+    if len(waveform) < min_duration * sample_frequency:
+        # signal is too short
+        return torch.empty(0, device=device, dtype=dtype)
+
+    # strided_input, size (m, padded_window_size) and signal_log_energy, size (m)
+    strided_input, signal_log_energy = _get_window(
+        waveform,
+        padded_window_size,
+        window_size,
+        window_shift,
+        window_type,
+        blackman_coeff,
+        snip_edges,
+        raw_energy,
+        energy_floor,
+        dither,
+        remove_dc_offset,
+        preemphasis_coefficient,
+    )
+
+    # size (m, padded_window_size // 2 + 1)
+    spectrum = torch.fft.rfft(strided_input).abs()
+    if use_power:
+        spectrum = spectrum.pow(2.0)
+
+    # size (num_mel_bins, padded_window_size // 2)
+    mel_energies, _ = get_mel_banks(
+        num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp
+    )
+    mel_energies = mel_energies.to(device=device, dtype=dtype)
+
+    # pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1)
+    mel_energies = torch.nn.functional.pad(mel_energies, (0, 1), mode="constant", value=0)
+
+    # sum with mel fiterbanks over the power spectrum, size (m, num_mel_bins)
+    mel_energies = torch.mm(spectrum, mel_energies.T)
+    if use_log_fbank:
+        # avoid log of zero (which should be prevented anyway by dithering)
+        mel_energies = torch.max(mel_energies, _get_epsilon(device, dtype)).log()
+
+    # if use_energy then add it as the last column for htk_compat == true else first column
+    if use_energy:
+        signal_log_energy = signal_log_energy.unsqueeze(1)  # size (m, 1)
+        # returns size (m, num_mel_bins + 1)
+        if htk_compat:
+            mel_energies = torch.cat((mel_energies, signal_log_energy), dim=1)
+        else:
+            mel_energies = torch.cat((signal_log_energy, mel_energies), dim=1)
+
+    mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
+    return mel_energies
+
+
+def _get_dct_matrix(num_ceps: int, num_mel_bins: int) -> Tensor:
+    # returns a dct matrix of size (num_mel_bins, num_ceps)
+    # size (num_mel_bins, num_mel_bins)
+    dct_matrix = torchaudio.functional.create_dct(num_mel_bins, num_mel_bins, "ortho")
+    # kaldi expects the first cepstral to be weighted sum of factor sqrt(1/num_mel_bins)
+    # this would be the first column in the dct_matrix for torchaudio as it expects a
+    # right multiply (which would be the first column of the kaldi's dct_matrix as kaldi
+    # expects a left multiply e.g. dct_matrix * vector).
+    dct_matrix[:, 0] = math.sqrt(1 / float(num_mel_bins))
+    dct_matrix = dct_matrix[:, :num_ceps]
+    return dct_matrix
+
+
+def _get_lifter_coeffs(num_ceps: int, cepstral_lifter: float) -> Tensor:
+    # returns size (num_ceps)
+    # Compute liftering coefficients (scaling on cepstral coeffs)
+    # coeffs are numbered slightly differently from HTK: the zeroth index is C0, which is not affected.
+    i = torch.arange(num_ceps)
+    return 1.0 + 0.5 * cepstral_lifter * torch.sin(math.pi * i / cepstral_lifter)
+
+
+def mfcc(
+    waveform: Tensor,
+    blackman_coeff: float = 0.42,
+    cepstral_lifter: float = 22.0,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    high_freq: float = 0.0,
+    htk_compat: bool = False,
+    low_freq: float = 20.0,
+    num_ceps: int = 13,
+    min_duration: float = 0.0,
+    num_mel_bins: int = 23,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    use_energy: bool = False,
+    vtln_high: float = -500.0,
+    vtln_low: float = 100.0,
+    vtln_warp: float = 1.0,
+    window_type: str = POVEY,
+) -> Tensor:
+    r"""Create a mfcc from a raw audio signal. This matches the input/output of Kaldi's
+    compute-mfcc-feats.
+
+    Args:
+        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
+        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
+        cepstral_lifter (float, optional): Constant that controls scaling of MFCCs (Default: ``22.0``)
+        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
+        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
+            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
+        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
+            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
+            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
+        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
+        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
+        high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
+         (Default: ``0.0``)
+        htk_compat (bool, optional): If true, put energy last.  Warning: not sufficient to get HTK compatible
+         features (need to change other parameters). (Default: ``False``)
+        low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``)
+        num_ceps (int, optional): Number of cepstra in MFCC computation (including C0) (Default: ``13``)
+        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
+        num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``)
+        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
+        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
+        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. (Default: ``True``)
+        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
+            specified there) (Default: ``16000.0``)
+        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
+        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
+            it this way.  (Default: ``False``)
+        use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
+        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if
+            negative, offset from high-mel-freq (Default: ``-500.0``)
+        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
+        vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
+        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
+         (Default: ``"povey"``)
+
+    Returns:
+        Tensor: A mfcc identical to what Kaldi would output. The shape is (m, ``num_ceps``)
+        where m is calculated in _get_strided
+    """
+    assert num_ceps <= num_mel_bins, "num_ceps cannot be larger than num_mel_bins: %d vs %d" % (num_ceps, num_mel_bins)
+
+    device, dtype = waveform.device, waveform.dtype
+
+    # The mel_energies should not be squared (use_power=True), not have mean subtracted
+    # (subtract_mean=False), and use log (use_log_fbank=True).
+    # size (m, num_mel_bins + use_energy)
+    feature = fbank(
+        waveform=waveform,
+        blackman_coeff=blackman_coeff,
+        channel=channel,
+        dither=dither,
+        energy_floor=energy_floor,
+        frame_length=frame_length,
+        frame_shift=frame_shift,
+        high_freq=high_freq,
+        htk_compat=htk_compat,
+        low_freq=low_freq,
+        min_duration=min_duration,
+        num_mel_bins=num_mel_bins,
+        preemphasis_coefficient=preemphasis_coefficient,
+        raw_energy=raw_energy,
+        remove_dc_offset=remove_dc_offset,
+        round_to_power_of_two=round_to_power_of_two,
+        sample_frequency=sample_frequency,
+        snip_edges=snip_edges,
+        subtract_mean=False,
+        use_energy=use_energy,
+        use_log_fbank=True,
+        use_power=True,
+        vtln_high=vtln_high,
+        vtln_low=vtln_low,
+        vtln_warp=vtln_warp,
+        window_type=window_type,
+    )
+
+    if use_energy:
+        # size (m)
+        signal_log_energy = feature[:, num_mel_bins if htk_compat else 0]
+        # offset is 0 if htk_compat==True else 1
+        mel_offset = int(not htk_compat)
+        feature = feature[:, mel_offset : (num_mel_bins + mel_offset)]
+
+    # size (num_mel_bins, num_ceps)
+    dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins).to(dtype=dtype, device=device)
+
+    # size (m, num_ceps)
+    feature = feature.matmul(dct_matrix)
+
+    if cepstral_lifter != 0.0:
+        # size (1, num_ceps)
+        lifter_coeffs = _get_lifter_coeffs(num_ceps, cepstral_lifter).unsqueeze(0)
+        feature *= lifter_coeffs.to(device=device, dtype=dtype)
+
+    # if use_energy then replace the last column for htk_compat == true else first column
+    if use_energy:
+        feature[:, 0] = signal_log_energy
+
+    if htk_compat:
+        energy = feature[:, 0].unsqueeze(1)  # size (m, 1)
+        feature = feature[:, 1:]  # size (m, num_ceps - 1)
+        if not use_energy:
+            # scale on C0 (actually removing a scale we previously added that's
+            # part of one common definition of the cosine transform.)
+            energy *= math.sqrt(2)
+
+        feature = torch.cat((feature, energy), dim=1)
+
+    feature = _subtract_column_mean(feature, subtract_mean)
+    return feature
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/__init__.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5946e809653c916bbee7cfad330ed50cefe3447
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/__init__.py
@@ -0,0 +1,34 @@
+from .cmuarctic import CMUARCTIC
+from .cmudict import CMUDict
+from .commonvoice import COMMONVOICE
+from .dr_vctk import DR_VCTK
+from .gtzan import GTZAN
+from .librilight_limited import LibriLightLimited
+from .librimix import LibriMix
+from .librispeech import LIBRISPEECH
+from .libritts import LIBRITTS
+from .ljspeech import LJSPEECH
+from .quesst14 import QUESST14
+from .speechcommands import SPEECHCOMMANDS
+from .tedlium import TEDLIUM
+from .vctk import VCTK_092
+from .yesno import YESNO
+
+
+__all__ = [
+    "COMMONVOICE",
+    "LIBRISPEECH",
+    "LibriLightLimited",
+    "SPEECHCOMMANDS",
+    "VCTK_092",
+    "DR_VCTK",
+    "YESNO",
+    "LJSPEECH",
+    "GTZAN",
+    "CMUARCTIC",
+    "CMUDict",
+    "LibriMix",
+    "LIBRITTS",
+    "TEDLIUM",
+    "QUESST14",
+]
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmuarctic.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmuarctic.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a1227b0151f28fe3cf82e54883ffef5e15a21b6
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmuarctic.py
@@ -0,0 +1,148 @@
+import csv
+import os
+from pathlib import Path
+from typing import Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+
+URL = "aew"
+FOLDER_IN_ARCHIVE = "ARCTIC"
+_CHECKSUMS = {
+    "http://festvox.org/cmu_arctic/packed/cmu_us_aew_arctic.tar.bz2": "645cb33c0f0b2ce41384fdd8d3db2c3f5fc15c1e688baeb74d2e08cab18ab406",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_ahw_arctic.tar.bz2": "024664adeb892809d646a3efd043625b46b5bfa3e6189b3500b2d0d59dfab06c",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_aup_arctic.tar.bz2": "2c55bc3050caa996758869126ad10cf42e1441212111db034b3a45189c18b6fc",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_awb_arctic.tar.bz2": "d74a950c9739a65f7bfc4dfa6187f2730fa03de5b8eb3f2da97a51b74df64d3c",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_axb_arctic.tar.bz2": "dd65c3d2907d1ee52f86e44f578319159e60f4bf722a9142be01161d84e330ff",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_bdl_arctic.tar.bz2": "26b91aaf48b2799b2956792b4632c2f926cd0542f402b5452d5adecb60942904",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_clb_arctic.tar.bz2": "3f16dc3f3b97955ea22623efb33b444341013fc660677b2e170efdcc959fa7c6",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_eey_arctic.tar.bz2": "8a0ee4e5acbd4b2f61a4fb947c1730ab3adcc9dc50b195981d99391d29928e8a",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_fem_arctic.tar.bz2": "3fcff629412b57233589cdb058f730594a62c4f3a75c20de14afe06621ef45e2",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_gka_arctic.tar.bz2": "dc82e7967cbd5eddbed33074b0699128dbd4482b41711916d58103707e38c67f",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_jmk_arctic.tar.bz2": "3a37c0e1dfc91e734fdbc88b562d9e2ebca621772402cdc693bbc9b09b211d73",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_ksp_arctic.tar.bz2": "8029cafce8296f9bed3022c44ef1e7953332b6bf6943c14b929f468122532717",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_ljm_arctic.tar.bz2": "b23993765cbf2b9e7bbc3c85b6c56eaf292ac81ee4bb887b638a24d104f921a0",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_lnh_arctic.tar.bz2": "4faf34d71aa7112813252fb20c5433e2fdd9a9de55a00701ffcbf05f24a5991a",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_rms_arctic.tar.bz2": "c6dc11235629c58441c071a7ba8a2d067903dfefbaabc4056d87da35b72ecda4",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_rxr_arctic.tar.bz2": "1fa4271c393e5998d200e56c102ff46fcfea169aaa2148ad9e9469616fbfdd9b",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_slp_arctic.tar.bz2": "54345ed55e45c23d419e9a823eef427f1cc93c83a710735ec667d068c916abf1",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_slt_arctic.tar.bz2": "7c173297916acf3cc7fcab2713be4c60b27312316765a90934651d367226b4ea",  # noqa: E501
+}
+
+
+def load_cmuarctic_item(line: str, path: str, folder_audio: str, ext_audio: str) -> Tuple[Tensor, int, str, str]:
+
+    utterance_id, transcript = line[0].strip().split(" ", 2)[1:]
+
+    # Remove space, double quote, and single parenthesis from transcript
+    transcript = transcript[1:-3]
+
+    file_audio = os.path.join(path, folder_audio, utterance_id + ext_audio)
+
+    # Load audio
+    waveform, sample_rate = torchaudio.load(file_audio)
+
+    return (waveform, sample_rate, transcript, utterance_id.split("_")[1])
+
+
+class CMUARCTIC(Dataset):
+    """Create a Dataset for *CMU ARCTIC* [:footcite:`Kominek03cmuarctic`].
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional):
+            The URL to download the dataset from or the type of the dataset to download.
+            (default: ``"aew"``)
+            Allowed type values are ``"aew"``, ``"ahw"``, ``"aup"``, ``"awb"``, ``"axb"``, ``"bdl"``,
+            ``"clb"``, ``"eey"``, ``"fem"``, ``"gka"``, ``"jmk"``, ``"ksp"``, ``"ljm"``, ``"lnh"``,
+            ``"rms"``, ``"rxr"``, ``"slp"`` or ``"slt"``.
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"ARCTIC"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+
+    _file_text = "txt.done.data"
+    _folder_text = "etc"
+    _ext_audio = ".wav"
+    _folder_audio = "wav"
+
+    def __init__(
+        self, root: Union[str, Path], url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False
+    ) -> None:
+
+        if url in [
+            "aew",
+            "ahw",
+            "aup",
+            "awb",
+            "axb",
+            "bdl",
+            "clb",
+            "eey",
+            "fem",
+            "gka",
+            "jmk",
+            "ksp",
+            "ljm",
+            "lnh",
+            "rms",
+            "rxr",
+            "slp",
+            "slt",
+        ]:
+
+            url = "cmu_us_" + url + "_arctic"
+            ext_archive = ".tar.bz2"
+            base_url = "http://www.festvox.org/cmu_arctic/packed/"
+
+            url = os.path.join(base_url, url + ext_archive)
+
+        # Get string representation of 'root' in case Path object is passed
+        root = os.fspath(root)
+
+        basename = os.path.basename(url)
+        root = os.path.join(root, folder_in_archive)
+        if not os.path.isdir(root):
+            os.mkdir(root)
+        archive = os.path.join(root, basename)
+
+        basename = basename.split(".")[0]
+
+        self._path = os.path.join(root, basename)
+
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _CHECKSUMS.get(url, None)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                extract_archive(archive)
+        else:
+            if not os.path.exists(self._path):
+                raise RuntimeError(
+                    f"The path {self._path} doesn't exist. "
+                    "Please check the ``root`` path or set `download=True` to download it"
+                )
+        self._text = os.path.join(self._path, self._folder_text, self._file_text)
+
+        with open(self._text, "r") as text:
+            walker = csv.reader(text, delimiter="\n")
+            self._walker = list(walker)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            (Tensor, int, str, str): ``(waveform, sample_rate, transcript, utterance_id)``
+        """
+        line = self._walker[n]
+        return load_cmuarctic_item(line, self._path, self._folder_audio, self._ext_audio)
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmudict.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmudict.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd17153409f813997a63eef42aea0f61165c72cc
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmudict.py
@@ -0,0 +1,183 @@
+import os
+import re
+from pathlib import Path
+from typing import Iterable, List, Tuple, Union
+
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+
+_CHECKSUMS = {
+    "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b": "209a8b4cd265013e96f4658632a9878103b0c5abf62b50d4ef3ae1be226b29e4",  # noqa: E501
+    "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols": "408ccaae803641c6d7b626b6299949320c2dbca96b2220fd3fb17887b023b027",  # noqa: E501
+}
+_PUNCTUATIONS = set(
+    [
+        "!EXCLAMATION-POINT",
+        '"CLOSE-QUOTE',
+        '"DOUBLE-QUOTE',
+        '"END-OF-QUOTE',
+        '"END-QUOTE',
+        '"IN-QUOTES',
+        '"QUOTE',
+        '"UNQUOTE',
+        "#HASH-MARK",
+        "#POUND-SIGN",
+        "#SHARP-SIGN",
+        "%PERCENT",
+        "&AMPERSAND",
+        "'END-INNER-QUOTE",
+        "'END-QUOTE",
+        "'INNER-QUOTE",
+        "'QUOTE",
+        "'SINGLE-QUOTE",
+        "(BEGIN-PARENS",
+        "(IN-PARENTHESES",
+        "(LEFT-PAREN",
+        "(OPEN-PARENTHESES",
+        "(PAREN",
+        "(PARENS",
+        "(PARENTHESES",
+        ")CLOSE-PAREN",
+        ")CLOSE-PARENTHESES",
+        ")END-PAREN",
+        ")END-PARENS",
+        ")END-PARENTHESES",
+        ")END-THE-PAREN",
+        ")PAREN",
+        ")PARENS",
+        ")RIGHT-PAREN",
+        ")UN-PARENTHESES",
+        "+PLUS",
+        ",COMMA",
+        "--DASH",
+        "-DASH",
+        "-HYPHEN",
+        "...ELLIPSIS",
+        ".DECIMAL",
+        ".DOT",
+        ".FULL-STOP",
+        ".PERIOD",
+        ".POINT",
+        "/SLASH",
+        ":COLON",
+        ";SEMI-COLON",
+        ";SEMI-COLON(1)",
+        "?QUESTION-MARK",
+        "{BRACE",
+        "{LEFT-BRACE",
+        "{OPEN-BRACE",
+        "}CLOSE-BRACE",
+        "}RIGHT-BRACE",
+    ]
+)
+
+
+def _parse_dictionary(lines: Iterable[str], exclude_punctuations: bool) -> List[str]:
+    _alt_re = re.compile(r"\([0-9]+\)")
+    cmudict: List[Tuple[str, List[str]]] = list()
+    for line in lines:
+        if not line or line.startswith(";;;"):  # ignore comments
+            continue
+
+        word, phones = line.strip().split("  ")
+        if word in _PUNCTUATIONS:
+            if exclude_punctuations:
+                continue
+            # !EXCLAMATION-POINT -> !
+            # --DASH -> --
+            # ...ELLIPSIS -> ...
+            if word.startswith("..."):
+                word = "..."
+            elif word.startswith("--"):
+                word = "--"
+            else:
+                word = word[0]
+
+        # if a word have multiple pronunciations, there will be (number) appended to it
+        # for example, DATAPOINTS and DATAPOINTS(1),
+        # the regular expression `_alt_re` removes the '(1)' and change the word DATAPOINTS(1) to DATAPOINTS
+        word = re.sub(_alt_re, "", word)
+        phones = phones.split(" ")
+        cmudict.append((word, phones))
+
+    return cmudict
+
+
+class CMUDict(Dataset):
+    """Create a Dataset for *CMU Pronouncing Dictionary* [:footcite:`cmudict`] (CMUDict).
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        exclude_punctuations (bool, optional):
+            When enabled, exclude the pronounciation of punctuations, such as
+            `!EXCLAMATION-POINT` and `#HASH-MARK`.
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        url (str, optional):
+            The URL to download the dictionary from.
+            (default: ``"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b"``)
+        url_symbols (str, optional):
+            The URL to download the list of symbols from.
+            (default: ``"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols"``)
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        exclude_punctuations: bool = True,
+        *,
+        download: bool = False,
+        url: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b",
+        url_symbols: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols",
+    ) -> None:
+
+        self.exclude_punctuations = exclude_punctuations
+
+        self._root_path = Path(root)
+        if not os.path.isdir(self._root_path):
+            raise RuntimeError(f"The root directory does not exist; {root}")
+
+        dict_file = self._root_path / os.path.basename(url)
+        symbol_file = self._root_path / os.path.basename(url_symbols)
+        if not os.path.exists(dict_file):
+            if not download:
+                raise RuntimeError(
+                    "The dictionary file is not found in the following location. "
+                    f"Set `download=True` to download it. {dict_file}"
+                )
+            checksum = _CHECKSUMS.get(url, None)
+            download_url_to_file(url, dict_file, checksum)
+        if not os.path.exists(symbol_file):
+            if not download:
+                raise RuntimeError(
+                    "The symbol file is not found in the following location. "
+                    f"Set `download=True` to download it. {symbol_file}"
+                )
+            checksum = _CHECKSUMS.get(url_symbols, None)
+            download_url_to_file(url_symbols, symbol_file, checksum)
+
+        with open(symbol_file, "r") as text:
+            self._symbols = [line.strip() for line in text.readlines()]
+
+        with open(dict_file, "r", encoding="latin-1") as text:
+            self._dictionary = _parse_dictionary(text.readlines(), exclude_punctuations=self.exclude_punctuations)
+
+    def __getitem__(self, n: int) -> Tuple[str, List[str]]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded.
+
+        Returns:
+            (str, List[str]): The corresponding word and phonemes ``(word, [phonemes])``.
+
+        """
+        return self._dictionary[n]
+
+    def __len__(self) -> int:
+        return len(self._dictionary)
+
+    @property
+    def symbols(self) -> List[str]:
+        """list[str]: A list of phonemes symbols, such as `AA`, `AE`, `AH`."""
+        return self._symbols.copy()
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/commonvoice.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/commonvoice.py
new file mode 100644
index 0000000000000000000000000000000000000000..29ad5b7e8d09a110a8a957c38c04bfba180bbce4
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/commonvoice.py
@@ -0,0 +1,71 @@
+import csv
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.utils.data import Dataset
+
+
+def load_commonvoice_item(
+    line: List[str], header: List[str], path: str, folder_audio: str, ext_audio: str
+) -> Tuple[Tensor, int, Dict[str, str]]:
+    # Each line as the following data:
+    # client_id, path, sentence, up_votes, down_votes, age, gender, accent
+
+    assert header[1] == "path"
+    fileid = line[1]
+    filename = os.path.join(path, folder_audio, fileid)
+    if not filename.endswith(ext_audio):
+        filename += ext_audio
+    waveform, sample_rate = torchaudio.load(filename)
+
+    dic = dict(zip(header, line))
+
+    return waveform, sample_rate, dic
+
+
+class COMMONVOICE(Dataset):
+    """Create a Dataset for *CommonVoice* [:footcite:`ardila2020common`].
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is located.
+             (Where the ``tsv`` file is present.)
+        tsv (str, optional):
+            The name of the tsv file used to construct the metadata, such as
+            ``"train.tsv"``, ``"test.tsv"``, ``"dev.tsv"``, ``"invalidated.tsv"``,
+            ``"validated.tsv"`` and ``"other.tsv"``. (default: ``"train.tsv"``)
+    """
+
+    _ext_txt = ".txt"
+    _ext_audio = ".mp3"
+    _folder_audio = "clips"
+
+    def __init__(self, root: Union[str, Path], tsv: str = "train.tsv") -> None:
+
+        # Get string representation of 'root' in case Path object is passed
+        self._path = os.fspath(root)
+        self._tsv = os.path.join(self._path, tsv)
+
+        with open(self._tsv, "r") as tsv_:
+            walker = csv.reader(tsv_, delimiter="\t")
+            self._header = next(walker)
+            self._walker = list(walker)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, Dict[str, str]]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            (Tensor, int, Dict[str, str]): ``(waveform, sample_rate, dictionary)``,  where dictionary
+            is built from the TSV file with the following keys: ``client_id``, ``path``, ``sentence``,
+            ``up_votes``, ``down_votes``, ``age``, ``gender`` and ``accent``.
+        """
+        line = self._walker[n]
+        return load_commonvoice_item(line, self._header, self._path, self._folder_audio, self._ext_audio)
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/dr_vctk.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/dr_vctk.py
new file mode 100644
index 0000000000000000000000000000000000000000..be865b61b2528f46aa2a014d4a9ab54672756fe3
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/dr_vctk.py
@@ -0,0 +1,106 @@
+from pathlib import Path
+from typing import Dict, Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+
+
+_URL = "https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip"
+_CHECKSUM = "781f12f4406ed36ed27ae3bce55da47ba176e2d8bae67319e389e07b2c9bd769"
+_SUPPORTED_SUBSETS = {"train", "test"}
+
+
+class DR_VCTK(Dataset):
+    """Create a dataset for *Device Recorded VCTK (Small subset version)* [:footcite:`Sarfjoo2018DeviceRV`].
+
+    Args:
+        root (str or Path): Root directory where the dataset's top level directory is found.
+        subset (str): The subset to use. Can be one of ``"train"`` and ``"test"``. (default: ``"train"``).
+        download (bool):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        url (str): The URL to download the dataset from.
+            (default: ``"https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip"``)
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        subset: str = "train",
+        *,
+        download: bool = False,
+        url: str = _URL,
+    ) -> None:
+        if subset not in _SUPPORTED_SUBSETS:
+            raise RuntimeError(
+                f"The subset '{subset}' does not match any of the supported subsets: {_SUPPORTED_SUBSETS}"
+            )
+
+        root = Path(root).expanduser()
+        archive = root / "DR-VCTK.zip"
+
+        self._subset = subset
+        self._path = root / "DR-VCTK" / "DR-VCTK"
+        self._clean_audio_dir = self._path / f"clean_{self._subset}set_wav_16k"
+        self._noisy_audio_dir = self._path / f"device-recorded_{self._subset}set_wav_16k"
+        self._config_filepath = self._path / "configurations" / f"{self._subset}_ch_log.txt"
+
+        if not self._path.is_dir():
+            if not archive.is_file():
+                if not download:
+                    raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
+                download_url_to_file(url, archive, hash_prefix=_CHECKSUM)
+            extract_archive(archive, root)
+
+        self._config = self._load_config(self._config_filepath)
+        self._filename_list = sorted(self._config)
+
+    def _load_config(self, filepath: str) -> Dict[str, Tuple[str, int]]:
+        # Skip header
+        skip_rows = 2 if self._subset == "train" else 1
+
+        config = {}
+        with open(filepath) as f:
+            for i, line in enumerate(f):
+                if i < skip_rows or not line:
+                    continue
+                filename, source, channel_id = line.strip().split("\t")
+                config[filename] = (source, int(channel_id))
+        return config
+
+    def _load_dr_vctk_item(self, filename: str) -> Tuple[Tensor, int, Tensor, int, str, str, str, int]:
+        speaker_id, utterance_id = filename.split(".")[0].split("_")
+        source, channel_id = self._config[filename]
+        file_clean_audio = self._clean_audio_dir / filename
+        file_noisy_audio = self._noisy_audio_dir / filename
+        waveform_clean, sample_rate_clean = torchaudio.load(file_clean_audio)
+        waveform_noisy, sample_rate_noisy = torchaudio.load(file_noisy_audio)
+        return (
+            waveform_clean,
+            sample_rate_clean,
+            waveform_noisy,
+            sample_rate_noisy,
+            speaker_id,
+            utterance_id,
+            source,
+            channel_id,
+        )
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, Tensor, int, str, str, str, int]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            (Tensor, int, Tensor, int, str, str, str, int):
+            ``(waveform_clean, sample_rate_clean, waveform_noisy, sample_rate_noisy, speaker_id,\
+                utterance_id, source, channel_id)``
+        """
+        filename = self._filename_list[n]
+        return self._load_dr_vctk_item(filename)
+
+    def __len__(self) -> int:
+        return len(self._filename_list)
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/gtzan.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/gtzan.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d087ea5ec401eabe80a1a9605def5a7a96bc961
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/gtzan.py
@@ -0,0 +1,1108 @@
+import os
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+
+# The following lists prefixed with `filtered_` provide a filtered split
+# that:
+#
+# a. Mitigate a known issue with GTZAN (duplication)
+#
+# b. Provide a standard split for testing it against other
+#    methods (e.g. the one in jordipons/sklearn-audio-transfer-learning).
+#
+# Those are used when GTZAN is initialised with the `filtered` keyword.
+# The split was taken from (github) jordipons/sklearn-audio-transfer-learning.
+
+gtzan_genres = [
+    "blues",
+    "classical",
+    "country",
+    "disco",
+    "hiphop",
+    "jazz",
+    "metal",
+    "pop",
+    "reggae",
+    "rock",
+]
+
+filtered_test = [
+    "blues.00012",
+    "blues.00013",
+    "blues.00014",
+    "blues.00015",
+    "blues.00016",
+    "blues.00017",
+    "blues.00018",
+    "blues.00019",
+    "blues.00020",
+    "blues.00021",
+    "blues.00022",
+    "blues.00023",
+    "blues.00024",
+    "blues.00025",
+    "blues.00026",
+    "blues.00027",
+    "blues.00028",
+    "blues.00061",
+    "blues.00062",
+    "blues.00063",
+    "blues.00064",
+    "blues.00065",
+    "blues.00066",
+    "blues.00067",
+    "blues.00068",
+    "blues.00069",
+    "blues.00070",
+    "blues.00071",
+    "blues.00072",
+    "blues.00098",
+    "blues.00099",
+    "classical.00011",
+    "classical.00012",
+    "classical.00013",
+    "classical.00014",
+    "classical.00015",
+    "classical.00016",
+    "classical.00017",
+    "classical.00018",
+    "classical.00019",
+    "classical.00020",
+    "classical.00021",
+    "classical.00022",
+    "classical.00023",
+    "classical.00024",
+    "classical.00025",
+    "classical.00026",
+    "classical.00027",
+    "classical.00028",
+    "classical.00029",
+    "classical.00034",
+    "classical.00035",
+    "classical.00036",
+    "classical.00037",
+    "classical.00038",
+    "classical.00039",
+    "classical.00040",
+    "classical.00041",
+    "classical.00049",
+    "classical.00077",
+    "classical.00078",
+    "classical.00079",
+    "country.00030",
+    "country.00031",
+    "country.00032",
+    "country.00033",
+    "country.00034",
+    "country.00035",
+    "country.00036",
+    "country.00037",
+    "country.00038",
+    "country.00039",
+    "country.00040",
+    "country.00043",
+    "country.00044",
+    "country.00046",
+    "country.00047",
+    "country.00048",
+    "country.00050",
+    "country.00051",
+    "country.00053",
+    "country.00054",
+    "country.00055",
+    "country.00056",
+    "country.00057",
+    "country.00058",
+    "country.00059",
+    "country.00060",
+    "country.00061",
+    "country.00062",
+    "country.00063",
+    "country.00064",
+    "disco.00001",
+    "disco.00021",
+    "disco.00058",
+    "disco.00062",
+    "disco.00063",
+    "disco.00064",
+    "disco.00065",
+    "disco.00066",
+    "disco.00069",
+    "disco.00076",
+    "disco.00077",
+    "disco.00078",
+    "disco.00079",
+    "disco.00080",
+    "disco.00081",
+    "disco.00082",
+    "disco.00083",
+    "disco.00084",
+    "disco.00085",
+    "disco.00086",
+    "disco.00087",
+    "disco.00088",
+    "disco.00091",
+    "disco.00092",
+    "disco.00093",
+    "disco.00094",
+    "disco.00096",
+    "disco.00097",
+    "disco.00099",
+    "hiphop.00000",
+    "hiphop.00026",
+    "hiphop.00027",
+    "hiphop.00030",
+    "hiphop.00040",
+    "hiphop.00043",
+    "hiphop.00044",
+    "hiphop.00045",
+    "hiphop.00051",
+    "hiphop.00052",
+    "hiphop.00053",
+    "hiphop.00054",
+    "hiphop.00062",
+    "hiphop.00063",
+    "hiphop.00064",
+    "hiphop.00065",
+    "hiphop.00066",
+    "hiphop.00067",
+    "hiphop.00068",
+    "hiphop.00069",
+    "hiphop.00070",
+    "hiphop.00071",
+    "hiphop.00072",
+    "hiphop.00073",
+    "hiphop.00074",
+    "hiphop.00075",
+    "hiphop.00099",
+    "jazz.00073",
+    "jazz.00074",
+    "jazz.00075",
+    "jazz.00076",
+    "jazz.00077",
+    "jazz.00078",
+    "jazz.00079",
+    "jazz.00080",
+    "jazz.00081",
+    "jazz.00082",
+    "jazz.00083",
+    "jazz.00084",
+    "jazz.00085",
+    "jazz.00086",
+    "jazz.00087",
+    "jazz.00088",
+    "jazz.00089",
+    "jazz.00090",
+    "jazz.00091",
+    "jazz.00092",
+    "jazz.00093",
+    "jazz.00094",
+    "jazz.00095",
+    "jazz.00096",
+    "jazz.00097",
+    "jazz.00098",
+    "jazz.00099",
+    "metal.00012",
+    "metal.00013",
+    "metal.00014",
+    "metal.00015",
+    "metal.00022",
+    "metal.00023",
+    "metal.00025",
+    "metal.00026",
+    "metal.00027",
+    "metal.00028",
+    "metal.00029",
+    "metal.00030",
+    "metal.00031",
+    "metal.00032",
+    "metal.00033",
+    "metal.00038",
+    "metal.00039",
+    "metal.00067",
+    "metal.00070",
+    "metal.00073",
+    "metal.00074",
+    "metal.00075",
+    "metal.00078",
+    "metal.00083",
+    "metal.00085",
+    "metal.00087",
+    "metal.00088",
+    "pop.00000",
+    "pop.00001",
+    "pop.00013",
+    "pop.00014",
+    "pop.00043",
+    "pop.00063",
+    "pop.00064",
+    "pop.00065",
+    "pop.00066",
+    "pop.00069",
+    "pop.00070",
+    "pop.00071",
+    "pop.00072",
+    "pop.00073",
+    "pop.00074",
+    "pop.00075",
+    "pop.00076",
+    "pop.00077",
+    "pop.00078",
+    "pop.00079",
+    "pop.00082",
+    "pop.00088",
+    "pop.00089",
+    "pop.00090",
+    "pop.00091",
+    "pop.00092",
+    "pop.00093",
+    "pop.00094",
+    "pop.00095",
+    "pop.00096",
+    "reggae.00034",
+    "reggae.00035",
+    "reggae.00036",
+    "reggae.00037",
+    "reggae.00038",
+    "reggae.00039",
+    "reggae.00040",
+    "reggae.00046",
+    "reggae.00047",
+    "reggae.00048",
+    "reggae.00052",
+    "reggae.00053",
+    "reggae.00064",
+    "reggae.00065",
+    "reggae.00066",
+    "reggae.00067",
+    "reggae.00068",
+    "reggae.00071",
+    "reggae.00079",
+    "reggae.00082",
+    "reggae.00083",
+    "reggae.00084",
+    "reggae.00087",
+    "reggae.00088",
+    "reggae.00089",
+    "reggae.00090",
+    "rock.00010",
+    "rock.00011",
+    "rock.00012",
+    "rock.00013",
+    "rock.00014",
+    "rock.00015",
+    "rock.00027",
+    "rock.00028",
+    "rock.00029",
+    "rock.00030",
+    "rock.00031",
+    "rock.00032",
+    "rock.00033",
+    "rock.00034",
+    "rock.00035",
+    "rock.00036",
+    "rock.00037",
+    "rock.00039",
+    "rock.00040",
+    "rock.00041",
+    "rock.00042",
+    "rock.00043",
+    "rock.00044",
+    "rock.00045",
+    "rock.00046",
+    "rock.00047",
+    "rock.00048",
+    "rock.00086",
+    "rock.00087",
+    "rock.00088",
+    "rock.00089",
+    "rock.00090",
+]
+
+filtered_train = [
+    "blues.00029",
+    "blues.00030",
+    "blues.00031",
+    "blues.00032",
+    "blues.00033",
+    "blues.00034",
+    "blues.00035",
+    "blues.00036",
+    "blues.00037",
+    "blues.00038",
+    "blues.00039",
+    "blues.00040",
+    "blues.00041",
+    "blues.00042",
+    "blues.00043",
+    "blues.00044",
+    "blues.00045",
+    "blues.00046",
+    "blues.00047",
+    "blues.00048",
+    "blues.00049",
+    "blues.00073",
+    "blues.00074",
+    "blues.00075",
+    "blues.00076",
+    "blues.00077",
+    "blues.00078",
+    "blues.00079",
+    "blues.00080",
+    "blues.00081",
+    "blues.00082",
+    "blues.00083",
+    "blues.00084",
+    "blues.00085",
+    "blues.00086",
+    "blues.00087",
+    "blues.00088",
+    "blues.00089",
+    "blues.00090",
+    "blues.00091",
+    "blues.00092",
+    "blues.00093",
+    "blues.00094",
+    "blues.00095",
+    "blues.00096",
+    "blues.00097",
+    "classical.00030",
+    "classical.00031",
+    "classical.00032",
+    "classical.00033",
+    "classical.00043",
+    "classical.00044",
+    "classical.00045",
+    "classical.00046",
+    "classical.00047",
+    "classical.00048",
+    "classical.00050",
+    "classical.00051",
+    "classical.00052",
+    "classical.00053",
+    "classical.00054",
+    "classical.00055",
+    "classical.00056",
+    "classical.00057",
+    "classical.00058",
+    "classical.00059",
+    "classical.00060",
+    "classical.00061",
+    "classical.00062",
+    "classical.00063",
+    "classical.00064",
+    "classical.00065",
+    "classical.00066",
+    "classical.00067",
+    "classical.00080",
+    "classical.00081",
+    "classical.00082",
+    "classical.00083",
+    "classical.00084",
+    "classical.00085",
+    "classical.00086",
+    "classical.00087",
+    "classical.00088",
+    "classical.00089",
+    "classical.00090",
+    "classical.00091",
+    "classical.00092",
+    "classical.00093",
+    "classical.00094",
+    "classical.00095",
+    "classical.00096",
+    "classical.00097",
+    "classical.00098",
+    "classical.00099",
+    "country.00019",
+    "country.00020",
+    "country.00021",
+    "country.00022",
+    "country.00023",
+    "country.00024",
+    "country.00025",
+    "country.00026",
+    "country.00028",
+    "country.00029",
+    "country.00065",
+    "country.00066",
+    "country.00067",
+    "country.00068",
+    "country.00069",
+    "country.00070",
+    "country.00071",
+    "country.00072",
+    "country.00073",
+    "country.00074",
+    "country.00075",
+    "country.00076",
+    "country.00077",
+    "country.00078",
+    "country.00079",
+    "country.00080",
+    "country.00081",
+    "country.00082",
+    "country.00083",
+    "country.00084",
+    "country.00085",
+    "country.00086",
+    "country.00087",
+    "country.00088",
+    "country.00089",
+    "country.00090",
+    "country.00091",
+    "country.00092",
+    "country.00093",
+    "country.00094",
+    "country.00095",
+    "country.00096",
+    "country.00097",
+    "country.00098",
+    "country.00099",
+    "disco.00005",
+    "disco.00015",
+    "disco.00016",
+    "disco.00017",
+    "disco.00018",
+    "disco.00019",
+    "disco.00020",
+    "disco.00022",
+    "disco.00023",
+    "disco.00024",
+    "disco.00025",
+    "disco.00026",
+    "disco.00027",
+    "disco.00028",
+    "disco.00029",
+    "disco.00030",
+    "disco.00031",
+    "disco.00032",
+    "disco.00033",
+    "disco.00034",
+    "disco.00035",
+    "disco.00036",
+    "disco.00037",
+    "disco.00039",
+    "disco.00040",
+    "disco.00041",
+    "disco.00042",
+    "disco.00043",
+    "disco.00044",
+    "disco.00045",
+    "disco.00047",
+    "disco.00049",
+    "disco.00053",
+    "disco.00054",
+    "disco.00056",
+    "disco.00057",
+    "disco.00059",
+    "disco.00061",
+    "disco.00070",
+    "disco.00073",
+    "disco.00074",
+    "disco.00089",
+    "hiphop.00002",
+    "hiphop.00003",
+    "hiphop.00004",
+    "hiphop.00005",
+    "hiphop.00006",
+    "hiphop.00007",
+    "hiphop.00008",
+    "hiphop.00009",
+    "hiphop.00010",
+    "hiphop.00011",
+    "hiphop.00012",
+    "hiphop.00013",
+    "hiphop.00014",
+    "hiphop.00015",
+    "hiphop.00016",
+    "hiphop.00017",
+    "hiphop.00018",
+    "hiphop.00019",
+    "hiphop.00020",
+    "hiphop.00021",
+    "hiphop.00022",
+    "hiphop.00023",
+    "hiphop.00024",
+    "hiphop.00025",
+    "hiphop.00028",
+    "hiphop.00029",
+    "hiphop.00031",
+    "hiphop.00032",
+    "hiphop.00033",
+    "hiphop.00034",
+    "hiphop.00035",
+    "hiphop.00036",
+    "hiphop.00037",
+    "hiphop.00038",
+    "hiphop.00041",
+    "hiphop.00042",
+    "hiphop.00055",
+    "hiphop.00056",
+    "hiphop.00057",
+    "hiphop.00058",
+    "hiphop.00059",
+    "hiphop.00060",
+    "hiphop.00061",
+    "hiphop.00077",
+    "hiphop.00078",
+    "hiphop.00079",
+    "hiphop.00080",
+    "jazz.00000",
+    "jazz.00001",
+    "jazz.00011",
+    "jazz.00012",
+    "jazz.00013",
+    "jazz.00014",
+    "jazz.00015",
+    "jazz.00016",
+    "jazz.00017",
+    "jazz.00018",
+    "jazz.00019",
+    "jazz.00020",
+    "jazz.00021",
+    "jazz.00022",
+    "jazz.00023",
+    "jazz.00024",
+    "jazz.00041",
+    "jazz.00047",
+    "jazz.00048",
+    "jazz.00049",
+    "jazz.00050",
+    "jazz.00051",
+    "jazz.00052",
+    "jazz.00053",
+    "jazz.00054",
+    "jazz.00055",
+    "jazz.00056",
+    "jazz.00057",
+    "jazz.00058",
+    "jazz.00059",
+    "jazz.00060",
+    "jazz.00061",
+    "jazz.00062",
+    "jazz.00063",
+    "jazz.00064",
+    "jazz.00065",
+    "jazz.00066",
+    "jazz.00067",
+    "jazz.00068",
+    "jazz.00069",
+    "jazz.00070",
+    "jazz.00071",
+    "jazz.00072",
+    "metal.00002",
+    "metal.00003",
+    "metal.00005",
+    "metal.00021",
+    "metal.00024",
+    "metal.00035",
+    "metal.00046",
+    "metal.00047",
+    "metal.00048",
+    "metal.00049",
+    "metal.00050",
+    "metal.00051",
+    "metal.00052",
+    "metal.00053",
+    "metal.00054",
+    "metal.00055",
+    "metal.00056",
+    "metal.00057",
+    "metal.00059",
+    "metal.00060",
+    "metal.00061",
+    "metal.00062",
+    "metal.00063",
+    "metal.00064",
+    "metal.00065",
+    "metal.00066",
+    "metal.00069",
+    "metal.00071",
+    "metal.00072",
+    "metal.00079",
+    "metal.00080",
+    "metal.00084",
+    "metal.00086",
+    "metal.00089",
+    "metal.00090",
+    "metal.00091",
+    "metal.00092",
+    "metal.00093",
+    "metal.00094",
+    "metal.00095",
+    "metal.00096",
+    "metal.00097",
+    "metal.00098",
+    "metal.00099",
+    "pop.00002",
+    "pop.00003",
+    "pop.00004",
+    "pop.00005",
+    "pop.00006",
+    "pop.00007",
+    "pop.00008",
+    "pop.00009",
+    "pop.00011",
+    "pop.00012",
+    "pop.00016",
+    "pop.00017",
+    "pop.00018",
+    "pop.00019",
+    "pop.00020",
+    "pop.00023",
+    "pop.00024",
+    "pop.00025",
+    "pop.00026",
+    "pop.00027",
+    "pop.00028",
+    "pop.00029",
+    "pop.00031",
+    "pop.00032",
+    "pop.00033",
+    "pop.00034",
+    "pop.00035",
+    "pop.00036",
+    "pop.00038",
+    "pop.00039",
+    "pop.00040",
+    "pop.00041",
+    "pop.00042",
+    "pop.00044",
+    "pop.00046",
+    "pop.00049",
+    "pop.00050",
+    "pop.00080",
+    "pop.00097",
+    "pop.00098",
+    "pop.00099",
+    "reggae.00000",
+    "reggae.00001",
+    "reggae.00002",
+    "reggae.00004",
+    "reggae.00006",
+    "reggae.00009",
+    "reggae.00011",
+    "reggae.00012",
+    "reggae.00014",
+    "reggae.00015",
+    "reggae.00016",
+    "reggae.00017",
+    "reggae.00018",
+    "reggae.00019",
+    "reggae.00020",
+    "reggae.00021",
+    "reggae.00022",
+    "reggae.00023",
+    "reggae.00024",
+    "reggae.00025",
+    "reggae.00026",
+    "reggae.00027",
+    "reggae.00028",
+    "reggae.00029",
+    "reggae.00030",
+    "reggae.00031",
+    "reggae.00032",
+    "reggae.00042",
+    "reggae.00043",
+    "reggae.00044",
+    "reggae.00045",
+    "reggae.00049",
+    "reggae.00050",
+    "reggae.00051",
+    "reggae.00054",
+    "reggae.00055",
+    "reggae.00056",
+    "reggae.00057",
+    "reggae.00058",
+    "reggae.00059",
+    "reggae.00060",
+    "reggae.00063",
+    "reggae.00069",
+    "rock.00000",
+    "rock.00001",
+    "rock.00002",
+    "rock.00003",
+    "rock.00004",
+    "rock.00005",
+    "rock.00006",
+    "rock.00007",
+    "rock.00008",
+    "rock.00009",
+    "rock.00016",
+    "rock.00017",
+    "rock.00018",
+    "rock.00019",
+    "rock.00020",
+    "rock.00021",
+    "rock.00022",
+    "rock.00023",
+    "rock.00024",
+    "rock.00025",
+    "rock.00026",
+    "rock.00057",
+    "rock.00058",
+    "rock.00059",
+    "rock.00060",
+    "rock.00061",
+    "rock.00062",
+    "rock.00063",
+    "rock.00064",
+    "rock.00065",
+    "rock.00066",
+    "rock.00067",
+    "rock.00068",
+    "rock.00069",
+    "rock.00070",
+    "rock.00091",
+    "rock.00092",
+    "rock.00093",
+    "rock.00094",
+    "rock.00095",
+    "rock.00096",
+    "rock.00097",
+    "rock.00098",
+    "rock.00099",
+]
+
+filtered_valid = [
+    "blues.00000",
+    "blues.00001",
+    "blues.00002",
+    "blues.00003",
+    "blues.00004",
+    "blues.00005",
+    "blues.00006",
+    "blues.00007",
+    "blues.00008",
+    "blues.00009",
+    "blues.00010",
+    "blues.00011",
+    "blues.00050",
+    "blues.00051",
+    "blues.00052",
+    "blues.00053",
+    "blues.00054",
+    "blues.00055",
+    "blues.00056",
+    "blues.00057",
+    "blues.00058",
+    "blues.00059",
+    "blues.00060",
+    "classical.00000",
+    "classical.00001",
+    "classical.00002",
+    "classical.00003",
+    "classical.00004",
+    "classical.00005",
+    "classical.00006",
+    "classical.00007",
+    "classical.00008",
+    "classical.00009",
+    "classical.00010",
+    "classical.00068",
+    "classical.00069",
+    "classical.00070",
+    "classical.00071",
+    "classical.00072",
+    "classical.00073",
+    "classical.00074",
+    "classical.00075",
+    "classical.00076",
+    "country.00000",
+    "country.00001",
+    "country.00002",
+    "country.00003",
+    "country.00004",
+    "country.00005",
+    "country.00006",
+    "country.00007",
+    "country.00009",
+    "country.00010",
+    "country.00011",
+    "country.00012",
+    "country.00013",
+    "country.00014",
+    "country.00015",
+    "country.00016",
+    "country.00017",
+    "country.00018",
+    "country.00027",
+    "country.00041",
+    "country.00042",
+    "country.00045",
+    "country.00049",
+    "disco.00000",
+    "disco.00002",
+    "disco.00003",
+    "disco.00004",
+    "disco.00006",
+    "disco.00007",
+    "disco.00008",
+    "disco.00009",
+    "disco.00010",
+    "disco.00011",
+    "disco.00012",
+    "disco.00013",
+    "disco.00014",
+    "disco.00046",
+    "disco.00048",
+    "disco.00052",
+    "disco.00067",
+    "disco.00068",
+    "disco.00072",
+    "disco.00075",
+    "disco.00090",
+    "disco.00095",
+    "hiphop.00081",
+    "hiphop.00082",
+    "hiphop.00083",
+    "hiphop.00084",
+    "hiphop.00085",
+    "hiphop.00086",
+    "hiphop.00087",
+    "hiphop.00088",
+    "hiphop.00089",
+    "hiphop.00090",
+    "hiphop.00091",
+    "hiphop.00092",
+    "hiphop.00093",
+    "hiphop.00094",
+    "hiphop.00095",
+    "hiphop.00096",
+    "hiphop.00097",
+    "hiphop.00098",
+    "jazz.00002",
+    "jazz.00003",
+    "jazz.00004",
+    "jazz.00005",
+    "jazz.00006",
+    "jazz.00007",
+    "jazz.00008",
+    "jazz.00009",
+    "jazz.00010",
+    "jazz.00025",
+    "jazz.00026",
+    "jazz.00027",
+    "jazz.00028",
+    "jazz.00029",
+    "jazz.00030",
+    "jazz.00031",
+    "jazz.00032",
+    "metal.00000",
+    "metal.00001",
+    "metal.00006",
+    "metal.00007",
+    "metal.00008",
+    "metal.00009",
+    "metal.00010",
+    "metal.00011",
+    "metal.00016",
+    "metal.00017",
+    "metal.00018",
+    "metal.00019",
+    "metal.00020",
+    "metal.00036",
+    "metal.00037",
+    "metal.00068",
+    "metal.00076",
+    "metal.00077",
+    "metal.00081",
+    "metal.00082",
+    "pop.00010",
+    "pop.00053",
+    "pop.00055",
+    "pop.00058",
+    "pop.00059",
+    "pop.00060",
+    "pop.00061",
+    "pop.00062",
+    "pop.00081",
+    "pop.00083",
+    "pop.00084",
+    "pop.00085",
+    "pop.00086",
+    "reggae.00061",
+    "reggae.00062",
+    "reggae.00070",
+    "reggae.00072",
+    "reggae.00074",
+    "reggae.00076",
+    "reggae.00077",
+    "reggae.00078",
+    "reggae.00085",
+    "reggae.00092",
+    "reggae.00093",
+    "reggae.00094",
+    "reggae.00095",
+    "reggae.00096",
+    "reggae.00097",
+    "reggae.00098",
+    "reggae.00099",
+    "rock.00038",
+    "rock.00049",
+    "rock.00050",
+    "rock.00051",
+    "rock.00052",
+    "rock.00053",
+    "rock.00054",
+    "rock.00055",
+    "rock.00056",
+    "rock.00071",
+    "rock.00072",
+    "rock.00073",
+    "rock.00074",
+    "rock.00075",
+    "rock.00076",
+    "rock.00077",
+    "rock.00078",
+    "rock.00079",
+    "rock.00080",
+    "rock.00081",
+    "rock.00082",
+    "rock.00083",
+    "rock.00084",
+    "rock.00085",
+]
+
+
+URL = "http://opihi.cs.uvic.ca/sound/genres.tar.gz"
+FOLDER_IN_ARCHIVE = "genres"
+_CHECKSUMS = {
+    "http://opihi.cs.uvic.ca/sound/genres.tar.gz": "24347e0223d2ba798e0a558c4c172d9d4a19c00bb7963fe055d183dadb4ef2c6"
+}
+
+
+def load_gtzan_item(fileid: str, path: str, ext_audio: str) -> Tuple[Tensor, str]:
+    """
+    Loads a file from the dataset and returns the raw waveform
+    as a Torch Tensor, its sample rate as an integer, and its
+    genre as a string.
+    """
+    # Filenames are of the form label.id, e.g. blues.00078
+    label, _ = fileid.split(".")
+
+    # Read wav
+    file_audio = os.path.join(path, label, fileid + ext_audio)
+    waveform, sample_rate = torchaudio.load(file_audio)
+
+    return waveform, sample_rate, label
+
+
+class GTZAN(Dataset):
+    """Create a Dataset for *GTZAN* [:footcite:`tzanetakis_essl_cook_2001`].
+
+    Note:
+        Please see http://marsyas.info/downloads/datasets.html if you are planning to use
+        this dataset to publish results.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from.
+            (default: ``"http://opihi.cs.uvic.ca/sound/genres.tar.gz"``)
+        folder_in_archive (str, optional): The top-level directory of the dataset.
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        subset (str or None, optional): Which subset of the dataset to use.
+            One of ``"training"``, ``"validation"``, ``"testing"`` or ``None``.
+            If ``None``, the entire dataset is used. (default: ``None``).
+    """
+
+    _ext_audio = ".wav"
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = URL,
+        folder_in_archive: str = FOLDER_IN_ARCHIVE,
+        download: bool = False,
+        subset: Optional[str] = None,
+    ) -> None:
+
+        # super(GTZAN, self).__init__()
+
+        # Get string representation of 'root' in case Path object is passed
+        root = os.fspath(root)
+
+        self.root = root
+        self.url = url
+        self.folder_in_archive = folder_in_archive
+        self.download = download
+        self.subset = subset
+
+        assert subset is None or subset in ["training", "validation", "testing"], (
+            "When `subset` not None, it must take a value from " + "{'training', 'validation', 'testing'}."
+        )
+
+        archive = os.path.basename(url)
+        archive = os.path.join(root, archive)
+        self._path = os.path.join(root, folder_in_archive)
+
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _CHECKSUMS.get(url, None)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                extract_archive(archive)
+
+        if not os.path.isdir(self._path):
+            raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
+
+        if self.subset is None:
+            # Check every subdirectory under dataset root
+            # which has the same name as the genres in
+            # GTZAN (e.g. `root_dir'/blues/, `root_dir'/rock, etc.)
+            # This lets users remove or move around song files,
+            # useful when e.g. they want to use only some of the files
+            # in a genre or want to label other files with a different
+            # genre.
+            self._walker = []
+
+            root = os.path.expanduser(self._path)
+
+            for directory in gtzan_genres:
+                fulldir = os.path.join(root, directory)
+
+                if not os.path.exists(fulldir):
+                    continue
+
+                songs_in_genre = os.listdir(fulldir)
+                songs_in_genre.sort()
+                for fname in songs_in_genre:
+                    name, ext = os.path.splitext(fname)
+                    if ext.lower() == ".wav" and "." in name:
+                        # Check whether the file is of the form
+                        # `gtzan_genre`.`5 digit number`.wav
+                        genre, num = name.split(".")
+                        if genre in gtzan_genres and len(num) == 5 and num.isdigit():
+                            self._walker.append(name)
+        else:
+            if self.subset == "training":
+                self._walker = filtered_train
+            elif self.subset == "validation":
+                self._walker = filtered_valid
+            elif self.subset == "testing":
+                self._walker = filtered_test
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            (Tensor, int, str): ``(waveform, sample_rate, label)``
+        """
+        fileid = self._walker[n]
+        item = load_gtzan_item(fileid, self._path, self._ext_audio)
+        waveform, sample_rate, label = item
+        return waveform, sample_rate, label
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librilight_limited.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librilight_limited.py
new file mode 100644
index 0000000000000000000000000000000000000000..947254479ec83e2bc96525fe824a2201afe1e6ae
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librilight_limited.py
@@ -0,0 +1,91 @@
+import os
+from pathlib import Path
+from typing import List, Tuple, Union
+
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.librispeech import load_librispeech_item
+from torchaudio.datasets.utils import extract_archive
+
+
+_ARCHIVE_NAME = "librispeech_finetuning"
+_URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz"
+_CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af"
+
+
+def _get_fileids_paths(path, subset, _ext_audio) -> List[Tuple[str, str]]:
+    """Get the file names and the corresponding file paths without `speaker_id`
+    and `chapter_id` directories.
+    The format of path is like:
+        {root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or
+        {root}/{_ARCHIVE_NAME}/9h/[clean, other]
+    """
+    if subset == "10min":
+        files_paths = [
+            (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem))
+            for p in Path(path).glob("1h/0/*/*/*/*" + _ext_audio)
+        ]
+    elif subset in ["1h", "10h"]:
+        files_paths = [
+            (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem))
+            for p in Path(path).glob("1h/*/*/*/*/*" + _ext_audio)
+        ]
+        if subset == "10h":
+            files_paths += [
+                (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem))
+                for p in Path(path).glob("9h/*/*/*/*" + _ext_audio)
+            ]
+    else:
+        raise ValueError(f"Unsupported subset value. Found {subset}.")
+    files_paths = sorted(files_paths, key=lambda x: x[0] + x[1])
+    return files_paths
+
+
+class LibriLightLimited(Dataset):
+    """Create a Dataset for LibriLightLimited, which is the supervised subset of
+        LibriLight dataset.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        subset (str, optional): The subset to use. Options: [``10min``, ``1h``, ``10h``]
+            (Default: ``10min``).
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+
+    _ext_txt = ".trans.txt"
+    _ext_audio = ".flac"
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        subset: str = "10min",
+        download: bool = False,
+    ) -> None:
+        assert subset in ["10min", "1h", "10h"], "`subset` must be one of ['10min', '1h', '10h']"
+
+        root = os.fspath(root)
+        self._path = os.path.join(root, _ARCHIVE_NAME)
+        archive = os.path.join(root, f"{_ARCHIVE_NAME}.tgz")
+        if not os.path.isdir(self._path):
+            if not download:
+                raise RuntimeError("Dataset not found. Please use `download=True` to download")
+            if not os.path.isfile(archive):
+                download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM)
+            extract_archive(archive)
+        self._fileids_paths = _get_fileids_paths(self._path, subset, self._ext_audio)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
+        """Load the n-th sample from the dataset.
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            (Tensor, int, str, int, int, int):
+            ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)``
+        """
+        file_path, fileid = self._fileids_paths[n]
+        return load_librispeech_item(fileid, file_path, self._ext_audio, self._ext_txt)
+
+    def __len__(self) -> int:
+        return len(self._fileids_paths)
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librimix.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librimix.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc9496d7732be787023129a33da59623d586693
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librimix.py
@@ -0,0 +1,85 @@
+from pathlib import Path
+from typing import List, Tuple, Union
+
+import torch
+import torchaudio
+from torch.utils.data import Dataset
+
+SampleType = Tuple[int, torch.Tensor, List[torch.Tensor]]
+
+
+class LibriMix(Dataset):
+    r"""Create the *LibriMix* [:footcite:`cosentino2020librimix`] dataset.
+
+    Args:
+        root (str or Path): The path to the directory where the directory ``Libri2Mix`` or
+            ``Libri3Mix`` is stored.
+        subset (str, optional): The subset to use. Options: [``train-360``, ``train-100``,
+            ``dev``, and ``test``] (Default: ``train-360``).
+        num_speakers (int, optional): The number of speakers, which determines the directories
+            to traverse. The Dataset will traverse ``s1`` to ``sN`` directories to collect
+            N source audios. (Default: 2)
+        sample_rate (int, optional): sample rate of audio files. The ``sample_rate`` determines
+            which subdirectory the audio are fetched. If any of the audio has a different sample
+            rate, raises ``ValueError``. Options: [8000, 16000] (Default: 8000)
+        task (str, optional): the task of LibriMix.
+            Options: [``enh_single``, ``enh_both``, ``sep_clean``, ``sep_noisy``]
+            (Default: ``sep_clean``)
+
+    Note:
+        The LibriMix dataset needs to be manually generated. Please check https://github.com/JorisCos/LibriMix
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        subset: str = "train-360",
+        num_speakers: int = 2,
+        sample_rate: int = 8000,
+        task: str = "sep_clean",
+    ):
+        self.root = Path(root) / f"Libri{num_speakers}Mix"
+        if sample_rate == 8000:
+            self.root = self.root / "wav8k/min" / subset
+        elif sample_rate == 16000:
+            self.root = self.root / "wav16k/min" / subset
+        else:
+            raise ValueError(f"Unsupported sample rate. Found {sample_rate}.")
+        self.sample_rate = sample_rate
+        self.task = task
+        self.mix_dir = (self.root / f"mix_{task.split('_')[1]}").resolve()
+        self.src_dirs = [(self.root / f"s{i+1}").resolve() for i in range(num_speakers)]
+
+        self.files = [p.name for p in self.mix_dir.glob("*wav")]
+        self.files.sort()
+
+    def _load_audio(self, path) -> torch.Tensor:
+        waveform, sample_rate = torchaudio.load(path)
+        if sample_rate != self.sample_rate:
+            raise ValueError(
+                f"The dataset contains audio file of sample rate {sample_rate}, "
+                f"but the requested sample rate is {self.sample_rate}."
+            )
+        return waveform
+
+    def _load_sample(self, filename) -> SampleType:
+        mixed = self._load_audio(str(self.mix_dir / filename))
+        srcs = []
+        for i, dir_ in enumerate(self.src_dirs):
+            src = self._load_audio(str(dir_ / filename))
+            if mixed.shape != src.shape:
+                raise ValueError(f"Different waveform shapes. mixed: {mixed.shape}, src[{i}]: {src.shape}")
+            srcs.append(src)
+        return self.sample_rate, mixed, srcs
+
+    def __len__(self) -> int:
+        return len(self.files)
+
+    def __getitem__(self, key: int) -> SampleType:
+        """Load the n-th sample from the dataset.
+        Args:
+            key (int): The index of the sample to be loaded
+        Returns:
+            (int, Tensor, List[Tensor]): ``(sample_rate, mix_waveform, list_of_source_waveforms)``
+        """
+        return self._load_sample(self.files[key])
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librispeech.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librispeech.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cd06468303bd612a45aa286ed5af83c169cff40
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librispeech.py
@@ -0,0 +1,135 @@
+import os
+from pathlib import Path
+from typing import Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+
+URL = "train-clean-100"
+FOLDER_IN_ARCHIVE = "LibriSpeech"
+_DATA_SUBSETS = [
+    "dev-clean",
+    "dev-other",
+    "test-clean",
+    "test-other",
+    "train-clean-100",
+    "train-clean-360",
+    "train-other-500",
+]
+_CHECKSUMS = {
+    "http://www.openslr.org/resources/12/dev-clean.tar.gz": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3",  # noqa: E501
+    "http://www.openslr.org/resources/12/dev-other.tar.gz": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365",  # noqa: E501
+    "http://www.openslr.org/resources/12/test-clean.tar.gz": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23",  # noqa: E501
+    "http://www.openslr.org/resources/12/test-other.tar.gz": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-clean-100.tar.gz": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-clean-360.tar.gz": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-other-500.tar.gz": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2",  # noqa: E501
+}
+
+
+def download_librispeech(root, url):
+    base_url = "http://www.openslr.org/resources/12/"
+    ext_archive = ".tar.gz"
+
+    filename = url + ext_archive
+    archive = os.path.join(root, filename)
+    download_url = os.path.join(base_url, filename)
+    if not os.path.isfile(archive):
+        checksum = _CHECKSUMS.get(download_url, None)
+        download_url_to_file(download_url, archive, hash_prefix=checksum)
+    extract_archive(archive)
+
+
+def load_librispeech_item(
+    fileid: str, path: str, ext_audio: str, ext_txt: str
+) -> Tuple[Tensor, int, str, int, int, int]:
+    speaker_id, chapter_id, utterance_id = fileid.split("-")
+
+    # Load audio
+    fileid_audio = f"{speaker_id}-{chapter_id}-{utterance_id}"
+    file_audio = fileid_audio + ext_audio
+    file_audio = os.path.join(path, speaker_id, chapter_id, file_audio)
+    waveform, sample_rate = torchaudio.load(file_audio)
+
+    # Load text
+    file_text = f"{speaker_id}-{chapter_id}{ext_txt}"
+    file_text = os.path.join(path, speaker_id, chapter_id, file_text)
+    with open(file_text) as ft:
+        for line in ft:
+            fileid_text, transcript = line.strip().split(" ", 1)
+            if fileid_audio == fileid_text:
+                break
+        else:
+            # Translation not found
+            raise FileNotFoundError(f"Translation not found for {fileid_audio}")
+
+    return (
+        waveform,
+        sample_rate,
+        transcript,
+        int(speaker_id),
+        int(chapter_id),
+        int(utterance_id),
+    )
+
+
+class LIBRISPEECH(Dataset):
+    """Create a Dataset for *LibriSpeech* [:footcite:`7178964`].
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from,
+            or the type of the dataset to dowload.
+            Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``,
+            ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and
+            ``"train-other-500"``. (default: ``"train-clean-100"``)
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"LibriSpeech"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+
+    _ext_txt = ".trans.txt"
+    _ext_audio = ".flac"
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = URL,
+        folder_in_archive: str = FOLDER_IN_ARCHIVE,
+        download: bool = False,
+    ) -> None:
+        if url not in _DATA_SUBSETS:
+            raise ValueError(f"Invalid url '{url}' given; please provide one of {_DATA_SUBSETS}.")
+
+        root = os.fspath(root)
+        self._path = os.path.join(root, folder_in_archive, url)
+
+        if not os.path.isdir(self._path):
+            if download:
+                download_librispeech(root, url)
+            else:
+                raise RuntimeError(
+                    f"Dataset not found at {self._path}. Please set `download=True` to download the dataset."
+                )
+
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*/*/*" + self._ext_audio))
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            (Tensor, int, str, int, int, int):
+            ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)``
+        """
+        fileid = self._walker[n]
+        return load_librispeech_item(fileid, self._path, self._ext_audio, self._ext_txt)
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/libritts.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/libritts.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7e10cedc4be9f91e0dc839ea446aea984060447
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/libritts.py
@@ -0,0 +1,154 @@
+import os
+from pathlib import Path
+from typing import Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+
+URL = "train-clean-100"
+FOLDER_IN_ARCHIVE = "LibriTTS"
+_CHECKSUMS = {
+    "http://www.openslr.org/resources/60/dev-clean.tar.gz": "da0864e1bd26debed35da8a869dd5c04dfc27682921936de7cff9c8a254dbe1a",  # noqa: E501
+    "http://www.openslr.org/resources/60/dev-other.tar.gz": "d413eda26f3a152ac7c9cf3658ef85504dfb1b625296e5fa83727f5186cca79c",  # noqa: E501
+    "http://www.openslr.org/resources/60/test-clean.tar.gz": "234ea5b25859102a87024a4b9b86641f5b5aaaf1197335c95090cde04fe9a4f5",  # noqa: E501
+    "http://www.openslr.org/resources/60/test-other.tar.gz": "33a5342094f3bba7ccc2e0500b9e72d558f72eb99328ac8debe1d9080402f10d",  # noqa: E501
+    "http://www.openslr.org/resources/60/train-clean-100.tar.gz": "c5608bf1ef74bb621935382b8399c5cdd51cd3ee47cec51f00f885a64c6c7f6b",  # noqa: E501
+    "http://www.openslr.org/resources/60/train-clean-360.tar.gz": "ce7cff44dcac46009d18379f37ef36551123a1dc4e5c8e4eb73ae57260de4886",  # noqa: E501
+    "http://www.openslr.org/resources/60/train-other-500.tar.gz": "e35f7e34deeb2e2bdfe4403d88c8fdd5fbf64865cae41f027a185a6965f0a5df",  # noqa: E501
+}
+
+
+def load_libritts_item(
+    fileid: str,
+    path: str,
+    ext_audio: str,
+    ext_original_txt: str,
+    ext_normalized_txt: str,
+) -> Tuple[Tensor, int, str, str, int, int, str]:
+    speaker_id, chapter_id, segment_id, utterance_id = fileid.split("_")
+    utterance_id = fileid
+
+    normalized_text = utterance_id + ext_normalized_txt
+    normalized_text = os.path.join(path, speaker_id, chapter_id, normalized_text)
+
+    original_text = utterance_id + ext_original_txt
+    original_text = os.path.join(path, speaker_id, chapter_id, original_text)
+
+    file_audio = utterance_id + ext_audio
+    file_audio = os.path.join(path, speaker_id, chapter_id, file_audio)
+
+    # Load audio
+    waveform, sample_rate = torchaudio.load(file_audio)
+
+    # Load original text
+    with open(original_text) as ft:
+        original_text = ft.readline()
+
+    # Load normalized text
+    with open(normalized_text, "r") as ft:
+        normalized_text = ft.readline()
+
+    return (
+        waveform,
+        sample_rate,
+        original_text,
+        normalized_text,
+        int(speaker_id),
+        int(chapter_id),
+        utterance_id,
+    )
+
+
+class LIBRITTS(Dataset):
+    """Create a Dataset for *LibriTTS* [:footcite:`Zen2019LibriTTSAC`].
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from,
+            or the type of the dataset to dowload.
+            Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``,
+            ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and
+            ``"train-other-500"``. (default: ``"train-clean-100"``)
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"LibriTTS"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+
+    _ext_original_txt = ".original.txt"
+    _ext_normalized_txt = ".normalized.txt"
+    _ext_audio = ".wav"
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = URL,
+        folder_in_archive: str = FOLDER_IN_ARCHIVE,
+        download: bool = False,
+    ) -> None:
+
+        if url in [
+            "dev-clean",
+            "dev-other",
+            "test-clean",
+            "test-other",
+            "train-clean-100",
+            "train-clean-360",
+            "train-other-500",
+        ]:
+
+            ext_archive = ".tar.gz"
+            base_url = "http://www.openslr.org/resources/60/"
+
+            url = os.path.join(base_url, url + ext_archive)
+
+        # Get string representation of 'root' in case Path object is passed
+        root = os.fspath(root)
+
+        basename = os.path.basename(url)
+        archive = os.path.join(root, basename)
+
+        basename = basename.split(".")[0]
+        folder_in_archive = os.path.join(folder_in_archive, basename)
+
+        self._path = os.path.join(root, folder_in_archive)
+
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _CHECKSUMS.get(url, None)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                extract_archive(archive)
+        else:
+            if not os.path.exists(self._path):
+                raise RuntimeError(
+                    f"The path {self._path} doesn't exist. "
+                    "Please check the ``root`` path or set `download=True` to download it"
+                )
+
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*/*/*" + self._ext_audio))
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int, int, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            (Tensor, int, str, str, str, int, int, str):
+            ``(waveform, sample_rate, original_text, normalized_text, speaker_id, chapter_id, utterance_id)``
+        """
+        fileid = self._walker[n]
+        return load_libritts_item(
+            fileid,
+            self._path,
+            self._ext_audio,
+            self._ext_original_txt,
+            self._ext_normalized_txt,
+        )
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/ljspeech.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/ljspeech.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8421b639f913a007838c2d01fed1181d83d5c9a
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/ljspeech.py
@@ -0,0 +1,99 @@
+import csv
+import os
+from pathlib import Path
+from typing import Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+
+
+_RELEASE_CONFIGS = {
+    "release1": {
+        "folder_in_archive": "wavs",
+        "url": "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2",
+        "checksum": "be1a30453f28eb8dd26af4101ae40cbf2c50413b1bb21936cbcdc6fae3de8aa5",
+    }
+}
+
+
+class LJSPEECH(Dataset):
+    """Create a Dataset for *LJSpeech-1.1* [:footcite:`ljspeech17`].
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from.
+            (default: ``"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"``)
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"wavs"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = _RELEASE_CONFIGS["release1"]["url"],
+        folder_in_archive: str = _RELEASE_CONFIGS["release1"]["folder_in_archive"],
+        download: bool = False,
+    ) -> None:
+
+        self._parse_filesystem(root, url, folder_in_archive, download)
+
+    def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, download: bool) -> None:
+        root = Path(root)
+
+        basename = os.path.basename(url)
+        archive = root / basename
+
+        basename = Path(basename.split(".tar.bz2")[0])
+        folder_in_archive = basename / folder_in_archive
+
+        self._path = root / folder_in_archive
+        self._metadata_path = root / basename / "metadata.csv"
+
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _RELEASE_CONFIGS["release1"]["checksum"]
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                extract_archive(archive)
+        else:
+            if not os.path.exists(self._path):
+                raise RuntimeError(
+                    f"The path {self._path} doesn't exist. "
+                    "Please check the ``root`` path or set `download=True` to download it"
+                )
+
+        with open(self._metadata_path, "r", newline="") as metadata:
+            flist = csv.reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE)
+            self._flist = list(flist)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            (Tensor, int, str, str):
+            ``(waveform, sample_rate, transcript, normalized_transcript)``
+        """
+        line = self._flist[n]
+        fileid, transcript, normalized_transcript = line
+        fileid_audio = self._path / (fileid + ".wav")
+
+        # Load audio
+        waveform, sample_rate = torchaudio.load(fileid_audio)
+
+        return (
+            waveform,
+            sample_rate,
+            transcript,
+            normalized_transcript,
+        )
+
+    def __len__(self) -> int:
+        return len(self._flist)
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/quesst14.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/quesst14.py
new file mode 100644
index 0000000000000000000000000000000000000000..68ddceeaf9d770a772de87d9709472d6c0dce455
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/quesst14.py
@@ -0,0 +1,109 @@
+import os
+import re
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+import torch
+import torchaudio
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+
+
+URL = "https://speech.fit.vutbr.cz/files/quesst14Database.tgz"
+_CHECKSUM = "4f869e06bc066bbe9c5dde31dbd3909a0870d70291110ebbb38878dcbc2fc5e4"
+_LANGUAGES = [
+    "albanian",
+    "basque",
+    "czech",
+    "nnenglish",
+    "romanian",
+    "slovak",
+]
+
+
+class QUESST14(Dataset):
+    """Create *QUESST14* [:footcite:`Mir2015QUESST2014EQ`] Dataset
+
+    Args:
+        root (str or Path): Root directory where the dataset's top level directory is found
+        subset (str): Subset of the dataset to use. Options: [``"docs"``, ``"dev"``, ``"eval"``].
+        language (str or None, optional): Language to get dataset for.
+            Options: [``None``, ``albanian``, ``basque``, ``czech``, ``nnenglish``, ``romanian``, ``slovak``].
+            If ``None``, dataset consists of all languages. (default: ``"nnenglish"``)
+        download (bool, optional): Whether to download the dataset if it is not found at root path.
+            (default: ``False``)
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        subset: str,
+        language: Optional[str] = "nnenglish",
+        download: bool = False,
+    ) -> None:
+        assert subset in ["docs", "dev", "eval"], "`subset` must be one of ['docs', 'dev', 'eval']"
+
+        assert language is None or language in _LANGUAGES, f"`language` must be None or one of {str(_LANGUAGES)}"
+
+        # Get string representation of 'root'
+        root = os.fspath(root)
+
+        basename = os.path.basename(URL)
+        archive = os.path.join(root, basename)
+
+        basename = basename.rsplit(".", 2)[0]
+        self._path = os.path.join(root, basename)
+
+        if not os.path.isdir(self._path):
+            if not os.path.isfile(archive):
+                if not download:
+                    raise RuntimeError("Dataset not found. Please use `download=True` to download")
+                download_url_to_file(URL, archive, hash_prefix=_CHECKSUM)
+            extract_archive(archive, root)
+
+        if subset == "docs":
+            self.data = filter_audio_paths(self._path, language, "language_key_utterances.lst")
+        elif subset == "dev":
+            self.data = filter_audio_paths(self._path, language, "language_key_dev.lst")
+        elif subset == "eval":
+            self.data = filter_audio_paths(self._path, language, "language_key_eval.lst")
+
+    def _load_sample(self, n: int) -> Tuple[torch.Tensor, int, str]:
+        audio_path = self.data[n]
+        wav, sample_rate = torchaudio.load(audio_path)
+        return wav, sample_rate, audio_path.with_suffix("").name
+
+    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            (Tensor, int, str): ``(waveform, sample_rate, file_name)``
+        """
+        return self._load_sample(n)
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+
+def filter_audio_paths(
+    path: str,
+    language: str,
+    lst_name: str,
+):
+    """Extract audio paths for the given language."""
+    audio_paths = []
+
+    path = Path(path)
+    with open(path / "scoring" / lst_name) as f:
+        for line in f:
+            audio_path, lang = line.strip().split()
+            if language is not None and lang != language:
+                continue
+            audio_path = re.sub(r"^.*?\/", "", audio_path)
+            audio_paths.append(path / audio_path)
+
+    return audio_paths
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/speechcommands.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/speechcommands.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b9872662f9396651ff85a1438aed14afaed3c90
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/speechcommands.py
@@ -0,0 +1,149 @@
+import os
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+
+FOLDER_IN_ARCHIVE = "SpeechCommands"
+URL = "speech_commands_v0.02"
+HASH_DIVIDER = "_nohash_"
+EXCEPT_FOLDER = "_background_noise_"
+_CHECKSUMS = {
+    "https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.01.tar.gz": "743935421bb51cccdb6bdd152e04c5c70274e935c82119ad7faeec31780d811d",  # noqa: E501
+    "https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz": "af14739ee7dc311471de98f5f9d2c9191b18aedfe957f4a6ff791c709868ff58",  # noqa: E501
+}
+
+
+def _load_list(root, *filenames):
+    output = []
+    for filename in filenames:
+        filepath = os.path.join(root, filename)
+        with open(filepath) as fileobj:
+            output += [os.path.normpath(os.path.join(root, line.strip())) for line in fileobj]
+    return output
+
+
+def load_speechcommands_item(filepath: str, path: str) -> Tuple[Tensor, int, str, str, int]:
+    relpath = os.path.relpath(filepath, path)
+    label, filename = os.path.split(relpath)
+    # Besides the officially supported split method for datasets defined by "validation_list.txt"
+    # and "testing_list.txt" over "speech_commands_v0.0x.tar.gz" archives, an alternative split
+    # method referred to in paragraph 2-3 of Section 7.1, references 13 and 14 of the original
+    # paper, and the checksums file from the tensorflow_datasets package [1] is also supported.
+    # Some filenames in those "speech_commands_test_set_v0.0x.tar.gz" archives have the form
+    # "xxx.wav.wav", so file extensions twice needs to be stripped twice.
+    # [1] https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/url_checksums/speech_commands.txt
+    speaker, _ = os.path.splitext(filename)
+    speaker, _ = os.path.splitext(speaker)
+
+    speaker_id, utterance_number = speaker.split(HASH_DIVIDER)
+    utterance_number = int(utterance_number)
+
+    # Load audio
+    waveform, sample_rate = torchaudio.load(filepath)
+    return waveform, sample_rate, label, speaker_id, utterance_number
+
+
+class SPEECHCOMMANDS(Dataset):
+    """Create a Dataset for *Speech Commands* [:footcite:`speechcommandsv2`].
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from,
+            or the type of the dataset to dowload.
+            Allowed type values are ``"speech_commands_v0.01"`` and ``"speech_commands_v0.02"``
+            (default: ``"speech_commands_v0.02"``)
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"SpeechCommands"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        subset (str or None, optional):
+            Select a subset of the dataset [None, "training", "validation", "testing"]. None means
+            the whole dataset. "validation" and "testing" are defined in "validation_list.txt" and
+            "testing_list.txt", respectively, and "training" is the rest. Details for the files
+            "validation_list.txt" and "testing_list.txt" are explained in the README of the dataset
+            and in the introduction of Section 7 of the original paper and its reference 12. The
+            original paper can be found `here <https://arxiv.org/pdf/1804.03209.pdf>`_. (Default: ``None``)
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = URL,
+        folder_in_archive: str = FOLDER_IN_ARCHIVE,
+        download: bool = False,
+        subset: Optional[str] = None,
+    ) -> None:
+
+        assert subset is None or subset in ["training", "validation", "testing"], (
+            "When `subset` not None, it must take a value from " + "{'training', 'validation', 'testing'}."
+        )
+
+        if url in [
+            "speech_commands_v0.01",
+            "speech_commands_v0.02",
+        ]:
+            base_url = "https://storage.googleapis.com/download.tensorflow.org/data/"
+            ext_archive = ".tar.gz"
+
+            url = os.path.join(base_url, url + ext_archive)
+
+        # Get string representation of 'root' in case Path object is passed
+        root = os.fspath(root)
+
+        basename = os.path.basename(url)
+        archive = os.path.join(root, basename)
+
+        basename = basename.rsplit(".", 2)[0]
+        folder_in_archive = os.path.join(folder_in_archive, basename)
+
+        self._path = os.path.join(root, folder_in_archive)
+
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _CHECKSUMS.get(url, None)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                extract_archive(archive, self._path)
+        else:
+            if not os.path.exists(self._path):
+                raise RuntimeError(
+                    f"The path {self._path} doesn't exist. "
+                    "Please check the ``root`` path or set `download=True` to download it"
+                )
+
+        if subset == "validation":
+            self._walker = _load_list(self._path, "validation_list.txt")
+        elif subset == "testing":
+            self._walker = _load_list(self._path, "testing_list.txt")
+        elif subset == "training":
+            excludes = set(_load_list(self._path, "validation_list.txt", "testing_list.txt"))
+            walker = sorted(str(p) for p in Path(self._path).glob("*/*.wav"))
+            self._walker = [
+                w
+                for w in walker
+                if HASH_DIVIDER in w and EXCEPT_FOLDER not in w and os.path.normpath(w) not in excludes
+            ]
+        else:
+            walker = sorted(str(p) for p in Path(self._path).glob("*/*.wav"))
+            self._walker = [w for w in walker if HASH_DIVIDER in w and EXCEPT_FOLDER not in w]
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            (Tensor, int, str, str, int):
+            ``(waveform, sample_rate, label, speaker_id, utterance_number)``
+        """
+        fileid = self._walker[n]
+        return load_speechcommands_item(fileid, self._path)
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/tedlium.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/tedlium.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7478ca7beb892c1016a5d286811111de12f436d
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/tedlium.py
@@ -0,0 +1,206 @@
+import os
+from pathlib import Path
+from typing import Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+
+
+_RELEASE_CONFIGS = {
+    "release1": {
+        "folder_in_archive": "TEDLIUM_release1",
+        "url": "http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz",
+        "checksum": "30301975fd8c5cac4040c261c0852f57cfa8adbbad2ce78e77e4986957445f27",
+        "data_path": "",
+        "subset": "train",
+        "supported_subsets": ["train", "test", "dev"],
+        "dict": "TEDLIUM.150K.dic",
+    },
+    "release2": {
+        "folder_in_archive": "TEDLIUM_release2",
+        "url": "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz",
+        "checksum": "93281b5fcaaae5c88671c9d000b443cb3c7ea3499ad12010b3934ca41a7b9c58",
+        "data_path": "",
+        "subset": "train",
+        "supported_subsets": ["train", "test", "dev"],
+        "dict": "TEDLIUM.152k.dic",
+    },
+    "release3": {
+        "folder_in_archive": "TEDLIUM_release-3",
+        "url": "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz",
+        "checksum": "ad1e454d14d1ad550bc2564c462d87c7a7ec83d4dc2b9210f22ab4973b9eccdb",
+        "data_path": "data/",
+        "subset": "train",
+        "supported_subsets": ["train", "test", "dev"],
+        "dict": "TEDLIUM.152k.dic",
+    },
+}
+
+
+class TEDLIUM(Dataset):
+    """
+    Create a Dataset for *Tedlium* [:footcite:`rousseau2012tedlium`]. It supports releases 1,2 and 3.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        release (str, optional): Release version.
+            Allowed values are ``"release1"``, ``"release2"`` or ``"release3"``.
+            (default: ``"release1"``).
+        subset (str, optional): The subset of dataset to use. Valid options are ``"train"``, ``"dev"``,
+            and ``"test"``. Defaults to ``"train"``.
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        audio_ext (str, optional): extension for audio file (default: ``".sph"``)
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        release: str = "release1",
+        subset: str = "train",
+        download: bool = False,
+        audio_ext: str = ".sph",
+    ) -> None:
+        self._ext_audio = audio_ext
+        if release in _RELEASE_CONFIGS.keys():
+            folder_in_archive = _RELEASE_CONFIGS[release]["folder_in_archive"]
+            url = _RELEASE_CONFIGS[release]["url"]
+            subset = subset if subset else _RELEASE_CONFIGS[release]["subset"]
+        else:
+            # Raise warning
+            raise RuntimeError(
+                "The release {} does not match any of the supported tedlium releases{} ".format(
+                    release,
+                    _RELEASE_CONFIGS.keys(),
+                )
+            )
+        if subset not in _RELEASE_CONFIGS[release]["supported_subsets"]:
+            # Raise warning
+            raise RuntimeError(
+                "The subset {} does not match any of the supported tedlium subsets{} ".format(
+                    subset,
+                    _RELEASE_CONFIGS[release]["supported_subsets"],
+                )
+            )
+
+        # Get string representation of 'root' in case Path object is passed
+        root = os.fspath(root)
+
+        basename = os.path.basename(url)
+        archive = os.path.join(root, basename)
+
+        basename = basename.split(".")[0]
+
+        if release == "release3":
+            if subset == "train":
+                self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"])
+            else:
+                self._path = os.path.join(root, folder_in_archive, "legacy", subset)
+        else:
+            self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"], subset)
+
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _RELEASE_CONFIGS[release]["checksum"]
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                extract_archive(archive)
+        else:
+            if not os.path.exists(self._path):
+                raise RuntimeError(
+                    f"The path {self._path} doesn't exist. "
+                    "Please check the ``root`` path or set `download=True` to download it"
+                )
+
+        # Create list for all samples
+        self._filelist = []
+        stm_path = os.path.join(self._path, "stm")
+        for file in sorted(os.listdir(stm_path)):
+            if file.endswith(".stm"):
+                stm_path = os.path.join(self._path, "stm", file)
+                with open(stm_path) as f:
+                    l = len(f.readlines())
+                    file = file.replace(".stm", "")
+                    self._filelist.extend((file, line) for line in range(l))
+        # Create dict path for later read
+        self._dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"])
+        self._phoneme_dict = None
+
+    def _load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, int, str, int, int, int]:
+        """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name.
+
+        Args:
+            fileid (str): File id to identify both text and audio files corresponding to the sample
+            line (int): Line identifier for the sample inside the text file
+            path (str): Dataset root path
+
+        Returns:
+            (Tensor, int, str, int, int, int):
+            ``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)``
+        """
+        transcript_path = os.path.join(path, "stm", fileid)
+        with open(transcript_path + ".stm") as f:
+            transcript = f.readlines()[line]
+            talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split(" ", 6)
+
+        wave_path = os.path.join(path, "sph", fileid)
+        waveform, sample_rate = self._load_audio(wave_path + self._ext_audio, start_time=start_time, end_time=end_time)
+
+        return (waveform, sample_rate, transcript, talk_id, speaker_id, identifier)
+
+    def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]:
+        """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality
+        and load individual sentences from a full ted audio talk file.
+
+        Args:
+            path (str): Path to audio file
+            start_time (int): Time in seconds where the sample sentence stars
+            end_time (int): Time in seconds where the sample sentence finishes
+            sample_rate (float, optional): Sampling rate
+
+        Returns:
+            [Tensor, int]: Audio tensor representation and sample rate
+        """
+        start_time = int(float(start_time) * sample_rate)
+        end_time = int(float(end_time) * sample_rate)
+
+        kwargs = {"frame_offset": start_time, "num_frames": end_time - start_time}
+
+        return torchaudio.load(path, **kwargs)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            tuple: ``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)``
+        """
+        fileid, line = self._filelist[n]
+        return self._load_tedlium_item(fileid, line, self._path)
+
+    def __len__(self) -> int:
+        """TEDLIUM dataset custom function overwritting len default behaviour.
+
+        Returns:
+            int: TEDLIUM dataset length
+        """
+        return len(self._filelist)
+
+    @property
+    def phoneme_dict(self):
+        """dict[str, tuple[str]]: Phonemes. Mapping from word to tuple of phonemes.
+        Note that some words have empty phonemes.
+        """
+        # Read phoneme dictionary
+        if not self._phoneme_dict:
+            self._phoneme_dict = {}
+            with open(self._dict_path, "r", encoding="utf-8") as f:
+                for line in f.readlines():
+                    content = line.strip().split()
+                    self._phoneme_dict[content[0]] = tuple(content[1:])  # content[1:] can be empty list
+        return self._phoneme_dict.copy()
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/utils.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..020555c480cb8f71ddbc26d525716e34bfc35fd8
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/utils.py
@@ -0,0 +1,191 @@
+import hashlib
+import logging
+import os
+import tarfile
+import urllib
+import urllib.request
+import warnings
+import zipfile
+from typing import Any, Iterable, List, Optional
+
+from torch.utils.model_zoo import tqdm
+
+
+def stream_url(
+    url: str, start_byte: Optional[int] = None, block_size: int = 32 * 1024, progress_bar: bool = True
+) -> Iterable:
+    """Stream url by chunk
+
+    Args:
+        url (str): Url.
+        start_byte (int or None, optional): Start streaming at that point (Default: ``None``).
+        block_size (int, optional): Size of chunks to stream (Default: ``32 * 1024``).
+        progress_bar (bool, optional): Display a progress bar (Default: ``True``).
+    """
+
+    # If we already have the whole file, there is no need to download it again
+    req = urllib.request.Request(url, method="HEAD")
+    with urllib.request.urlopen(req) as response:
+        url_size = int(response.info().get("Content-Length", -1))
+    if url_size == start_byte:
+        return
+
+    req = urllib.request.Request(url)
+    if start_byte:
+        req.headers["Range"] = "bytes={}-".format(start_byte)
+
+    with urllib.request.urlopen(req) as upointer, tqdm(
+        unit="B",
+        unit_scale=True,
+        unit_divisor=1024,
+        total=url_size,
+        disable=not progress_bar,
+    ) as pbar:
+
+        num_bytes = 0
+        while True:
+            chunk = upointer.read(block_size)
+            if not chunk:
+                break
+            yield chunk
+            num_bytes += len(chunk)
+            pbar.update(len(chunk))
+
+
+def download_url(
+    url: str,
+    download_folder: str,
+    filename: Optional[str] = None,
+    hash_value: Optional[str] = None,
+    hash_type: str = "sha256",
+    progress_bar: bool = True,
+    resume: bool = False,
+) -> None:
+    """Download file to disk.
+
+    Args:
+        url (str): Url.
+        download_folder (str): Folder to download file.
+        filename (str or None, optional): Name of downloaded file. If None, it is inferred from the url
+            (Default: ``None``).
+        hash_value (str or None, optional): Hash for url (Default: ``None``).
+        hash_type (str, optional): Hash type, among "sha256" and "md5" (Default: ``"sha256"``).
+        progress_bar (bool, optional): Display a progress bar (Default: ``True``).
+        resume (bool, optional): Enable resuming download (Default: ``False``).
+    """
+    warnings.warn("download_url is deprecated and will be removed in the v0.12 release.")
+    req = urllib.request.Request(url, method="HEAD")
+    req_info = urllib.request.urlopen(req).info()
+
+    # Detect filename
+    filename = filename or req_info.get_filename() or os.path.basename(url)
+    filepath = os.path.join(download_folder, filename)
+    if resume and os.path.exists(filepath):
+        mode = "ab"
+        local_size: Optional[int] = os.path.getsize(filepath)
+
+    elif not resume and os.path.exists(filepath):
+        raise RuntimeError("{} already exists. Delete the file manually and retry.".format(filepath))
+    else:
+        mode = "wb"
+        local_size = None
+
+    if hash_value and local_size == int(req_info.get("Content-Length", -1)):
+        with open(filepath, "rb") as file_obj:
+            if validate_file(file_obj, hash_value, hash_type):
+                return
+        raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath))
+
+    with open(filepath, mode) as fpointer:
+        for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar):
+            fpointer.write(chunk)
+
+    with open(filepath, "rb") as file_obj:
+        if hash_value and not validate_file(file_obj, hash_value, hash_type):
+            raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath))
+
+
+def validate_file(file_obj: Any, hash_value: str, hash_type: str = "sha256") -> bool:
+    """Validate a given file object with its hash.
+
+    Args:
+        file_obj: File object to read from.
+        hash_value (str): Hash for url.
+        hash_type (str, optional): Hash type, among "sha256" and "md5" (Default: ``"sha256"``).
+
+    Returns:
+        bool: return True if its a valid file, else False.
+    """
+
+    if hash_type == "sha256":
+        hash_func = hashlib.sha256()
+    elif hash_type == "md5":
+        hash_func = hashlib.md5()
+    else:
+        raise ValueError
+
+    while True:
+        # Read by chunk to avoid filling memory
+        chunk = file_obj.read(1024**2)
+        if not chunk:
+            break
+        hash_func.update(chunk)
+
+    return hash_func.hexdigest() == hash_value
+
+
+def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bool = False) -> List[str]:
+    """Extract archive.
+    Args:
+        from_path (str): the path of the archive.
+        to_path (str or None, optional): the root path of the extraced files (directory of from_path)
+            (Default: ``None``)
+        overwrite (bool, optional): overwrite existing files (Default: ``False``)
+
+    Returns:
+        List[str]: List of paths to extracted files even if not overwritten.
+
+    Examples:
+        >>> url = 'http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz'
+        >>> from_path = './validation.tar.gz'
+        >>> to_path = './'
+        >>> torchaudio.datasets.utils.download_from_url(url, from_path)
+        >>> torchaudio.datasets.utils.extract_archive(from_path, to_path)
+    """
+
+    if to_path is None:
+        to_path = os.path.dirname(from_path)
+
+    try:
+        with tarfile.open(from_path, "r") as tar:
+            logging.info("Opened tar file {}.".format(from_path))
+            files = []
+            for file_ in tar:  # type: Any
+                file_path = os.path.join(to_path, file_.name)
+                if file_.isfile():
+                    files.append(file_path)
+                    if os.path.exists(file_path):
+                        logging.info("{} already extracted.".format(file_path))
+                        if not overwrite:
+                            continue
+                tar.extract(file_, to_path)
+            return files
+    except tarfile.ReadError:
+        pass
+
+    try:
+        with zipfile.ZipFile(from_path, "r") as zfile:
+            logging.info("Opened zip file {}.".format(from_path))
+            files = zfile.namelist()
+            for file_ in files:
+                file_path = os.path.join(to_path, file_)
+                if os.path.exists(file_path):
+                    logging.info("{} already extracted.".format(file_path))
+                    if not overwrite:
+                        continue
+                zfile.extract(file_, to_path)
+        return files
+    except zipfile.BadZipFile:
+        pass
+
+    raise NotImplementedError("We currently only support tar.gz, tgz, and zip achives.")
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/vctk.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/vctk.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2dd6abddf9a9dea8d583e9d42f2578b065f1e86
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/vctk.py
@@ -0,0 +1,133 @@
+import os
+from typing import Tuple
+
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+
+URL = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"
+_CHECKSUMS = {
+    "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip": "f96258be9fdc2cbff6559541aae7ea4f59df3fcaf5cf963aae5ca647357e359c"  # noqa: E501
+}
+
+
+SampleType = Tuple[Tensor, int, str, str, str]
+
+
+class VCTK_092(Dataset):
+    """Create *VCTK 0.92* [:footcite:`yamagishi2019vctk`] Dataset
+
+    Args:
+        root (str): Root directory where the dataset's top level directory is found.
+        mic_id (str, optional): Microphone ID. Either ``"mic1"`` or ``"mic2"``. (default: ``"mic2"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        url (str, optional): The URL to download the dataset from.
+            (default: ``"https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"``)
+        audio_ext (str, optional): Custom audio extension if dataset is converted to non-default audio format.
+
+    Note:
+        * All the speeches from speaker ``p315`` will be skipped due to the lack of the corresponding text files.
+        * All the speeches from ``p280`` will be skipped for ``mic_id="mic2"`` due to the lack of the audio files.
+        * Some of the speeches from speaker ``p362`` will be skipped due to the lack of  the audio files.
+        * See Also: https://datashare.is.ed.ac.uk/handle/10283/3443
+    """
+
+    def __init__(
+        self,
+        root: str,
+        mic_id: str = "mic2",
+        download: bool = False,
+        url: str = URL,
+        audio_ext=".flac",
+    ):
+        if mic_id not in ["mic1", "mic2"]:
+            raise RuntimeError(f'`mic_id` has to be either "mic1" or "mic2". Found: {mic_id}')
+
+        archive = os.path.join(root, "VCTK-Corpus-0.92.zip")
+
+        self._path = os.path.join(root, "VCTK-Corpus-0.92")
+        self._txt_dir = os.path.join(self._path, "txt")
+        self._audio_dir = os.path.join(self._path, "wav48_silence_trimmed")
+        self._mic_id = mic_id
+        self._audio_ext = audio_ext
+
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _CHECKSUMS.get(url, None)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                extract_archive(archive, self._path)
+
+        if not os.path.isdir(self._path):
+            raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
+
+        # Extracting speaker IDs from the folder structure
+        self._speaker_ids = sorted(os.listdir(self._txt_dir))
+        self._sample_ids = []
+
+        """
+        Due to some insufficient data complexity in the 0.92 version of this dataset,
+        we start traversing the audio folder structure in accordance with the text folder.
+        As some of the audio files are missing of either ``mic_1`` or ``mic_2`` but the
+        text is present for the same, we first check for the existence of the audio file
+        before adding it to the ``sample_ids`` list.
+
+        Once the ``audio_ids`` are loaded into memory we can quickly access the list for
+        different parameters required by the user.
+        """
+        for speaker_id in self._speaker_ids:
+            if speaker_id == "p280" and mic_id == "mic2":
+                continue
+            utterance_dir = os.path.join(self._txt_dir, speaker_id)
+            for utterance_file in sorted(f for f in os.listdir(utterance_dir) if f.endswith(".txt")):
+                utterance_id = os.path.splitext(utterance_file)[0]
+                audio_path_mic = os.path.join(
+                    self._audio_dir,
+                    speaker_id,
+                    f"{utterance_id}_{mic_id}{self._audio_ext}",
+                )
+                if speaker_id == "p362" and not os.path.isfile(audio_path_mic):
+                    continue
+                self._sample_ids.append(utterance_id.split("_"))
+
+    def _load_text(self, file_path) -> str:
+        with open(file_path) as file_path:
+            return file_path.readlines()[0]
+
+    def _load_audio(self, file_path) -> Tuple[Tensor, int]:
+        return torchaudio.load(file_path)
+
+    def _load_sample(self, speaker_id: str, utterance_id: str, mic_id: str) -> SampleType:
+        transcript_path = os.path.join(self._txt_dir, speaker_id, f"{speaker_id}_{utterance_id}.txt")
+        audio_path = os.path.join(
+            self._audio_dir,
+            speaker_id,
+            f"{speaker_id}_{utterance_id}_{mic_id}{self._audio_ext}",
+        )
+
+        # Reading text
+        transcript = self._load_text(transcript_path)
+
+        # Reading FLAC
+        waveform, sample_rate = self._load_audio(audio_path)
+
+        return (waveform, sample_rate, transcript, speaker_id, utterance_id)
+
+    def __getitem__(self, n: int) -> SampleType:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            (Tensor, int, str, str, str):
+            ``(waveform, sample_rate, transcript, speaker_id, utterance_id)``
+        """
+        speaker_id, utterance_id = self._sample_ids[n]
+        return self._load_sample(speaker_id, utterance_id, self._mic_id)
+
+    def __len__(self) -> int:
+        return len(self._sample_ids)
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/yesno.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/yesno.py
new file mode 100644
index 0000000000000000000000000000000000000000..8818f578052c7cfbee876aba9db7402abdc835b0
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/yesno.py
@@ -0,0 +1,82 @@
+import os
+from pathlib import Path
+from typing import List, Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+
+
+_RELEASE_CONFIGS = {
+    "release1": {
+        "folder_in_archive": "waves_yesno",
+        "url": "http://www.openslr.org/resources/1/waves_yesno.tar.gz",
+        "checksum": "c3f49e0cca421f96b75b41640749167b52118f232498667ca7a5f9416aef8e73",
+    }
+}
+
+
+class YESNO(Dataset):
+    """Create a Dataset for *YesNo* [:footcite:`YesNo`].
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from.
+            (default: ``"http://www.openslr.org/resources/1/waves_yesno.tar.gz"``)
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"waves_yesno"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = _RELEASE_CONFIGS["release1"]["url"],
+        folder_in_archive: str = _RELEASE_CONFIGS["release1"]["folder_in_archive"],
+        download: bool = False,
+    ) -> None:
+
+        self._parse_filesystem(root, url, folder_in_archive, download)
+
+    def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, download: bool) -> None:
+        root = Path(root)
+        archive = os.path.basename(url)
+        archive = root / archive
+
+        self._path = root / folder_in_archive
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _RELEASE_CONFIGS["release1"]["checksum"]
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                extract_archive(archive)
+
+        if not os.path.isdir(self._path):
+            raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
+
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*.wav"))
+
+    def _load_item(self, fileid: str, path: str):
+        labels = [int(c) for c in fileid.split("_")]
+        file_audio = os.path.join(path, fileid + ".wav")
+        waveform, sample_rate = torchaudio.load(file_audio)
+        return waveform, sample_rate, labels
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, List[int]]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            (Tensor, int, List[int]): ``(waveform, sample_rate, labels)``
+        """
+        fileid = self._walker[n]
+        item = self._load_item(fileid, self._path)
+        return item
+
+    def __len__(self) -> int:
+        return len(self._walker)
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/__init__.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..06325da3fee54a2696c6829ee6d5491a9384b37d
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/__init__.py
@@ -0,0 +1,109 @@
+from .filtering import (
+    allpass_biquad,
+    band_biquad,
+    bandpass_biquad,
+    bandreject_biquad,
+    bass_biquad,
+    biquad,
+    contrast,
+    dcshift,
+    deemph_biquad,
+    dither,
+    equalizer_biquad,
+    filtfilt,
+    flanger,
+    gain,
+    highpass_biquad,
+    lfilter,
+    lowpass_biquad,
+    overdrive,
+    phaser,
+    riaa_biquad,
+    treble_biquad,
+    vad,
+)
+from .functional import (
+    amplitude_to_DB,
+    apply_beamforming,
+    apply_codec,
+    compute_deltas,
+    compute_kaldi_pitch,
+    create_dct,
+    DB_to_amplitude,
+    detect_pitch_frequency,
+    edit_distance,
+    griffinlim,
+    inverse_spectrogram,
+    linear_fbanks,
+    mask_along_axis,
+    mask_along_axis_iid,
+    melscale_fbanks,
+    mu_law_decoding,
+    mu_law_encoding,
+    mvdr_weights_rtf,
+    mvdr_weights_souden,
+    phase_vocoder,
+    pitch_shift,
+    psd,
+    resample,
+    rnnt_loss,
+    rtf_evd,
+    rtf_power,
+    sliding_window_cmn,
+    spectral_centroid,
+    spectrogram,
+)
+
+__all__ = [
+    "amplitude_to_DB",
+    "compute_deltas",
+    "compute_kaldi_pitch",
+    "create_dct",
+    "melscale_fbanks",
+    "linear_fbanks",
+    "DB_to_amplitude",
+    "detect_pitch_frequency",
+    "griffinlim",
+    "mask_along_axis",
+    "mask_along_axis_iid",
+    "mu_law_encoding",
+    "mu_law_decoding",
+    "phase_vocoder",
+    "sliding_window_cmn",
+    "spectrogram",
+    "inverse_spectrogram",
+    "spectral_centroid",
+    "allpass_biquad",
+    "band_biquad",
+    "bandpass_biquad",
+    "bandreject_biquad",
+    "bass_biquad",
+    "biquad",
+    "contrast",
+    "dither",
+    "dcshift",
+    "deemph_biquad",
+    "equalizer_biquad",
+    "filtfilt",
+    "flanger",
+    "gain",
+    "highpass_biquad",
+    "lfilter",
+    "lowpass_biquad",
+    "overdrive",
+    "phaser",
+    "riaa_biquad",
+    "treble_biquad",
+    "vad",
+    "apply_codec",
+    "resample",
+    "edit_distance",
+    "pitch_shift",
+    "rnnt_loss",
+    "psd",
+    "mvdr_weights_souden",
+    "mvdr_weights_rtf",
+    "rtf_evd",
+    "rtf_power",
+    "apply_beamforming",
+]
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/filtering.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/filtering.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a08418ed105494b1bc9a13dd10a76bbb595e8d6
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/filtering.py
@@ -0,0 +1,1661 @@
+import math
+import warnings
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+
+def _dB2Linear(x: float) -> float:
+    return math.exp(x * math.log(10) / 20.0)
+
+
+def _generate_wave_table(
+    wave_type: str,
+    data_type: str,
+    table_size: int,
+    min: float,
+    max: float,
+    phase: float,
+    device: torch.device,
+) -> Tensor:
+    r"""A helper function for phaser. Generates a table with given parameters.
+
+    Args:
+        wave_type (str): SINE or TRIANGULAR
+        data_type (str): desired data_type ( `INT` or `FLOAT` )
+        table_size (int): desired table size
+        min (float): desired min value
+        max (float): desired max value
+        phase (float): desired phase
+        device (torch.device): Torch device on which table must be generated
+    Returns:
+        Tensor: A 1D tensor with wave table values
+    """
+
+    phase_offset = int(phase / math.pi / 2 * table_size + 0.5)
+
+    t = torch.arange(table_size, device=device, dtype=torch.int32)
+
+    point = (t + phase_offset) % table_size
+
+    d = torch.zeros_like(point, device=device, dtype=torch.float64)
+
+    if wave_type == "SINE":
+        d = (torch.sin(point.to(torch.float64) / table_size * 2 * math.pi) + 1) / 2
+    elif wave_type == "TRIANGLE":
+        d = point.to(torch.float64) * 2 / table_size
+        value = torch.div(4 * point, table_size, rounding_mode="floor")
+        d[value == 0] = d[value == 0] + 0.5
+        d[value == 1] = 1.5 - d[value == 1]
+        d[value == 2] = 1.5 - d[value == 2]
+        d[value == 3] = d[value == 3] - 1.5
+
+    d = d * (max - min) + min
+
+    if data_type == "INT":
+        mask = d < 0
+        d[mask] = d[mask] - 0.5
+        d[~mask] = d[~mask] + 0.5
+        d = d.to(torch.int32)
+    elif data_type == "FLOAT":
+        d = d.to(torch.float32)
+
+    return d
+
+
+def allpass_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design two-pole all-pass filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform(torch.Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+
+    alpha = torch.sin(w0) / 2 / Q
+
+    b0 = 1 - alpha
+    b1 = -2 * torch.cos(w0)
+    b2 = 1 + alpha
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def band_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    central_freq: float,
+    Q: float = 0.707,
+    noise: bool = False,
+) -> Tensor:
+    r"""Design two-pole band filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
+        noise (bool, optional) : If ``True``, uses the alternate mode for un-pitched audio (e.g. percussion).
+            If ``False``, uses mode oriented to pitched audio, i.e. voice, singing,
+            or instrumental music (Default: ``False``).
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    bw_Hz = central_freq / Q
+
+    a0 = 1.0
+    a2 = torch.exp(-2 * math.pi * bw_Hz / sample_rate)
+    a1 = -4 * a2 / (1 + a2) * torch.cos(w0)
+
+    b0 = torch.sqrt(1 - a1 * a1 / (4 * a2)) * (1 - a2)
+
+    if noise:
+        mult = torch.sqrt(((1 + a2) * (1 + a2) - a1 * a1) * (1 - a2) / (1 + a2)) / b0
+        b0 = mult * b0
+
+    b1 = 0.0
+    b2 = 0.0
+
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def bandpass_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    central_freq: float,
+    Q: float = 0.707,
+    const_skirt_gain: bool = False,
+) -> Tensor:
+    r"""Design two-pole band-pass filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+        const_skirt_gain (bool, optional) : If ``True``, uses a constant skirt gain (peak gain = Q).
+            If ``False``, uses a constant 0dB peak gain. (Default: ``False``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+
+    temp = torch.sin(w0) / 2 if const_skirt_gain else alpha
+    b0 = temp
+    b1 = 0.0
+    b2 = -temp
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def bandreject_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design two-pole band-reject filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+
+    b0 = 1.0
+    b1 = -2 * torch.cos(w0)
+    b2 = 1.0
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def bass_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    gain: float,
+    central_freq: float = 100,
+    Q: float = 0.707,
+) -> Tensor:
+    r"""Design a bass tone-control effect.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB.
+        central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``100``)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    gain = torch.as_tensor(gain, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+    A = torch.exp(gain / 40 * math.log(10))
+
+    temp1 = 2 * torch.sqrt(A) * alpha
+    temp2 = (A - 1) * torch.cos(w0)
+    temp3 = (A + 1) * torch.cos(w0)
+
+    b0 = A * ((A + 1) - temp2 + temp1)
+    b1 = 2 * A * ((A - 1) - temp3)
+    b2 = A * ((A + 1) - temp2 - temp1)
+    a0 = (A + 1) + temp2 + temp1
+    a1 = -2 * ((A - 1) + temp3)
+    a2 = (A + 1) + temp2 - temp1
+
+    return biquad(waveform, b0 / a0, b1 / a0, b2 / a0, a0 / a0, a1 / a0, a2 / a0)
+
+
+def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: float, a2: float) -> Tensor:
+    r"""Perform a biquad filter of input tensor.  Initial conditions set to 0.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        b0 (float or torch.Tensor): numerator coefficient of current input, x[n]
+        b1 (float or torch.Tensor): numerator coefficient of input one time step ago x[n-1]
+        b2 (float or torch.Tensor): numerator coefficient of input two time steps ago x[n-2]
+        a0 (float or torch.Tensor): denominator coefficient of current output y[n], typically 1
+        a1 (float or torch.Tensor): denominator coefficient of current output y[n-1]
+        a2 (float or torch.Tensor): denominator coefficient of current output y[n-2]
+
+    Returns:
+        Tensor: Waveform with dimension of `(..., time)`
+
+    Reference:
+       - https://en.wikipedia.org/wiki/Digital_biquad_filter
+    """
+
+    device = waveform.device
+    dtype = waveform.dtype
+
+    b0 = torch.as_tensor(b0, dtype=dtype, device=device).view(1)
+    b1 = torch.as_tensor(b1, dtype=dtype, device=device).view(1)
+    b2 = torch.as_tensor(b2, dtype=dtype, device=device).view(1)
+    a0 = torch.as_tensor(a0, dtype=dtype, device=device).view(1)
+    a1 = torch.as_tensor(a1, dtype=dtype, device=device).view(1)
+    a2 = torch.as_tensor(a2, dtype=dtype, device=device).view(1)
+
+    output_waveform = lfilter(
+        waveform,
+        torch.cat([a0, a1, a2]),
+        torch.cat([b0, b1, b2]),
+    )
+    return output_waveform
+
+
+def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor:
+    r"""Apply contrast effect.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Comparable with compression, this effect modifies an audio signal to make it sound louder
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        enhancement_amount (float, optional): controls the amount of the enhancement
+            Allowed range of values for enhancement_amount : 0-100
+            Note that enhancement_amount = 0 still gives a significant contrast enhancement
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+
+    if not 0 <= enhancement_amount <= 100:
+        raise ValueError("Allowed range of values for enhancement_amount : 0-100")
+
+    contrast = enhancement_amount / 750.0
+
+    temp1 = waveform * (math.pi / 2)
+    temp2 = contrast * torch.sin(temp1 * 4)
+    output_waveform = torch.sin(temp1 + temp2)
+
+    return output_waveform
+
+
+def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None) -> Tensor:
+    r"""Apply a DC shift to the audio. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    This can be useful to remove a DC offset
+    (caused perhaps by a hardware problem in the recording chain) from the audio
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        shift (float): indicates the amount to shift the audio
+            Allowed range of values for shift : -2.0 to +2.0
+        limiter_gain (float of None, optional): It is used only on peaks to prevent clipping
+            It should have a value much less than 1 (e.g. 0.05 or 0.02)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+    output_waveform = waveform
+    limiter_threshold = 0.0
+
+    if limiter_gain is not None:
+        limiter_threshold = 1.0 - (abs(shift) - limiter_gain)
+
+    # Note:
+    # the following index-based update breaks auto-grad support
+    if limiter_gain is not None and shift > 0:
+        mask = waveform > limiter_threshold
+        temp = (waveform[mask] - limiter_threshold) * limiter_gain / (1 - limiter_threshold)
+        output_waveform[mask] = (temp + limiter_threshold + shift).clamp(max=limiter_threshold)
+        output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1)
+    elif limiter_gain is not None and shift < 0:
+        mask = waveform < -limiter_threshold
+        temp = (waveform[mask] + limiter_threshold) * limiter_gain / (1 - limiter_threshold)
+        output_waveform[mask] = (temp - limiter_threshold + shift).clamp(min=-limiter_threshold)
+        output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1)
+    else:
+        output_waveform = (waveform + shift).clamp(min=-1, max=1)
+
+    return output_waveform
+
+
+def deemph_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
+    r"""Apply ISO 908 CD de-emphasis (shelving) IIR filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, Allowed sample rate ``44100`` or ``48000``
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+
+    if sample_rate == 44100:
+        central_freq = 5283
+        width_slope = 0.4845
+        gain = -9.477
+    elif sample_rate == 48000:
+        central_freq = 5356
+        width_slope = 0.479
+        gain = -9.62
+    else:
+        raise ValueError("Sample rate must be 44100 (audio-CD) or 48000 (DAT)")
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    A = math.exp(gain / 40.0 * math.log(10))
+    alpha = math.sin(w0) / 2 * math.sqrt((A + 1 / A) * (1 / width_slope - 1) + 2)
+
+    temp1 = 2 * math.sqrt(A) * alpha
+    temp2 = (A - 1) * math.cos(w0)
+    temp3 = (A + 1) * math.cos(w0)
+
+    b0 = A * ((A + 1) + temp2 + temp1)
+    b1 = -2 * A * ((A - 1) + temp3)
+    b2 = A * ((A + 1) + temp2 - temp1)
+    a0 = (A + 1) - temp2 + temp1
+    a1 = 2 * ((A - 1) - temp3)
+    a2 = (A + 1) - temp2 - temp1
+
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def _add_noise_shaping(dithered_waveform: Tensor, waveform: Tensor) -> Tensor:
+    r"""Noise shaping is calculated by error:
+    error[n] = dithered[n] - original[n]
+    noise_shaped_waveform[n] = dithered[n] + error[n-1]
+    """
+    wf_shape = waveform.size()
+    waveform = waveform.reshape(-1, wf_shape[-1])
+
+    dithered_shape = dithered_waveform.size()
+    dithered_waveform = dithered_waveform.reshape(-1, dithered_shape[-1])
+
+    error = dithered_waveform - waveform
+
+    # add error[n-1] to dithered_waveform[n], so offset the error by 1 index
+    zeros = torch.zeros(1, dtype=error.dtype, device=error.device)
+    for index in range(error.size()[0]):
+        err = error[index]
+        error_offset = torch.cat((zeros, err))
+        error[index] = error_offset[: waveform.size()[1]]
+
+    noise_shaped = dithered_waveform + error
+    return noise_shaped.reshape(dithered_shape[:-1] + noise_shaped.shape[-1:])
+
+
+def _apply_probability_distribution(waveform: Tensor, density_function: str = "TPDF") -> Tensor:
+    r"""Apply a probability distribution function on a waveform.
+
+    Triangular probability density function (TPDF) dither noise has a
+    triangular distribution; values in the center of the range have a higher
+    probability of occurring.
+
+    Rectangular probability density function (RPDF) dither noise has a
+    uniform distribution; any value in the specified range has the same
+    probability of occurring.
+
+    Gaussian probability density function (GPDF) has a normal distribution.
+    The relationship of probabilities of results follows a bell-shaped,
+    or Gaussian curve, typical of dither generated by analog sources.
+    Args:
+        waveform (Tensor): Tensor of audio of dimension (..., time)
+        density_function (str, optional): The density function of a
+           continuous random variable (Default: ``"TPDF"``)
+           Options: Triangular Probability Density Function - `TPDF`
+                    Rectangular Probability Density Function - `RPDF`
+                    Gaussian Probability Density Function - `GPDF`
+    Returns:
+        Tensor: waveform dithered with TPDF
+    """
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, shape[-1])
+
+    channel_size = waveform.size()[0] - 1
+    time_size = waveform.size()[-1] - 1
+
+    random_channel = (
+        int(
+            torch.randint(
+                channel_size,
+                [
+                    1,
+                ],
+            ).item()
+        )
+        if channel_size > 0
+        else 0
+    )
+    random_time = (
+        int(
+            torch.randint(
+                time_size,
+                [
+                    1,
+                ],
+            ).item()
+        )
+        if time_size > 0
+        else 0
+    )
+
+    number_of_bits = 16
+    up_scaling = 2 ** (number_of_bits - 1) - 2
+    signal_scaled = waveform * up_scaling
+    down_scaling = 2 ** (number_of_bits - 1)
+
+    signal_scaled_dis = waveform
+    if density_function == "RPDF":
+        RPDF = waveform[random_channel][random_time] - 0.5
+
+        signal_scaled_dis = signal_scaled + RPDF
+    elif density_function == "GPDF":
+        # TODO Replace by distribution code once
+        # https://github.com/pytorch/pytorch/issues/29843 is resolved
+        # gaussian = torch.distributions.normal.Normal(torch.mean(waveform, -1), 1).sample()
+
+        num_rand_variables = 6
+
+        gaussian = waveform[random_channel][random_time]
+        for ws in num_rand_variables * [time_size]:
+            rand_chan = int(
+                torch.randint(
+                    channel_size,
+                    [
+                        1,
+                    ],
+                ).item()
+            )
+            gaussian += waveform[rand_chan][
+                int(
+                    torch.randint(
+                        ws,
+                        [
+                            1,
+                        ],
+                    ).item()
+                )
+            ]
+
+        signal_scaled_dis = signal_scaled + gaussian
+    else:
+        # dtype needed for https://github.com/pytorch/pytorch/issues/32358
+        TPDF = torch.bartlett_window(time_size + 1, dtype=signal_scaled.dtype, device=signal_scaled.device)
+        TPDF = TPDF.repeat((channel_size + 1), 1)
+        signal_scaled_dis = signal_scaled + TPDF
+
+    quantised_signal_scaled = torch.round(signal_scaled_dis)
+    quantised_signal = quantised_signal_scaled / down_scaling
+
+    # unpack batch
+    return quantised_signal.reshape(shape[:-1] + quantised_signal.shape[-1:])
+
+
+def dither(waveform: Tensor, density_function: str = "TPDF", noise_shaping: bool = False) -> Tensor:
+    r"""Apply dither
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Dither increases the perceived dynamic range of audio stored at a
+    particular bit-depth by eliminating nonlinear truncation distortion
+    (i.e. adding minimally perceived noise to mask distortion caused by quantization).
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension (..., time)
+        density_function (str, optional):
+            The density function of a continuous random variable. One of
+            ``"TPDF"`` (Triangular Probability Density Function),
+            ``"RPDF"`` (Rectangular Probability Density Function) or
+            ``"GPDF"`` (Gaussian Probability Density Function) (Default: ``"TPDF"``).
+        noise_shaping (bool, optional): a filtering process that shapes the spectral
+            energy of quantisation error (Default: ``False``)
+
+    Returns:
+       Tensor: waveform dithered
+    """
+    dithered = _apply_probability_distribution(waveform, density_function=density_function)
+
+    if noise_shaping:
+        return _add_noise_shaping(dithered, waveform)
+    else:
+        return dithered
+
+
+def equalizer_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    center_freq: float,
+    gain: float,
+    Q: float = 0.707,
+) -> Tensor:
+    r"""Design biquad peaking equalizer filter and perform filtering.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        center_freq (float): filter's central frequency
+        gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    center_freq = torch.as_tensor(center_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    gain = torch.as_tensor(gain, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * center_freq / sample_rate
+    A = torch.exp(gain / 40.0 * math.log(10))
+    alpha = torch.sin(w0) / 2 / Q
+
+    b0 = 1 + alpha * A
+    b1 = -2 * torch.cos(w0)
+    b2 = 1 - alpha * A
+    a0 = 1 + alpha / A
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha / A
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def filtfilt(
+    waveform: Tensor,
+    a_coeffs: Tensor,
+    b_coeffs: Tensor,
+    clamp: bool = True,
+) -> Tensor:
+    r"""Apply an IIR filter forward and backward to a waveform.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Inspired by https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.filtfilt.html
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`.  Must be normalized to -1 to 1.
+        a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delay coefficients are first, e.g. ``[a0, a1, a2, ...]``.
+                                Must be same size as b_coeffs (pad with 0's as necessary).
+        b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delay coefficients are first, e.g. ``[b0, b1, b2, ...]``.
+                                Must be same size as a_coeffs (pad with 0's as necessary).
+        clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
+
+    Returns:
+        Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
+        are 2D Tensors, or `(..., time)` otherwise.
+    """
+    forward_filtered = lfilter(waveform, a_coeffs, b_coeffs, clamp=False, batching=True)
+    backward_filtered = lfilter(
+        forward_filtered.flip(-1),
+        a_coeffs,
+        b_coeffs,
+        clamp=clamp,
+        batching=True,
+    ).flip(-1)
+    return backward_filtered
+
+
+def flanger(
+    waveform: Tensor,
+    sample_rate: int,
+    delay: float = 0.0,
+    depth: float = 2.0,
+    regen: float = 0.0,
+    width: float = 71.0,
+    speed: float = 0.5,
+    phase: float = 25.0,
+    modulation: str = "sinusoidal",
+    interpolation: str = "linear",
+) -> Tensor:
+    r"""Apply a flanger effect to the audio. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., channel, time)` .
+            Max 4 channels allowed
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        delay (float, optional): desired delay in milliseconds(ms)
+            Allowed range of values are 0 to 30
+        depth (float, optional): desired delay depth in milliseconds(ms)
+            Allowed range of values are 0 to 10
+        regen (float, optional): desired regen(feedback gain) in dB
+            Allowed range of values are -95 to 95
+        width (float, optional):  desired width(delay gain) in dB
+            Allowed range of values are 0 to 100
+        speed (float, optional):  modulation speed in Hz
+            Allowed range of values are 0.1 to 10
+        phase (float, optional):  percentage phase-shift for multi-channel
+            Allowed range of values are 0 to 100
+        modulation (str, optional):  Use either "sinusoidal" or "triangular" modulation. (Default: ``sinusoidal``)
+        interpolation (str, optional): Use either "linear" or "quadratic" for delay-line interpolation.
+            (Default: ``linear``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., channel, time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+
+        - Scott Lehman, `Effects Explained`_,
+
+    .. _Effects Explained:
+        https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
+    """
+
+    if modulation not in ("sinusoidal", "triangular"):
+        raise ValueError("Only 'sinusoidal' or 'triangular' modulation allowed")
+
+    if interpolation not in ("linear", "quadratic"):
+        raise ValueError("Only 'linear' or 'quadratic' interpolation allowed")
+
+    actual_shape = waveform.shape
+    device, dtype = waveform.device, waveform.dtype
+
+    if actual_shape[-2] > 4:
+        raise ValueError("Max 4 channels allowed")
+
+    # convert to 3D (batch, channels, time)
+    waveform = waveform.view(-1, actual_shape[-2], actual_shape[-1])
+
+    # Scaling
+    feedback_gain = regen / 100
+    delay_gain = width / 100
+    channel_phase = phase / 100
+    delay_min = delay / 1000
+    delay_depth = depth / 1000
+
+    n_channels = waveform.shape[-2]
+
+    if modulation == "sinusoidal":
+        wave_type = "SINE"
+    else:
+        wave_type = "TRIANGLE"
+
+    # Balance output:
+    in_gain = 1.0 / (1 + delay_gain)
+    delay_gain = delay_gain / (1 + delay_gain)
+
+    # Balance feedback loop:
+    delay_gain = delay_gain * (1 - abs(feedback_gain))
+
+    delay_buf_length = int((delay_min + delay_depth) * sample_rate + 0.5)
+    delay_buf_length = delay_buf_length + 2
+
+    delay_bufs = torch.zeros(waveform.shape[0], n_channels, delay_buf_length, dtype=dtype, device=device)
+    delay_last = torch.zeros(waveform.shape[0], n_channels, dtype=dtype, device=device)
+
+    lfo_length = int(sample_rate / speed)
+
+    table_min = math.floor(delay_min * sample_rate + 0.5)
+    table_max = delay_buf_length - 2.0
+
+    lfo = _generate_wave_table(
+        wave_type=wave_type,
+        data_type="FLOAT",
+        table_size=lfo_length,
+        min=float(table_min),
+        max=float(table_max),
+        phase=3 * math.pi / 2,
+        device=device,
+    )
+
+    output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
+
+    delay_buf_pos = 0
+    lfo_pos = 0
+    channel_idxs = torch.arange(0, n_channels, device=device)
+
+    for i in range(waveform.shape[-1]):
+
+        delay_buf_pos = (delay_buf_pos + delay_buf_length - 1) % delay_buf_length
+
+        cur_channel_phase = (channel_idxs * lfo_length * channel_phase + 0.5).to(torch.int64)
+        delay_tensor = lfo[(lfo_pos + cur_channel_phase) % lfo_length]
+        frac_delay = torch.frac(delay_tensor)
+        delay_tensor = torch.floor(delay_tensor)
+
+        int_delay = delay_tensor.to(torch.int64)
+
+        temp = waveform[:, :, i]
+
+        delay_bufs[:, :, delay_buf_pos] = temp + delay_last * feedback_gain
+
+        delayed_0 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
+
+        int_delay = int_delay + 1
+
+        delayed_1 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
+
+        int_delay = int_delay + 1
+
+        if interpolation == "linear":
+            delayed = delayed_0 + (delayed_1 - delayed_0) * frac_delay
+        else:
+            delayed_2 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
+
+            int_delay = int_delay + 1
+
+            delayed_2 = delayed_2 - delayed_0
+            delayed_1 = delayed_1 - delayed_0
+            a = delayed_2 * 0.5 - delayed_1
+            b = delayed_1 * 2 - delayed_2 * 0.5
+
+            delayed = delayed_0 + (a * frac_delay + b) * frac_delay
+
+        delay_last = delayed
+        output_waveform[:, :, i] = waveform[:, :, i] * in_gain + delayed * delay_gain
+
+        lfo_pos = (lfo_pos + 1) % lfo_length
+
+    return output_waveform.clamp(min=-1, max=1).view(actual_shape)
+
+
+def gain(waveform: Tensor, gain_db: float = 1.0) -> Tensor:
+    r"""Apply amplification or attenuation to the whole waveform.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+       waveform (Tensor): Tensor of audio of dimension (..., time).
+       gain_db (float, optional) Gain adjustment in decibels (dB) (Default: ``1.0``).
+
+    Returns:
+       Tensor: the whole waveform amplified by gain_db.
+    """
+    if gain_db == 0:
+        return waveform
+
+    ratio = 10 ** (gain_db / 20)
+
+    return waveform * ratio
+
+
+def highpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design biquad highpass filter and perform filtering.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        cutoff_freq (float or torch.Tensor): filter cutoff frequency
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform dimension of `(..., time)`
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * cutoff_freq / sample_rate
+    alpha = torch.sin(w0) / 2.0 / Q
+
+    b0 = (1 + torch.cos(w0)) / 2
+    b1 = -1 - torch.cos(w0)
+    b2 = b0
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def _lfilter_core_generic_loop(input_signal_windows: Tensor, a_coeffs_flipped: Tensor, padded_output_waveform: Tensor):
+    n_order = a_coeffs_flipped.size(1)
+    a_coeffs_flipped = a_coeffs_flipped.unsqueeze(2)
+    for i_sample, o0 in enumerate(input_signal_windows.permute(2, 0, 1)):
+        windowed_output_signal = padded_output_waveform[:, :, i_sample : i_sample + n_order]
+        o0 -= (windowed_output_signal.transpose(0, 1) @ a_coeffs_flipped)[..., 0].t()
+        padded_output_waveform[:, :, i_sample + n_order - 1] = o0
+
+
+try:
+    _lfilter_core_cpu_loop = torch.ops.torchaudio._lfilter_core_loop
+except RuntimeError as err:
+    assert str(err) == "No such operator torchaudio::_lfilter_core_loop"
+    _lfilter_core_cpu_loop = _lfilter_core_generic_loop
+
+
+def _lfilter_core(
+    waveform: Tensor,
+    a_coeffs: Tensor,
+    b_coeffs: Tensor,
+) -> Tensor:
+
+    assert a_coeffs.size() == b_coeffs.size()
+    assert len(waveform.size()) == 3
+    assert waveform.device == a_coeffs.device
+    assert b_coeffs.device == a_coeffs.device
+
+    n_batch, n_channel, n_sample = waveform.size()
+    n_order = a_coeffs.size(1)
+    assert n_order > 0
+
+    # Pad the input and create output
+
+    padded_waveform = torch.nn.functional.pad(waveform, [n_order - 1, 0])
+    padded_output_waveform = torch.zeros_like(padded_waveform)
+
+    # Set up the coefficients matrix
+    # Flip coefficients' order
+    a_coeffs_flipped = a_coeffs.flip(1)
+    b_coeffs_flipped = b_coeffs.flip(1)
+
+    # calculate windowed_input_signal in parallel using convolution
+    input_signal_windows = torch.nn.functional.conv1d(padded_waveform, b_coeffs_flipped.unsqueeze(1), groups=n_channel)
+
+    input_signal_windows.div_(a_coeffs[:, :1])
+    a_coeffs_flipped.div_(a_coeffs[:, :1])
+
+    if (
+        input_signal_windows.device == torch.device("cpu")
+        and a_coeffs_flipped.device == torch.device("cpu")
+        and padded_output_waveform.device == torch.device("cpu")
+    ):
+        _lfilter_core_cpu_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform)
+    else:
+        _lfilter_core_generic_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform)
+
+    output = padded_output_waveform[:, :, n_order - 1 :]
+    return output
+
+
+try:
+    _lfilter = torch.ops.torchaudio._lfilter
+except RuntimeError as err:
+    assert str(err) == "No such operator torchaudio::_lfilter"
+    _lfilter = _lfilter_core
+
+
+def lfilter(waveform: Tensor, a_coeffs: Tensor, b_coeffs: Tensor, clamp: bool = True, batching: bool = True) -> Tensor:
+    r"""Perform an IIR filter by evaluating difference equation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Note:
+        To avoid numerical problems, small filter order is preferred.
+        Using double precision could also minimize numerical precision errors.
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`.  Must be normalized to -1 to 1.
+        a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delays coefficients are first, e.g. ``[a0, a1, a2, ...]``.
+                                Must be same size as b_coeffs (pad with 0's as necessary).
+        b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delays coefficients are first, e.g. ``[b0, b1, b2, ...]``.
+                                Must be same size as a_coeffs (pad with 0's as necessary).
+        clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
+        batching (bool, optional): Effective only when coefficients are 2D. If ``True``, then waveform should be at
+                                    least 2D, and the size of second axis from last should equals to ``num_filters``.
+                                    The output can be expressed as ``output[..., i, :] = lfilter(waveform[..., i, :],
+                                    a_coeffs[i], b_coeffs[i], clamp=clamp, batching=False)``. (Default: ``True``)
+
+    Returns:
+        Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
+        are 2D Tensors, or `(..., time)` otherwise.
+    """
+    assert a_coeffs.size() == b_coeffs.size()
+    assert a_coeffs.ndim <= 2
+
+    if a_coeffs.ndim > 1:
+        if batching:
+            assert waveform.ndim > 1
+            assert waveform.shape[-2] == a_coeffs.shape[0]
+        else:
+            waveform = torch.stack([waveform] * a_coeffs.shape[0], -2)
+    else:
+        a_coeffs = a_coeffs.unsqueeze(0)
+        b_coeffs = b_coeffs.unsqueeze(0)
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, a_coeffs.shape[0], shape[-1])
+    output = _lfilter(waveform, a_coeffs, b_coeffs)
+
+    if clamp:
+        output = torch.clamp(output, min=-1.0, max=1.0)
+
+    # unpack batch
+    output = output.reshape(shape[:-1] + output.shape[-1:])
+
+    return output
+
+
+def lowpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design biquad lowpass filter and perform filtering.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        cutoff_freq (float or torch.Tensor): filter cutoff frequency
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * cutoff_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+
+    b0 = (1 - torch.cos(w0)) / 2
+    b1 = 1 - torch.cos(w0)
+    b2 = b0
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def _overdrive_core_loop_generic(
+    waveform: Tensor, temp: Tensor, last_in: Tensor, last_out: Tensor, output_waveform: Tensor
+):
+    for i in range(waveform.shape[-1]):
+        last_out = temp[:, i] - last_in + 0.995 * last_out
+        last_in = temp[:, i]
+        output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
+
+
+try:
+    _overdrive_core_loop_cpu = torch.ops.torchaudio._overdrive_core_loop
+except RuntimeError as err:
+    assert str(err) == "No such operator torchaudio::_overdrive_core_loop"
+    _overdrive_core_loop_cpu = _overdrive_core_loop_generic
+
+
+def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
+    r"""Apply a overdrive effect to the audio. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    This effect applies a non linear distortion to the audio signal.
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        gain (float, optional): desired gain at the boost (or attenuation) in dB
+            Allowed range of values are 0 to 100
+        colour (float, optional):  controls the amount of even harmonic content in the over-driven output
+            Allowed range of values are 0 to 100
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+    actual_shape = waveform.shape
+    device, dtype = waveform.device, waveform.dtype
+
+    # convert to 2D (..,time)
+    waveform = waveform.view(-1, actual_shape[-1])
+
+    gain = _dB2Linear(gain)
+    colour = colour / 200
+    last_in = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device)
+    last_out = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device)
+
+    temp = waveform * gain + colour
+
+    mask1 = temp < -1
+    temp[mask1] = torch.tensor(-2.0 / 3.0, dtype=dtype, device=device)
+    # Wrapping the constant with Tensor is required for Torchscript
+
+    mask2 = temp > 1
+    temp[mask2] = torch.tensor(2.0 / 3.0, dtype=dtype, device=device)
+
+    mask3 = ~mask1 & ~mask2
+    temp[mask3] = temp[mask3] - (temp[mask3] ** 3) * (1.0 / 3)
+
+    output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
+
+    # Uses CPU optimized loop function if available for CPU device
+    if device == torch.device("cpu"):
+        _overdrive_core_loop_cpu(waveform, temp, last_in, last_out, output_waveform)
+    else:
+        _overdrive_core_loop_generic(waveform, temp, last_in, last_out, output_waveform)
+
+    return output_waveform.clamp(min=-1, max=1).view(actual_shape)
+
+
+def phaser(
+    waveform: Tensor,
+    sample_rate: int,
+    gain_in: float = 0.4,
+    gain_out: float = 0.74,
+    delay_ms: float = 3.0,
+    decay: float = 0.4,
+    mod_speed: float = 0.5,
+    sinusoidal: bool = True,
+) -> Tensor:
+    r"""Apply a phasing effect to the audio. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        gain_in (float, optional): desired input gain at the boost (or attenuation) in dB
+            Allowed range of values are 0 to 1
+        gain_out (float, optional): desired output gain at the boost (or attenuation) in dB
+            Allowed range of values are 0 to 1e9
+        delay_ms (float, optional): desired delay in milliseconds
+            Allowed range of values are 0 to 5.0
+        decay (float, optional):  desired decay relative to gain-in
+            Allowed range of values are 0 to 0.99
+        mod_speed (float, optional):  modulation speed in Hz
+            Allowed range of values are 0.1 to 2
+        sinusoidal (bool, optional):  If ``True``, uses sinusoidal modulation (preferable for multiple instruments)
+            If ``False``, uses triangular modulation (gives single instruments a sharper phasing effect)
+            (Default: ``True``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - Scott Lehman, `Effects Explained`_.
+
+    .. _Effects Explained:
+        https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
+    """
+    actual_shape = waveform.shape
+    device, dtype = waveform.device, waveform.dtype
+
+    # convert to 2D (channels,time)
+    waveform = waveform.view(-1, actual_shape[-1])
+
+    delay_buf_len = int((delay_ms * 0.001 * sample_rate) + 0.5)
+    delay_buf = torch.zeros(waveform.shape[0], delay_buf_len, dtype=dtype, device=device)
+
+    mod_buf_len = int(sample_rate / mod_speed + 0.5)
+
+    if sinusoidal:
+        wave_type = "SINE"
+    else:
+        wave_type = "TRIANGLE"
+
+    mod_buf = _generate_wave_table(
+        wave_type=wave_type,
+        data_type="INT",
+        table_size=mod_buf_len,
+        min=1.0,
+        max=float(delay_buf_len),
+        phase=math.pi / 2,
+        device=device,
+    )
+
+    delay_pos = 0
+    mod_pos = 0
+
+    output_waveform_pre_gain_list = []
+    waveform = waveform * gain_in
+    delay_buf = delay_buf * decay
+    waveform_list = [waveform[:, i] for i in range(waveform.size(1))]
+    delay_buf_list = [delay_buf[:, i] for i in range(delay_buf.size(1))]
+    mod_buf_list = [mod_buf[i] for i in range(mod_buf.size(0))]
+
+    for i in range(waveform.shape[-1]):
+        idx = int((delay_pos + mod_buf_list[mod_pos]) % delay_buf_len)
+        mod_pos = (mod_pos + 1) % mod_buf_len
+        delay_pos = (delay_pos + 1) % delay_buf_len
+        temp = (waveform_list[i]) + (delay_buf_list[idx])
+        delay_buf_list[delay_pos] = temp * decay
+        output_waveform_pre_gain_list.append(temp)
+
+    output_waveform = torch.stack(output_waveform_pre_gain_list, dim=1).to(dtype=dtype, device=device)
+    output_waveform.mul_(gain_out)
+
+    return output_waveform.clamp(min=-1, max=1).view(actual_shape)
+
+
+def riaa_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
+    r"""Apply RIAA vinyl playback equalization.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz).
+            Allowed sample rates in Hz : ``44100``,``48000``,``88200``,``96000``
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+
+    if sample_rate == 44100:
+        zeros = [-0.2014898, 0.9233820]
+        poles = [0.7083149, 0.9924091]
+
+    elif sample_rate == 48000:
+        zeros = [-0.1766069, 0.9321590]
+        poles = [0.7396325, 0.9931330]
+
+    elif sample_rate == 88200:
+        zeros = [-0.1168735, 0.9648312]
+        poles = [0.8590646, 0.9964002]
+
+    elif sample_rate == 96000:
+        zeros = [-0.1141486, 0.9676817]
+        poles = [0.8699137, 0.9966946]
+
+    else:
+        raise ValueError("Sample rate must be 44.1k, 48k, 88.2k, or 96k")
+
+    # polynomial coefficients with roots zeros[0] and zeros[1]
+    b0 = 1.0
+    b1 = -(zeros[0] + zeros[1])
+    b2 = zeros[0] * zeros[1]
+
+    # polynomial coefficients with roots poles[0] and poles[1]
+    a0 = 1.0
+    a1 = -(poles[0] + poles[1])
+    a2 = poles[0] * poles[1]
+
+    # Normalize to 0dB at 1kHz
+    y = 2 * math.pi * 1000 / sample_rate
+    b_re = b0 + b1 * math.cos(-y) + b2 * math.cos(-2 * y)
+    a_re = a0 + a1 * math.cos(-y) + a2 * math.cos(-2 * y)
+    b_im = b1 * math.sin(-y) + b2 * math.sin(-2 * y)
+    a_im = a1 * math.sin(-y) + a2 * math.sin(-2 * y)
+    g = 1 / math.sqrt((b_re**2 + b_im**2) / (a_re**2 + a_im**2))
+
+    b0 *= g
+    b1 *= g
+    b2 *= g
+
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def treble_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    gain: float,
+    central_freq: float = 3000,
+    Q: float = 0.707,
+) -> Tensor:
+    r"""Design a treble tone-control effect.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB.
+        central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``3000``)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    gain = torch.as_tensor(gain, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+    A = torch.exp(gain / 40 * math.log(10))
+
+    temp1 = 2 * torch.sqrt(A) * alpha
+    temp2 = (A - 1) * torch.cos(w0)
+    temp3 = (A + 1) * torch.cos(w0)
+
+    b0 = A * ((A + 1) + temp2 + temp1)
+    b1 = -2 * A * ((A - 1) + temp3)
+    b2 = A * ((A + 1) + temp2 - temp1)
+    a0 = (A + 1) - temp2 + temp1
+    a1 = 2 * ((A - 1) - temp3)
+    a2 = (A + 1) - temp2 - temp1
+
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def _measure(
+    measure_len_ws: int,
+    samples: Tensor,
+    spectrum: Tensor,
+    noise_spectrum: Tensor,
+    spectrum_window: Tensor,
+    spectrum_start: int,
+    spectrum_end: int,
+    cepstrum_window: Tensor,
+    cepstrum_start: int,
+    cepstrum_end: int,
+    noise_reduction_amount: float,
+    measure_smooth_time_mult: float,
+    noise_up_time_mult: float,
+    noise_down_time_mult: float,
+    index_ns: int,
+    boot_count: int,
+) -> float:
+
+    assert spectrum.size()[-1] == noise_spectrum.size()[-1]
+
+    samplesLen_ns = samples.size()[-1]
+    dft_len_ws = spectrum.size()[-1]
+
+    dftBuf = torch.zeros(dft_len_ws)
+
+    _index_ns = torch.tensor([index_ns] + [(index_ns + i) % samplesLen_ns for i in range(1, measure_len_ws)])
+    dftBuf[:measure_len_ws] = samples[_index_ns] * spectrum_window[:measure_len_ws]
+
+    # memset(c->dftBuf + i, 0, (p->dft_len_ws - i) * sizeof(*c->dftBuf));
+    dftBuf[measure_len_ws:dft_len_ws].zero_()
+
+    # lsx_safe_rdft((int)p->dft_len_ws, 1, c->dftBuf);
+    _dftBuf = torch.fft.rfft(dftBuf)
+
+    # memset(c->dftBuf, 0, p->spectrum_start * sizeof(*c->dftBuf));
+    _dftBuf[:spectrum_start].zero_()
+
+    mult: float = boot_count / (1.0 + boot_count) if boot_count >= 0 else measure_smooth_time_mult
+
+    _d = _dftBuf[spectrum_start:spectrum_end].abs()
+    spectrum[spectrum_start:spectrum_end].mul_(mult).add_(_d * (1 - mult))
+    _d = spectrum[spectrum_start:spectrum_end] ** 2
+
+    _zeros = torch.zeros(spectrum_end - spectrum_start)
+    _mult = (
+        _zeros
+        if boot_count >= 0
+        else torch.where(
+            _d > noise_spectrum[spectrum_start:spectrum_end],
+            torch.tensor(noise_up_time_mult),  # if
+            torch.tensor(noise_down_time_mult),  # else
+        )
+    )
+
+    noise_spectrum[spectrum_start:spectrum_end].mul_(_mult).add_(_d * (1 - _mult))
+    _d = torch.sqrt(
+        torch.max(
+            _zeros,
+            _d - noise_reduction_amount * noise_spectrum[spectrum_start:spectrum_end],
+        )
+    )
+
+    _cepstrum_Buf: Tensor = torch.zeros(dft_len_ws >> 1)
+    _cepstrum_Buf[spectrum_start:spectrum_end] = _d * cepstrum_window
+    _cepstrum_Buf[spectrum_end : dft_len_ws >> 1].zero_()
+
+    # lsx_safe_rdft((int)p->dft_len_ws >> 1, 1, c->dftBuf);
+    _cepstrum_Buf = torch.fft.rfft(_cepstrum_Buf)
+
+    result: float = float(torch.sum(_cepstrum_Buf[cepstrum_start:cepstrum_end].abs().pow(2)))
+    result = math.log(result / (cepstrum_end - cepstrum_start)) if result > 0 else -math.inf
+    return max(0, 21 + result)
+
+
+def vad(
+    waveform: Tensor,
+    sample_rate: int,
+    trigger_level: float = 7.0,
+    trigger_time: float = 0.25,
+    search_time: float = 1.0,
+    allowed_gap: float = 0.25,
+    pre_trigger_time: float = 0.0,
+    # Fine-tuning parameters
+    boot_time: float = 0.35,
+    noise_up_time: float = 0.1,
+    noise_down_time: float = 0.01,
+    noise_reduction_amount: float = 1.35,
+    measure_freq: float = 20.0,
+    measure_duration: Optional[float] = None,
+    measure_smooth_time: float = 0.4,
+    hp_filter_freq: float = 50.0,
+    lp_filter_freq: float = 6000.0,
+    hp_lifter_freq: float = 150.0,
+    lp_lifter_freq: float = 2000.0,
+) -> Tensor:
+    r"""Voice Activity Detector. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
+    The algorithm currently uses a simple cepstral power measurement to detect voice,
+    so may be fooled by other things, especially music.
+
+    The effect can trim only from the front of the audio,
+    so in order to trim from the back, the reverse effect must also be used.
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension `(channels, time)` or `(time)`
+            Tensor of shape `(channels, time)` is treated as a multi-channel recording
+            of the same event and the resulting output will be trimmed to the earliest
+            voice activity in any channel.
+        sample_rate (int): Sample rate of audio signal.
+        trigger_level (float, optional): The measurement level used to trigger activity detection.
+            This may need to be cahnged depending on the noise level, signal level,
+            and other characteristics of the input audio. (Default: 7.0)
+        trigger_time (float, optional): The time constant (in seconds)
+            used to help ignore short bursts of sound. (Default: 0.25)
+        search_time (float, optional): The amount of audio (in seconds)
+            to search for quieter/shorter bursts of audio to include prior
+            to the detected trigger point. (Default: 1.0)
+        allowed_gap (float, optional): The allowed gap (in seconds) between
+            quieter/shorter bursts of audio to include prior
+            to the detected trigger point. (Default: 0.25)
+        pre_trigger_time (float, optional): The amount of audio (in seconds) to preserve
+            before the trigger point and any found quieter/shorter bursts. (Default: 0.0)
+        boot_time (float, optional) The algorithm (internally) uses adaptive noise
+            estimation/reduction in order to detect the start of the wanted audio.
+            This option sets the time for the initial noise estimate. (Default: 0.35)
+        noise_up_time (float, optional) Time constant used by the adaptive noise estimator
+            for when the noise level is increasing. (Default: 0.1)
+        noise_down_time (float, optional) Time constant used by the adaptive noise estimator
+            for when the noise level is decreasing. (Default: 0.01)
+        noise_reduction_amount (float, optional) Amount of noise reduction to use in
+            the detection algorithm (e.g. 0, 0.5, ...). (Default: 1.35)
+        measure_freq (float, optional) Frequency of the algorithm’s
+            processing/measurements. (Default: 20.0)
+        measure_duration: (float, optional) Measurement duration.
+            (Default: Twice the measurement period; i.e. with overlap.)
+        measure_smooth_time (float, optional) Time constant used to smooth
+            spectral measurements. (Default: 0.4)
+        hp_filter_freq (float, optional) "Brick-wall" frequency of high-pass filter applied
+            at the input to the detector algorithm. (Default: 50.0)
+        lp_filter_freq (float, optional) "Brick-wall" frequency of low-pass filter applied
+            at the input to the detector algorithm. (Default: 6000.0)
+        hp_lifter_freq (float, optional) "Brick-wall" frequency of high-pass lifter used
+            in the detector algorithm. (Default: 150.0)
+        lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used
+            in the detector algorithm. (Default: 2000.0)
+
+    Returns:
+        Tensor: Tensor of audio of dimension `(..., time)`.
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+
+    if waveform.ndim > 2:
+        warnings.warn(
+            "Expected input tensor dimension of 1 for single channel"
+            f" or 2 for multi-channel. Got {waveform.ndim} instead. "
+            "Batch semantics is not supported. "
+            "Please refer to https://github.com/pytorch/audio/issues/1348"
+            " and https://github.com/pytorch/audio/issues/1468."
+        )
+
+    measure_duration: float = 2.0 / measure_freq if measure_duration is None else measure_duration
+
+    measure_len_ws = int(sample_rate * measure_duration + 0.5)
+    measure_len_ns = measure_len_ws
+    # for (dft_len_ws = 16; dft_len_ws < measure_len_ws; dft_len_ws <<= 1);
+    dft_len_ws = 16
+    while dft_len_ws < measure_len_ws:
+        dft_len_ws *= 2
+
+    measure_period_ns = int(sample_rate / measure_freq + 0.5)
+    measures_len = math.ceil(search_time * measure_freq)
+    search_pre_trigger_len_ns = measures_len * measure_period_ns
+    gap_len = int(allowed_gap * measure_freq + 0.5)
+
+    fixed_pre_trigger_len_ns = int(pre_trigger_time * sample_rate + 0.5)
+    samplesLen_ns = fixed_pre_trigger_len_ns + search_pre_trigger_len_ns + measure_len_ns
+
+    spectrum_window = torch.zeros(measure_len_ws)
+    for i in range(measure_len_ws):
+        # sox.h:741 define SOX_SAMPLE_MIN (sox_sample_t)SOX_INT_MIN(32)
+        spectrum_window[i] = 2.0 / math.sqrt(float(measure_len_ws))
+    # lsx_apply_hann(spectrum_window, (int)measure_len_ws);
+    spectrum_window *= torch.hann_window(measure_len_ws, dtype=torch.float)
+
+    spectrum_start: int = int(hp_filter_freq / sample_rate * dft_len_ws + 0.5)
+    spectrum_start: int = max(spectrum_start, 1)
+    spectrum_end: int = int(lp_filter_freq / sample_rate * dft_len_ws + 0.5)
+    spectrum_end: int = min(spectrum_end, dft_len_ws // 2)
+
+    cepstrum_window = torch.zeros(spectrum_end - spectrum_start)
+    for i in range(spectrum_end - spectrum_start):
+        cepstrum_window[i] = 2.0 / math.sqrt(float(spectrum_end) - spectrum_start)
+    # lsx_apply_hann(cepstrum_window,(int)(spectrum_end - spectrum_start));
+    cepstrum_window *= torch.hann_window(spectrum_end - spectrum_start, dtype=torch.float)
+
+    cepstrum_start = math.ceil(sample_rate * 0.5 / lp_lifter_freq)
+    cepstrum_end = math.floor(sample_rate * 0.5 / hp_lifter_freq)
+    cepstrum_end = min(cepstrum_end, dft_len_ws // 4)
+
+    assert cepstrum_end > cepstrum_start
+
+    noise_up_time_mult = math.exp(-1.0 / (noise_up_time * measure_freq))
+    noise_down_time_mult = math.exp(-1.0 / (noise_down_time * measure_freq))
+    measure_smooth_time_mult = math.exp(-1.0 / (measure_smooth_time * measure_freq))
+    trigger_meas_time_mult = math.exp(-1.0 / (trigger_time * measure_freq))
+
+    boot_count_max = int(boot_time * measure_freq - 0.5)
+    measure_timer_ns = measure_len_ns
+    boot_count = measures_index = flushedLen_ns = samplesIndex_ns = 0
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.view(-1, shape[-1])
+
+    n_channels, ilen = waveform.size()
+
+    mean_meas = torch.zeros(n_channels)
+    samples = torch.zeros(n_channels, samplesLen_ns)
+    spectrum = torch.zeros(n_channels, dft_len_ws)
+    noise_spectrum = torch.zeros(n_channels, dft_len_ws)
+    measures = torch.zeros(n_channels, measures_len)
+
+    has_triggered: bool = False
+    num_measures_to_flush: int = 0
+    pos: int = 0
+
+    while pos < ilen and not has_triggered:
+        measure_timer_ns -= 1
+        for i in range(n_channels):
+            samples[i, samplesIndex_ns] = waveform[i, pos]
+            # if (!p->measure_timer_ns) {
+            if measure_timer_ns == 0:
+                index_ns: int = (samplesIndex_ns + samplesLen_ns - measure_len_ns) % samplesLen_ns
+                meas: float = _measure(
+                    measure_len_ws=measure_len_ws,
+                    samples=samples[i],
+                    spectrum=spectrum[i],
+                    noise_spectrum=noise_spectrum[i],
+                    spectrum_window=spectrum_window,
+                    spectrum_start=spectrum_start,
+                    spectrum_end=spectrum_end,
+                    cepstrum_window=cepstrum_window,
+                    cepstrum_start=cepstrum_start,
+                    cepstrum_end=cepstrum_end,
+                    noise_reduction_amount=noise_reduction_amount,
+                    measure_smooth_time_mult=measure_smooth_time_mult,
+                    noise_up_time_mult=noise_up_time_mult,
+                    noise_down_time_mult=noise_down_time_mult,
+                    index_ns=index_ns,
+                    boot_count=boot_count,
+                )
+                measures[i, measures_index] = meas
+                mean_meas[i] = mean_meas[i] * trigger_meas_time_mult + meas * (1.0 - trigger_meas_time_mult)
+
+                has_triggered = has_triggered or (mean_meas[i] >= trigger_level)
+                if has_triggered:
+                    n: int = measures_len
+                    k: int = measures_index
+                    jTrigger: int = n
+                    jZero: int = n
+                    j: int = 0
+
+                    for j in range(n):
+                        if (measures[i, k] >= trigger_level) and (j <= jTrigger + gap_len):
+                            jZero = jTrigger = j
+                        elif (measures[i, k] == 0) and (jTrigger >= jZero):
+                            jZero = j
+                        k = (k + n - 1) % n
+                    j = min(j, jZero)
+                    # num_measures_to_flush = range_limit(j, num_measures_to_flush, n);
+                    num_measures_to_flush = min(max(num_measures_to_flush, j), n)
+                # end if has_triggered
+            # end if (measure_timer_ns == 0):
+        # end for
+        samplesIndex_ns += 1
+        pos += 1
+        # end while
+        if samplesIndex_ns == samplesLen_ns:
+            samplesIndex_ns = 0
+        if measure_timer_ns == 0:
+            measure_timer_ns = measure_period_ns
+            measures_index += 1
+            measures_index = measures_index % measures_len
+            if boot_count >= 0:
+                boot_count = -1 if boot_count == boot_count_max else boot_count + 1
+
+        if has_triggered:
+            flushedLen_ns = (measures_len - num_measures_to_flush) * measure_period_ns
+            samplesIndex_ns = (samplesIndex_ns + flushedLen_ns) % samplesLen_ns
+
+    res = waveform[:, pos - samplesLen_ns + flushedLen_ns :]
+    # unpack batch
+    return res.view(shape[:-1] + res.shape[-1:])
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/functional.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..665bf8c1f4f87b154cb96552dec404206352e75a
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/functional/functional.py
@@ -0,0 +1,2162 @@
+# -*- coding: utf-8 -*-
+
+import io
+import math
+import warnings
+from collections.abc import Sequence
+from typing import Optional, Tuple, Union, List
+
+import torch
+import torchaudio
+from torch import Tensor
+from torchaudio._internal import module_utils as _mod_utils
+
+__all__ = [
+    "spectrogram",
+    "inverse_spectrogram",
+    "griffinlim",
+    "amplitude_to_DB",
+    "DB_to_amplitude",
+    "compute_deltas",
+    "compute_kaldi_pitch",
+    "melscale_fbanks",
+    "linear_fbanks",
+    "create_dct",
+    "compute_deltas",
+    "detect_pitch_frequency",
+    "DB_to_amplitude",
+    "mu_law_encoding",
+    "mu_law_decoding",
+    "phase_vocoder",
+    "mask_along_axis",
+    "mask_along_axis_iid",
+    "sliding_window_cmn",
+    "spectral_centroid",
+    "apply_codec",
+    "resample",
+    "edit_distance",
+    "pitch_shift",
+    "rnnt_loss",
+    "psd",
+    "mvdr_weights_souden",
+    "mvdr_weights_rtf",
+    "rtf_evd",
+    "rtf_power",
+    "apply_beamforming",
+]
+
+
+def spectrogram(
+    waveform: Tensor,
+    pad: int,
+    window: Tensor,
+    n_fft: int,
+    hop_length: int,
+    win_length: int,
+    power: Optional[float],
+    normalized: bool,
+    center: bool = True,
+    pad_mode: str = "reflect",
+    onesided: bool = True,
+    return_complex: Optional[bool] = None,
+) -> Tensor:
+    r"""Create a spectrogram or a batch of spectrograms from a raw audio signal.
+    The spectrogram can be either magnitude-only or complex.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension `(..., time)`
+        pad (int): Two sided padding of signal
+        window (Tensor): Window tensor that is applied/multiplied to each frame/window
+        n_fft (int): Size of FFT
+        hop_length (int): Length of hop between STFT windows
+        win_length (int): Window size
+        power (float or None): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for energy, 2 for power, etc.
+            If None, then the complex spectrum is returned instead.
+        normalized (bool): Whether to normalize by magnitude after stft
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            Default: ``True``
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. Default: ``"reflect"``
+        onesided (bool, optional): controls whether to return half of results to
+            avoid redundancy. Default: ``True``
+        return_complex (bool, optional):
+            Deprecated and not used.
+
+    Returns:
+        Tensor: Dimension `(..., freq, time)`, freq is
+        ``n_fft // 2 + 1`` and ``n_fft`` is the number of
+        Fourier bins, and time is the number of window hops (n_frame).
+    """
+    if return_complex is not None:
+        warnings.warn(
+            "`return_complex` argument is now deprecated and is not effective."
+            "`torchaudio.functional.spectrogram(power=None)` always returns a tensor with "
+            "complex dtype. Please remove the argument in the function call."
+        )
+
+    if pad > 0:
+        # TODO add "with torch.no_grad():" back when JIT supports it
+        waveform = torch.nn.functional.pad(waveform, (pad, pad), "constant")
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, shape[-1])
+
+    # default values are consistent with librosa.core.spectrum._spectrogram
+    spec_f = torch.stft(
+        input=waveform,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        window=window,
+        center=center,
+        pad_mode=pad_mode,
+        normalized=False,
+        onesided=onesided,
+        return_complex=True,
+    )
+
+    # unpack batch
+    spec_f = spec_f.reshape(shape[:-1] + spec_f.shape[-2:])
+
+    if normalized:
+        spec_f /= window.pow(2.0).sum().sqrt()
+    if power is not None:
+        if power == 1.0:
+            return spec_f.abs()
+        return spec_f.abs().pow(power)
+    return spec_f
+
+
+def inverse_spectrogram(
+    spectrogram: Tensor,
+    length: Optional[int],
+    pad: int,
+    window: Tensor,
+    n_fft: int,
+    hop_length: int,
+    win_length: int,
+    normalized: bool,
+    center: bool = True,
+    pad_mode: str = "reflect",
+    onesided: bool = True,
+) -> Tensor:
+    r"""Create an inverse spectrogram or a batch of inverse spectrograms from the provided
+    complex-valued spectrogram.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        spectrogram (Tensor): Complex tensor of audio of dimension (..., freq, time).
+        length (int or None): The output length of the waveform.
+        pad (int): Two sided padding of signal. It is only effective when ``length`` is provided.
+        window (Tensor): Window tensor that is applied/multiplied to each frame/window
+        n_fft (int): Size of FFT
+        hop_length (int): Length of hop between STFT windows
+        win_length (int): Window size
+        normalized (bool): Whether the stft output was normalized by magnitude
+        center (bool, optional): whether the waveform was padded on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            Default: ``True``
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. This parameter is provided for compatibility with the
+            spectrogram function and is not used. Default: ``"reflect"``
+        onesided (bool, optional): controls whether spectrogram was done in onesided mode.
+            Default: ``True``
+
+    Returns:
+        Tensor: Dimension `(..., time)`. Least squares estimation of the original signal.
+    """
+
+    if not spectrogram.is_complex():
+        raise ValueError("Expected `spectrogram` to be complex dtype.")
+
+    if normalized:
+        spectrogram = spectrogram * window.pow(2.0).sum().sqrt()
+
+    # pack batch
+    shape = spectrogram.size()
+    spectrogram = spectrogram.reshape(-1, shape[-2], shape[-1])
+
+    # default values are consistent with librosa.core.spectrum._spectrogram
+    waveform = torch.istft(
+        input=spectrogram,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        window=window,
+        center=center,
+        normalized=False,
+        onesided=onesided,
+        length=length + 2 * pad if length is not None else None,
+        return_complex=False,
+    )
+
+    if length is not None and pad > 0:
+        # remove padding from front and back
+        waveform = waveform[:, pad:-pad]
+
+    # unpack batch
+    waveform = waveform.reshape(shape[:-2] + waveform.shape[-1:])
+
+    return waveform
+
+
+def _get_complex_dtype(real_dtype: torch.dtype):
+    if real_dtype == torch.double:
+        return torch.cdouble
+    if real_dtype == torch.float:
+        return torch.cfloat
+    if real_dtype == torch.half:
+        return torch.complex32
+    raise ValueError(f"Unexpected dtype {real_dtype}")
+
+
+def griffinlim(
+    specgram: Tensor,
+    window: Tensor,
+    n_fft: int,
+    hop_length: int,
+    win_length: int,
+    power: float,
+    n_iter: int,
+    momentum: float,
+    length: Optional[int],
+    rand_init: bool,
+) -> Tensor:
+    r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Implementation ported from
+    *librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
+    and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].
+
+    Args:
+        specgram (Tensor): A magnitude-only STFT spectrogram of dimension `(..., freq, frames)`
+            where freq is ``n_fft // 2 + 1``.
+        window (Tensor): Window tensor that is applied/multiplied to each frame/window
+        n_fft (int): Size of FFT, creates ``n_fft // 2 + 1`` bins
+        hop_length (int): Length of hop between STFT windows. (
+            Default: ``win_length // 2``)
+        win_length (int): Window size. (Default: ``n_fft``)
+        power (float): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for energy, 2 for power, etc.
+        n_iter (int): Number of iteration for phase recovery process.
+        momentum (float): The momentum parameter for fast Griffin-Lim.
+            Setting this to 0 recovers the original Griffin-Lim method.
+            Values near 1 can lead to faster convergence, but above 1 may not converge.
+        length (int or None): Array length of the expected output.
+        rand_init (bool): Initializes phase randomly if True, to zero otherwise.
+
+    Returns:
+        Tensor: waveform of `(..., time)`, where time equals the ``length`` parameter if given.
+    """
+    assert momentum < 1, "momentum={} > 1 can be unstable".format(momentum)
+    assert momentum >= 0, "momentum={} < 0".format(momentum)
+
+    # pack batch
+    shape = specgram.size()
+    specgram = specgram.reshape([-1] + list(shape[-2:]))
+
+    specgram = specgram.pow(1 / power)
+
+    # initialize the phase
+    if rand_init:
+        angles = torch.rand(specgram.size(), dtype=_get_complex_dtype(specgram.dtype), device=specgram.device)
+    else:
+        angles = torch.full(specgram.size(), 1, dtype=_get_complex_dtype(specgram.dtype), device=specgram.device)
+
+    # And initialize the previous iterate to 0
+    tprev = torch.tensor(0.0, dtype=specgram.dtype, device=specgram.device)
+    for _ in range(n_iter):
+        # Invert with our current estimate of the phases
+        inverse = torch.istft(
+            specgram * angles, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, length=length
+        )
+
+        # Rebuild the spectrogram
+        rebuilt = torch.stft(
+            input=inverse,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            center=True,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+
+        # Update our phase estimates
+        angles = rebuilt
+        if momentum:
+            angles = angles - tprev.mul_(momentum / (1 + momentum))
+        angles = angles.div(angles.abs().add(1e-16))
+
+        # Store the previous iterate
+        tprev = rebuilt
+
+    # Return the final phase estimates
+    waveform = torch.istft(
+        specgram * angles, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, length=length
+    )
+
+    # unpack batch
+    waveform = waveform.reshape(shape[:-2] + waveform.shape[-1:])
+
+    return waveform
+
+
+def amplitude_to_DB(
+    x: Tensor, multiplier: float, amin: float, db_multiplier: float, top_db: Optional[float] = None
+) -> Tensor:
+    r"""Turn a spectrogram from the power/amplitude scale to the decibel scale.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    The output of each tensor in a batch depends on the maximum value of that tensor,
+    and so may return different values for an audio clip split into snippets vs. a full clip.
+
+    Args:
+
+        x (Tensor): Input spectrogram(s) before being converted to decibel scale. Input should take
+          the form `(..., freq, time)`. Batched inputs should include a channel dimension and
+          have the form `(batch, channel, freq, time)`.
+        multiplier (float): Use 10. for power and 20. for amplitude
+        amin (float): Number to clamp ``x``
+        db_multiplier (float): Log10(max(reference value and amin))
+        top_db (float or None, optional): Minimum negative cut-off in decibels. A reasonable number
+            is 80. (Default: ``None``)
+
+    Returns:
+        Tensor: Output tensor in decibel scale
+    """
+    x_db = multiplier * torch.log10(torch.clamp(x, min=amin))
+    x_db -= multiplier * db_multiplier
+
+    if top_db is not None:
+        # Expand batch
+        shape = x_db.size()
+        packed_channels = shape[-3] if x_db.dim() > 2 else 1
+        x_db = x_db.reshape(-1, packed_channels, shape[-2], shape[-1])
+
+        x_db = torch.max(x_db, (x_db.amax(dim=(-3, -2, -1)) - top_db).view(-1, 1, 1, 1))
+
+        # Repack batch
+        x_db = x_db.reshape(shape)
+
+    return x_db
+
+
+def DB_to_amplitude(x: Tensor, ref: float, power: float) -> Tensor:
+    r"""Turn a tensor from the decibel scale to the power/amplitude scale.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        x (Tensor): Input tensor before being converted to power/amplitude scale.
+        ref (float): Reference which the output will be scaled by.
+        power (float): If power equals 1, will compute DB to power. If 0.5, will compute DB to amplitude.
+
+    Returns:
+        Tensor: Output tensor in power/amplitude scale.
+    """
+    return ref * torch.pow(torch.pow(10.0, 0.1 * x), power)
+
+
+def _hz_to_mel(freq: float, mel_scale: str = "htk") -> float:
+    r"""Convert Hz to Mels.
+
+    Args:
+        freqs (float): Frequencies in Hz
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Returns:
+        mels (float): Frequency in Mels
+    """
+
+    if mel_scale not in ["slaney", "htk"]:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+
+    if mel_scale == "htk":
+        return 2595.0 * math.log10(1.0 + (freq / 700.0))
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - f_min) / f_sp
+
+    # Fill in the log-scale part
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - f_min) / f_sp
+    logstep = math.log(6.4) / 27.0
+
+    if freq >= min_log_hz:
+        mels = min_log_mel + math.log(freq / min_log_hz) / logstep
+
+    return mels
+
+
+def _mel_to_hz(mels: Tensor, mel_scale: str = "htk") -> Tensor:
+    """Convert mel bin numbers to frequencies.
+
+    Args:
+        mels (Tensor): Mel frequencies
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Returns:
+        freqs (Tensor): Mels converted in Hz
+    """
+
+    if mel_scale not in ["slaney", "htk"]:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+
+    if mel_scale == "htk":
+        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
+
+    # Fill in the linear scale
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mels
+
+    # And now the nonlinear scale
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - f_min) / f_sp
+    logstep = math.log(6.4) / 27.0
+
+    log_t = mels >= min_log_mel
+    freqs[log_t] = min_log_hz * torch.exp(logstep * (mels[log_t] - min_log_mel))
+
+    return freqs
+
+
+def _create_triangular_filterbank(
+    all_freqs: Tensor,
+    f_pts: Tensor,
+) -> Tensor:
+    """Create a triangular filter bank.
+
+    Args:
+        all_freqs (Tensor): STFT freq points of size (`n_freqs`).
+        f_pts (Tensor): Filter mid points of size (`n_filter`).
+
+    Returns:
+        fb (Tensor): The filter bank of size (`n_freqs`, `n_filter`).
+    """
+    # Adopted from Librosa
+    # calculate the difference between each filter mid point and each stft freq point in hertz
+    f_diff = f_pts[1:] - f_pts[:-1]  # (n_filter + 1)
+    slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1)  # (n_freqs, n_filter + 2)
+    # create overlapping triangles
+    zero = torch.zeros(1)
+    down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_filter)
+    up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_filter)
+    fb = torch.max(zero, torch.min(down_slopes, up_slopes))
+
+    return fb
+
+
+def melscale_fbanks(
+    n_freqs: int,
+    f_min: float,
+    f_max: float,
+    n_mels: int,
+    sample_rate: int,
+    norm: Optional[str] = None,
+    mel_scale: str = "htk",
+) -> Tensor:
+    r"""Create a frequency bin conversion matrix.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Note:
+        For the sake of the numerical compatibility with librosa, not all the coefficients
+        in the resulting filter bank has magnitude of 1.
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/mel_fbanks.png
+           :alt: Visualization of generated filter bank
+
+    Args:
+        n_freqs (int): Number of frequencies to highlight/apply
+        f_min (float): Minimum frequency (Hz)
+        f_max (float): Maximum frequency (Hz)
+        n_mels (int): Number of mel filterbanks
+        sample_rate (int): Sample rate of the audio waveform
+        norm (str or None, optional): If 'slaney', divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Returns:
+        Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``)
+        meaning number of frequencies to highlight/apply to x the number of filterbanks.
+        Each column is a filterbank so that assuming there is a matrix A of
+        size (..., ``n_freqs``), the applied result would be
+        ``A * melscale_fbanks(A.size(-1), ...)``.
+
+    """
+
+    if norm is not None and norm != "slaney":
+        raise ValueError("norm must be one of None or 'slaney'")
+
+    # freq bins
+    all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
+
+    # calculate mel freq bins
+    m_min = _hz_to_mel(f_min, mel_scale=mel_scale)
+    m_max = _hz_to_mel(f_max, mel_scale=mel_scale)
+
+    m_pts = torch.linspace(m_min, m_max, n_mels + 2)
+    f_pts = _mel_to_hz(m_pts, mel_scale=mel_scale)
+
+    # create filterbank
+    fb = _create_triangular_filterbank(all_freqs, f_pts)
+
+    if norm is not None and norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels])
+        fb *= enorm.unsqueeze(0)
+
+    if (fb.max(dim=0).values == 0.0).any():
+        warnings.warn(
+            "At least one mel filterbank has all zero values. "
+            f"The value for `n_mels` ({n_mels}) may be set too high. "
+            f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
+        )
+
+    return fb
+
+
+def linear_fbanks(
+    n_freqs: int,
+    f_min: float,
+    f_max: float,
+    n_filter: int,
+    sample_rate: int,
+) -> Tensor:
+    r"""Creates a linear triangular filterbank.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Note:
+        For the sake of the numerical compatibility with librosa, not all the coefficients
+        in the resulting filter bank has magnitude of 1.
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/lin_fbanks.png
+           :alt: Visualization of generated filter bank
+
+    Args:
+        n_freqs (int): Number of frequencies to highlight/apply
+        f_min (float): Minimum frequency (Hz)
+        f_max (float): Maximum frequency (Hz)
+        n_filter (int): Number of (linear) triangular filter
+        sample_rate (int): Sample rate of the audio waveform
+
+    Returns:
+        Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_filter``)
+        meaning number of frequencies to highlight/apply to x the number of filterbanks.
+        Each column is a filterbank so that assuming there is a matrix A of
+        size (..., ``n_freqs``), the applied result would be
+        ``A * linear_fbanks(A.size(-1), ...)``.
+    """
+    # freq bins
+    all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
+
+    # filter mid-points
+    f_pts = torch.linspace(f_min, f_max, n_filter + 2)
+
+    # create filterbank
+    fb = _create_triangular_filterbank(all_freqs, f_pts)
+
+    return fb
+
+
+def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]) -> Tensor:
+    r"""Create a DCT transformation matrix with shape (``n_mels``, ``n_mfcc``),
+    normalized depending on norm.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Args:
+        n_mfcc (int): Number of mfc coefficients to retain
+        n_mels (int): Number of mel filterbanks
+        norm (str or None): Norm to use (either 'ortho' or None)
+
+    Returns:
+        Tensor: The transformation matrix, to be right-multiplied to
+        row-wise data of size (``n_mels``, ``n_mfcc``).
+    """
+    # http://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II
+    n = torch.arange(float(n_mels))
+    k = torch.arange(float(n_mfcc)).unsqueeze(1)
+    dct = torch.cos(math.pi / float(n_mels) * (n + 0.5) * k)  # size (n_mfcc, n_mels)
+    if norm is None:
+        dct *= 2.0
+    else:
+        assert norm == "ortho"
+        dct[0] *= 1.0 / math.sqrt(2.0)
+        dct *= math.sqrt(2.0 / float(n_mels))
+    return dct.t()
+
+
+def mu_law_encoding(x: Tensor, quantization_channels: int) -> Tensor:
+    r"""Encode signal based on mu-law companding.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    For more info see the
+    `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
+
+    This algorithm expects the signal has been scaled to between -1 and 1 and
+    returns a signal encoded with values from 0 to quantization_channels - 1.
+
+    Args:
+        x (Tensor): Input tensor
+        quantization_channels (int): Number of channels
+
+    Returns:
+        Tensor: Input after mu-law encoding
+    """
+    mu = quantization_channels - 1.0
+    if not x.is_floating_point():
+        warnings.warn(
+            "The input Tensor must be of floating type. \
+            This will be an error in the v0.12 release."
+        )
+        x = x.to(torch.float)
+    mu = torch.tensor(mu, dtype=x.dtype)
+    x_mu = torch.sign(x) * torch.log1p(mu * torch.abs(x)) / torch.log1p(mu)
+    x_mu = ((x_mu + 1) / 2 * mu + 0.5).to(torch.int64)
+    return x_mu
+
+
+def mu_law_decoding(x_mu: Tensor, quantization_channels: int) -> Tensor:
+    r"""Decode mu-law encoded signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    For more info see the
+    `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
+
+    This expects an input with values between 0 and quantization_channels - 1
+    and returns a signal scaled between -1 and 1.
+
+    Args:
+        x_mu (Tensor): Input tensor
+        quantization_channels (int): Number of channels
+
+    Returns:
+        Tensor: Input after mu-law decoding
+    """
+    mu = quantization_channels - 1.0
+    if not x_mu.is_floating_point():
+        x_mu = x_mu.to(torch.float)
+    mu = torch.tensor(mu, dtype=x_mu.dtype)
+    x = ((x_mu) / mu) * 2 - 1.0
+    x = torch.sign(x) * (torch.exp(torch.abs(x) * torch.log1p(mu)) - 1.0) / mu
+    return x
+
+
+def phase_vocoder(complex_specgrams: Tensor, rate: float, phase_advance: Tensor) -> Tensor:
+    r"""Given a STFT tensor, speed up in time without modifying pitch by a factor of ``rate``.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        complex_specgrams (Tensor):
+            A tensor of dimension `(..., freq, num_frame)` with complex dtype.
+        rate (float): Speed-up factor
+        phase_advance (Tensor): Expected phase advance in each bin. Dimension of `(freq, 1)`
+
+    Returns:
+        Tensor:
+            Stretched spectrogram. The resulting tensor is of the same dtype as the input
+            spectrogram, but the number of frames is changed to ``ceil(num_frame / rate)``.
+
+    Example
+        >>> freq, hop_length = 1025, 512
+        >>> # (channel, freq, time)
+        >>> complex_specgrams = torch.randn(2, freq, 300, dtype=torch.cfloat)
+        >>> rate = 1.3 # Speed up by 30%
+        >>> phase_advance = torch.linspace(
+        >>>    0, math.pi * hop_length, freq)[..., None]
+        >>> x = phase_vocoder(complex_specgrams, rate, phase_advance)
+        >>> x.shape # with 231 == ceil(300 / 1.3)
+        torch.Size([2, 1025, 231])
+    """
+    if rate == 1.0:
+        return complex_specgrams
+
+    # pack batch
+    shape = complex_specgrams.size()
+    complex_specgrams = complex_specgrams.reshape([-1] + list(shape[-2:]))
+
+    # Figures out the corresponding real dtype, i.e. complex128 -> float64, complex64 -> float32
+    # Note torch.real is a view so it does not incur any memory copy.
+    real_dtype = torch.real(complex_specgrams).dtype
+    time_steps = torch.arange(0, complex_specgrams.size(-1), rate, device=complex_specgrams.device, dtype=real_dtype)
+
+    alphas = time_steps % 1.0
+    phase_0 = complex_specgrams[..., :1].angle()
+
+    # Time Padding
+    complex_specgrams = torch.nn.functional.pad(complex_specgrams, [0, 2])
+
+    # (new_bins, freq, 2)
+    complex_specgrams_0 = complex_specgrams.index_select(-1, time_steps.long())
+    complex_specgrams_1 = complex_specgrams.index_select(-1, (time_steps + 1).long())
+
+    angle_0 = complex_specgrams_0.angle()
+    angle_1 = complex_specgrams_1.angle()
+
+    norm_0 = complex_specgrams_0.abs()
+    norm_1 = complex_specgrams_1.abs()
+
+    phase = angle_1 - angle_0 - phase_advance
+    phase = phase - 2 * math.pi * torch.round(phase / (2 * math.pi))
+
+    # Compute Phase Accum
+    phase = phase + phase_advance
+    phase = torch.cat([phase_0, phase[..., :-1]], dim=-1)
+    phase_acc = torch.cumsum(phase, -1)
+
+    mag = alphas * norm_1 + (1 - alphas) * norm_0
+
+    complex_specgrams_stretch = torch.polar(mag, phase_acc)
+
+    # unpack batch
+    complex_specgrams_stretch = complex_specgrams_stretch.reshape(shape[:-2] + complex_specgrams_stretch.shape[1:])
+    return complex_specgrams_stretch
+
+
+def _get_mask_param(mask_param: int, p: float, axis_length: int) -> int:
+    if p == 1.0:
+        return mask_param
+    else:
+        return min(mask_param, int(axis_length * p))
+
+
+def mask_along_axis_iid(
+    specgrams: Tensor,
+    mask_param: int,
+    mask_value: float,
+    axis: int,
+    p: float = 1.0,
+) -> Tensor:
+    r"""Apply a mask along ``axis``.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Mask will be applied from indices ``[v_0, v_0 + v)``,
+    where ``v`` is sampled from ``uniform(0, max_v)`` and
+    ``v_0`` from ``uniform(0, specgrams.size(axis) - v)``,
+    with ``max_v = mask_param`` when ``p = 1.0`` and
+    ``max_v = min(mask_param, floor(specgrams.size(axis) * p))`` otherwise.
+
+    Args:
+        specgrams (Tensor): Real spectrograms `(batch, channel, freq, time)`
+        mask_param (int): Number of columns to be masked will be uniformly sampled from [0, mask_param]
+        mask_value (float): Value to assign to the masked columns
+        axis (int): Axis to apply masking on (2 -> frequency, 3 -> time)
+        p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
+
+    Returns:
+        Tensor: Masked spectrograms of dimensions `(batch, channel, freq, time)`
+    """
+
+    if axis not in [2, 3]:
+        raise ValueError("Only Frequency and Time masking are supported")
+
+    if not 0.0 <= p <= 1.0:
+        raise ValueError(f"The value of p must be between 0.0 and 1.0 ({p} given).")
+
+    mask_param = _get_mask_param(mask_param, p, specgrams.shape[axis])
+    if mask_param < 1:
+        return specgrams
+
+    device = specgrams.device
+    dtype = specgrams.dtype
+
+    value = torch.rand(specgrams.shape[:2], device=device, dtype=dtype) * mask_param
+    min_value = torch.rand(specgrams.shape[:2], device=device, dtype=dtype) * (specgrams.size(axis) - value)
+
+    # Create broadcastable mask
+    mask_start = min_value.long()[..., None, None]
+    mask_end = (min_value.long() + value.long())[..., None, None]
+    mask = torch.arange(0, specgrams.size(axis), device=device, dtype=dtype)
+
+    # Per batch example masking
+    specgrams = specgrams.transpose(axis, -1)
+    specgrams = specgrams.masked_fill((mask >= mask_start) & (mask < mask_end), mask_value)
+    specgrams = specgrams.transpose(axis, -1)
+
+    return specgrams
+
+
+def mask_along_axis(
+    specgram: Tensor,
+    mask_param: int,
+    mask_value: float,
+    axis: int,
+    p: float = 1.0,
+) -> Tensor:
+    r"""Apply a mask along ``axis``.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Mask will be applied from indices ``[v_0, v_0 + v)``,
+    where ``v`` is sampled from ``uniform(0, max_v)`` and
+    ``v_0`` from ``uniform(0, specgrams.size(axis) - v)``, with
+    ``max_v = mask_param`` when ``p = 1.0`` and
+    ``max_v = min(mask_param, floor(specgrams.size(axis) * p))``
+    otherwise.
+    All examples will have the same mask interval.
+
+    Args:
+        specgram (Tensor): Real spectrogram `(channel, freq, time)`
+        mask_param (int): Number of columns to be masked will be uniformly sampled from [0, mask_param]
+        mask_value (float): Value to assign to the masked columns
+        axis (int): Axis to apply masking on (1 -> frequency, 2 -> time)
+        p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
+
+    Returns:
+        Tensor: Masked spectrogram of dimensions `(channel, freq, time)`
+    """
+    if axis not in [1, 2]:
+        raise ValueError("Only Frequency and Time masking are supported")
+
+    if not 0.0 <= p <= 1.0:
+        raise ValueError(f"The value of p must be between 0.0 and 1.0 ({p} given).")
+
+    mask_param = _get_mask_param(mask_param, p, specgram.shape[axis])
+    if mask_param < 1:
+        return specgram
+
+    # pack batch
+    shape = specgram.size()
+    specgram = specgram.reshape([-1] + list(shape[-2:]))
+    value = torch.rand(1) * mask_param
+    min_value = torch.rand(1) * (specgram.size(axis) - value)
+
+    mask_start = (min_value.long()).squeeze()
+    mask_end = (min_value.long() + value.long()).squeeze()
+    mask = torch.arange(0, specgram.shape[axis], device=specgram.device, dtype=specgram.dtype)
+    mask = (mask >= mask_start) & (mask < mask_end)
+    if axis == 1:
+        mask = mask.unsqueeze(-1)
+
+    assert mask_end - mask_start < mask_param
+
+    specgram = specgram.masked_fill(mask, mask_value)
+
+    # unpack batch
+    specgram = specgram.reshape(shape[:-2] + specgram.shape[-2:])
+
+    return specgram
+
+
+def compute_deltas(specgram: Tensor, win_length: int = 5, mode: str = "replicate") -> Tensor:
+    r"""Compute delta coefficients of a tensor, usually a spectrogram:
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    .. math::
+       d_t = \frac{\sum_{n=1}^{\text{N}} n (c_{t+n} - c_{t-n})}{2 \sum_{n=1}^{\text{N}} n^2}
+
+    where :math:`d_t` is the deltas at time :math:`t`,
+    :math:`c_t` is the spectrogram coeffcients at time :math:`t`,
+    :math:`N` is ``(win_length-1)//2``.
+
+    Args:
+        specgram (Tensor): Tensor of audio of dimension `(..., freq, time)`
+        win_length (int, optional): The window length used for computing delta (Default: ``5``)
+        mode (str, optional): Mode parameter passed to padding (Default: ``"replicate"``)
+
+    Returns:
+        Tensor: Tensor of deltas of dimension `(..., freq, time)`
+
+    Example
+        >>> specgram = torch.randn(1, 40, 1000)
+        >>> delta = compute_deltas(specgram)
+        >>> delta2 = compute_deltas(delta)
+    """
+    device = specgram.device
+    dtype = specgram.dtype
+
+    # pack batch
+    shape = specgram.size()
+    specgram = specgram.reshape(1, -1, shape[-1])
+
+    assert win_length >= 3
+
+    n = (win_length - 1) // 2
+
+    # twice sum of integer squared
+    denom = n * (n + 1) * (2 * n + 1) / 3
+
+    specgram = torch.nn.functional.pad(specgram, (n, n), mode=mode)
+
+    kernel = torch.arange(-n, n + 1, 1, device=device, dtype=dtype).repeat(specgram.shape[1], 1, 1)
+
+    output = torch.nn.functional.conv1d(specgram, kernel, groups=specgram.shape[1]) / denom
+
+    # unpack batch
+    output = output.reshape(shape)
+
+    return output
+
+
+def _compute_nccf(waveform: Tensor, sample_rate: int, frame_time: float, freq_low: int) -> Tensor:
+    r"""
+    Compute Normalized Cross-Correlation Function (NCCF).
+
+    .. math::
+        \phi_i(m) = \frac{\sum_{n=b_i}^{b_i + N-1} w(n) w(m+n)}{\sqrt{E(b_i) E(m+b_i)}},
+
+    where
+    :math:`\phi_i(m)` is the NCCF at frame :math:`i` with lag :math:`m`,
+    :math:`w` is the waveform,
+    :math:`N` is the length of a frame,
+    :math:`b_i` is the beginning of frame :math:`i`,
+    :math:`E(j)` is the energy :math:`\sum_{n=j}^{j+N-1} w^2(n)`.
+    """
+
+    EPSILON = 10 ** (-9)
+
+    # Number of lags to check
+    lags = int(math.ceil(sample_rate / freq_low))
+
+    frame_size = int(math.ceil(sample_rate * frame_time))
+
+    waveform_length = waveform.size()[-1]
+    num_of_frames = int(math.ceil(waveform_length / frame_size))
+
+    p = lags + num_of_frames * frame_size - waveform_length
+    waveform = torch.nn.functional.pad(waveform, (0, p))
+
+    # Compute lags
+    output_lag = []
+    for lag in range(1, lags + 1):
+        s1 = waveform[..., :-lag].unfold(-1, frame_size, frame_size)[..., :num_of_frames, :]
+        s2 = waveform[..., lag:].unfold(-1, frame_size, frame_size)[..., :num_of_frames, :]
+
+        output_frames = (
+            (s1 * s2).sum(-1)
+            / (EPSILON + torch.norm(s1, p=2, dim=-1)).pow(2)
+            / (EPSILON + torch.norm(s2, p=2, dim=-1)).pow(2)
+        )
+
+        output_lag.append(output_frames.unsqueeze(-1))
+
+    nccf = torch.cat(output_lag, -1)
+
+    return nccf
+
+
+def _combine_max(a: Tuple[Tensor, Tensor], b: Tuple[Tensor, Tensor], thresh: float = 0.99) -> Tuple[Tensor, Tensor]:
+    """
+    Take value from first if bigger than a multiplicative factor of the second, elementwise.
+    """
+    mask = a[0] > thresh * b[0]
+    values = mask * a[0] + ~mask * b[0]
+    indices = mask * a[1] + ~mask * b[1]
+    return values, indices
+
+
+def _find_max_per_frame(nccf: Tensor, sample_rate: int, freq_high: int) -> Tensor:
+    r"""
+    For each frame, take the highest value of NCCF,
+    apply centered median smoothing, and convert to frequency.
+
+    Note: If the max among all the lags is very close
+    to the first half of lags, then the latter is taken.
+    """
+
+    lag_min = int(math.ceil(sample_rate / freq_high))
+
+    # Find near enough max that is smallest
+
+    best = torch.max(nccf[..., lag_min:], -1)
+
+    half_size = nccf.shape[-1] // 2
+    half = torch.max(nccf[..., lag_min:half_size], -1)
+
+    best = _combine_max(half, best)
+    indices = best[1]
+
+    # Add back minimal lag
+    indices += lag_min
+    # Add 1 empirical calibration offset
+    indices += 1
+
+    return indices
+
+
+def _median_smoothing(indices: Tensor, win_length: int) -> Tensor:
+    r"""
+    Apply median smoothing to the 1D tensor over the given window.
+    """
+
+    # Centered windowed
+    pad_length = (win_length - 1) // 2
+
+    # "replicate" padding in any dimension
+    indices = torch.nn.functional.pad(indices, (pad_length, 0), mode="constant", value=0.0)
+
+    indices[..., :pad_length] = torch.cat(pad_length * [indices[..., pad_length].unsqueeze(-1)], dim=-1)
+    roll = indices.unfold(-1, win_length, 1)
+
+    values, _ = torch.median(roll, -1)
+    return values
+
+
+def detect_pitch_frequency(
+    waveform: Tensor,
+    sample_rate: int,
+    frame_time: float = 10 ** (-2),
+    win_length: int = 30,
+    freq_low: int = 85,
+    freq_high: int = 3400,
+) -> Tensor:
+    r"""Detect pitch frequency.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    It is implemented using normalized cross-correlation function and median smoothing.
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension `(..., freq, time)`
+        sample_rate (int): The sample rate of the waveform (Hz)
+        frame_time (float, optional): Duration of a frame (Default: ``10 ** (-2)``).
+        win_length (int, optional): The window length for median smoothing (in number of frames) (Default: ``30``).
+        freq_low (int, optional): Lowest frequency that can be detected (Hz) (Default: ``85``).
+        freq_high (int, optional): Highest frequency that can be detected (Hz) (Default: ``3400``).
+
+    Returns:
+        Tensor: Tensor of freq of dimension `(..., frame)`
+    """
+    # pack batch
+    shape = list(waveform.size())
+    waveform = waveform.reshape([-1] + shape[-1:])
+
+    nccf = _compute_nccf(waveform, sample_rate, frame_time, freq_low)
+    indices = _find_max_per_frame(nccf, sample_rate, freq_high)
+    indices = _median_smoothing(indices, win_length)
+
+    # Convert indices to frequency
+    EPSILON = 10 ** (-9)
+    freq = sample_rate / (EPSILON + indices.to(torch.float))
+
+    # unpack batch
+    freq = freq.reshape(shape[:-1] + list(freq.shape[-1:]))
+
+    return freq
+
+
+def sliding_window_cmn(
+    specgram: Tensor,
+    cmn_window: int = 600,
+    min_cmn_window: int = 100,
+    center: bool = False,
+    norm_vars: bool = False,
+) -> Tensor:
+    r"""
+    Apply sliding-window cepstral mean (and optionally variance) normalization per utterance.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        specgram (Tensor): Tensor of spectrogram of dimension `(..., time, freq)`
+        cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600)
+        min_cmn_window (int, optional):  Minimum CMN window used at start of decoding (adds latency only at start).
+            Only applicable if center == false, ignored if center==true (int, default = 100)
+        center (bool, optional): If true, use a window centered on the current frame
+            (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)
+        norm_vars (bool, optional): If true, normalize variance to one. (bool, default = false)
+
+    Returns:
+        Tensor: Tensor matching input shape `(..., freq, time)`
+    """
+    input_shape = specgram.shape
+    num_frames, num_feats = input_shape[-2:]
+    specgram = specgram.view(-1, num_frames, num_feats)
+    num_channels = specgram.shape[0]
+
+    dtype = specgram.dtype
+    device = specgram.device
+    last_window_start = last_window_end = -1
+    cur_sum = torch.zeros(num_channels, num_feats, dtype=dtype, device=device)
+    cur_sumsq = torch.zeros(num_channels, num_feats, dtype=dtype, device=device)
+    cmn_specgram = torch.zeros(num_channels, num_frames, num_feats, dtype=dtype, device=device)
+    for t in range(num_frames):
+        window_start = 0
+        window_end = 0
+        if center:
+            window_start = t - cmn_window // 2
+            window_end = window_start + cmn_window
+        else:
+            window_start = t - cmn_window
+            window_end = t + 1
+        if window_start < 0:
+            window_end -= window_start
+            window_start = 0
+        if not center:
+            if window_end > t:
+                window_end = max(t + 1, min_cmn_window)
+        if window_end > num_frames:
+            window_start -= window_end - num_frames
+            window_end = num_frames
+            if window_start < 0:
+                window_start = 0
+        if last_window_start == -1:
+            input_part = specgram[:, window_start : window_end - window_start, :]
+            cur_sum += torch.sum(input_part, 1)
+            if norm_vars:
+                cur_sumsq += torch.cumsum(input_part**2, 1)[:, -1, :]
+        else:
+            if window_start > last_window_start:
+                frame_to_remove = specgram[:, last_window_start, :]
+                cur_sum -= frame_to_remove
+                if norm_vars:
+                    cur_sumsq -= frame_to_remove**2
+            if window_end > last_window_end:
+                frame_to_add = specgram[:, last_window_end, :]
+                cur_sum += frame_to_add
+                if norm_vars:
+                    cur_sumsq += frame_to_add**2
+        window_frames = window_end - window_start
+        last_window_start = window_start
+        last_window_end = window_end
+        cmn_specgram[:, t, :] = specgram[:, t, :] - cur_sum / window_frames
+        if norm_vars:
+            if window_frames == 1:
+                cmn_specgram[:, t, :] = torch.zeros(num_channels, num_feats, dtype=dtype, device=device)
+            else:
+                variance = cur_sumsq
+                variance = variance / window_frames
+                variance -= (cur_sum**2) / (window_frames**2)
+                variance = torch.pow(variance, -0.5)
+                cmn_specgram[:, t, :] *= variance
+
+    cmn_specgram = cmn_specgram.view(input_shape[:-2] + (num_frames, num_feats))
+    if len(input_shape) == 2:
+        cmn_specgram = cmn_specgram.squeeze(0)
+    return cmn_specgram
+
+
+def spectral_centroid(
+    waveform: Tensor,
+    sample_rate: int,
+    pad: int,
+    window: Tensor,
+    n_fft: int,
+    hop_length: int,
+    win_length: int,
+) -> Tensor:
+    r"""Compute the spectral centroid for each channel along the time axis.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    The spectral centroid is defined as the weighted average of the
+    frequency values, weighted by their magnitude.
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension `(..., time)`
+        sample_rate (int): Sample rate of the audio waveform
+        pad (int): Two sided padding of signal
+        window (Tensor): Window tensor that is applied/multiplied to each frame/window
+        n_fft (int): Size of FFT
+        hop_length (int): Length of hop between STFT windows
+        win_length (int): Window size
+
+    Returns:
+        Tensor: Dimension `(..., time)`
+    """
+    specgram = spectrogram(
+        waveform,
+        pad=pad,
+        window=window,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        power=1.0,
+        normalized=False,
+    )
+    freqs = torch.linspace(0, sample_rate // 2, steps=1 + n_fft // 2, device=specgram.device).reshape((-1, 1))
+    freq_dim = -2
+    return (freqs * specgram).sum(dim=freq_dim) / specgram.sum(dim=freq_dim)
+
+
+@_mod_utils.requires_sox()
+def apply_codec(
+    waveform: Tensor,
+    sample_rate: int,
+    format: str,
+    channels_first: bool = True,
+    compression: Optional[float] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+) -> Tensor:
+    r"""
+    Apply codecs as a form of augmentation.
+
+    .. devices:: CPU
+
+    Args:
+        waveform (Tensor): Audio data. Must be 2 dimensional. See also ```channels_first```.
+        sample_rate (int): Sample rate of the audio waveform.
+        format (str): File format.
+        channels_first (bool, optional):
+            When True, both the input and output Tensor have dimension `(channel, time)`.
+            Otherwise, they have dimension `(time, channel)`.
+        compression (float or None, optional): Used for formats other than WAV.
+            For more details see :py:func:`torchaudio.backend.sox_io_backend.save`.
+        encoding (str or None, optional): Changes the encoding for the supported formats.
+            For more details see :py:func:`torchaudio.backend.sox_io_backend.save`.
+        bits_per_sample (int or None, optional): Changes the bit depth for the supported formats.
+            For more details see :py:func:`torchaudio.backend.sox_io_backend.save`.
+
+    Returns:
+        Tensor: Resulting Tensor.
+        If ``channels_first=True``, it has `(channel, time)` else `(time, channel)`.
+    """
+    bytes = io.BytesIO()
+    torchaudio.backend.sox_io_backend.save(
+        bytes, waveform, sample_rate, channels_first, compression, format, encoding, bits_per_sample
+    )
+    bytes.seek(0)
+    augmented, sr = torchaudio.backend.sox_io_backend.load(bytes, channels_first=channels_first, format=format)
+    if sr != sample_rate:
+        augmented = resample(augmented, sr, sample_rate)
+    return augmented
+
+
+@_mod_utils.requires_kaldi()
+def compute_kaldi_pitch(
+    waveform: torch.Tensor,
+    sample_rate: float,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    min_f0: float = 50,
+    max_f0: float = 400,
+    soft_min_f0: float = 10.0,
+    penalty_factor: float = 0.1,
+    lowpass_cutoff: float = 1000,
+    resample_frequency: float = 4000,
+    delta_pitch: float = 0.005,
+    nccf_ballast: float = 7000,
+    lowpass_filter_width: int = 1,
+    upsample_filter_width: int = 5,
+    max_frames_latency: int = 0,
+    frames_per_chunk: int = 0,
+    simulate_first_pass_online: bool = False,
+    recompute_frame: int = 500,
+    snip_edges: bool = True,
+) -> torch.Tensor:
+    """Extract pitch based on method described in *A pitch extraction algorithm tuned
+    for automatic speech recognition* [:footcite:`6854049`].
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi.
+
+    Args:
+        waveform (Tensor):
+            The input waveform of shape `(..., time)`.
+        sample_rate (float):
+            Sample rate of `waveform`.
+        frame_length (float, optional):
+            Frame length in milliseconds. (default: 25.0)
+        frame_shift (float, optional):
+            Frame shift in milliseconds. (default: 10.0)
+        min_f0 (float, optional):
+            Minimum F0 to search for (Hz)  (default: 50.0)
+        max_f0 (float, optional):
+            Maximum F0 to search for (Hz)  (default: 400.0)
+        soft_min_f0 (float, optional):
+            Minimum f0, applied in soft way, must not exceed min-f0  (default: 10.0)
+        penalty_factor (float, optional):
+            Cost factor for FO change.  (default: 0.1)
+        lowpass_cutoff (float, optional):
+            Cutoff frequency for LowPass filter (Hz) (default: 1000)
+        resample_frequency (float, optional):
+            Frequency that we down-sample the signal to. Must be more than twice lowpass-cutoff.
+            (default: 4000)
+        delta_pitch( float, optional):
+            Smallest relative change in pitch that our algorithm measures. (default: 0.005)
+        nccf_ballast (float, optional):
+            Increasing this factor reduces NCCF for quiet frames (default: 7000)
+        lowpass_filter_width (int, optional):
+            Integer that determines filter width of lowpass filter, more gives sharper filter.
+            (default: 1)
+        upsample_filter_width (int, optional):
+            Integer that determines filter width when upsampling NCCF. (default: 5)
+        max_frames_latency (int, optional):
+            Maximum number of frames of latency that we allow pitch tracking to introduce into
+            the feature processing (affects output only if ``frames_per_chunk > 0`` and
+            ``simulate_first_pass_online=True``) (default: 0)
+        frames_per_chunk (int, optional):
+            The number of frames used for energy normalization. (default: 0)
+        simulate_first_pass_online (bool, optional):
+            If true, the function will output features that correspond to what an online decoder
+            would see in the first pass of decoding -- not the final version of the features,
+            which is the default. (default: False)
+            Relevant if ``frames_per_chunk > 0``.
+        recompute_frame (int, optional):
+            Only relevant for compatibility with online pitch extraction.
+            A non-critical parameter; the frame at which we recompute some of the forward pointers,
+            after revising our estimate of the signal energy.
+            Relevant if ``frames_per_chunk > 0``. (default: 500)
+        snip_edges (bool, optional):
+            If this is set to false, the incomplete frames near the ending edge won't be snipped,
+            so that the number of frames is the file size divided by the frame-shift.
+            This makes different types of features give the same number of frames. (default: True)
+
+    Returns:
+       Tensor: Pitch feature. Shape: `(batch, frames 2)` where the last dimension
+       corresponds to pitch and NCCF.
+    """
+    shape = waveform.shape
+    waveform = waveform.reshape(-1, shape[-1])
+    result = torch.ops.torchaudio.kaldi_ComputeKaldiPitch(
+        waveform,
+        sample_rate,
+        frame_length,
+        frame_shift,
+        min_f0,
+        max_f0,
+        soft_min_f0,
+        penalty_factor,
+        lowpass_cutoff,
+        resample_frequency,
+        delta_pitch,
+        nccf_ballast,
+        lowpass_filter_width,
+        upsample_filter_width,
+        max_frames_latency,
+        frames_per_chunk,
+        simulate_first_pass_online,
+        recompute_frame,
+        snip_edges,
+    )
+    result = result.reshape(shape[:-1] + result.shape[-2:])
+    return result
+
+
+def _get_sinc_resample_kernel(
+    orig_freq: int,
+    new_freq: int,
+    gcd: int,
+    lowpass_filter_width: int = 6,
+    rolloff: float = 0.99,
+    resampling_method: str = "sinc_interpolation",
+    beta: Optional[float] = None,
+    device: torch.device = torch.device("cpu"),
+    dtype: Optional[torch.dtype] = None,
+):
+
+    if not (int(orig_freq) == orig_freq and int(new_freq) == new_freq):
+        raise Exception(
+            "Frequencies must be of integer type to ensure quality resampling computation. "
+            "To work around this, manually convert both frequencies to integer values "
+            "that maintain their resampling rate ratio before passing them into the function. "
+            "Example: To downsample a 44100 hz waveform by a factor of 8, use "
+            "`orig_freq=8` and `new_freq=1` instead of `orig_freq=44100` and `new_freq=5512.5`. "
+            "For more information, please refer to https://github.com/pytorch/audio/issues/1487."
+        )
+
+    if resampling_method not in ["sinc_interpolation", "kaiser_window"]:
+        raise ValueError("Invalid resampling method: {}".format(resampling_method))
+
+    orig_freq = int(orig_freq) // gcd
+    new_freq = int(new_freq) // gcd
+
+    assert lowpass_filter_width > 0
+    kernels = []
+    base_freq = min(orig_freq, new_freq)
+    # This will perform antialiasing filtering by removing the highest frequencies.
+    # At first I thought I only needed this when downsampling, but when upsampling
+    # you will get edge artifacts without this, as the edge is equivalent to zero padding,
+    # which will add high freq artifacts.
+    base_freq *= rolloff
+
+    # The key idea of the algorithm is that x(t) can be exactly reconstructed from x[i] (tensor)
+    # using the sinc interpolation formula:
+    #   x(t) = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - t))
+    # We can then sample the function x(t) with a different sample rate:
+    #    y[j] = x(j / new_freq)
+    # or,
+    #    y[j] = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - j / new_freq))
+
+    # We see here that y[j] is the convolution of x[i] with a specific filter, for which
+    # we take an FIR approximation, stopping when we see at least `lowpass_filter_width` zeros crossing.
+    # But y[j+1] is going to have a different set of weights and so on, until y[j + new_freq].
+    # Indeed:
+    # y[j + new_freq] = sum_i x[i] sinc(pi * orig_freq * ((i / orig_freq - (j + new_freq) / new_freq))
+    #                 = sum_i x[i] sinc(pi * orig_freq * ((i - orig_freq) / orig_freq - j / new_freq))
+    #                 = sum_i x[i + orig_freq] sinc(pi * orig_freq * (i / orig_freq - j / new_freq))
+    # so y[j+new_freq] uses the same filter as y[j], but on a shifted version of x by `orig_freq`.
+    # This will explain the F.conv1d after, with a stride of orig_freq.
+    width = math.ceil(lowpass_filter_width * orig_freq / base_freq)
+    # If orig_freq is still big after GCD reduction, most filters will be very unbalanced, i.e.,
+    # they will have a lot of almost zero values to the left or to the right...
+    # There is probably a way to evaluate those filters more efficiently, but this is kept for
+    # future work.
+    idx_dtype = dtype if dtype is not None else torch.float64
+    idx = torch.arange(-width, width + orig_freq, device=device, dtype=idx_dtype)
+
+    for i in range(new_freq):
+        t = (-i / new_freq + idx / orig_freq) * base_freq
+        t = t.clamp_(-lowpass_filter_width, lowpass_filter_width)
+
+        # we do not use built in torch windows here as we need to evaluate the window
+        # at specific positions, not over a regular grid.
+        if resampling_method == "sinc_interpolation":
+            window = torch.cos(t * math.pi / lowpass_filter_width / 2) ** 2
+        else:
+            # kaiser_window
+            if beta is None:
+                beta = 14.769656459379492
+            beta_tensor = torch.tensor(float(beta))
+            window = torch.i0(beta_tensor * torch.sqrt(1 - (t / lowpass_filter_width) ** 2)) / torch.i0(beta_tensor)
+        t *= math.pi
+        kernel = torch.where(t == 0, torch.tensor(1.0).to(t), torch.sin(t) / t)
+        kernel.mul_(window)
+        kernels.append(kernel)
+
+    scale = base_freq / orig_freq
+    kernels = torch.stack(kernels).view(new_freq, 1, -1).mul_(scale)
+    if dtype is None:
+        kernels = kernels.to(dtype=torch.float32)
+    return kernels, width
+
+
+def _apply_sinc_resample_kernel(
+    waveform: Tensor,
+    orig_freq: int,
+    new_freq: int,
+    gcd: int,
+    kernel: Tensor,
+    width: int,
+):
+    if not waveform.is_floating_point():
+        raise TypeError(f"Expected floating point type for waveform tensor, but received {waveform.dtype}.")
+
+    orig_freq = int(orig_freq) // gcd
+    new_freq = int(new_freq) // gcd
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.view(-1, shape[-1])
+
+    num_wavs, length = waveform.shape
+    waveform = torch.nn.functional.pad(waveform, (width, width + orig_freq))
+    resampled = torch.nn.functional.conv1d(waveform[:, None], kernel, stride=orig_freq)
+    resampled = resampled.transpose(1, 2).reshape(num_wavs, -1)
+    target_length = int(math.ceil(new_freq * length / orig_freq))
+    resampled = resampled[..., :target_length]
+
+    # unpack batch
+    resampled = resampled.view(shape[:-1] + resampled.shape[-1:])
+    return resampled
+
+
+def resample(
+    waveform: Tensor,
+    orig_freq: int,
+    new_freq: int,
+    lowpass_filter_width: int = 6,
+    rolloff: float = 0.99,
+    resampling_method: str = "sinc_interpolation",
+    beta: Optional[float] = None,
+) -> Tensor:
+    r"""Resamples the waveform at the new frequency using bandlimited interpolation. [:footcite:`RESAMPLE`].
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Note:
+        ``transforms.Resample`` precomputes and reuses the resampling kernel, so using it will result in
+        more efficient computation if resampling multiple waveforms with the same resampling parameters.
+
+    Args:
+        waveform (Tensor): The input signal of dimension `(..., time)`
+        orig_freq (int): The original frequency of the signal
+        new_freq (int): The desired frequency
+        lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper
+            but less efficient. (Default: ``6``)
+        rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist.
+            Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``)
+        resampling_method (str, optional): The resampling method to use.
+            Options: [``sinc_interpolation``, ``kaiser_window``] (Default: ``'sinc_interpolation'``)
+        beta (float or None, optional): The shape parameter used for kaiser window.
+
+    Returns:
+        Tensor: The waveform at the new frequency of dimension `(..., time).`
+    """
+
+    assert orig_freq > 0.0 and new_freq > 0.0
+
+    if orig_freq == new_freq:
+        return waveform
+
+    gcd = math.gcd(int(orig_freq), int(new_freq))
+
+    kernel, width = _get_sinc_resample_kernel(
+        orig_freq,
+        new_freq,
+        gcd,
+        lowpass_filter_width,
+        rolloff,
+        resampling_method,
+        beta,
+        waveform.device,
+        waveform.dtype,
+    )
+    resampled = _apply_sinc_resample_kernel(waveform, orig_freq, new_freq, gcd, kernel, width)
+    return resampled
+
+
+@torch.jit.unused
+def edit_distance(seq1: Sequence, seq2: Sequence) -> int:
+    """
+    Calculate the word level edit (Levenshtein) distance between two sequences.
+
+    .. devices:: CPU
+
+    The function computes an edit distance allowing deletion, insertion and
+    substitution. The result is an integer.
+
+    For most applications, the two input sequences should be the same type. If
+    two strings are given, the output is the edit distance between the two
+    strings (character edit distance). If two lists of strings are given, the
+    output is the edit distance between sentences (word edit distance). Users
+    may want to normalize the output by the length of the reference sequence.
+
+    Args:
+        seq1 (Sequence): the first sequence to compare.
+        seq2 (Sequence): the second sequence to compare.
+    Returns:
+        int: The distance between the first and second sequences.
+    """
+    len_sent2 = len(seq2)
+    dold = list(range(len_sent2 + 1))
+    dnew = [0 for _ in range(len_sent2 + 1)]
+
+    for i in range(1, len(seq1) + 1):
+        dnew[0] = i
+        for j in range(1, len_sent2 + 1):
+            if seq1[i - 1] == seq2[j - 1]:
+                dnew[j] = dold[j - 1]
+            else:
+                substitution = dold[j - 1] + 1
+                insertion = dnew[j - 1] + 1
+                deletion = dold[j] + 1
+                dnew[j] = min(substitution, insertion, deletion)
+
+        dnew, dold = dold, dnew
+
+    return int(dold[-1])
+
+
+def pitch_shift(
+    waveform: Tensor,
+    sample_rate: int,
+    n_steps: int,
+    bins_per_octave: int = 12,
+    n_fft: int = 512,
+    win_length: Optional[int] = None,
+    hop_length: Optional[int] = None,
+    window: Optional[Tensor] = None,
+) -> Tensor:
+    """
+    Shift the pitch of a waveform by ``n_steps`` steps.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        waveform (Tensor): The input waveform of shape `(..., time)`.
+        sample_rate (int): Sample rate of `waveform`.
+        n_steps (int): The (fractional) steps to shift `waveform`.
+        bins_per_octave (int, optional): The number of steps per octave (Default: ``12``).
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins (Default: ``512``).
+        win_length (int or None, optional): Window size. If None, then ``n_fft`` is used. (Default: ``None``).
+        hop_length (int or None, optional): Length of hop between STFT windows. If None, then
+            ``win_length // 4`` is used (Default: ``None``).
+        window (Tensor or None, optional): Window tensor that is applied/multiplied to each frame/window.
+            If None, then ``torch.hann_window(win_length)`` is used (Default: ``None``).
+
+
+    Returns:
+        Tensor: The pitch-shifted audio waveform of shape `(..., time)`.
+    """
+    waveform_stretch = _stretch_waveform(
+        waveform,
+        n_steps,
+        bins_per_octave,
+        n_fft,
+        win_length,
+        hop_length,
+        window,
+    )
+    rate = 2.0 ** (-float(n_steps) / bins_per_octave)
+    waveform_shift = resample(waveform_stretch, int(sample_rate / rate), sample_rate)
+
+    return _fix_waveform_shape(waveform_shift, waveform.size())
+
+
+def _stretch_waveform(
+    waveform: Tensor,
+    n_steps: int,
+    bins_per_octave: int = 12,
+    n_fft: int = 512,
+    win_length: Optional[int] = None,
+    hop_length: Optional[int] = None,
+    window: Optional[Tensor] = None,
+) -> Tensor:
+    """
+    Pitch shift helper function to preprocess and stretch waveform before resampling step.
+
+    Args:
+        See pitch_shift arg descriptions.
+
+    Returns:
+        Tensor: The preprocessed waveform stretched prior to resampling.
+    """
+    if hop_length is None:
+        hop_length = n_fft // 4
+    if win_length is None:
+        win_length = n_fft
+    if window is None:
+        window = torch.hann_window(window_length=win_length, device=waveform.device)
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, shape[-1])
+
+    ori_len = shape[-1]
+    rate = 2.0 ** (-float(n_steps) / bins_per_octave)
+    spec_f = torch.stft(
+        input=waveform,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        window=window,
+        center=True,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    phase_advance = torch.linspace(0, math.pi * hop_length, spec_f.shape[-2], device=spec_f.device)[..., None]
+    spec_stretch = phase_vocoder(spec_f, rate, phase_advance)
+    len_stretch = int(round(ori_len / rate))
+    waveform_stretch = torch.istft(
+        spec_stretch, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, length=len_stretch
+    )
+    return waveform_stretch
+
+
+def _fix_waveform_shape(
+    waveform_shift: Tensor,
+    shape: List[int],
+) -> Tensor:
+    """
+    PitchShift helper function to process after resampling step to fix the shape back.
+
+    Args:
+        waveform_shift(Tensor): The waveform after stretch and resample
+        shape (List[int]): The shape of initial waveform
+
+    Returns:
+        Tensor: The pitch-shifted audio waveform of shape `(..., time)`.
+    """
+    ori_len = shape[-1]
+    shift_len = waveform_shift.size()[-1]
+    if shift_len > ori_len:
+        waveform_shift = waveform_shift[..., :ori_len]
+    else:
+        waveform_shift = torch.nn.functional.pad(waveform_shift, [0, ori_len - shift_len])
+
+    # unpack batch
+    waveform_shift = waveform_shift.view(shape[:-1] + waveform_shift.shape[-1:])
+    return waveform_shift
+
+
+def rnnt_loss(
+    logits: Tensor,
+    targets: Tensor,
+    logit_lengths: Tensor,
+    target_lengths: Tensor,
+    blank: int = -1,
+    clamp: float = -1,
+    reduction: str = "mean",
+):
+    """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
+    [:footcite:`graves2012sequence`].
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    The RNN Transducer loss extends the CTC loss by defining a distribution over output
+    sequences of all lengths, and by jointly modelling both input-output and output-output
+    dependencies.
+
+    Args:
+        logits (Tensor): Tensor of dimension `(batch, max seq length, max target length + 1, class)`
+            containing output from joiner
+        targets (Tensor): Tensor of dimension `(batch, max target length)` containing targets with zero padded
+        logit_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of each sequence from encoder
+        target_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of targets for each sequence
+        blank (int, optional): blank label (Default: ``-1``)
+        clamp (float, optional): clamp for gradients (Default: ``-1``)
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. (Default: ``'mean'``)
+    Returns:
+        Tensor: Loss with the reduction option applied. If ``reduction`` is  ``'none'``, then size `(batch)`,
+        otherwise scalar.
+    """
+    if reduction not in ["none", "mean", "sum"]:
+        raise ValueError("reduction should be one of 'none', 'mean', or 'sum'")
+
+    if blank < 0:  # reinterpret blank index if blank < 0.
+        blank = logits.shape[-1] + blank
+
+    costs, _ = torch.ops.torchaudio.rnnt_loss(
+        logits=logits,
+        targets=targets,
+        logit_lengths=logit_lengths,
+        target_lengths=target_lengths,
+        blank=blank,
+        clamp=clamp,
+    )
+
+    if reduction == "mean":
+        return costs.mean()
+    elif reduction == "sum":
+        return costs.sum()
+
+    return costs
+
+
+def psd(
+    specgram: Tensor,
+    mask: Optional[Tensor] = None,
+    normalize: bool = True,
+    eps: float = 1e-10,
+) -> Tensor:
+    """Compute cross-channel power spectral density (PSD) matrix.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        specgram (torch.Tensor): Multi-channel complex-valued spectrum.
+            Tensor with dimensions `(..., channel, freq, time)`.
+        mask (torch.Tensor or None, optional): Time-Frequency mask for normalization.
+            Tensor with dimensions `(..., freq, time)`. (Default: ``None``)
+        normalize (bool, optional): If ``True``, normalize the mask along the time dimension. (Default: ``True``)
+        eps (float, optional): Value to add to the denominator in mask normalization. (Default: ``1e-15``)
+
+    Returns:
+        torch.Tensor: The complex-valued PSD matrix of the input spectrum.
+        Tensor with dimensions `(..., freq, channel, channel)`
+    """
+    specgram = specgram.transpose(-3, -2)  # shape (freq, channel, time)
+    # outer product:
+    # (..., ch_1, time) x (..., ch_2, time) -> (..., time, ch_1, ch_2)
+    psd = torch.einsum("...ct,...et->...tce", [specgram, specgram.conj()])
+
+    if mask is not None:
+        assert (
+            mask.shape[:-1] == specgram.shape[:-2] and mask.shape[-1] == specgram.shape[-1]
+        ), "The dimensions of mask except the channel dimension should be the same as specgram."
+        f"Found {mask.shape} for mask and {specgram.shape} for specgram."
+        # Normalized mask along time dimension:
+        if normalize:
+            mask = mask / (mask.sum(dim=-1, keepdim=True) + eps)
+
+        psd = psd * mask[..., None, None]
+
+    psd = psd.sum(dim=-3)
+    return psd
+
+
+def _compute_mat_trace(input: torch.Tensor, dim1: int = -1, dim2: int = -2) -> torch.Tensor:
+    r"""Compute the trace of a Tensor along ``dim1`` and ``dim2`` dimensions.
+
+    Args:
+        input (torch.Tensor): Tensor with dimensions `(..., channel, channel)`.
+        dim1 (int, optional): The first dimension of the diagonal matrix.
+            (Default: ``-1``)
+        dim2 (int, optional): The second dimension of the diagonal matrix.
+            (Default: ``-2``)
+
+    Returns:
+        Tensor: The trace of the input Tensor.
+    """
+    assert input.ndim >= 2, "The dimension of the tensor must be at least 2."
+    assert input.shape[dim1] == input.shape[dim2], "The size of ``dim1`` and ``dim2`` must be the same."
+    input = torch.diagonal(input, 0, dim1=dim1, dim2=dim2)
+    return input.sum(dim=-1)
+
+
+def _tik_reg(mat: torch.Tensor, reg: float = 1e-7, eps: float = 1e-8) -> torch.Tensor:
+    """Perform Tikhonov regularization (only modifying real part).
+
+    Args:
+        mat (torch.Tensor): Input matrix with dimensions `(..., channel, channel)`.
+        reg (float, optional): Regularization factor. (Default: 1e-8)
+        eps (float, optional): Value to avoid the correlation matrix is all-zero. (Default: ``1e-8``)
+
+    Returns:
+        Tensor: Regularized matrix with dimensions `(..., channel, channel)`.
+    """
+    # Add eps
+    C = mat.size(-1)
+    eye = torch.eye(C, dtype=mat.dtype, device=mat.device)
+    epsilon = _compute_mat_trace(mat).real[..., None, None] * reg
+    # in case that correlation_matrix is all-zero
+    epsilon = epsilon + eps
+    mat = mat + epsilon * eye[..., :, :]
+    return mat
+
+
+def _assert_psd_matrices(psd_s: torch.Tensor, psd_n: torch.Tensor) -> None:
+    """Assertion checks of the PSD matrices of target speech and noise.
+
+    Args:
+        psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+    """
+    assert (
+        psd_s.ndim >= 3 and psd_n.ndim >= 3
+    ), "Expected at least 3D Tensor (..., freq, channel, channel) for psd_s and psd_n."
+    "Found {psd_s.shape} for psd_s and {psd_n.shape} for psd_n."
+    assert (
+        psd_s.is_complex() and psd_n.is_complex()
+    ), "The type of psd_s and psd_n must be ``torch.cfloat`` or ``torch.cdouble``."
+    f"Found {psd_s.dtype} for psd_s and {psd_n.dtype} for psd_n."
+    assert (
+        psd_s.shape == psd_n.shape
+    ), f"The dimensions of psd_s and psd_n should be the same. Found {psd_s.shape} and {psd_n.shape}."
+    assert (
+        psd_s.shape[-1] == psd_s.shape[-2]
+    ), f"The last two dimensions of psd_s should be the same. Found {psd_s.shape}."
+
+
+def mvdr_weights_souden(
+    psd_s: Tensor,
+    psd_n: Tensor,
+    reference_channel: Union[int, Tensor],
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+) -> Tensor:
+    r"""Compute the Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) beamforming weights
+    by the method proposed by *Souden et, al.* [:footcite:`souden2009optimal`].
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Given the power spectral density (PSD) matrix of target speech :math:`\bf{\Phi}_{\textbf{SS}}`,
+    the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and a one-hot vector that represents the
+    reference channel :math:`\bf{u}`, the method computes the MVDR beamforming weight martrix
+    :math:`\textbf{w}_{\text{MVDR}}`. The formula is defined as:
+
+    .. math::
+        \textbf{w}_{\text{MVDR}}(f) =
+        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bf{\Phi}_{\textbf{SS}}}}(f)}
+        {\text{Trace}({{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f) \bf{\Phi}_{\textbf{SS}}}(f))}}\bm{u}
+
+    Args:
+        psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        reference_channel (int or torch.Tensor): Specifies the reference channel.
+            If the dtype is ``int``, it represents the reference channel index.
+            If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
+            is one-hot.
+        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+            (Default: ``True``)
+        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+        eps (float, optional): Value to add to the denominator in the beamforming weight formula.
+            (Default: ``1e-8``)
+
+    Returns:
+        torch.Tensor: The complex-valued MVDR beamforming weight matrix with dimensions `(..., freq, channel)`.
+    """
+    _assert_psd_matrices(psd_s, psd_n)
+
+    if diagonal_loading:
+        psd_n = _tik_reg(psd_n, reg=diag_eps)
+    numerator = torch.linalg.solve(psd_n, psd_s)  # psd_n.inv() @ psd_s
+    # ws: (..., C, C) / (...,) -> (..., C, C)
+    ws = numerator / (_compute_mat_trace(numerator)[..., None, None] + eps)
+    if torch.jit.isinstance(reference_channel, int):
+        beamform_weights = ws[..., :, reference_channel]
+    elif torch.jit.isinstance(reference_channel, Tensor):
+        reference_channel = reference_channel.to(psd_n.dtype)
+        # h: (..., F, C_1, C_2) x (..., C_2) -> (..., F, C_1)
+        beamform_weights = torch.einsum("...c,...c->...", [ws, reference_channel[..., None, None, :]])
+    else:
+        raise TypeError(f"Expected 'int' or 'Tensor' for reference_channel. Found: {type(reference_channel)}.")
+
+    return beamform_weights
+
+
+def mvdr_weights_rtf(
+    rtf: Tensor,
+    psd_n: Tensor,
+    reference_channel: Optional[Union[int, Tensor]] = None,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+) -> Tensor:
+    r"""Compute the Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) beamforming weights
+    based on the relative transfer function (RTF) and power spectral density (PSD) matrix of noise.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Given the relative transfer function (RTF) matrix or the steering vector of target speech :math:`\bm{v}`,
+    the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and a one-hot vector that represents the
+    reference channel :math:`\bf{u}`, the method computes the MVDR beamforming weight martrix
+    :math:`\textbf{w}_{\text{MVDR}}`. The formula is defined as:
+
+    .. math::
+        \textbf{w}_{\text{MVDR}}(f) =
+        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}}
+        {{\bm{v}^{\mathsf{H}}}(f){\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}
+
+    where :math:`(.)^{\mathsf{H}}` denotes the Hermitian Conjugate operation.
+
+    Args:
+        rtf (torch.Tensor): The complex-valued RTF vector of target speech.
+            Tensor with dimensions `(..., freq, channel)`.
+        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        reference_channel (int or torch.Tensor): Specifies the reference channel.
+            If the dtype is ``int``, it represents the reference channel index.
+            If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
+            is one-hot.
+        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+            (Default: ``True``)
+        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+        eps (float, optional): Value to add to the denominator in the beamforming weight formula.
+            (Default: ``1e-8``)
+
+    Returns:
+        torch.Tensor: The complex-valued MVDR beamforming weight matrix with dimensions `(..., freq, channel)`.
+    """
+    assert rtf.ndim >= 2, f"Expected at least 2D Tensor (..., freq, channel) for rtf. Found {rtf.shape}."
+    assert psd_n.ndim >= 3, f"Expected at least 3D Tensor (..., freq, channel, channel) for psd_n. Found {psd_n.shape}."
+    assert (
+        rtf.is_complex() and psd_n.is_complex()
+    ), "The type of rtf and psd_n must be ``torch.cfloat`` or ``torch.cdouble``."
+    f"Found {rtf.dtype} for rtf and {psd_n.dtype} for psd_n."
+    assert (
+        rtf.shape == psd_n.shape[:-1]
+    ), "The dimensions of rtf and the dimensions withou the last dimension of psd_n should be the same."
+    f"Found {rtf.shape} for rtf and {psd_n.shape} for psd_n."
+    assert (
+        psd_n.shape[-1] == psd_n.shape[-2]
+    ), f"The last two dimensions of psd_n should be the same. Found {psd_n.shape}."
+
+    if diagonal_loading:
+        psd_n = _tik_reg(psd_n, reg=diag_eps)
+    # numerator = psd_n.inv() @ stv
+    numerator = torch.linalg.solve(psd_n, rtf.unsqueeze(-1)).squeeze(-1)  # (..., freq, channel)
+    # denominator = stv^H @ psd_n.inv() @ stv
+    denominator = torch.einsum("...d,...d->...", [rtf.conj(), numerator])
+    beamform_weights = numerator / (denominator.real.unsqueeze(-1) + eps)
+    # normalize the numerator
+    if reference_channel is not None:
+        if torch.jit.isinstance(reference_channel, int):
+            scale = rtf[..., reference_channel].conj()
+        elif torch.jit.isinstance(reference_channel, Tensor):
+            reference_channel = reference_channel.to(psd_n.dtype)
+            scale = torch.einsum("...c,...c->...", [rtf.conj(), reference_channel[..., None, :]])
+        else:
+            raise TypeError(f"Expected 'int' or 'Tensor' for reference_channel. Found: {type(reference_channel)}.")
+
+        beamform_weights = beamform_weights * scale[..., None]
+
+    return beamform_weights
+
+
+def rtf_evd(psd_s: Tensor) -> Tensor:
+    r"""Estimate the relative transfer function (RTF) or the steering vector by eigenvalue decomposition.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        psd_s (Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+            Tensor of dimension `(..., freq, channel, channel)`
+
+    Returns:
+        Tensor: The estimated complex-valued RTF of target speech.
+        Tensor of dimension `(..., freq, channel)`
+    """
+    assert psd_s.is_complex(), f"The type of psd_s must be ``torch.cfloat`` or ``torch.cdouble``. Found {psd_s.dtype}."
+    assert (
+        psd_s.shape[-1] == psd_s.shape[-2]
+    ), f"The last two dimensions of psd_s should be the same. Found {psd_s.shape}."
+    _, v = torch.linalg.eigh(psd_s)  # v is sorted along with eigenvalues in ascending order
+    rtf = v[..., -1]  # choose the eigenvector with max eigenvalue
+    return rtf
+
+
+def rtf_power(
+    psd_s: Tensor,
+    psd_n: Tensor,
+    reference_channel: Union[int, Tensor],
+    n_iter: int = 3,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+) -> Tensor:
+    r"""Estimate the relative transfer function (RTF) or the steering vector by the power method.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        reference_channel (int or torch.Tensor): Specifies the reference channel.
+            If the dtype is ``int``, it represents the reference channel index.
+            If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
+            is one-hot.
+        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+            (Default: ``True``)
+        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+
+    Returns:
+        torch.Tensor: The estimated complex-valued RTF of target speech.
+        Tensor of dimension `(..., freq, channel)`.
+    """
+    _assert_psd_matrices(psd_s, psd_n)
+    assert n_iter > 0, "The number of iteration must be greater than 0."
+
+    # Apply diagonal loading to psd_n to improve robustness.
+    if diagonal_loading:
+        psd_n = _tik_reg(psd_n, reg=diag_eps)
+    # phi is regarded as the first iteration
+    phi = torch.linalg.solve(psd_n, psd_s)  # psd_n.inv() @ psd_s
+    if torch.jit.isinstance(reference_channel, int):
+        rtf = phi[..., reference_channel]
+    elif torch.jit.isinstance(reference_channel, Tensor):
+        reference_channel = reference_channel.to(psd_n.dtype)
+        rtf = torch.einsum("...c,...c->...", [phi, reference_channel[..., None, None, :]])
+    else:
+        raise TypeError(f"Expected 'int' or 'Tensor' for reference_channel. Found: {type(reference_channel)}.")
+    rtf = rtf.unsqueeze(-1)  # (..., freq, channel, 1)
+    if n_iter >= 2:
+        # The number of iterations in the for loop is `n_iter - 2`
+        # because the `phi` above and `torch.matmul(psd_s, rtf)` are regarded as
+        # two iterations.
+        for _ in range(n_iter - 2):
+            rtf = torch.matmul(phi, rtf)
+        rtf = torch.matmul(psd_s, rtf)
+    else:
+        # if there is only one iteration, the rtf is the psd_s[..., referenc_channel]
+        # which is psd_n @ phi @ ref_channel
+        rtf = torch.matmul(psd_n, rtf)
+    return rtf.squeeze(-1)
+
+
+def apply_beamforming(beamform_weights: Tensor, specgram: Tensor) -> Tensor:
+    r"""Apply the beamforming weight to the multi-channel noisy spectrum to obtain the single-channel enhanced spectrum.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    .. math::
+        \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f)
+    where :math:`\textbf{w}_{\text{bf}}(f)` is the beamforming weight for the :math:`f`-th frequency bin,
+    :math:`\textbf{Y}` is the multi-channel spectrum for the :math:`f`-th frequency bin.
+
+    Args:
+        beamform_weights (Tensor): The complex-valued beamforming weight matrix.
+            Tensor of dimension `(..., freq, channel)`
+        specgram (Tensor): The multi-channel complex-valued noisy spectrum.
+            Tensor of dimension `(..., channel, freq, time)`
+
+    Returns:
+        Tensor: The single-channel complex-valued enhanced spectrum.
+            Tensor of dimension `(..., freq, time)`
+    """
+    assert (
+        beamform_weights.shape[:-2] == specgram.shape[:-3]
+    ), "The dimensions except the last two dimensions of beamform_weights should be the same "
+    "as the dimensions except the last three dimensions of specgram."
+    f"Found {beamform_weights.shape} for beamform_weights and {specgram.shape} for specgram."
+    assert (
+        beamform_weights.is_complex() and specgram.is_complex()
+    ), "The type of beamform_weights and specgram must be ``torch.cfloat`` or ``torch.cdouble``."
+    f"Found {beamform_weights.dtype} for beamform_weights and {specgram.dtype} for specgram."
+
+    # (..., freq, channel) x (..., channel, freq, time) -> (..., freq, time)
+    specgram_enhanced = torch.einsum("...fc,...cft->...ft", [beamform_weights.conj(), specgram])
+    return specgram_enhanced
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/__init__.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb0f84732cae237ed657624592f7c090700eeb98
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/__init__.py
@@ -0,0 +1,53 @@
+from .conformer import Conformer
+from .conv_tasnet import ConvTasNet
+from .deepspeech import DeepSpeech
+from .emformer import Emformer
+from .rnnt import emformer_rnnt_base, emformer_rnnt_model, RNNT
+from .rnnt_decoder import Hypothesis, RNNTBeamSearch
+from .tacotron2 import Tacotron2
+from .wav2letter import Wav2Letter
+from .wav2vec2 import (
+    hubert_base,
+    hubert_large,
+    hubert_pretrain_base,
+    hubert_pretrain_large,
+    hubert_pretrain_model,
+    hubert_pretrain_xlarge,
+    hubert_xlarge,
+    HuBERTPretrainModel,
+    wav2vec2_base,
+    wav2vec2_large,
+    wav2vec2_large_lv60k,
+    wav2vec2_model,
+    Wav2Vec2Model,
+)
+from .wavernn import WaveRNN
+
+
+__all__ = [
+    "Wav2Letter",
+    "WaveRNN",
+    "ConvTasNet",
+    "DeepSpeech",
+    "Wav2Vec2Model",
+    "HuBERTPretrainModel",
+    "wav2vec2_model",
+    "wav2vec2_base",
+    "wav2vec2_large",
+    "wav2vec2_large_lv60k",
+    "hubert_base",
+    "hubert_large",
+    "hubert_xlarge",
+    "hubert_pretrain_model",
+    "hubert_pretrain_base",
+    "hubert_pretrain_large",
+    "hubert_pretrain_xlarge",
+    "Tacotron2",
+    "Conformer",
+    "Emformer",
+    "Hypothesis",
+    "RNNT",
+    "RNNTBeamSearch",
+    "emformer_rnnt_base",
+    "emformer_rnnt_model",
+]
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/conformer.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..890c2945c753fc1defdb9f0f152d7e167a7d7182
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/conformer.py
@@ -0,0 +1,292 @@
+from typing import Optional, Tuple
+
+import torch
+
+
+__all__ = ["Conformer"]
+
+
+def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor:
+    batch_size = lengths.shape[0]
+    max_length = int(torch.max(lengths).item())
+    padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand(
+        batch_size, max_length
+    ) >= lengths.unsqueeze(1)
+    return padding_mask
+
+
+class _ConvolutionModule(torch.nn.Module):
+    r"""Conformer convolution module.
+
+    Args:
+        input_dim (int): input dimension.
+        num_channels (int): number of depthwise convolution layer input channels.
+        depthwise_kernel_size (int): kernel size of depthwise convolution layer.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+        bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``)
+        use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_channels: int,
+        depthwise_kernel_size: int,
+        dropout: float = 0.0,
+        bias: bool = False,
+        use_group_norm: bool = False,
+    ) -> None:
+        super().__init__()
+        assert (depthwise_kernel_size - 1) % 2 == 0, "depthwise_kernel_size must be odd to achieve 'SAME' padding."
+        self.layer_norm = torch.nn.LayerNorm(input_dim)
+        self.sequential = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                input_dim,
+                2 * num_channels,
+                1,
+                stride=1,
+                padding=0,
+                bias=bias,
+            ),
+            torch.nn.GLU(dim=1),
+            torch.nn.Conv1d(
+                num_channels,
+                num_channels,
+                depthwise_kernel_size,
+                stride=1,
+                padding=(depthwise_kernel_size - 1) // 2,
+                groups=num_channels,
+                bias=bias,
+            ),
+            torch.nn.GroupNorm(num_groups=1, num_channels=num_channels)
+            if use_group_norm
+            else torch.nn.BatchNorm1d(num_channels),
+            torch.nn.SiLU(),
+            torch.nn.Conv1d(
+                num_channels,
+                input_dim,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=bias,
+            ),
+            torch.nn.Dropout(dropout),
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            input (torch.Tensor): with shape `(B, T, D)`.
+
+        Returns:
+            torch.Tensor: output, with shape `(B, T, D)`.
+        """
+        x = self.layer_norm(input)
+        x = x.transpose(1, 2)
+        x = self.sequential(x)
+        return x.transpose(1, 2)
+
+
+class _FeedForwardModule(torch.nn.Module):
+    r"""Positionwise feed forward layer.
+
+    Args:
+        input_dim (int): input dimension.
+        hidden_dim (int): hidden dimension.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+    """
+
+    def __init__(self, input_dim: int, hidden_dim: int, dropout: float = 0.0) -> None:
+        super().__init__()
+        self.sequential = torch.nn.Sequential(
+            torch.nn.LayerNorm(input_dim),
+            torch.nn.Linear(input_dim, hidden_dim, bias=True),
+            torch.nn.SiLU(),
+            torch.nn.Dropout(dropout),
+            torch.nn.Linear(hidden_dim, input_dim, bias=True),
+            torch.nn.Dropout(dropout),
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            input (torch.Tensor): with shape `(*, D)`.
+
+        Returns:
+            torch.Tensor: output, with shape `(*, D)`.
+        """
+        return self.sequential(input)
+
+
+class ConformerLayer(torch.nn.Module):
+    r"""Conformer layer that constitutes Conformer.
+
+    Args:
+        input_dim (int): input dimension.
+        ffn_dim (int): hidden layer dimension of feedforward network.
+        num_attention_heads (int): number of attention heads.
+        depthwise_conv_kernel_size (int): kernel size of depthwise convolution layer.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
+            in the convolution module. (Default: ``False``)
+        convolution_first (bool, optional): apply the convolution module ahead of
+            the attention module. (Default: ``False``)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        ffn_dim: int,
+        num_attention_heads: int,
+        depthwise_conv_kernel_size: int,
+        dropout: float = 0.0,
+        use_group_norm: bool = False,
+        convolution_first: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.ffn1 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout)
+
+        self.self_attn_layer_norm = torch.nn.LayerNorm(input_dim)
+        self.self_attn = torch.nn.MultiheadAttention(input_dim, num_attention_heads, dropout=dropout)
+        self.self_attn_dropout = torch.nn.Dropout(dropout)
+
+        self.conv_module = _ConvolutionModule(
+            input_dim=input_dim,
+            num_channels=input_dim,
+            depthwise_kernel_size=depthwise_conv_kernel_size,
+            dropout=dropout,
+            bias=True,
+            use_group_norm=use_group_norm,
+        )
+
+        self.ffn2 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout)
+        self.final_layer_norm = torch.nn.LayerNorm(input_dim)
+        self.convolution_first = convolution_first
+
+    def _apply_convolution(self, input: torch.Tensor) -> torch.Tensor:
+        residual = input
+        input = input.transpose(0, 1)
+        input = self.conv_module(input)
+        input = input.transpose(0, 1)
+        input = residual + input
+        return input
+
+    def forward(self, input: torch.Tensor, key_padding_mask: Optional[torch.Tensor]) -> torch.Tensor:
+        r"""
+        Args:
+            input (torch.Tensor): input, with shape `(T, B, D)`.
+            key_padding_mask (torch.Tensor or None): key padding mask to use in self attention layer.
+
+        Returns:
+            torch.Tensor: output, with shape `(T, B, D)`.
+        """
+        residual = input
+        x = self.ffn1(input)
+        x = x * 0.5 + residual
+
+        if self.convolution_first:
+            x = self._apply_convolution(x)
+
+        residual = x
+        x = self.self_attn_layer_norm(x)
+        x, _ = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )
+        x = self.self_attn_dropout(x)
+        x = x + residual
+
+        if not self.convolution_first:
+            x = self._apply_convolution(x)
+
+        residual = x
+        x = self.ffn2(x)
+        x = x * 0.5 + residual
+
+        x = self.final_layer_norm(x)
+        return x
+
+
+class Conformer(torch.nn.Module):
+    r"""Implements the Conformer architecture introduced in
+    *Conformer: Convolution-augmented Transformer for Speech Recognition*
+    [:footcite:`gulati2020conformer`].
+
+    Args:
+        input_dim (int): input dimension.
+        num_heads (int): number of attention heads in each Conformer layer.
+        ffn_dim (int): hidden layer dimension of feedforward networks.
+        num_layers (int): number of Conformer layers to instantiate.
+        depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
+            in the convolution module. (Default: ``False``)
+        convolution_first (bool, optional): apply the convolution module ahead of
+            the attention module. (Default: ``False``)
+
+    Examples:
+        >>> conformer = Conformer(
+        >>>     input_dim=80,
+        >>>     num_heads=4,
+        >>>     ffn_dim=128,
+        >>>     num_layers=4,
+        >>>     depthwise_conv_kernel_size=31,
+        >>> )
+        >>> lengths = torch.randint(1, 400, (10,))  # (batch,)
+        >>> input = torch.rand(10, int(lengths.max()), input_dim)  # (batch, num_frames, input_dim)
+        >>> output = conformer(input, lengths)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_heads: int,
+        ffn_dim: int,
+        num_layers: int,
+        depthwise_conv_kernel_size: int,
+        dropout: float = 0.0,
+        use_group_norm: bool = False,
+        convolution_first: bool = False,
+    ):
+        super().__init__()
+
+        self.conformer_layers = torch.nn.ModuleList(
+            [
+                ConformerLayer(
+                    input_dim,
+                    ffn_dim,
+                    num_heads,
+                    depthwise_conv_kernel_size,
+                    dropout=dropout,
+                    use_group_norm=use_group_norm,
+                    convolution_first=convolution_first,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        Args:
+            input (torch.Tensor): with shape `(B, T, input_dim)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor)
+                torch.Tensor
+                    output frames, with shape `(B, T, input_dim)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid frames for i-th batch element in output frames.
+        """
+        encoder_padding_mask = _lengths_to_padding_mask(lengths)
+
+        x = input.transpose(0, 1)
+        for layer in self.conformer_layers:
+            x = layer(x, encoder_padding_mask)
+        return x.transpose(0, 1), lengths
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/conv_tasnet.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/conv_tasnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b424661d26c71360af7cf829cde16a3ac2551ce7
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/conv_tasnet.py
@@ -0,0 +1,301 @@
+"""Implements Conv-TasNet with building blocks of it.
+
+Based on https://github.com/naplab/Conv-TasNet/tree/e66d82a8f956a69749ec8a4ae382217faa097c5c
+"""
+
+from typing import Optional, Tuple
+
+import torch
+
+
+class ConvBlock(torch.nn.Module):
+    """1D Convolutional block.
+
+    Args:
+        io_channels (int): The number of input/output channels, <B, Sc>
+        hidden_channels (int): The number of channels in the internal layers, <H>.
+        kernel_size (int): The convolution kernel size of the middle layer, <P>.
+        padding (int): Padding value of the convolution in the middle layer.
+        dilation (int, optional): Dilation value of the convolution in the middle layer.
+        no_redisual (bool, optional): Disable residual block/output.
+
+    Note:
+        This implementation corresponds to the "non-causal" setting in the paper.
+    """
+
+    def __init__(
+        self,
+        io_channels: int,
+        hidden_channels: int,
+        kernel_size: int,
+        padding: int,
+        dilation: int = 1,
+        no_residual: bool = False,
+    ):
+        super().__init__()
+
+        self.conv_layers = torch.nn.Sequential(
+            torch.nn.Conv1d(in_channels=io_channels, out_channels=hidden_channels, kernel_size=1),
+            torch.nn.PReLU(),
+            torch.nn.GroupNorm(num_groups=1, num_channels=hidden_channels, eps=1e-08),
+            torch.nn.Conv1d(
+                in_channels=hidden_channels,
+                out_channels=hidden_channels,
+                kernel_size=kernel_size,
+                padding=padding,
+                dilation=dilation,
+                groups=hidden_channels,
+            ),
+            torch.nn.PReLU(),
+            torch.nn.GroupNorm(num_groups=1, num_channels=hidden_channels, eps=1e-08),
+        )
+
+        self.res_out = (
+            None
+            if no_residual
+            else torch.nn.Conv1d(in_channels=hidden_channels, out_channels=io_channels, kernel_size=1)
+        )
+        self.skip_out = torch.nn.Conv1d(in_channels=hidden_channels, out_channels=io_channels, kernel_size=1)
+
+    def forward(self, input: torch.Tensor) -> Tuple[Optional[torch.Tensor], torch.Tensor]:
+        feature = self.conv_layers(input)
+        if self.res_out is None:
+            residual = None
+        else:
+            residual = self.res_out(feature)
+        skip_out = self.skip_out(feature)
+        return residual, skip_out
+
+
+class MaskGenerator(torch.nn.Module):
+    """TCN (Temporal Convolution Network) Separation Module
+
+    Generates masks for separation.
+
+    Args:
+        input_dim (int): Input feature dimension, <N>.
+        num_sources (int): The number of sources to separate.
+        kernel_size (int): The convolution kernel size of conv blocks, <P>.
+        num_featrs (int): Input/output feature dimenstion of conv blocks, <B, Sc>.
+        num_hidden (int): Intermediate feature dimention of conv blocks, <H>
+        num_layers (int): The number of conv blocks in one stack, <X>.
+        num_stacks (int): The number of conv block stacks, <R>.
+        msk_activate (str): The activation function of the mask output.
+
+    Note:
+        This implementation corresponds to the "non-causal" setting in the paper.
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_sources: int,
+        kernel_size: int,
+        num_feats: int,
+        num_hidden: int,
+        num_layers: int,
+        num_stacks: int,
+        msk_activate: str,
+    ):
+        super().__init__()
+
+        self.input_dim = input_dim
+        self.num_sources = num_sources
+
+        self.input_norm = torch.nn.GroupNorm(num_groups=1, num_channels=input_dim, eps=1e-8)
+        self.input_conv = torch.nn.Conv1d(in_channels=input_dim, out_channels=num_feats, kernel_size=1)
+
+        self.receptive_field = 0
+        self.conv_layers = torch.nn.ModuleList([])
+        for s in range(num_stacks):
+            for l in range(num_layers):
+                multi = 2**l
+                self.conv_layers.append(
+                    ConvBlock(
+                        io_channels=num_feats,
+                        hidden_channels=num_hidden,
+                        kernel_size=kernel_size,
+                        dilation=multi,
+                        padding=multi,
+                        # The last ConvBlock does not need residual
+                        no_residual=(l == (num_layers - 1) and s == (num_stacks - 1)),
+                    )
+                )
+                self.receptive_field += kernel_size if s == 0 and l == 0 else (kernel_size - 1) * multi
+        self.output_prelu = torch.nn.PReLU()
+        self.output_conv = torch.nn.Conv1d(
+            in_channels=num_feats,
+            out_channels=input_dim * num_sources,
+            kernel_size=1,
+        )
+        if msk_activate == "sigmoid":
+            self.mask_activate = torch.nn.Sigmoid()
+        elif msk_activate == "relu":
+            self.mask_activate = torch.nn.ReLU()
+        else:
+            raise ValueError(f"Unsupported activation {msk_activate}")
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        """Generate separation mask.
+
+        Args:
+            input (torch.Tensor): 3D Tensor with shape [batch, features, frames]
+
+        Returns:
+            Tensor: shape [batch, num_sources, features, frames]
+        """
+        batch_size = input.shape[0]
+        feats = self.input_norm(input)
+        feats = self.input_conv(feats)
+        output = 0.0
+        for layer in self.conv_layers:
+            residual, skip = layer(feats)
+            if residual is not None:  # the last conv layer does not produce residual
+                feats = feats + residual
+            output = output + skip
+        output = self.output_prelu(output)
+        output = self.output_conv(output)
+        output = self.mask_activate(output)
+        return output.view(batch_size, self.num_sources, self.input_dim, -1)
+
+
+class ConvTasNet(torch.nn.Module):
+    """Conv-TasNet: a fully-convolutional time-domain audio separation network
+    *Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation*
+    [:footcite:`Luo_2019`].
+
+    Args:
+        num_sources (int, optional): The number of sources to split.
+        enc_kernel_size (int, optional): The convolution kernel size of the encoder/decoder, <L>.
+        enc_num_feats (int, optional): The feature dimensions passed to mask generator, <N>.
+        msk_kernel_size (int, optional): The convolution kernel size of the mask generator, <P>.
+        msk_num_feats (int, optional): The input/output feature dimension of conv block in the mask generator, <B, Sc>.
+        msk_num_hidden_feats (int, optional): The internal feature dimension of conv block of the mask generator, <H>.
+        msk_num_layers (int, optional): The number of layers in one conv block of the mask generator, <X>.
+        msk_num_stacks (int, optional): The numbr of conv blocks of the mask generator, <R>.
+        msk_activate (str, optional): The activation function of the mask output (Default: ``sigmoid``).
+
+    Note:
+        This implementation corresponds to the "non-causal" setting in the paper.
+    """
+
+    def __init__(
+        self,
+        num_sources: int = 2,
+        # encoder/decoder parameters
+        enc_kernel_size: int = 16,
+        enc_num_feats: int = 512,
+        # mask generator parameters
+        msk_kernel_size: int = 3,
+        msk_num_feats: int = 128,
+        msk_num_hidden_feats: int = 512,
+        msk_num_layers: int = 8,
+        msk_num_stacks: int = 3,
+        msk_activate: str = "sigmoid",
+    ):
+        super().__init__()
+
+        self.num_sources = num_sources
+        self.enc_num_feats = enc_num_feats
+        self.enc_kernel_size = enc_kernel_size
+        self.enc_stride = enc_kernel_size // 2
+
+        self.encoder = torch.nn.Conv1d(
+            in_channels=1,
+            out_channels=enc_num_feats,
+            kernel_size=enc_kernel_size,
+            stride=self.enc_stride,
+            padding=self.enc_stride,
+            bias=False,
+        )
+        self.mask_generator = MaskGenerator(
+            input_dim=enc_num_feats,
+            num_sources=num_sources,
+            kernel_size=msk_kernel_size,
+            num_feats=msk_num_feats,
+            num_hidden=msk_num_hidden_feats,
+            num_layers=msk_num_layers,
+            num_stacks=msk_num_stacks,
+            msk_activate=msk_activate,
+        )
+        self.decoder = torch.nn.ConvTranspose1d(
+            in_channels=enc_num_feats,
+            out_channels=1,
+            kernel_size=enc_kernel_size,
+            stride=self.enc_stride,
+            padding=self.enc_stride,
+            bias=False,
+        )
+
+    def _align_num_frames_with_strides(self, input: torch.Tensor) -> Tuple[torch.Tensor, int]:
+        """Pad input Tensor so that the end of the input tensor corresponds with
+
+        1. (if kernel size is odd) the center of the last convolution kernel
+        or 2. (if kernel size is even) the end of the first half of the last convolution kernel
+
+        Assumption:
+            The resulting Tensor will be padded with the size of stride (== kernel_width // 2)
+            on the both ends in Conv1D
+
+        |<--- k_1 --->|
+        |      |            |<-- k_n-1 -->|
+        |      |                  |  |<--- k_n --->|
+        |      |                  |         |      |
+        |      |                  |         |      |
+        |      v                  v         v      |
+        |<---->|<--- input signal --->|<--->|<---->|
+         stride                         PAD  stride
+
+        Args:
+            input (torch.Tensor): 3D Tensor with shape (batch_size, channels==1, frames)
+
+        Returns:
+            Tensor: Padded Tensor
+            int: Number of paddings performed
+        """
+        batch_size, num_channels, num_frames = input.shape
+        is_odd = self.enc_kernel_size % 2
+        num_strides = (num_frames - is_odd) // self.enc_stride
+        num_remainings = num_frames - (is_odd + num_strides * self.enc_stride)
+        if num_remainings == 0:
+            return input, 0
+
+        num_paddings = self.enc_stride - num_remainings
+        pad = torch.zeros(
+            batch_size,
+            num_channels,
+            num_paddings,
+            dtype=input.dtype,
+            device=input.device,
+        )
+        return torch.cat([input, pad], 2), num_paddings
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        """Perform source separation. Generate audio source waveforms.
+
+        Args:
+            input (torch.Tensor): 3D Tensor with shape [batch, channel==1, frames]
+
+        Returns:
+            Tensor: 3D Tensor with shape [batch, channel==num_sources, frames]
+        """
+        if input.ndim != 3 or input.shape[1] != 1:
+            raise ValueError(f"Expected 3D tensor (batch, channel==1, frames). Found: {input.shape}")
+
+        # B: batch size
+        # L: input frame length
+        # L': padded input frame length
+        # F: feature dimension
+        # M: feature frame length
+        # S: number of sources
+
+        padded, num_pads = self._align_num_frames_with_strides(input)  # B, 1, L'
+        batch_size, num_padded_frames = padded.shape[0], padded.shape[2]
+        feats = self.encoder(padded)  # B, F, M
+        masked = self.mask_generator(feats) * feats.unsqueeze(1)  # B, S, F, M
+        masked = masked.view(batch_size * self.num_sources, self.enc_num_feats, -1)  # B*S, F, M
+        decoded = self.decoder(masked)  # B*S, 1, L'
+        output = decoded.view(batch_size, self.num_sources, num_padded_frames)  # B, S, L'
+        if num_pads > 0:
+            output = output[..., :-num_pads]  # B, S, L
+        return output
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/deepspeech.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/deepspeech.py
new file mode 100644
index 0000000000000000000000000000000000000000..e279498e4916a9ff7a943af36853fee392d0b240
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/deepspeech.py
@@ -0,0 +1,85 @@
+import torch
+
+__all__ = ["DeepSpeech"]
+
+
+class FullyConnected(torch.nn.Module):
+    """
+    Args:
+        n_feature: Number of input features
+        n_hidden: Internal hidden unit size.
+    """
+
+    def __init__(self, n_feature: int, n_hidden: int, dropout: float, relu_max_clip: int = 20) -> None:
+        super(FullyConnected, self).__init__()
+        self.fc = torch.nn.Linear(n_feature, n_hidden, bias=True)
+        self.relu_max_clip = relu_max_clip
+        self.dropout = dropout
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc(x)
+        x = torch.nn.functional.relu(x)
+        x = torch.nn.functional.hardtanh(x, 0, self.relu_max_clip)
+        if self.dropout:
+            x = torch.nn.functional.dropout(x, self.dropout, self.training)
+        return x
+
+
+class DeepSpeech(torch.nn.Module):
+    """
+    DeepSpeech model architecture from *Deep Speech: Scaling up end-to-end speech recognition*
+    [:footcite:`hannun2014deep`].
+
+    Args:
+        n_feature: Number of input features
+        n_hidden: Internal hidden unit size.
+        n_class: Number of output classes
+    """
+
+    def __init__(
+        self,
+        n_feature: int,
+        n_hidden: int = 2048,
+        n_class: int = 40,
+        dropout: float = 0.0,
+    ) -> None:
+        super(DeepSpeech, self).__init__()
+        self.n_hidden = n_hidden
+        self.fc1 = FullyConnected(n_feature, n_hidden, dropout)
+        self.fc2 = FullyConnected(n_hidden, n_hidden, dropout)
+        self.fc3 = FullyConnected(n_hidden, n_hidden, dropout)
+        self.bi_rnn = torch.nn.RNN(n_hidden, n_hidden, num_layers=1, nonlinearity="relu", bidirectional=True)
+        self.fc4 = FullyConnected(n_hidden, n_hidden, dropout)
+        self.out = torch.nn.Linear(n_hidden, n_class)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): Tensor of dimension (batch, channel, time, feature).
+        Returns:
+            Tensor: Predictor tensor of dimension (batch, time, class).
+        """
+        # N x C x T x F
+        x = self.fc1(x)
+        # N x C x T x H
+        x = self.fc2(x)
+        # N x C x T x H
+        x = self.fc3(x)
+        # N x C x T x H
+        x = x.squeeze(1)
+        # N x T x H
+        x = x.transpose(0, 1)
+        # T x N x H
+        x, _ = self.bi_rnn(x)
+        # The fifth (non-recurrent) layer takes both the forward and backward units as inputs
+        x = x[:, :, : self.n_hidden] + x[:, :, self.n_hidden :]
+        # T x N x H
+        x = self.fc4(x)
+        # T x N x H
+        x = self.out(x)
+        # T x N x n_class
+        x = x.permute(1, 0, 2)
+        # N x T x n_class
+        x = torch.nn.functional.log_softmax(x, dim=2)
+        # N x T x n_class
+        return x
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/emformer.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/emformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..72de3ddcb7ae027859d0a76dfcfd157ff5a91d08
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/emformer.py
@@ -0,0 +1,876 @@
+import math
+from typing import List, Optional, Tuple
+
+import torch
+
+
+__all__ = ["Emformer"]
+
+
+def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor:
+    batch_size = lengths.shape[0]
+    max_length = int(torch.max(lengths).item())
+    padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand(
+        batch_size, max_length
+    ) >= lengths.unsqueeze(1)
+    return padding_mask
+
+
+def _gen_padding_mask(
+    utterance: torch.Tensor,
+    right_context: torch.Tensor,
+    summary: torch.Tensor,
+    lengths: torch.Tensor,
+    mems: torch.Tensor,
+    left_context_key: Optional[torch.Tensor] = None,
+) -> Optional[torch.Tensor]:
+    T = right_context.size(0) + utterance.size(0) + summary.size(0)
+    B = right_context.size(1)
+    if B == 1:
+        padding_mask = None
+    else:
+        right_context_blocks_length = T - torch.max(lengths).int() - summary.size(0)
+        left_context_blocks_length = left_context_key.size(0) if left_context_key is not None else 0
+        klengths = lengths + mems.size(0) + right_context_blocks_length + left_context_blocks_length
+        padding_mask = _lengths_to_padding_mask(lengths=klengths)
+    return padding_mask
+
+
+def _get_activation_module(activation: str) -> torch.nn.Module:
+    if activation == "relu":
+        return torch.nn.ReLU()
+    elif activation == "gelu":
+        return torch.nn.GELU()
+    elif activation == "silu":
+        return torch.nn.SiLU()
+    else:
+        raise ValueError(f"Unsupported activation {activation}")
+
+
+def _get_weight_init_gains(weight_init_scale_strategy: Optional[str], num_layers: int) -> List[Optional[float]]:
+    if weight_init_scale_strategy is None:
+        return [None for _ in range(num_layers)]
+    elif weight_init_scale_strategy == "depthwise":
+        return [1.0 / math.sqrt(layer_idx + 1) for layer_idx in range(num_layers)]
+    elif weight_init_scale_strategy == "constant":
+        return [1.0 / math.sqrt(2) for layer_idx in range(num_layers)]
+    else:
+        raise ValueError(f"Unsupported weight_init_scale_strategy value {weight_init_scale_strategy}")
+
+
+def _gen_attention_mask_block(
+    col_widths: List[int], col_mask: List[bool], num_rows: int, device: torch.device
+) -> torch.Tensor:
+    assert len(col_widths) == len(col_mask), "Length of col_widths must match that of col_mask"
+
+    mask_block = [
+        torch.ones(num_rows, col_width, device=device)
+        if is_ones_col
+        else torch.zeros(num_rows, col_width, device=device)
+        for col_width, is_ones_col in zip(col_widths, col_mask)
+    ]
+    return torch.cat(mask_block, dim=1)
+
+
+class _EmformerAttention(torch.nn.Module):
+    r"""Emformer layer attention module.
+
+    Args:
+        input_dim (int): input dimension.
+        num_heads (int): number of attention heads in each Emformer layer.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+        weight_init_gain (float or None, optional): scale factor to apply when initializing
+            attention module parameters. (Default: ``None``)
+        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
+        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        weight_init_gain: Optional[float] = None,
+        tanh_on_mem: bool = False,
+        negative_inf: float = -1e8,
+    ):
+        super().__init__()
+
+        if input_dim % num_heads != 0:
+            raise ValueError(f"input_dim ({input_dim}) is not a multiple of num_heads ({num_heads}).")
+
+        self.input_dim = input_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.tanh_on_mem = tanh_on_mem
+        self.negative_inf = negative_inf
+
+        self.scaling = (self.input_dim // self.num_heads) ** -0.5
+
+        self.emb_to_key_value = torch.nn.Linear(input_dim, 2 * input_dim, bias=True)
+        self.emb_to_query = torch.nn.Linear(input_dim, input_dim, bias=True)
+        self.out_proj = torch.nn.Linear(input_dim, input_dim, bias=True)
+
+        if weight_init_gain:
+            torch.nn.init.xavier_uniform_(self.emb_to_key_value.weight, gain=weight_init_gain)
+            torch.nn.init.xavier_uniform_(self.emb_to_query.weight, gain=weight_init_gain)
+
+    def _gen_key_value(self, input: torch.Tensor, mems: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        T, _, _ = input.shape
+        summary_length = mems.size(0) + 1
+        right_ctx_utterance_block = input[: T - summary_length]
+        mems_right_ctx_utterance_block = torch.cat([mems, right_ctx_utterance_block])
+        key, value = self.emb_to_key_value(mems_right_ctx_utterance_block).chunk(chunks=2, dim=2)
+        return key, value
+
+    def _gen_attention_probs(
+        self,
+        attention_weights: torch.Tensor,
+        attention_mask: torch.Tensor,
+        padding_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        attention_weights_float = attention_weights.float()
+        attention_weights_float = attention_weights_float.masked_fill(attention_mask.unsqueeze(0), self.negative_inf)
+        T = attention_weights.size(1)
+        B = attention_weights.size(0) // self.num_heads
+        if padding_mask is not None:
+            attention_weights_float = attention_weights_float.view(B, self.num_heads, T, -1)
+            attention_weights_float = attention_weights_float.masked_fill(
+                padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), self.negative_inf
+            )
+            attention_weights_float = attention_weights_float.view(B * self.num_heads, T, -1)
+        attention_probs = torch.nn.functional.softmax(attention_weights_float, dim=-1).type_as(attention_weights)
+        return torch.nn.functional.dropout(attention_probs, p=float(self.dropout), training=self.training)
+
+    def _forward_impl(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        summary: torch.Tensor,
+        mems: torch.Tensor,
+        attention_mask: torch.Tensor,
+        left_context_key: Optional[torch.Tensor] = None,
+        left_context_val: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        B = utterance.size(1)
+        T = right_context.size(0) + utterance.size(0) + summary.size(0)
+
+        # Compute query with [right context, utterance, summary].
+        query = self.emb_to_query(torch.cat([right_context, utterance, summary]))
+
+        # Compute key and value with [mems, right context, utterance].
+        key, value = self.emb_to_key_value(torch.cat([mems, right_context, utterance])).chunk(chunks=2, dim=2)
+
+        if left_context_key is not None and left_context_val is not None:
+            right_context_blocks_length = T - torch.max(lengths).int() - summary.size(0)
+            key = torch.cat(
+                [
+                    key[: mems.size(0) + right_context_blocks_length],
+                    left_context_key,
+                    key[mems.size(0) + right_context_blocks_length :],
+                ],
+            )
+            value = torch.cat(
+                [
+                    value[: mems.size(0) + right_context_blocks_length],
+                    left_context_val,
+                    value[mems.size(0) + right_context_blocks_length :],
+                ],
+            )
+
+        # Compute attention weights from query, key, and value.
+        reshaped_query, reshaped_key, reshaped_value = [
+            tensor.contiguous().view(-1, B * self.num_heads, self.input_dim // self.num_heads).transpose(0, 1)
+            for tensor in [query, key, value]
+        ]
+        attention_weights = torch.bmm(reshaped_query * self.scaling, reshaped_key.transpose(1, 2))
+
+        # Compute padding mask.
+        padding_mask = _gen_padding_mask(utterance, right_context, summary, lengths, mems, left_context_key)
+
+        # Compute attention probabilities.
+        attention_probs = self._gen_attention_probs(attention_weights, attention_mask, padding_mask)
+
+        # Compute attention.
+        attention = torch.bmm(attention_probs, reshaped_value)
+        assert attention.shape == (
+            B * self.num_heads,
+            T,
+            self.input_dim // self.num_heads,
+        )
+        attention = attention.transpose(0, 1).contiguous().view(T, B, self.input_dim)
+
+        # Apply output projection.
+        output_right_context_mems = self.out_proj(attention)
+
+        summary_length = summary.size(0)
+        output_right_context = output_right_context_mems[: T - summary_length]
+        output_mems = output_right_context_mems[T - summary_length :]
+        if self.tanh_on_mem:
+            output_mems = torch.tanh(output_mems)
+        else:
+            output_mems = torch.clamp(output_mems, min=-10, max=10)
+
+        return output_right_context, output_mems, key, value
+
+    def forward(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        summary: torch.Tensor,
+        mems: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        D: feature dimension of each frame;
+        T: number of utterance frames;
+        R: number of right context frames;
+        S: number of summary elements;
+        M: number of memory elements.
+
+        Args:
+            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``utterance``.
+            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
+            summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
+            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
+            attention_mask (torch.Tensor): attention mask for underlying attention module.
+
+        Returns:
+            (Tensor, Tensor):
+                Tensor
+                    output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
+                Tensor
+                    updated memory elements, with shape `(M, B, D)`.
+        """
+        output, output_mems, _, _ = self._forward_impl(utterance, lengths, right_context, summary, mems, attention_mask)
+        return output, output_mems[:-1]
+
+    @torch.jit.export
+    def infer(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        summary: torch.Tensor,
+        mems: torch.Tensor,
+        left_context_key: torch.Tensor,
+        left_context_val: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for inference.
+
+        B: batch size;
+        D: feature dimension of each frame;
+        T: number of utterance frames;
+        R: number of right context frames;
+        S: number of summary elements;
+        M: number of memory elements.
+
+        Args:
+            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``utterance``.
+            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
+            summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
+            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
+            left_context_key (torch.Tensor): left context attention key computed from preceding invocation.
+            left_context_val (torch.Tensor): left context attention value computed from preceding invocation.
+
+        Returns:
+            (Tensor, Tensor, Tensor, and Tensor):
+                Tensor
+                    output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
+                Tensor
+                    updated memory elements, with shape `(M, B, D)`.
+                Tensor
+                    attention key computed for left context and utterance.
+                Tensor
+                    attention value computed for left context and utterance.
+        """
+        query_dim = right_context.size(0) + utterance.size(0) + summary.size(0)
+        key_dim = right_context.size(0) + utterance.size(0) + mems.size(0) + left_context_key.size(0)
+        attention_mask = torch.zeros(query_dim, key_dim).to(dtype=torch.bool, device=utterance.device)
+        attention_mask[-1, : mems.size(0)] = True
+        output, output_mems, key, value = self._forward_impl(
+            utterance,
+            lengths,
+            right_context,
+            summary,
+            mems,
+            attention_mask,
+            left_context_key=left_context_key,
+            left_context_val=left_context_val,
+        )
+        return (
+            output,
+            output_mems,
+            key[mems.size(0) + right_context.size(0) :],
+            value[mems.size(0) + right_context.size(0) :],
+        )
+
+
+class _EmformerLayer(torch.nn.Module):
+    r"""Emformer layer that constitutes Emformer.
+
+    Args:
+        input_dim (int): input dimension.
+        num_heads (int): number of attention heads.
+        ffn_dim: (int): hidden layer dimension of feedforward network.
+        segment_length (int): length of each input segment.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+        activation (str, optional): activation function to use in feedforward network.
+            Must be one of ("relu", "gelu", "silu"). (Default: "relu")
+        left_context_length (int, optional): length of left context. (Default: 0)
+        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
+        weight_init_gain (float or None, optional): scale factor to apply when initializing
+            attention module parameters. (Default: ``None``)
+        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
+        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_heads: int,
+        ffn_dim: int,
+        segment_length: int,
+        dropout: float = 0.0,
+        activation: str = "relu",
+        left_context_length: int = 0,
+        max_memory_size: int = 0,
+        weight_init_gain: Optional[float] = None,
+        tanh_on_mem: bool = False,
+        negative_inf: float = -1e8,
+    ):
+        super().__init__()
+
+        self.attention = _EmformerAttention(
+            input_dim=input_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            weight_init_gain=weight_init_gain,
+            tanh_on_mem=tanh_on_mem,
+            negative_inf=negative_inf,
+        )
+        self.dropout = torch.nn.Dropout(dropout)
+        self.memory_op = torch.nn.AvgPool1d(kernel_size=segment_length, stride=segment_length, ceil_mode=True)
+
+        activation_module = _get_activation_module(activation)
+        self.pos_ff = torch.nn.Sequential(
+            torch.nn.LayerNorm(input_dim),
+            torch.nn.Linear(input_dim, ffn_dim),
+            activation_module,
+            torch.nn.Dropout(dropout),
+            torch.nn.Linear(ffn_dim, input_dim),
+            torch.nn.Dropout(dropout),
+        )
+        self.layer_norm_input = torch.nn.LayerNorm(input_dim)
+        self.layer_norm_output = torch.nn.LayerNorm(input_dim)
+
+        self.left_context_length = left_context_length
+        self.segment_length = segment_length
+        self.max_memory_size = max_memory_size
+        self.input_dim = input_dim
+
+        self.use_mem = max_memory_size > 0
+
+    def _init_state(self, batch_size: int, device: Optional[torch.device]) -> List[torch.Tensor]:
+        empty_memory = torch.zeros(self.max_memory_size, batch_size, self.input_dim, device=device)
+        left_context_key = torch.zeros(self.left_context_length, batch_size, self.input_dim, device=device)
+        left_context_val = torch.zeros(self.left_context_length, batch_size, self.input_dim, device=device)
+        past_length = torch.zeros(1, batch_size, dtype=torch.int32, device=device)
+        return [empty_memory, left_context_key, left_context_val, past_length]
+
+    def _unpack_state(self, state: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        past_length = state[3][0][0].item()
+        past_left_context_length = min(self.left_context_length, past_length)
+        past_mem_length = min(self.max_memory_size, math.ceil(past_length / self.segment_length))
+        pre_mems = state[0][self.max_memory_size - past_mem_length :]
+        lc_key = state[1][self.left_context_length - past_left_context_length :]
+        lc_val = state[2][self.left_context_length - past_left_context_length :]
+        return pre_mems, lc_key, lc_val
+
+    def _pack_state(
+        self,
+        next_k: torch.Tensor,
+        next_v: torch.Tensor,
+        update_length: int,
+        mems: torch.Tensor,
+        state: List[torch.Tensor],
+    ) -> List[torch.Tensor]:
+        new_k = torch.cat([state[1], next_k])
+        new_v = torch.cat([state[2], next_v])
+        state[0] = torch.cat([state[0], mems])[-self.max_memory_size :]
+        state[1] = new_k[new_k.shape[0] - self.left_context_length :]
+        state[2] = new_v[new_v.shape[0] - self.left_context_length :]
+        state[3] = state[3] + update_length
+        return state
+
+    def _process_attention_output(
+        self,
+        rc_output: torch.Tensor,
+        utterance: torch.Tensor,
+        right_context: torch.Tensor,
+    ) -> torch.Tensor:
+        result = self.dropout(rc_output) + torch.cat([right_context, utterance])
+        result = self.pos_ff(result) + result
+        result = self.layer_norm_output(result)
+        return result
+
+    def _apply_pre_attention_layer_norm(
+        self, utterance: torch.Tensor, right_context: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        layer_norm_input = self.layer_norm_input(torch.cat([right_context, utterance]))
+        return (
+            layer_norm_input[right_context.size(0) :],
+            layer_norm_input[: right_context.size(0)],
+        )
+
+    def _apply_post_attention_ffn(
+        self, rc_output: torch.Tensor, utterance: torch.Tensor, right_context: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        rc_output = self._process_attention_output(rc_output, utterance, right_context)
+        return rc_output[right_context.size(0) :], rc_output[: right_context.size(0)]
+
+    def _apply_attention_forward(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        mems: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if attention_mask is None:
+            raise ValueError("attention_mask must be not None when for_inference is False")
+
+        if self.use_mem:
+            summary = self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)
+        else:
+            summary = torch.empty(0).to(dtype=utterance.dtype, device=utterance.device)
+        rc_output, next_m = self.attention(
+            utterance=utterance,
+            lengths=lengths,
+            right_context=right_context,
+            summary=summary,
+            mems=mems,
+            attention_mask=attention_mask,
+        )
+        return rc_output, next_m
+
+    def _apply_attention_infer(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        mems: torch.Tensor,
+        state: Optional[List[torch.Tensor]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
+        if state is None:
+            state = self._init_state(utterance.size(1), device=utterance.device)
+        pre_mems, lc_key, lc_val = self._unpack_state(state)
+        if self.use_mem:
+            summary = self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)
+            summary = summary[:1]
+        else:
+            summary = torch.empty(0).to(dtype=utterance.dtype, device=utterance.device)
+        rc_output, next_m, next_k, next_v = self.attention.infer(
+            utterance=utterance,
+            lengths=lengths,
+            right_context=right_context,
+            summary=summary,
+            mems=pre_mems,
+            left_context_key=lc_key,
+            left_context_val=lc_val,
+        )
+        state = self._pack_state(next_k, next_v, utterance.size(0), mems, state)
+        return rc_output, next_m, state
+
+    def forward(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        mems: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        D: feature dimension of each frame;
+        T: number of utterance frames;
+        R: number of right context frames;
+        M: number of memory elements.
+
+        Args:
+            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``utterance``.
+            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
+            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
+            attention_mask (torch.Tensor): attention mask for underlying attention module.
+
+        Returns:
+            (Tensor, Tensor, Tensor):
+                Tensor
+                    encoded utterance frames, with shape `(T, B, D)`.
+                Tensor
+                    updated right context frames, with shape `(R, B, D)`.
+                Tensor
+                    updated memory elements, with shape `(M, B, D)`.
+        """
+        (
+            layer_norm_utterance,
+            layer_norm_right_context,
+        ) = self._apply_pre_attention_layer_norm(utterance, right_context)
+        rc_output, output_mems = self._apply_attention_forward(
+            layer_norm_utterance,
+            lengths,
+            layer_norm_right_context,
+            mems,
+            attention_mask,
+        )
+        output_utterance, output_right_context = self._apply_post_attention_ffn(rc_output, utterance, right_context)
+        return output_utterance, output_right_context, output_mems
+
+    @torch.jit.export
+    def infer(
+        self,
+        utterance: torch.Tensor,
+        lengths: torch.Tensor,
+        right_context: torch.Tensor,
+        state: Optional[List[torch.Tensor]],
+        mems: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], torch.Tensor]:
+        r"""Forward pass for inference.
+
+        B: batch size;
+        D: feature dimension of each frame;
+        T: number of utterance frames;
+        R: number of right context frames;
+        M: number of memory elements.
+
+        Args:
+            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``utterance``.
+            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
+            state (List[torch.Tensor] or None): list of tensors representing layer internal state
+                generated in preceding invocation of ``infer``.
+            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
+
+        Returns:
+            (Tensor, Tensor, List[torch.Tensor], Tensor):
+                Tensor
+                    encoded utterance frames, with shape `(T, B, D)`.
+                Tensor
+                    updated right context frames, with shape `(R, B, D)`.
+                List[Tensor]
+                    list of tensors representing layer internal state
+                    generated in current invocation of ``infer``.
+                Tensor
+                    updated memory elements, with shape `(M, B, D)`.
+        """
+        (
+            layer_norm_utterance,
+            layer_norm_right_context,
+        ) = self._apply_pre_attention_layer_norm(utterance, right_context)
+        rc_output, output_mems, output_state = self._apply_attention_infer(
+            layer_norm_utterance, lengths, layer_norm_right_context, mems, state
+        )
+        output_utterance, output_right_context = self._apply_post_attention_ffn(rc_output, utterance, right_context)
+        return output_utterance, output_right_context, output_state, output_mems
+
+
+class _EmformerImpl(torch.nn.Module):
+    def __init__(
+        self,
+        emformer_layers: torch.nn.ModuleList,
+        segment_length: int,
+        left_context_length: int = 0,
+        right_context_length: int = 0,
+        max_memory_size: int = 0,
+    ):
+        super().__init__()
+
+        self.use_mem = max_memory_size > 0
+        self.memory_op = torch.nn.AvgPool1d(
+            kernel_size=segment_length,
+            stride=segment_length,
+            ceil_mode=True,
+        )
+        self.emformer_layers = emformer_layers
+        self.left_context_length = left_context_length
+        self.right_context_length = right_context_length
+        self.segment_length = segment_length
+        self.max_memory_size = max_memory_size
+
+    def _gen_right_context(self, input: torch.Tensor) -> torch.Tensor:
+        T = input.shape[0]
+        num_segs = math.ceil((T - self.right_context_length) / self.segment_length)
+        right_context_blocks = []
+        for seg_idx in range(num_segs - 1):
+            start = (seg_idx + 1) * self.segment_length
+            end = start + self.right_context_length
+            right_context_blocks.append(input[start:end])
+        right_context_blocks.append(input[T - self.right_context_length :])
+        return torch.cat(right_context_blocks)
+
+    def _gen_attention_mask_col_widths(self, seg_idx: int, utterance_length: int) -> List[int]:
+        num_segs = math.ceil(utterance_length / self.segment_length)
+        rc = self.right_context_length
+        lc = self.left_context_length
+        rc_start = seg_idx * rc
+        rc_end = rc_start + rc
+        seg_start = max(seg_idx * self.segment_length - lc, 0)
+        seg_end = min((seg_idx + 1) * self.segment_length, utterance_length)
+        rc_length = self.right_context_length * num_segs
+
+        if self.use_mem:
+            m_start = max(seg_idx - self.max_memory_size, 0)
+            mem_length = num_segs - 1
+            col_widths = [
+                m_start,  # before memory
+                seg_idx - m_start,  # memory
+                mem_length - seg_idx,  # after memory
+                rc_start,  # before right context
+                rc,  # right context
+                rc_length - rc_end,  # after right context
+                seg_start,  # before query segment
+                seg_end - seg_start,  # query segment
+                utterance_length - seg_end,  # after query segment
+            ]
+        else:
+            col_widths = [
+                rc_start,  # before right context
+                rc,  # right context
+                rc_length - rc_end,  # after right context
+                seg_start,  # before query segment
+                seg_end - seg_start,  # query segment
+                utterance_length - seg_end,  # after query segment
+            ]
+
+        return col_widths
+
+    def _gen_attention_mask(self, input: torch.Tensor) -> torch.Tensor:
+        utterance_length = input.size(0)
+        num_segs = math.ceil(utterance_length / self.segment_length)
+
+        rc_mask = []
+        query_mask = []
+        summary_mask = []
+
+        if self.use_mem:
+            num_cols = 9
+            # memory, right context, query segment
+            rc_q_cols_mask = [idx in [1, 4, 7] for idx in range(num_cols)]
+            # right context, query segment
+            s_cols_mask = [idx in [4, 7] for idx in range(num_cols)]
+            masks_to_concat = [rc_mask, query_mask, summary_mask]
+        else:
+            num_cols = 6
+            # right context, query segment
+            rc_q_cols_mask = [idx in [1, 4] for idx in range(num_cols)]
+            s_cols_mask = None
+            masks_to_concat = [rc_mask, query_mask]
+
+        for seg_idx in range(num_segs):
+            col_widths = self._gen_attention_mask_col_widths(seg_idx, utterance_length)
+
+            rc_mask_block = _gen_attention_mask_block(
+                col_widths, rc_q_cols_mask, self.right_context_length, input.device
+            )
+            rc_mask.append(rc_mask_block)
+
+            query_mask_block = _gen_attention_mask_block(
+                col_widths,
+                rc_q_cols_mask,
+                min(
+                    self.segment_length,
+                    utterance_length - seg_idx * self.segment_length,
+                ),
+                input.device,
+            )
+            query_mask.append(query_mask_block)
+
+            if s_cols_mask is not None:
+                summary_mask_block = _gen_attention_mask_block(col_widths, s_cols_mask, 1, input.device)
+                summary_mask.append(summary_mask_block)
+
+        attention_mask = (1 - torch.cat([torch.cat(mask) for mask in masks_to_concat])).to(torch.bool)
+        return attention_mask
+
+    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training and non-streaming inference.
+
+        B: batch size;
+        T: max number of input frames in batch;
+        D: feature dimension of each frame.
+
+        Args:
+            input (torch.Tensor): utterance frames right-padded with right context frames, with
+                shape `(B, T + right_context_length, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid utterance frames for i-th batch element in ``input``.
+
+        Returns:
+            (Tensor, Tensor):
+                Tensor
+                    output frames, with shape `(B, T, D)`.
+                Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid frames for i-th batch element in output frames.
+        """
+        input = input.permute(1, 0, 2)
+        right_context = self._gen_right_context(input)
+        utterance = input[: input.size(0) - self.right_context_length]
+        attention_mask = self._gen_attention_mask(utterance)
+        mems = (
+            self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)[:-1]
+            if self.use_mem
+            else torch.empty(0).to(dtype=input.dtype, device=input.device)
+        )
+        output = utterance
+        for layer in self.emformer_layers:
+            output, right_context, mems = layer(output, lengths, right_context, mems, attention_mask)
+        return output.permute(1, 0, 2), lengths
+
+    @torch.jit.export
+    def infer(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        states: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass for streaming inference.
+
+        B: batch size;
+        D: feature dimension of each frame.
+
+        Args:
+            input (torch.Tensor): utterance frames right-padded with right context frames, with
+                shape `(B, segment_length + right_context_length, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            states (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing internal state generated in preceding invocation of ``infer``. (Default: ``None``)
+
+        Returns:
+            (Tensor, Tensor, List[List[Tensor]]):
+                Tensor
+                    output frames, with shape `(B, segment_length, D)`.
+                Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid frames for i-th batch element in output frames.
+                List[List[Tensor]]
+                    output states; list of lists of tensors representing internal state
+                    generated in current invocation of ``infer``.
+        """
+        assert input.size(1) == self.segment_length + self.right_context_length, (
+            "Per configured segment_length and right_context_length"
+            f", expected size of {self.segment_length + self.right_context_length} for dimension 1 of input"
+            f", but got {input.size(1)}."
+        )
+        input = input.permute(1, 0, 2)
+        right_context_start_idx = input.size(0) - self.right_context_length
+        right_context = input[right_context_start_idx:]
+        utterance = input[:right_context_start_idx]
+        output_lengths = torch.clamp(lengths - self.right_context_length, min=0)
+        mems = (
+            self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)
+            if self.use_mem
+            else torch.empty(0).to(dtype=input.dtype, device=input.device)
+        )
+        output = utterance
+        output_states: List[List[torch.Tensor]] = []
+        for layer_idx, layer in enumerate(self.emformer_layers):
+            output, right_context, output_state, mems = layer.infer(
+                output,
+                output_lengths,
+                right_context,
+                None if states is None else states[layer_idx],
+                mems,
+            )
+            output_states.append(output_state)
+
+        return output.permute(1, 0, 2), output_lengths, output_states
+
+
+class Emformer(_EmformerImpl):
+    r"""Implements the Emformer architecture introduced in
+    *Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency Streaming Speech Recognition*
+    [:footcite:`shi2021emformer`].
+
+    Args:
+        input_dim (int): input dimension.
+        num_heads (int): number of attention heads in each Emformer layer.
+        ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
+        num_layers (int): number of Emformer layers to instantiate.
+        segment_length (int): length of each input segment.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+        activation (str, optional): activation function to use in each Emformer layer's
+            feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
+        left_context_length (int, optional): length of left context. (Default: 0)
+        right_context_length (int, optional): length of right context. (Default: 0)
+        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
+        weight_init_scale_strategy (str or None, optional): per-layer weight initialization scaling
+            strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
+        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
+        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
+
+    Examples:
+        >>> emformer = Emformer(512, 8, 2048, 20, 4, right_context_length=1)
+        >>> input = torch.rand(128, 400, 512)  # batch, num_frames, feature_dim
+        >>> lengths = torch.randint(1, 200, (128,))  # batch
+        >>> output, lengths = emformer(input, lengths)
+        >>> input = torch.rand(128, 5, 512)
+        >>> lengths = torch.ones(128) * 5
+        >>> output, lengths, states = emformer.infer(input, lengths, None)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_heads: int,
+        ffn_dim: int,
+        num_layers: int,
+        segment_length: int,
+        dropout: float = 0.0,
+        activation: str = "relu",
+        left_context_length: int = 0,
+        right_context_length: int = 0,
+        max_memory_size: int = 0,
+        weight_init_scale_strategy: Optional[str] = "depthwise",
+        tanh_on_mem: bool = False,
+        negative_inf: float = -1e8,
+    ):
+        weight_init_gains = _get_weight_init_gains(weight_init_scale_strategy, num_layers)
+        emformer_layers = torch.nn.ModuleList(
+            [
+                _EmformerLayer(
+                    input_dim,
+                    num_heads,
+                    ffn_dim,
+                    segment_length,
+                    dropout=dropout,
+                    activation=activation,
+                    left_context_length=left_context_length,
+                    max_memory_size=max_memory_size,
+                    weight_init_gain=weight_init_gains[layer_idx],
+                    tanh_on_mem=tanh_on_mem,
+                    negative_inf=negative_inf,
+                )
+                for layer_idx in range(num_layers)
+            ]
+        )
+        super().__init__(
+            emformer_layers,
+            segment_length,
+            left_context_length=left_context_length,
+            right_context_length=right_context_length,
+            max_memory_size=max_memory_size,
+        )
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/rnnt.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/rnnt.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7cfee6dd9a253b8829492962110c2f494f29e16
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/rnnt.py
@@ -0,0 +1,813 @@
+from abc import ABC, abstractmethod
+from typing import List, Optional, Tuple
+
+import torch
+from torchaudio.models import Emformer
+
+
+__all__ = ["RNNT", "emformer_rnnt_base", "emformer_rnnt_model"]
+
+
+class _TimeReduction(torch.nn.Module):
+    r"""Coalesces frames along time dimension into a
+    fewer number of frames with higher feature dimensionality.
+
+    Args:
+        stride (int): number of frames to merge for each output frame.
+    """
+
+    def __init__(self, stride: int) -> None:
+        super().__init__()
+        self.stride = stride
+
+    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""Forward pass.
+
+        B: batch size;
+        T: maximum input sequence length in batch;
+        D: feature dimension of each input sequence frame.
+
+        Args:
+            input (torch.Tensor): input sequences, with shape `(B, T, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    output sequences, with shape
+                    `(B, T  // stride, D * stride)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid frames for i-th batch element in output sequences.
+        """
+        B, T, D = input.shape
+        num_frames = T - (T % self.stride)
+        input = input[:, :num_frames, :]
+        lengths = lengths.div(self.stride, rounding_mode="trunc")
+        T_max = num_frames // self.stride
+
+        output = input.reshape(B, T_max, D * self.stride)
+        output = output.contiguous()
+        return output, lengths
+
+
+class _CustomLSTM(torch.nn.Module):
+    r"""Custom long-short-term memory (LSTM) block that applies layer normalization
+    to internal nodes.
+
+    Args:
+        input_dim (int): input dimension.
+        hidden_dim (int): hidden dimension.
+        layer_norm (bool, optional): if ``True``, enables layer normalization. (Default: ``False``)
+        layer_norm_epsilon (float, optional):  value of epsilon to use in
+            layer normalization layers (Default: 1e-5)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        layer_norm: bool = False,
+        layer_norm_epsilon: float = 1e-5,
+    ) -> None:
+        super().__init__()
+        self.x2g = torch.nn.Linear(input_dim, 4 * hidden_dim, bias=(not layer_norm))
+        self.p2g = torch.nn.Linear(hidden_dim, 4 * hidden_dim, bias=False)
+        if layer_norm:
+            self.c_norm = torch.nn.LayerNorm(hidden_dim, eps=layer_norm_epsilon)
+            self.g_norm = torch.nn.LayerNorm(4 * hidden_dim, eps=layer_norm_epsilon)
+        else:
+            self.c_norm = torch.nn.Identity()
+            self.g_norm = torch.nn.Identity()
+
+        self.hidden_dim = hidden_dim
+
+    def forward(
+        self, input: torch.Tensor, state: Optional[List[torch.Tensor]]
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        r"""Forward pass.
+
+        B: batch size;
+        T: maximum sequence length in batch;
+        D: feature dimension of each input sequence element.
+
+        Args:
+            input (torch.Tensor): with shape `(T, B, D)`.
+            state (List[torch.Tensor] or None): list of tensors
+                representing internal state generated in preceding invocation
+                of ``forward``.
+
+        Returns:
+            (torch.Tensor, List[torch.Tensor]):
+                torch.Tensor
+                    output, with shape `(T, B, hidden_dim)`.
+                List[torch.Tensor]
+                    list of tensors representing internal state generated
+                    in current invocation of ``forward``.
+        """
+        if state is None:
+            B = input.size(1)
+            h = torch.zeros(B, self.hidden_dim, device=input.device, dtype=input.dtype)
+            c = torch.zeros(B, self.hidden_dim, device=input.device, dtype=input.dtype)
+        else:
+            h, c = state
+
+        gated_input = self.x2g(input)
+        outputs = []
+        for gates in gated_input.unbind(0):
+            gates = gates + self.p2g(h)
+            gates = self.g_norm(gates)
+            input_gate, forget_gate, cell_gate, output_gate = gates.chunk(4, 1)
+            input_gate = input_gate.sigmoid()
+            forget_gate = forget_gate.sigmoid()
+            cell_gate = cell_gate.tanh()
+            output_gate = output_gate.sigmoid()
+            c = forget_gate * c + input_gate * cell_gate
+            c = self.c_norm(c)
+            h = output_gate * c.tanh()
+            outputs.append(h)
+
+        output = torch.stack(outputs, dim=0)
+        state = [h, c]
+
+        return output, state
+
+
+class _Transcriber(ABC):
+    @abstractmethod
+    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        pass
+
+    @abstractmethod
+    def infer(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        states: Optional[List[List[torch.Tensor]]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        pass
+
+
+class _EmformerEncoder(torch.nn.Module, _Transcriber):
+    r"""Emformer-based recurrent neural network transducer (RNN-T) encoder (transcription network).
+
+    Args:
+        input_dim (int): feature dimension of each input sequence element.
+        output_dim (int): feature dimension of each output sequence element.
+        segment_length (int): length of input segment expressed as number of frames.
+        right_context_length (int): length of right context expressed as number of frames.
+        time_reduction_input_dim (int): dimension to scale each element in input sequences to
+            prior to applying time reduction block.
+        time_reduction_stride (int): factor by which to reduce length of input sequence.
+        transformer_num_heads (int): number of attention heads in each Emformer layer.
+        transformer_ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
+        transformer_num_layers (int): number of Emformer layers to instantiate.
+        transformer_left_context_length (int): length of left context.
+        transformer_dropout (float, optional): transformer dropout probability. (Default: 0.0)
+        transformer_activation (str, optional): activation function to use in each Emformer layer's
+            feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
+        transformer_max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
+        transformer_weight_init_scale_strategy (str, optional): per-layer weight initialization scaling
+            strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
+        transformer_tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
+    """
+
+    def __init__(
+        self,
+        *,
+        input_dim: int,
+        output_dim: int,
+        segment_length: int,
+        right_context_length: int,
+        time_reduction_input_dim: int,
+        time_reduction_stride: int,
+        transformer_num_heads: int,
+        transformer_ffn_dim: int,
+        transformer_num_layers: int,
+        transformer_left_context_length: int,
+        transformer_dropout: float = 0.0,
+        transformer_activation: str = "relu",
+        transformer_max_memory_size: int = 0,
+        transformer_weight_init_scale_strategy: str = "depthwise",
+        transformer_tanh_on_mem: bool = False,
+    ) -> None:
+        super().__init__()
+        self.input_linear = torch.nn.Linear(
+            input_dim,
+            time_reduction_input_dim,
+            bias=False,
+        )
+        self.time_reduction = _TimeReduction(time_reduction_stride)
+        transformer_input_dim = time_reduction_input_dim * time_reduction_stride
+        self.transformer = Emformer(
+            transformer_input_dim,
+            transformer_num_heads,
+            transformer_ffn_dim,
+            transformer_num_layers,
+            segment_length // time_reduction_stride,
+            dropout=transformer_dropout,
+            activation=transformer_activation,
+            left_context_length=transformer_left_context_length,
+            right_context_length=right_context_length // time_reduction_stride,
+            max_memory_size=transformer_max_memory_size,
+            weight_init_scale_strategy=transformer_weight_init_scale_strategy,
+            tanh_on_mem=transformer_tanh_on_mem,
+        )
+        self.output_linear = torch.nn.Linear(transformer_input_dim, output_dim)
+        self.layer_norm = torch.nn.LayerNorm(output_dim)
+
+    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum input sequence length in batch;
+        D: feature dimension of each input sequence frame (input_dim).
+
+        Args:
+            input (torch.Tensor): input frame sequences right-padded with right context, with
+                shape `(B, T + right context length, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    output frame sequences, with
+                    shape `(B, T // time_reduction_stride, output_dim)`.
+                torch.Tensor
+                    output input lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output frame sequences.
+        """
+        input_linear_out = self.input_linear(input)
+        time_reduction_out, time_reduction_lengths = self.time_reduction(input_linear_out, lengths)
+        transformer_out, transformer_lengths = self.transformer(time_reduction_out, time_reduction_lengths)
+        output_linear_out = self.output_linear(transformer_out)
+        layer_norm_out = self.layer_norm(output_linear_out)
+        return layer_norm_out, transformer_lengths
+
+    @torch.jit.export
+    def infer(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        states: Optional[List[List[torch.Tensor]]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass for inference.
+
+        B: batch size;
+        T: maximum input sequence segment length in batch;
+        D: feature dimension of each input sequence frame (input_dim).
+
+        Args:
+            input (torch.Tensor): input frame sequence segments right-padded with right context, with
+                shape `(B, T + right context length, D)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            state (List[List[torch.Tensor]] or None): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``infer``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output frame sequences, with
+                    shape `(B, T // time_reduction_stride, output_dim)`.
+                torch.Tensor
+                    output input lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation
+                    of ``infer``.
+        """
+        input_linear_out = self.input_linear(input)
+        time_reduction_out, time_reduction_lengths = self.time_reduction(input_linear_out, lengths)
+        (
+            transformer_out,
+            transformer_lengths,
+            transformer_states,
+        ) = self.transformer.infer(time_reduction_out, time_reduction_lengths, states)
+        output_linear_out = self.output_linear(transformer_out)
+        layer_norm_out = self.layer_norm(output_linear_out)
+        return layer_norm_out, transformer_lengths, transformer_states
+
+
+class _Predictor(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) prediction network.
+
+    Args:
+        num_symbols (int): size of target token lexicon.
+        output_dim (int): feature dimension of each output sequence element.
+        symbol_embedding_dim (int): dimension of each target token embedding.
+        num_lstm_layers (int): number of LSTM layers to instantiate.
+        lstm_hidden_dim (int): output dimension of each LSTM layer.
+        lstm_layer_norm (bool, optional): if ``True``, enables layer normalization
+            for LSTM layers. (Default: ``False``)
+        lstm_layer_norm_epsilon (float, optional): value of epsilon to use in
+            LSTM layer normalization layers. (Default: 1e-5)
+        lstm_dropout (float, optional): LSTM dropout probability. (Default: 0.0)
+
+    """
+
+    def __init__(
+        self,
+        num_symbols: int,
+        output_dim: int,
+        symbol_embedding_dim: int,
+        num_lstm_layers: int,
+        lstm_hidden_dim: int,
+        lstm_layer_norm: bool = False,
+        lstm_layer_norm_epsilon: float = 1e-5,
+        lstm_dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.embedding = torch.nn.Embedding(num_symbols, symbol_embedding_dim)
+        self.input_layer_norm = torch.nn.LayerNorm(symbol_embedding_dim)
+        self.lstm_layers = torch.nn.ModuleList(
+            [
+                _CustomLSTM(
+                    symbol_embedding_dim if idx == 0 else lstm_hidden_dim,
+                    lstm_hidden_dim,
+                    layer_norm=lstm_layer_norm,
+                    layer_norm_epsilon=lstm_layer_norm_epsilon,
+                )
+                for idx in range(num_lstm_layers)
+            ]
+        )
+        self.dropout = torch.nn.Dropout(p=lstm_dropout)
+        self.linear = torch.nn.Linear(lstm_hidden_dim, output_dim)
+        self.output_layer_norm = torch.nn.LayerNorm(output_dim)
+
+        self.lstm_dropout = lstm_dropout
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass.
+
+        B: batch size;
+        U: maximum sequence length in batch;
+        D: feature dimension of each input sequence element.
+
+        Args:
+            input (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``forward``. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output encoding sequences, with shape `(B, U, output_dim)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output encoding sequences.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation of ``forward``.
+        """
+        input_tb = input.permute(1, 0)
+        embedding_out = self.embedding(input_tb)
+        input_layer_norm_out = self.input_layer_norm(embedding_out)
+
+        lstm_out = input_layer_norm_out
+        state_out: List[List[torch.Tensor]] = []
+        for layer_idx, lstm in enumerate(self.lstm_layers):
+            lstm_out, lstm_state_out = lstm(lstm_out, None if state is None else state[layer_idx])
+            lstm_out = self.dropout(lstm_out)
+            state_out.append(lstm_state_out)
+
+        linear_out = self.linear(lstm_out)
+        output_layer_norm_out = self.output_layer_norm(linear_out)
+        return output_layer_norm_out.permute(1, 0, 2), lengths, state_out
+
+
+class _Joiner(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) joint network.
+
+    Args:
+        input_dim (int): source and target input dimension.
+        output_dim (int): output dimension.
+        activation (str, optional): activation function to use in the joiner.
+            Must be one of ("relu", "tanh"). (Default: "relu")
+
+    """
+
+    def __init__(self, input_dim: int, output_dim: int, activation: str = "relu") -> None:
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, output_dim, bias=True)
+        if activation == "relu":
+            self.activation = torch.nn.ReLU()
+        elif activation == "tanh":
+            self.activation = torch.nn.Tanh()
+        else:
+            raise ValueError(f"Unsupported activation {activation}")
+
+    def forward(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+        """
+        joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous()
+        activation_out = self.activation(joint_encodings)
+        output = self.linear(activation_out)
+        return output, source_lengths, target_lengths
+
+
+class RNNT(torch.nn.Module):
+    r"""torchaudio.models.RNNT()
+
+    Recurrent neural network transducer (RNN-T) model.
+
+    Note:
+        To build the model, please use one of the factory functions.
+
+    Args:
+        transcriber (torch.nn.Module): transcription network.
+        predictor (torch.nn.Module): prediction network.
+        joiner (torch.nn.Module): joint network.
+    """
+
+    def __init__(self, transcriber: _Transcriber, predictor: _Predictor, joiner: _Joiner) -> None:
+        super().__init__()
+        self.transcriber = transcriber
+        self.predictor = predictor
+        self.joiner = joiner
+
+    def forward(
+        self,
+        sources: torch.Tensor,
+        source_lengths: torch.Tensor,
+        targets: torch.Tensor,
+        target_lengths: torch.Tensor,
+        predictor_state: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: feature dimension of each source sequence element.
+
+        Args:
+            sources (torch.Tensor): source frame sequences right-padded with right context, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``sources``.
+            targets (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``targets``.
+            predictor_state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing prediction network internal state generated in preceding invocation
+                of ``forward``. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    joint network output, with shape
+                    `(B, max output source length, max output target length, output_dim (number of target symbols))`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing prediction network internal state generated in current invocation
+                    of ``forward``.
+        """
+        source_encodings, source_lengths = self.transcriber(
+            input=sources,
+            lengths=source_lengths,
+        )
+        target_encodings, target_lengths, predictor_state = self.predictor(
+            input=targets,
+            lengths=target_lengths,
+            state=predictor_state,
+        )
+        output, source_lengths, target_lengths = self.joiner(
+            source_encodings=source_encodings,
+            source_lengths=source_lengths,
+            target_encodings=target_encodings,
+            target_lengths=target_lengths,
+        )
+
+        return (
+            output,
+            source_lengths,
+            target_lengths,
+            predictor_state,
+        )
+
+    @torch.jit.export
+    def transcribe_streaming(
+        self,
+        sources: torch.Tensor,
+        source_lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Applies transcription network to sources in streaming mode.
+
+        B: batch size;
+        T: maximum source sequence segment length in batch;
+        D: feature dimension of each source sequence frame.
+
+        Args:
+            sources (torch.Tensor): source frame sequence segments right-padded with right context, with
+                shape `(B, T + right context length, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``sources``.
+            state (List[List[torch.Tensor]] or None): list of lists of tensors
+                representing transcription network internal state generated in preceding invocation
+                of ``transcribe_streaming``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output frame sequences, with
+                    shape `(B, T // time_reduction_stride, output_dim)`.
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing transcription network internal state generated in current invocation
+                    of ``transcribe_streaming``.
+        """
+        return self.transcriber.infer(sources, source_lengths, state)
+
+    @torch.jit.export
+    def transcribe(
+        self,
+        sources: torch.Tensor,
+        source_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""Applies transcription network to sources in non-streaming mode.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        D: feature dimension of each source sequence frame.
+
+        Args:
+            sources (torch.Tensor): source frame sequences right-padded with right context, with
+                shape `(B, T + right context length, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``sources``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    output frame sequences, with
+                    shape `(B, T // time_reduction_stride, output_dim)`.
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output frame sequences.
+        """
+        return self.transcriber(sources, source_lengths)
+
+    @torch.jit.export
+    def predict(
+        self,
+        targets: torch.Tensor,
+        target_lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Applies prediction network to targets.
+
+        B: batch size;
+        U: maximum target sequence length in batch;
+        D: feature dimension of each target sequence frame.
+
+        Args:
+            targets (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``targets``.
+            state (List[List[torch.Tensor]] or None): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``predict``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output frame sequences, with shape `(B, U, output_dim)`.
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation of ``predict``.
+        """
+        return self.predictor(input=targets, lengths=target_lengths, state=state)
+
+    @torch.jit.export
+    def join(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Applies joint network to source and target encodings.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+        """
+        output, source_lengths, target_lengths = self.joiner(
+            source_encodings=source_encodings,
+            source_lengths=source_lengths,
+            target_encodings=target_encodings,
+            target_lengths=target_lengths,
+        )
+        return output, source_lengths, target_lengths
+
+
+def emformer_rnnt_model(
+    *,
+    input_dim: int,
+    encoding_dim: int,
+    num_symbols: int,
+    segment_length: int,
+    right_context_length: int,
+    time_reduction_input_dim: int,
+    time_reduction_stride: int,
+    transformer_num_heads: int,
+    transformer_ffn_dim: int,
+    transformer_num_layers: int,
+    transformer_dropout: float,
+    transformer_activation: str,
+    transformer_left_context_length: int,
+    transformer_max_memory_size: int,
+    transformer_weight_init_scale_strategy: str,
+    transformer_tanh_on_mem: bool,
+    symbol_embedding_dim: int,
+    num_lstm_layers: int,
+    lstm_layer_norm: bool,
+    lstm_layer_norm_epsilon: float,
+    lstm_dropout: float,
+) -> RNNT:
+    r"""Builds Emformer-based recurrent neural network transducer (RNN-T) model.
+
+    Note:
+        For non-streaming inference, the expectation is for `transcribe` to be called on input
+        sequences right-concatenated with `right_context_length` frames.
+
+        For streaming inference, the expectation is for `transcribe_streaming` to be called
+        on input chunks comprising `segment_length` frames right-concatenated with `right_context_length`
+        frames.
+
+    Args:
+        input_dim (int): dimension of input sequence frames passed to transcription network.
+        encoding_dim (int): dimension of transcription- and prediction-network-generated encodings
+            passed to joint network.
+        num_symbols (int): cardinality of set of target tokens.
+        segment_length (int): length of input segment expressed as number of frames.
+        right_context_length (int): length of right context expressed as number of frames.
+        time_reduction_input_dim (int): dimension to scale each element in input sequences to
+            prior to applying time reduction block.
+        time_reduction_stride (int): factor by which to reduce length of input sequence.
+        transformer_num_heads (int): number of attention heads in each Emformer layer.
+        transformer_ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
+        transformer_num_layers (int): number of Emformer layers to instantiate.
+        transformer_left_context_length (int): length of left context considered by Emformer.
+        transformer_dropout (float): Emformer dropout probability.
+        transformer_activation (str): activation function to use in each Emformer layer's
+            feedforward network. Must be one of ("relu", "gelu", "silu").
+        transformer_max_memory_size (int): maximum number of memory elements to use.
+        transformer_weight_init_scale_strategy (str): per-layer weight initialization scaling
+            strategy. Must be one of ("depthwise", "constant", ``None``).
+        transformer_tanh_on_mem (bool): if ``True``, applies tanh to memory elements.
+        symbol_embedding_dim (int): dimension of each target token embedding.
+        num_lstm_layers (int): number of LSTM layers to instantiate.
+        lstm_layer_norm (bool): if ``True``, enables layer normalization for LSTM layers.
+        lstm_layer_norm_epsilon (float): value of epsilon to use in LSTM layer normalization layers.
+        lstm_dropout (float): LSTM dropout probability.
+
+    Returns:
+        RNNT:
+            Emformer RNN-T model.
+    """
+    encoder = _EmformerEncoder(
+        input_dim=input_dim,
+        output_dim=encoding_dim,
+        segment_length=segment_length,
+        right_context_length=right_context_length,
+        time_reduction_input_dim=time_reduction_input_dim,
+        time_reduction_stride=time_reduction_stride,
+        transformer_num_heads=transformer_num_heads,
+        transformer_ffn_dim=transformer_ffn_dim,
+        transformer_num_layers=transformer_num_layers,
+        transformer_dropout=transformer_dropout,
+        transformer_activation=transformer_activation,
+        transformer_left_context_length=transformer_left_context_length,
+        transformer_max_memory_size=transformer_max_memory_size,
+        transformer_weight_init_scale_strategy=transformer_weight_init_scale_strategy,
+        transformer_tanh_on_mem=transformer_tanh_on_mem,
+    )
+    predictor = _Predictor(
+        num_symbols,
+        encoding_dim,
+        symbol_embedding_dim=symbol_embedding_dim,
+        num_lstm_layers=num_lstm_layers,
+        lstm_hidden_dim=symbol_embedding_dim,
+        lstm_layer_norm=lstm_layer_norm,
+        lstm_layer_norm_epsilon=lstm_layer_norm_epsilon,
+        lstm_dropout=lstm_dropout,
+    )
+    joiner = _Joiner(encoding_dim, num_symbols)
+    return RNNT(encoder, predictor, joiner)
+
+
+def emformer_rnnt_base(num_symbols: int) -> RNNT:
+    r"""Builds basic version of Emformer RNN-T model.
+
+    Args:
+        num_symbols (int): The size of target token lexicon.
+
+    Returns:
+        RNNT:
+            Emformer RNN-T model.
+    """
+    return emformer_rnnt_model(
+        input_dim=80,
+        encoding_dim=1024,
+        num_symbols=num_symbols,
+        segment_length=16,
+        right_context_length=4,
+        time_reduction_input_dim=128,
+        time_reduction_stride=4,
+        transformer_num_heads=8,
+        transformer_ffn_dim=2048,
+        transformer_num_layers=20,
+        transformer_dropout=0.1,
+        transformer_activation="gelu",
+        transformer_left_context_length=30,
+        transformer_max_memory_size=0,
+        transformer_weight_init_scale_strategy="depthwise",
+        transformer_tanh_on_mem=True,
+        symbol_embedding_dim=512,
+        num_lstm_layers=3,
+        lstm_layer_norm=True,
+        lstm_layer_norm_epsilon=1e-3,
+        lstm_dropout=0.3,
+    )
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/rnnt_decoder.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/rnnt_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f44afc8c190dc2ad432ee2cef299a8d7364efc65
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/rnnt_decoder.py
@@ -0,0 +1,340 @@
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+from torchaudio.models import RNNT
+
+
+__all__ = ["Hypothesis", "RNNTBeamSearch"]
+
+
+Hypothesis = Tuple[List[int], torch.Tensor, List[List[torch.Tensor]], float]
+Hypothesis.__doc__ = """Hypothesis generated by RNN-T beam search decoder,
+    represented as tuple of (tokens, prediction network output, prediction network state, score).
+    """
+
+
+def _get_hypo_tokens(hypo: Hypothesis) -> List[int]:
+    return hypo[0]
+
+
+def _get_hypo_predictor_out(hypo: Hypothesis) -> torch.Tensor:
+    return hypo[1]
+
+
+def _get_hypo_state(hypo: Hypothesis) -> List[List[torch.Tensor]]:
+    return hypo[2]
+
+
+def _get_hypo_score(hypo: Hypothesis) -> float:
+    return hypo[3]
+
+
+def _get_hypo_key(hypo: Hypothesis) -> str:
+    return str(hypo[0])
+
+
+def _batch_state(hypos: List[Hypothesis]) -> List[List[torch.Tensor]]:
+    states: List[List[torch.Tensor]] = []
+    for i in range(len(_get_hypo_state(hypos[0]))):
+        batched_state_components: List[torch.Tensor] = []
+        for j in range(len(_get_hypo_state(hypos[0])[i])):
+            batched_state_components.append(torch.cat([_get_hypo_state(hypo)[i][j] for hypo in hypos]))
+        states.append(batched_state_components)
+    return states
+
+
+def _slice_state(states: List[List[torch.Tensor]], idx: int, device: torch.device) -> List[List[torch.Tensor]]:
+    idx_tensor = torch.tensor([idx], device=device)
+    return [[state.index_select(0, idx_tensor) for state in state_tuple] for state_tuple in states]
+
+
+def _default_hypo_sort_key(hypo: Hypothesis) -> float:
+    return _get_hypo_score(hypo) / (len(_get_hypo_tokens(hypo)) + 1)
+
+
+def _compute_updated_scores(
+    hypos: List[Hypothesis],
+    next_token_probs: torch.Tensor,
+    beam_width: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    hypo_scores = torch.tensor([_get_hypo_score(h) for h in hypos]).unsqueeze(1)
+    nonblank_scores = hypo_scores + next_token_probs[:, :-1]  # [beam_width, num_tokens - 1]
+    nonblank_nbest_scores, nonblank_nbest_idx = nonblank_scores.reshape(-1).topk(beam_width)
+    nonblank_nbest_hypo_idx = nonblank_nbest_idx.div(nonblank_scores.shape[1], rounding_mode="trunc")
+    nonblank_nbest_token = nonblank_nbest_idx % nonblank_scores.shape[1]
+    return nonblank_nbest_scores, nonblank_nbest_hypo_idx, nonblank_nbest_token
+
+
+def _remove_hypo(hypo: Hypothesis, hypo_list: List[Hypothesis]) -> None:
+    for i, elem in enumerate(hypo_list):
+        if _get_hypo_key(hypo) == _get_hypo_key(elem):
+            del hypo_list[i]
+            break
+
+
+class RNNTBeamSearch(torch.nn.Module):
+    r"""Beam search decoder for RNN-T model.
+
+    Args:
+        model (RNNT): RNN-T model to use.
+        blank (int): index of blank token in vocabulary.
+        temperature (float, optional): temperature to apply to joint network output.
+            Larger values yield more uniform samples. (Default: 1.0)
+        hypo_sort_key (Callable[[Hypothesis], float] or None, optional): callable that computes a score
+            for a given hypothesis to rank hypotheses by. If ``None``, defaults to callable that returns
+            hypothesis score normalized by token sequence length. (Default: None)
+        step_max_tokens (int, optional): maximum number of tokens to emit per input time step. (Default: 100)
+    """
+
+    def __init__(
+        self,
+        model: RNNT,
+        blank: int,
+        temperature: float = 1.0,
+        hypo_sort_key: Optional[Callable[[Hypothesis], float]] = None,
+        step_max_tokens: int = 100,
+    ) -> None:
+        super().__init__()
+        self.model = model
+        self.blank = blank
+        self.temperature = temperature
+
+        if hypo_sort_key is None:
+            self.hypo_sort_key = _default_hypo_sort_key
+        else:
+            self.hypo_sort_key = hypo_sort_key
+
+        self.step_max_tokens = step_max_tokens
+
+    def _init_b_hypos(self, hypo: Optional[Hypothesis], device: torch.device) -> List[Hypothesis]:
+        if hypo is not None:
+            token = _get_hypo_tokens(hypo)[-1]
+            state = _get_hypo_state(hypo)
+        else:
+            token = self.blank
+            state = None
+
+        one_tensor = torch.tensor([1], device=device)
+        pred_out, _, pred_state = self.model.predict(torch.tensor([[token]], device=device), one_tensor, state)
+        init_hypo = (
+            [token],
+            pred_out[0].detach(),
+            pred_state,
+            0.0,
+        )
+        return [init_hypo]
+
+    def _gen_next_token_probs(
+        self, enc_out: torch.Tensor, hypos: List[Hypothesis], device: torch.device
+    ) -> torch.Tensor:
+        one_tensor = torch.tensor([1], device=device)
+        predictor_out = torch.stack([_get_hypo_predictor_out(h) for h in hypos], dim=0)
+        joined_out, _, _ = self.model.join(
+            enc_out,
+            one_tensor,
+            predictor_out,
+            torch.tensor([1] * len(hypos), device=device),
+        )  # [beam_width, 1, 1, num_tokens]
+        joined_out = torch.nn.functional.log_softmax(joined_out / self.temperature, dim=3)
+        return joined_out[:, 0, 0]
+
+    def _gen_b_hypos(
+        self,
+        b_hypos: List[Hypothesis],
+        a_hypos: List[Hypothesis],
+        next_token_probs: torch.Tensor,
+        key_to_b_hypo: Dict[str, Hypothesis],
+    ) -> List[Hypothesis]:
+        for i in range(len(a_hypos)):
+            h_a = a_hypos[i]
+            append_blank_score = _get_hypo_score(h_a) + next_token_probs[i, -1]
+            if _get_hypo_key(h_a) in key_to_b_hypo:
+                h_b = key_to_b_hypo[_get_hypo_key(h_a)]
+                _remove_hypo(h_b, b_hypos)
+                score = float(torch.tensor(_get_hypo_score(h_b)).logaddexp(append_blank_score))
+            else:
+                score = float(append_blank_score)
+            h_b = (
+                _get_hypo_tokens(h_a),
+                _get_hypo_predictor_out(h_a),
+                _get_hypo_state(h_a),
+                score,
+            )
+            b_hypos.append(h_b)
+            key_to_b_hypo[_get_hypo_key(h_b)] = h_b
+        _, sorted_idx = torch.tensor([_get_hypo_score(hypo) for hypo in b_hypos]).sort()
+        return [b_hypos[idx] for idx in sorted_idx]
+
+    def _gen_a_hypos(
+        self,
+        a_hypos: List[Hypothesis],
+        b_hypos: List[Hypothesis],
+        next_token_probs: torch.Tensor,
+        t: int,
+        beam_width: int,
+        device: torch.device,
+    ) -> List[Hypothesis]:
+        (
+            nonblank_nbest_scores,
+            nonblank_nbest_hypo_idx,
+            nonblank_nbest_token,
+        ) = _compute_updated_scores(a_hypos, next_token_probs, beam_width)
+
+        if len(b_hypos) < beam_width:
+            b_nbest_score = -float("inf")
+        else:
+            b_nbest_score = _get_hypo_score(b_hypos[-beam_width])
+
+        base_hypos: List[Hypothesis] = []
+        new_tokens: List[int] = []
+        new_scores: List[float] = []
+        for i in range(beam_width):
+            score = float(nonblank_nbest_scores[i])
+            if score > b_nbest_score:
+                a_hypo_idx = int(nonblank_nbest_hypo_idx[i])
+                base_hypos.append(a_hypos[a_hypo_idx])
+                new_tokens.append(int(nonblank_nbest_token[i]))
+                new_scores.append(score)
+
+        if base_hypos:
+            new_hypos = self._gen_new_hypos(base_hypos, new_tokens, new_scores, t, device)
+        else:
+            new_hypos: List[Hypothesis] = []
+
+        return new_hypos
+
+    def _gen_new_hypos(
+        self,
+        base_hypos: List[Hypothesis],
+        tokens: List[int],
+        scores: List[float],
+        t: int,
+        device: torch.device,
+    ) -> List[Hypothesis]:
+        tgt_tokens = torch.tensor([[token] for token in tokens], device=device)
+        states = _batch_state(base_hypos)
+        pred_out, _, pred_states = self.model.predict(
+            tgt_tokens,
+            torch.tensor([1] * len(base_hypos), device=device),
+            states,
+        )
+        new_hypos: List[Hypothesis] = []
+        for i, h_a in enumerate(base_hypos):
+            new_tokens = _get_hypo_tokens(h_a) + [tokens[i]]
+            new_hypos.append((new_tokens, pred_out[i].detach(), _slice_state(pred_states, i, device), scores[i]))
+        return new_hypos
+
+    def _search(
+        self,
+        enc_out: torch.Tensor,
+        hypo: Optional[Hypothesis],
+        beam_width: int,
+    ) -> List[Hypothesis]:
+        n_time_steps = enc_out.shape[1]
+        device = enc_out.device
+
+        a_hypos: List[Hypothesis] = []
+        b_hypos = self._init_b_hypos(hypo, device)
+        for t in range(n_time_steps):
+            a_hypos = b_hypos
+            b_hypos = torch.jit.annotate(List[Hypothesis], [])
+            key_to_b_hypo: Dict[str, Hypothesis] = {}
+            symbols_current_t = 0
+
+            while a_hypos:
+                next_token_probs = self._gen_next_token_probs(enc_out[:, t : t + 1], a_hypos, device)
+                next_token_probs = next_token_probs.cpu()
+                b_hypos = self._gen_b_hypos(b_hypos, a_hypos, next_token_probs, key_to_b_hypo)
+
+                if symbols_current_t == self.step_max_tokens:
+                    break
+
+                a_hypos = self._gen_a_hypos(
+                    a_hypos,
+                    b_hypos,
+                    next_token_probs,
+                    t,
+                    beam_width,
+                    device,
+                )
+                if a_hypos:
+                    symbols_current_t += 1
+
+            _, sorted_idx = torch.tensor([self.hypo_sort_key(hypo) for hypo in b_hypos]).topk(beam_width)
+            b_hypos = [b_hypos[idx] for idx in sorted_idx]
+
+        return b_hypos
+
+    def forward(self, input: torch.Tensor, length: torch.Tensor, beam_width: int) -> List[Hypothesis]:
+        r"""Performs beam search for the given input sequence.
+
+        T: number of frames;
+        D: feature dimension of each frame.
+
+        Args:
+            input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D).
+            length (torch.Tensor): number of valid frames in input
+                sequence, with shape () or (1,).
+            beam_width (int): beam size to use during search.
+
+        Returns:
+            List[Hypothesis]: top-``beam_width`` hypotheses found by beam search.
+        """
+        assert input.dim() == 2 or (
+            input.dim() == 3 and input.shape[0] == 1
+        ), "input must be of shape (T, D) or (1, T, D)"
+        if input.dim() == 2:
+            input = input.unsqueeze(0)
+
+        assert length.shape == () or length.shape == (1,), "length must be of shape () or (1,)"
+        if input.dim() == 0:
+            input = input.unsqueeze(0)
+
+        enc_out, _ = self.model.transcribe(input, length)
+        return self._search(enc_out, None, beam_width)
+
+    @torch.jit.export
+    def infer(
+        self,
+        input: torch.Tensor,
+        length: torch.Tensor,
+        beam_width: int,
+        state: Optional[List[List[torch.Tensor]]] = None,
+        hypothesis: Optional[Hypothesis] = None,
+    ) -> Tuple[List[Hypothesis], List[List[torch.Tensor]]]:
+        r"""Performs beam search for the given input sequence in streaming mode.
+
+        T: number of frames;
+        D: feature dimension of each frame.
+
+        Args:
+            input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D).
+            length (torch.Tensor): number of valid frames in input
+                sequence, with shape () or (1,).
+            beam_width (int): beam size to use during search.
+            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing transcription network internal state generated in preceding
+                invocation. (Default: ``None``)
+            hypothesis (Hypothesis or None): hypothesis from preceding invocation to seed
+                search with. (Default: ``None``)
+
+        Returns:
+            (List[Hypothesis], List[List[torch.Tensor]]):
+                List[Hypothesis]
+                    top-``beam_width`` hypotheses found by beam search.
+                List[List[torch.Tensor]]
+                    list of lists of tensors representing transcription network
+                    internal state generated in current invocation.
+        """
+        assert input.dim() == 2 or (
+            input.dim() == 3 and input.shape[0] == 1
+        ), "input must be of shape (T, D) or (1, T, D)"
+        if input.dim() == 2:
+            input = input.unsqueeze(0)
+
+        assert length.shape == () or length.shape == (1,), "length must be of shape () or (1,)"
+        if length.dim() == 0:
+            length = length.unsqueeze(0)
+
+        enc_out, _, state = self.model.transcribe_streaming(input, length, state)
+        return self._search(enc_out, hypothesis, beam_width), state
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/tacotron2.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/tacotron2.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2bcc01a1471c3da70ad4ae1d566d47bc0d6bf5a
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/tacotron2.py
@@ -0,0 +1,1046 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+
+
+__all__ = [
+    "Tacotron2",
+]
+
+
+def _get_linear_layer(in_dim: int, out_dim: int, bias: bool = True, w_init_gain: str = "linear") -> torch.nn.Linear:
+    r"""Linear layer with xavier uniform initialization.
+
+    Args:
+        in_dim (int): Size of each input sample.
+        out_dim (int): Size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias. (Default: ``True``)
+        w_init_gain (str, optional): Parameter passed to ``torch.nn.init.calculate_gain``
+            for setting the gain parameter of ``xavier_uniform_``. (Default: ``linear``)
+
+    Returns:
+        (torch.nn.Linear): The corresponding linear layer.
+    """
+    linear = torch.nn.Linear(in_dim, out_dim, bias=bias)
+    torch.nn.init.xavier_uniform_(linear.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+    return linear
+
+
+def _get_conv1d_layer(
+    in_channels: int,
+    out_channels: int,
+    kernel_size: int = 1,
+    stride: int = 1,
+    padding: Optional[Union[str, int, Tuple[int]]] = None,
+    dilation: int = 1,
+    bias: bool = True,
+    w_init_gain: str = "linear",
+) -> torch.nn.Conv1d:
+    r"""1D convolution with xavier uniform initialization.
+
+    Args:
+        in_channels (int): Number of channels in the input image.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int, optional): Number of channels in the input image. (Default: ``1``)
+        stride (int, optional): Number of channels in the input image. (Default: ``1``)
+        padding (str, int or tuple, optional): Padding added to both sides of the input.
+            (Default: dilation * (kernel_size - 1) / 2)
+        dilation (int, optional): Number of channels in the input image. (Default: ``1``)
+        w_init_gain (str, optional): Parameter passed to ``torch.nn.init.calculate_gain``
+            for setting the gain parameter of ``xavier_uniform_``. (Default: ``linear``)
+
+    Returns:
+        (torch.nn.Conv1d): The corresponding Conv1D layer.
+    """
+    if padding is None:
+        assert kernel_size % 2 == 1
+        padding = int(dilation * (kernel_size - 1) / 2)
+
+    conv1d = torch.nn.Conv1d(
+        in_channels,
+        out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=bias,
+    )
+
+    torch.nn.init.xavier_uniform_(conv1d.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+
+    return conv1d
+
+
+def _get_mask_from_lengths(lengths: Tensor) -> Tensor:
+    r"""Returns a binary mask based on ``lengths``. The ``i``-th row and ``j``-th column of the mask
+    is ``1`` if ``j`` is smaller than ``i``-th element of ``lengths.
+
+    Args:
+        lengths (Tensor): The length of each element in the batch, with shape (n_batch, ).
+
+    Returns:
+        mask (Tensor): The binary mask, with shape (n_batch, max of ``lengths``).
+    """
+    max_len = torch.max(lengths).item()
+    ids = torch.arange(0, max_len, device=lengths.device, dtype=lengths.dtype)
+    mask = (ids < lengths.unsqueeze(1)).byte()
+    mask = torch.le(mask, 0)
+    return mask
+
+
+class _LocationLayer(nn.Module):
+    r"""Location layer used in the Attention model.
+
+    Args:
+        attention_n_filter (int): Number of filters for attention model.
+        attention_kernel_size (int): Kernel size for attention model.
+        attention_hidden_dim (int): Dimension of attention hidden representation.
+    """
+
+    def __init__(
+        self,
+        attention_n_filter: int,
+        attention_kernel_size: int,
+        attention_hidden_dim: int,
+    ):
+        super().__init__()
+        padding = int((attention_kernel_size - 1) / 2)
+        self.location_conv = _get_conv1d_layer(
+            2,
+            attention_n_filter,
+            kernel_size=attention_kernel_size,
+            padding=padding,
+            bias=False,
+            stride=1,
+            dilation=1,
+        )
+        self.location_dense = _get_linear_layer(
+            attention_n_filter, attention_hidden_dim, bias=False, w_init_gain="tanh"
+        )
+
+    def forward(self, attention_weights_cat: Tensor) -> Tensor:
+        r"""Location layer used in the Attention model.
+
+        Args:
+            attention_weights_cat (Tensor): Cumulative and previous attention weights
+                with shape (n_batch, 2, max of ``text_lengths``).
+
+        Returns:
+            processed_attention (Tensor): Cumulative and previous attention weights
+                with shape (n_batch, ``attention_hidden_dim``).
+        """
+        # (n_batch, attention_n_filter, text_lengths.max())
+        processed_attention = self.location_conv(attention_weights_cat)
+        processed_attention = processed_attention.transpose(1, 2)
+        # (n_batch, text_lengths.max(), attention_hidden_dim)
+        processed_attention = self.location_dense(processed_attention)
+        return processed_attention
+
+
+class _Attention(nn.Module):
+    r"""Locally sensitive attention model.
+
+    Args:
+        attention_rnn_dim (int): Number of hidden units for RNN.
+        encoder_embedding_dim (int): Number of embedding dimensions in the Encoder.
+        attention_hidden_dim (int): Dimension of attention hidden representation.
+        attention_location_n_filter (int): Number of filters for Attention model.
+        attention_location_kernel_size (int): Kernel size for Attention model.
+    """
+
+    def __init__(
+        self,
+        attention_rnn_dim: int,
+        encoder_embedding_dim: int,
+        attention_hidden_dim: int,
+        attention_location_n_filter: int,
+        attention_location_kernel_size: int,
+    ) -> None:
+        super().__init__()
+        self.query_layer = _get_linear_layer(attention_rnn_dim, attention_hidden_dim, bias=False, w_init_gain="tanh")
+        self.memory_layer = _get_linear_layer(
+            encoder_embedding_dim, attention_hidden_dim, bias=False, w_init_gain="tanh"
+        )
+        self.v = _get_linear_layer(attention_hidden_dim, 1, bias=False)
+        self.location_layer = _LocationLayer(
+            attention_location_n_filter,
+            attention_location_kernel_size,
+            attention_hidden_dim,
+        )
+        self.score_mask_value = -float("inf")
+
+    def _get_alignment_energies(self, query: Tensor, processed_memory: Tensor, attention_weights_cat: Tensor) -> Tensor:
+        r"""Get the alignment vector.
+
+        Args:
+            query (Tensor): Decoder output with shape (n_batch, n_mels * n_frames_per_step).
+            processed_memory (Tensor): Processed Encoder outputs
+                with shape (n_batch, max of ``text_lengths``, attention_hidden_dim).
+            attention_weights_cat (Tensor): Cumulative and previous attention weights
+                with shape (n_batch, 2, max of ``text_lengths``).
+
+        Returns:
+            alignment (Tensor): attention weights, it is a tensor with shape (batch, max of ``text_lengths``).
+        """
+
+        processed_query = self.query_layer(query.unsqueeze(1))
+        processed_attention_weights = self.location_layer(attention_weights_cat)
+        energies = self.v(torch.tanh(processed_query + processed_attention_weights + processed_memory))
+
+        alignment = energies.squeeze(2)
+        return alignment
+
+    def forward(
+        self,
+        attention_hidden_state: Tensor,
+        memory: Tensor,
+        processed_memory: Tensor,
+        attention_weights_cat: Tensor,
+        mask: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        r"""Pass the input through the Attention model.
+
+        Args:
+            attention_hidden_state (Tensor): Attention rnn last output with shape (n_batch, ``attention_rnn_dim``).
+            memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
+            processed_memory (Tensor): Processed Encoder outputs
+                with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
+            attention_weights_cat (Tensor): Previous and cumulative attention weights
+                with shape (n_batch, current_num_frames * 2, max of ``text_lengths``).
+            mask (Tensor): Binary mask for padded data with shape (n_batch, current_num_frames).
+
+        Returns:
+            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
+            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
+        """
+        alignment = self._get_alignment_energies(attention_hidden_state, processed_memory, attention_weights_cat)
+
+        alignment = alignment.masked_fill(mask, self.score_mask_value)
+
+        attention_weights = F.softmax(alignment, dim=1)
+        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
+        attention_context = attention_context.squeeze(1)
+
+        return attention_context, attention_weights
+
+
+class _Prenet(nn.Module):
+    r"""Prenet Module. It is consists of ``len(output_size)`` linear layers.
+
+    Args:
+        in_dim (int): The size of each input sample.
+        output_sizes (list): The output dimension of each linear layers.
+    """
+
+    def __init__(self, in_dim: int, out_sizes: List[int]) -> None:
+        super().__init__()
+        in_sizes = [in_dim] + out_sizes[:-1]
+        self.layers = nn.ModuleList(
+            [_get_linear_layer(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, out_sizes)]
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""Pass the input through Prenet.
+
+        Args:
+            x (Tensor): The input sequence to Prenet with shape (n_batch, in_dim).
+
+        Return:
+            x (Tensor): Tensor with shape (n_batch, sizes[-1])
+        """
+
+        for linear in self.layers:
+            x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
+        return x
+
+
+class _Postnet(nn.Module):
+    r"""Postnet Module.
+
+    Args:
+        n_mels (int): Number of mel bins.
+        postnet_embedding_dim (int): Postnet embedding dimension.
+        postnet_kernel_size (int): Postnet kernel size.
+        postnet_n_convolution (int): Number of postnet convolutions.
+    """
+
+    def __init__(
+        self,
+        n_mels: int,
+        postnet_embedding_dim: int,
+        postnet_kernel_size: int,
+        postnet_n_convolution: int,
+    ):
+        super().__init__()
+        self.convolutions = nn.ModuleList()
+
+        for i in range(postnet_n_convolution):
+            in_channels = n_mels if i == 0 else postnet_embedding_dim
+            out_channels = n_mels if i == (postnet_n_convolution - 1) else postnet_embedding_dim
+            init_gain = "linear" if i == (postnet_n_convolution - 1) else "tanh"
+            num_features = n_mels if i == (postnet_n_convolution - 1) else postnet_embedding_dim
+            self.convolutions.append(
+                nn.Sequential(
+                    _get_conv1d_layer(
+                        in_channels,
+                        out_channels,
+                        kernel_size=postnet_kernel_size,
+                        stride=1,
+                        padding=int((postnet_kernel_size - 1) / 2),
+                        dilation=1,
+                        w_init_gain=init_gain,
+                    ),
+                    nn.BatchNorm1d(num_features),
+                )
+            )
+
+        self.n_convs = len(self.convolutions)
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""Pass the input through Postnet.
+
+        Args:
+            x (Tensor): The input sequence with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
+
+        Return:
+            x (Tensor): Tensor with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
+        """
+
+        for i, conv in enumerate(self.convolutions):
+            if i < self.n_convs - 1:
+                x = F.dropout(torch.tanh(conv(x)), 0.5, training=self.training)
+            else:
+                x = F.dropout(conv(x), 0.5, training=self.training)
+
+        return x
+
+
+class _Encoder(nn.Module):
+    r"""Encoder Module.
+
+    Args:
+        encoder_embedding_dim (int): Number of embedding dimensions in the encoder.
+        encoder_n_convolution (int): Number of convolution layers in the encoder.
+        encoder_kernel_size (int): The kernel size in the encoder.
+
+    Examples
+        >>> encoder = _Encoder(3, 512, 5)
+        >>> input = torch.rand(10, 20, 30)
+        >>> output = encoder(input)  # shape: (10, 30, 512)
+    """
+
+    def __init__(
+        self,
+        encoder_embedding_dim: int,
+        encoder_n_convolution: int,
+        encoder_kernel_size: int,
+    ) -> None:
+        super().__init__()
+
+        self.convolutions = nn.ModuleList()
+        for _ in range(encoder_n_convolution):
+            conv_layer = nn.Sequential(
+                _get_conv1d_layer(
+                    encoder_embedding_dim,
+                    encoder_embedding_dim,
+                    kernel_size=encoder_kernel_size,
+                    stride=1,
+                    padding=int((encoder_kernel_size - 1) / 2),
+                    dilation=1,
+                    w_init_gain="relu",
+                ),
+                nn.BatchNorm1d(encoder_embedding_dim),
+            )
+            self.convolutions.append(conv_layer)
+
+        self.lstm = nn.LSTM(
+            encoder_embedding_dim,
+            int(encoder_embedding_dim / 2),
+            1,
+            batch_first=True,
+            bidirectional=True,
+        )
+        self.lstm.flatten_parameters()
+
+    def forward(self, x: Tensor, input_lengths: Tensor) -> Tensor:
+        r"""Pass the input through the Encoder.
+
+        Args:
+            x (Tensor): The input sequences with shape (n_batch, encoder_embedding_dim, n_seq).
+            input_lengths (Tensor): The length of each input sequence with shape (n_batch, ).
+
+        Return:
+            x (Tensor): A tensor with shape (n_batch, n_seq, encoder_embedding_dim).
+        """
+
+        for conv in self.convolutions:
+            x = F.dropout(F.relu(conv(x)), 0.5, self.training)
+
+        x = x.transpose(1, 2)
+
+        input_lengths = input_lengths.cpu()
+        x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True)
+
+        outputs, _ = self.lstm(x)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
+
+        return outputs
+
+
+class _Decoder(nn.Module):
+    r"""Decoder with Attention model.
+
+    Args:
+        n_mels (int): number of mel bins
+        n_frames_per_step (int): number of frames processed per step, only 1 is supported
+        encoder_embedding_dim (int): the number of embedding dimensions in the encoder.
+        decoder_rnn_dim (int): number of units in decoder LSTM
+        decoder_max_step (int): maximum number of output mel spectrograms
+        decoder_dropout (float): dropout probability for decoder LSTM
+        decoder_early_stopping (bool): stop decoding when all samples are finished
+        attention_rnn_dim (int): number of units in attention LSTM
+        attention_hidden_dim (int): dimension of attention hidden representation
+        attention_location_n_filter (int): number of filters for attention model
+        attention_location_kernel_size (int): kernel size for attention model
+        attention_dropout (float): dropout probability for attention LSTM
+        prenet_dim (int): number of ReLU units in prenet layers
+        gate_threshold (float): probability threshold for stop token
+    """
+
+    def __init__(
+        self,
+        n_mels: int,
+        n_frames_per_step: int,
+        encoder_embedding_dim: int,
+        decoder_rnn_dim: int,
+        decoder_max_step: int,
+        decoder_dropout: float,
+        decoder_early_stopping: bool,
+        attention_rnn_dim: int,
+        attention_hidden_dim: int,
+        attention_location_n_filter: int,
+        attention_location_kernel_size: int,
+        attention_dropout: float,
+        prenet_dim: int,
+        gate_threshold: float,
+    ) -> None:
+
+        super().__init__()
+        self.n_mels = n_mels
+        self.n_frames_per_step = n_frames_per_step
+        self.encoder_embedding_dim = encoder_embedding_dim
+        self.attention_rnn_dim = attention_rnn_dim
+        self.decoder_rnn_dim = decoder_rnn_dim
+        self.prenet_dim = prenet_dim
+        self.decoder_max_step = decoder_max_step
+        self.gate_threshold = gate_threshold
+        self.attention_dropout = attention_dropout
+        self.decoder_dropout = decoder_dropout
+        self.decoder_early_stopping = decoder_early_stopping
+
+        self.prenet = _Prenet(n_mels * n_frames_per_step, [prenet_dim, prenet_dim])
+
+        self.attention_rnn = nn.LSTMCell(prenet_dim + encoder_embedding_dim, attention_rnn_dim)
+
+        self.attention_layer = _Attention(
+            attention_rnn_dim,
+            encoder_embedding_dim,
+            attention_hidden_dim,
+            attention_location_n_filter,
+            attention_location_kernel_size,
+        )
+
+        self.decoder_rnn = nn.LSTMCell(attention_rnn_dim + encoder_embedding_dim, decoder_rnn_dim, True)
+
+        self.linear_projection = _get_linear_layer(decoder_rnn_dim + encoder_embedding_dim, n_mels * n_frames_per_step)
+
+        self.gate_layer = _get_linear_layer(
+            decoder_rnn_dim + encoder_embedding_dim, 1, bias=True, w_init_gain="sigmoid"
+        )
+
+    def _get_initial_frame(self, memory: Tensor) -> Tensor:
+        r"""Gets all zeros frames to use as the first decoder input.
+
+        Args:
+            memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
+
+        Returns:
+            decoder_input (Tensor): all zeros frames with shape
+                (n_batch, max of ``text_lengths``, ``n_mels * n_frames_per_step``).
+        """
+
+        n_batch = memory.size(0)
+        dtype = memory.dtype
+        device = memory.device
+        decoder_input = torch.zeros(n_batch, self.n_mels * self.n_frames_per_step, dtype=dtype, device=device)
+        return decoder_input
+
+    def _initialize_decoder_states(
+        self, memory: Tensor
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
+        r"""Initializes attention rnn states, decoder rnn states, attention
+        weights, attention cumulative weights, attention context, stores memory
+        and stores processed memory.
+
+        Args:
+            memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
+
+        Returns:
+            attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
+            attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
+            decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
+            decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
+            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
+            attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
+            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
+            processed_memory (Tensor): Processed encoder outputs
+                with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
+        """
+        n_batch = memory.size(0)
+        max_time = memory.size(1)
+        dtype = memory.dtype
+        device = memory.device
+
+        attention_hidden = torch.zeros(n_batch, self.attention_rnn_dim, dtype=dtype, device=device)
+        attention_cell = torch.zeros(n_batch, self.attention_rnn_dim, dtype=dtype, device=device)
+
+        decoder_hidden = torch.zeros(n_batch, self.decoder_rnn_dim, dtype=dtype, device=device)
+        decoder_cell = torch.zeros(n_batch, self.decoder_rnn_dim, dtype=dtype, device=device)
+
+        attention_weights = torch.zeros(n_batch, max_time, dtype=dtype, device=device)
+        attention_weights_cum = torch.zeros(n_batch, max_time, dtype=dtype, device=device)
+        attention_context = torch.zeros(n_batch, self.encoder_embedding_dim, dtype=dtype, device=device)
+
+        processed_memory = self.attention_layer.memory_layer(memory)
+
+        return (
+            attention_hidden,
+            attention_cell,
+            decoder_hidden,
+            decoder_cell,
+            attention_weights,
+            attention_weights_cum,
+            attention_context,
+            processed_memory,
+        )
+
+    def _parse_decoder_inputs(self, decoder_inputs: Tensor) -> Tensor:
+        r"""Prepares decoder inputs.
+
+        Args:
+            decoder_inputs (Tensor): Inputs used for teacher-forced training, i.e. mel-specs,
+                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``)
+
+        Returns:
+            inputs (Tensor): Processed decoder inputs with shape (max of ``mel_specgram_lengths``, n_batch, ``n_mels``).
+        """
+        # (n_batch, n_mels, mel_specgram_lengths.max()) -> (n_batch, mel_specgram_lengths.max(), n_mels)
+        decoder_inputs = decoder_inputs.transpose(1, 2)
+        decoder_inputs = decoder_inputs.view(
+            decoder_inputs.size(0),
+            int(decoder_inputs.size(1) / self.n_frames_per_step),
+            -1,
+        )
+        # (n_batch, mel_specgram_lengths.max(), n_mels) -> (mel_specgram_lengths.max(), n_batch, n_mels)
+        decoder_inputs = decoder_inputs.transpose(0, 1)
+        return decoder_inputs
+
+    def _parse_decoder_outputs(
+        self, mel_specgram: Tensor, gate_outputs: Tensor, alignments: Tensor
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        r"""Prepares decoder outputs for output
+
+        Args:
+            mel_specgram (Tensor): mel spectrogram with shape (max of ``mel_specgram_lengths``, n_batch, ``n_mels``)
+            gate_outputs (Tensor): predicted stop token with shape (max of ``mel_specgram_lengths``, n_batch)
+            alignments (Tensor): sequence of attention weights from the decoder
+                with shape (max of ``mel_specgram_lengths``, n_batch, max of ``text_lengths``)
+
+        Returns:
+            mel_specgram (Tensor): mel spectrogram with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``)
+            gate_outputs (Tensor): predicted stop token with shape (n_batch, max of ``mel_specgram_lengths``)
+            alignments (Tensor): sequence of attention weights from the decoder
+                with shape (n_batch, max of ``mel_specgram_lengths``, max of ``text_lengths``)
+        """
+        # (mel_specgram_lengths.max(), n_batch, text_lengths.max())
+        # -> (n_batch, mel_specgram_lengths.max(), text_lengths.max())
+        alignments = alignments.transpose(0, 1).contiguous()
+        # (mel_specgram_lengths.max(), n_batch) -> (n_batch, mel_specgram_lengths.max())
+        gate_outputs = gate_outputs.transpose(0, 1).contiguous()
+        # (mel_specgram_lengths.max(), n_batch, n_mels) -> (n_batch, mel_specgram_lengths.max(), n_mels)
+        mel_specgram = mel_specgram.transpose(0, 1).contiguous()
+        # decouple frames per step
+        shape = (mel_specgram.shape[0], -1, self.n_mels)
+        mel_specgram = mel_specgram.view(*shape)
+        # (n_batch, mel_specgram_lengths.max(), n_mels) -> (n_batch, n_mels, T_out)
+        mel_specgram = mel_specgram.transpose(1, 2)
+
+        return mel_specgram, gate_outputs, alignments
+
+    def decode(
+        self,
+        decoder_input: Tensor,
+        attention_hidden: Tensor,
+        attention_cell: Tensor,
+        decoder_hidden: Tensor,
+        decoder_cell: Tensor,
+        attention_weights: Tensor,
+        attention_weights_cum: Tensor,
+        attention_context: Tensor,
+        memory: Tensor,
+        processed_memory: Tensor,
+        mask: Tensor,
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
+        r"""Decoder step using stored states, attention and memory
+
+        Args:
+            decoder_input (Tensor): Output of the Prenet with shape (n_batch, ``prenet_dim``).
+            attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
+            attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
+            decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
+            decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
+            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
+            attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
+            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
+            memory (Tensor): Encoder output with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
+            processed_memory (Tensor): Processed Encoder outputs
+                with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
+            mask (Tensor): Binary mask for padded data with shape (n_batch, current_num_frames).
+
+        Returns:
+            decoder_output: Predicted mel spectrogram for the current frame with shape (n_batch, ``n_mels``).
+            gate_prediction (Tensor): Prediction of the stop token with shape (n_batch, ``1``).
+            attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
+            attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
+            decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
+            decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
+            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
+            attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
+            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
+        """
+        cell_input = torch.cat((decoder_input, attention_context), -1)
+
+        attention_hidden, attention_cell = self.attention_rnn(cell_input, (attention_hidden, attention_cell))
+        attention_hidden = F.dropout(attention_hidden, self.attention_dropout, self.training)
+
+        attention_weights_cat = torch.cat((attention_weights.unsqueeze(1), attention_weights_cum.unsqueeze(1)), dim=1)
+        attention_context, attention_weights = self.attention_layer(
+            attention_hidden, memory, processed_memory, attention_weights_cat, mask
+        )
+
+        attention_weights_cum += attention_weights
+        decoder_input = torch.cat((attention_hidden, attention_context), -1)
+
+        decoder_hidden, decoder_cell = self.decoder_rnn(decoder_input, (decoder_hidden, decoder_cell))
+        decoder_hidden = F.dropout(decoder_hidden, self.decoder_dropout, self.training)
+
+        decoder_hidden_attention_context = torch.cat((decoder_hidden, attention_context), dim=1)
+        decoder_output = self.linear_projection(decoder_hidden_attention_context)
+
+        gate_prediction = self.gate_layer(decoder_hidden_attention_context)
+
+        return (
+            decoder_output,
+            gate_prediction,
+            attention_hidden,
+            attention_cell,
+            decoder_hidden,
+            decoder_cell,
+            attention_weights,
+            attention_weights_cum,
+            attention_context,
+        )
+
+    def forward(
+        self, memory: Tensor, mel_specgram_truth: Tensor, memory_lengths: Tensor
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        r"""Decoder forward pass for training.
+
+        Args:
+            memory (Tensor): Encoder outputs
+                with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
+            mel_specgram_truth (Tensor): Decoder ground-truth mel-specs for teacher forcing
+                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
+            memory_lengths (Tensor): Encoder output lengths for attention masking
+                (the same as ``text_lengths``) with shape (n_batch, ).
+
+        Returns:
+            mel_specgram (Tensor): Predicted mel spectrogram
+                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
+            gate_outputs (Tensor): Predicted stop token for each timestep
+                with shape (n_batch,  max of ``mel_specgram_lengths``).
+            alignments (Tensor): Sequence of attention weights from the decoder
+                with shape (n_batch,  max of ``mel_specgram_lengths``, max of ``text_lengths``).
+        """
+
+        decoder_input = self._get_initial_frame(memory).unsqueeze(0)
+        decoder_inputs = self._parse_decoder_inputs(mel_specgram_truth)
+        decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
+        decoder_inputs = self.prenet(decoder_inputs)
+
+        mask = _get_mask_from_lengths(memory_lengths)
+        (
+            attention_hidden,
+            attention_cell,
+            decoder_hidden,
+            decoder_cell,
+            attention_weights,
+            attention_weights_cum,
+            attention_context,
+            processed_memory,
+        ) = self._initialize_decoder_states(memory)
+
+        mel_outputs, gate_outputs, alignments = [], [], []
+        while len(mel_outputs) < decoder_inputs.size(0) - 1:
+            decoder_input = decoder_inputs[len(mel_outputs)]
+            (
+                mel_output,
+                gate_output,
+                attention_hidden,
+                attention_cell,
+                decoder_hidden,
+                decoder_cell,
+                attention_weights,
+                attention_weights_cum,
+                attention_context,
+            ) = self.decode(
+                decoder_input,
+                attention_hidden,
+                attention_cell,
+                decoder_hidden,
+                decoder_cell,
+                attention_weights,
+                attention_weights_cum,
+                attention_context,
+                memory,
+                processed_memory,
+                mask,
+            )
+
+            mel_outputs += [mel_output.squeeze(1)]
+            gate_outputs += [gate_output.squeeze(1)]
+            alignments += [attention_weights]
+
+        mel_specgram, gate_outputs, alignments = self._parse_decoder_outputs(
+            torch.stack(mel_outputs), torch.stack(gate_outputs), torch.stack(alignments)
+        )
+
+        return mel_specgram, gate_outputs, alignments
+
+    def _get_go_frame(self, memory: Tensor) -> Tensor:
+        """Gets all zeros frames to use as the first decoder input
+
+        args:
+            memory (Tensor): Encoder outputs
+                with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
+
+        returns:
+            decoder_input (Tensor): All zeros frames with shape(n_batch, ``n_mels`` * ``n_frame_per_step``).
+        """
+
+        n_batch = memory.size(0)
+        dtype = memory.dtype
+        device = memory.device
+        decoder_input = torch.zeros(n_batch, self.n_mels * self.n_frames_per_step, dtype=dtype, device=device)
+        return decoder_input
+
+    @torch.jit.export
+    def infer(self, memory: Tensor, memory_lengths: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        """Decoder inference
+
+        Args:
+            memory (Tensor): Encoder outputs
+                with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
+            memory_lengths (Tensor): Encoder output lengths for attention masking
+                (the same as ``text_lengths``) with shape (n_batch, ).
+
+        Returns:
+            mel_specgram (Tensor): Predicted mel spectrogram
+                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
+            mel_specgram_lengths (Tensor): the length of the predicted mel spectrogram (n_batch, ))
+            gate_outputs (Tensor): Predicted stop token for each timestep
+                with shape (n_batch,  max of ``mel_specgram_lengths``).
+            alignments (Tensor): Sequence of attention weights from the decoder
+                with shape (n_batch,  max of ``mel_specgram_lengths``, max of ``text_lengths``).
+        """
+        batch_size, device = memory.size(0), memory.device
+
+        decoder_input = self._get_go_frame(memory)
+
+        mask = _get_mask_from_lengths(memory_lengths)
+        (
+            attention_hidden,
+            attention_cell,
+            decoder_hidden,
+            decoder_cell,
+            attention_weights,
+            attention_weights_cum,
+            attention_context,
+            processed_memory,
+        ) = self._initialize_decoder_states(memory)
+
+        mel_specgram_lengths = torch.zeros([batch_size], dtype=torch.int32, device=device)
+        finished = torch.zeros([batch_size], dtype=torch.bool, device=device)
+        mel_specgrams: List[Tensor] = []
+        gate_outputs: List[Tensor] = []
+        alignments: List[Tensor] = []
+        for _ in range(self.decoder_max_step):
+            decoder_input = self.prenet(decoder_input)
+            (
+                mel_specgram,
+                gate_output,
+                attention_hidden,
+                attention_cell,
+                decoder_hidden,
+                decoder_cell,
+                attention_weights,
+                attention_weights_cum,
+                attention_context,
+            ) = self.decode(
+                decoder_input,
+                attention_hidden,
+                attention_cell,
+                decoder_hidden,
+                decoder_cell,
+                attention_weights,
+                attention_weights_cum,
+                attention_context,
+                memory,
+                processed_memory,
+                mask,
+            )
+
+            mel_specgrams.append(mel_specgram.unsqueeze(0))
+            gate_outputs.append(gate_output.transpose(0, 1))
+            alignments.append(attention_weights)
+            mel_specgram_lengths[~finished] += 1
+
+            finished |= torch.sigmoid(gate_output.squeeze(1)) > self.gate_threshold
+            if self.decoder_early_stopping and torch.all(finished):
+                break
+
+            decoder_input = mel_specgram
+
+        if len(mel_specgrams) == self.decoder_max_step:
+            warnings.warn(
+                "Reached max decoder steps. The generated spectrogram might not cover " "the whole transcript."
+            )
+
+        mel_specgrams = torch.cat(mel_specgrams, dim=0)
+        gate_outputs = torch.cat(gate_outputs, dim=0)
+        alignments = torch.cat(alignments, dim=0)
+
+        mel_specgrams, gate_outputs, alignments = self._parse_decoder_outputs(mel_specgrams, gate_outputs, alignments)
+
+        return mel_specgrams, mel_specgram_lengths, gate_outputs, alignments
+
+
+class Tacotron2(nn.Module):
+    r"""Tacotron2 model based on the implementation from
+    `Nvidia <https://github.com/NVIDIA/DeepLearningExamples/>`_.
+
+    The original implementation was introduced in
+    *Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions*
+    [:footcite:`shen2018natural`].
+
+    Args:
+        mask_padding (bool, optional): Use mask padding (Default: ``False``).
+        n_mels (int, optional): Number of mel bins (Default: ``80``).
+        n_symbol (int, optional): Number of symbols for the input text (Default: ``148``).
+        n_frames_per_step (int, optional): Number of frames processed per step, only 1 is supported (Default: ``1``).
+        symbol_embedding_dim (int, optional): Input embedding dimension (Default: ``512``).
+        encoder_n_convolution (int, optional): Number of encoder convolutions (Default: ``3``).
+        encoder_kernel_size (int, optional): Encoder kernel size (Default: ``5``).
+        encoder_embedding_dim (int, optional): Encoder embedding dimension (Default: ``512``).
+        decoder_rnn_dim (int, optional): Number of units in decoder LSTM (Default: ``1024``).
+        decoder_max_step (int, optional): Maximum number of output mel spectrograms (Default: ``2000``).
+        decoder_dropout (float, optional): Dropout probability for decoder LSTM (Default: ``0.1``).
+        decoder_early_stopping (bool, optional): Continue decoding after all samples are finished (Default: ``True``).
+        attention_rnn_dim (int, optional): Number of units in attention LSTM (Default: ``1024``).
+        attention_hidden_dim (int, optional): Dimension of attention hidden representation (Default: ``128``).
+        attention_location_n_filter (int, optional): Number of filters for attention model (Default: ``32``).
+        attention_location_kernel_size (int, optional): Kernel size for attention model (Default: ``31``).
+        attention_dropout (float, optional): Dropout probability for attention LSTM (Default: ``0.1``).
+        prenet_dim (int, optional): Number of ReLU units in prenet layers (Default: ``256``).
+        postnet_n_convolution (int, optional): Number of postnet convolutions (Default: ``5``).
+        postnet_kernel_size (int, optional): Postnet kernel size (Default: ``5``).
+        postnet_embedding_dim (int, optional): Postnet embedding dimension (Default: ``512``).
+        gate_threshold (float, optional): Probability threshold for stop token (Default: ``0.5``).
+    """
+
+    def __init__(
+        self,
+        mask_padding: bool = False,
+        n_mels: int = 80,
+        n_symbol: int = 148,
+        n_frames_per_step: int = 1,
+        symbol_embedding_dim: int = 512,
+        encoder_embedding_dim: int = 512,
+        encoder_n_convolution: int = 3,
+        encoder_kernel_size: int = 5,
+        decoder_rnn_dim: int = 1024,
+        decoder_max_step: int = 2000,
+        decoder_dropout: float = 0.1,
+        decoder_early_stopping: bool = True,
+        attention_rnn_dim: int = 1024,
+        attention_hidden_dim: int = 128,
+        attention_location_n_filter: int = 32,
+        attention_location_kernel_size: int = 31,
+        attention_dropout: float = 0.1,
+        prenet_dim: int = 256,
+        postnet_n_convolution: int = 5,
+        postnet_kernel_size: int = 5,
+        postnet_embedding_dim: int = 512,
+        gate_threshold: float = 0.5,
+    ) -> None:
+        super().__init__()
+
+        self.mask_padding = mask_padding
+        self.n_mels = n_mels
+        self.n_frames_per_step = n_frames_per_step
+        self.embedding = nn.Embedding(n_symbol, symbol_embedding_dim)
+        torch.nn.init.xavier_uniform_(self.embedding.weight)
+        self.encoder = _Encoder(encoder_embedding_dim, encoder_n_convolution, encoder_kernel_size)
+        self.decoder = _Decoder(
+            n_mels,
+            n_frames_per_step,
+            encoder_embedding_dim,
+            decoder_rnn_dim,
+            decoder_max_step,
+            decoder_dropout,
+            decoder_early_stopping,
+            attention_rnn_dim,
+            attention_hidden_dim,
+            attention_location_n_filter,
+            attention_location_kernel_size,
+            attention_dropout,
+            prenet_dim,
+            gate_threshold,
+        )
+        self.postnet = _Postnet(n_mels, postnet_embedding_dim, postnet_kernel_size, postnet_n_convolution)
+
+    def forward(
+        self,
+        tokens: Tensor,
+        token_lengths: Tensor,
+        mel_specgram: Tensor,
+        mel_specgram_lengths: Tensor,
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        r"""Pass the input through the Tacotron2 model. This is in teacher
+        forcing mode, which is generally used for training.
+
+        The input ``tokens`` should be padded with zeros to length max of ``token_lengths``.
+        The input ``mel_specgram`` should be padded with zeros to length max of ``mel_specgram_lengths``.
+
+        Args:
+            tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of token_lengths)`.
+            token_lengths (Tensor): The valid length of each sample in ``tokens`` with shape `(n_batch, )`.
+            mel_specgram (Tensor): The target mel spectrogram
+                with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
+            mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape `(n_batch, )`.
+
+        Returns:
+            [Tensor, Tensor, Tensor, Tensor]:
+                Tensor
+                    Mel spectrogram before Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
+                Tensor
+                    Mel spectrogram after Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
+                Tensor
+                    The output for stop token at each time step with shape `(n_batch, max of mel_specgram_lengths)`.
+                Tensor
+                    Sequence of attention weights from the decoder with
+                    shape `(n_batch, max of mel_specgram_lengths, max of token_lengths)`.
+        """
+
+        embedded_inputs = self.embedding(tokens).transpose(1, 2)
+
+        encoder_outputs = self.encoder(embedded_inputs, token_lengths)
+        mel_specgram, gate_outputs, alignments = self.decoder(
+            encoder_outputs, mel_specgram, memory_lengths=token_lengths
+        )
+
+        mel_specgram_postnet = self.postnet(mel_specgram)
+        mel_specgram_postnet = mel_specgram + mel_specgram_postnet
+
+        if self.mask_padding:
+            mask = _get_mask_from_lengths(mel_specgram_lengths)
+            mask = mask.expand(self.n_mels, mask.size(0), mask.size(1))
+            mask = mask.permute(1, 0, 2)
+
+            mel_specgram.masked_fill_(mask, 0.0)
+            mel_specgram_postnet.masked_fill_(mask, 0.0)
+            gate_outputs.masked_fill_(mask[:, 0, :], 1e3)
+
+        return mel_specgram, mel_specgram_postnet, gate_outputs, alignments
+
+    @torch.jit.export
+    def infer(self, tokens: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Tensor, Tensor]:
+        r"""Using Tacotron2 for inference. The input is a batch of encoded
+        sentences (``tokens``) and its corresponding lengths (``lengths``). The
+        output is the generated mel spectrograms, its corresponding lengths, and
+        the attention weights from the decoder.
+
+        The input `tokens` should be padded with zeros to length max of ``lengths``.
+
+        Args:
+            tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of lengths)`.
+            lengths (Tensor or None, optional):
+                The valid length of each sample in ``tokens`` with shape `(n_batch, )`.
+                If ``None``, it is assumed that the all the tokens are valid. Default: ``None``
+
+        Returns:
+            (Tensor, Tensor, Tensor):
+                Tensor
+                    The predicted mel spectrogram with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
+                Tensor
+                    The length of the predicted mel spectrogram with shape `(n_batch, )`.
+                Tensor
+                    Sequence of attention weights from the decoder with shape
+                    `(n_batch, max of mel_specgram_lengths, max of lengths)`.
+        """
+        n_batch, max_length = tokens.shape
+        if lengths is None:
+            lengths = torch.tensor([max_length]).expand(n_batch).to(tokens.device, tokens.dtype)
+
+        assert lengths is not None  # For TorchScript compiler
+
+        embedded_inputs = self.embedding(tokens).transpose(1, 2)
+        encoder_outputs = self.encoder(embedded_inputs, lengths)
+        mel_specgram, mel_specgram_lengths, _, alignments = self.decoder.infer(encoder_outputs, lengths)
+
+        mel_outputs_postnet = self.postnet(mel_specgram)
+        mel_outputs_postnet = mel_specgram + mel_outputs_postnet
+
+        alignments = alignments.unfold(1, n_batch, n_batch).transpose(0, 2)
+
+        return mel_outputs_postnet, mel_specgram_lengths, alignments
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/wav2letter.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/wav2letter.py
new file mode 100644
index 0000000000000000000000000000000000000000..922287002388b2869c77a11adf9d9b18deb8e5bd
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/wav2letter.py
@@ -0,0 +1,71 @@
+from torch import nn, Tensor
+
+__all__ = [
+    "Wav2Letter",
+]
+
+
+class Wav2Letter(nn.Module):
+    r"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech
+    Recognition System* [:footcite:`collobert2016wav2letter`].
+
+     :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}`
+
+    Args:
+        num_classes (int, optional): Number of classes to be classified. (Default: ``40``)
+        input_type (str, optional): Wav2Letter can use as input: ``waveform``, ``power_spectrum``
+         or ``mfcc`` (Default: ``waveform``).
+        num_features (int, optional): Number of input features that the network will receive (Default: ``1``).
+    """
+
+    def __init__(self, num_classes: int = 40, input_type: str = "waveform", num_features: int = 1) -> None:
+        super(Wav2Letter, self).__init__()
+
+        acoustic_num_features = 250 if input_type == "waveform" else num_features
+        acoustic_model = nn.Sequential(
+            nn.Conv1d(in_channels=acoustic_num_features, out_channels=250, kernel_size=48, stride=2, padding=23),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=2000, out_channels=num_classes, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(inplace=True),
+        )
+
+        if input_type == "waveform":
+            waveform_model = nn.Sequential(
+                nn.Conv1d(in_channels=num_features, out_channels=250, kernel_size=250, stride=160, padding=45),
+                nn.ReLU(inplace=True),
+            )
+            self.acoustic_model = nn.Sequential(waveform_model, acoustic_model)
+
+        if input_type in ["power_spectrum", "mfcc"]:
+            self.acoustic_model = acoustic_model
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""
+        Args:
+            x (torch.Tensor): Tensor of dimension (batch_size, num_features, input_length).
+
+        Returns:
+            Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length).
+        """
+
+        x = self.acoustic_model(x)
+        x = nn.functional.log_softmax(x, dim=1)
+        return x
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/wavernn.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/wavernn.py
new file mode 100644
index 0000000000000000000000000000000000000000..aafdfaea15e39660abbbeac214eb903ecfd21190
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/models/wavernn.py
@@ -0,0 +1,402 @@
+import math
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+__all__ = [
+    "ResBlock",
+    "MelResNet",
+    "Stretch2d",
+    "UpsampleNetwork",
+    "WaveRNN",
+]
+
+
+class ResBlock(nn.Module):
+    r"""ResNet block based on *Efficient Neural Audio Synthesis* [:footcite:`kalchbrenner2018efficient`].
+
+    Args:
+        n_freq: the number of bins in a spectrogram. (Default: ``128``)
+
+    Examples
+        >>> resblock = ResBlock()
+        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
+        >>> output = resblock(input)  # shape: (10, 128, 512)
+    """
+
+    def __init__(self, n_freq: int = 128) -> None:
+        super().__init__()
+
+        self.resblock_model = nn.Sequential(
+            nn.Conv1d(in_channels=n_freq, out_channels=n_freq, kernel_size=1, bias=False),
+            nn.BatchNorm1d(n_freq),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channels=n_freq, out_channels=n_freq, kernel_size=1, bias=False),
+            nn.BatchNorm1d(n_freq),
+        )
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""Pass the input through the ResBlock layer.
+        Args:
+            specgram (Tensor): the input sequence to the ResBlock layer (n_batch, n_freq, n_time).
+
+        Return:
+            Tensor shape: (n_batch, n_freq, n_time)
+        """
+
+        return self.resblock_model(specgram) + specgram
+
+
+class MelResNet(nn.Module):
+    r"""MelResNet layer uses a stack of ResBlocks on spectrogram.
+
+    Args:
+        n_res_block: the number of ResBlock in stack. (Default: ``10``)
+        n_freq: the number of bins in a spectrogram. (Default: ``128``)
+        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
+        n_output: the number of output dimensions of melresnet. (Default: ``128``)
+        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)
+
+    Examples
+        >>> melresnet = MelResNet()
+        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
+        >>> output = melresnet(input)  # shape: (10, 128, 508)
+    """
+
+    def __init__(
+        self, n_res_block: int = 10, n_freq: int = 128, n_hidden: int = 128, n_output: int = 128, kernel_size: int = 5
+    ) -> None:
+        super().__init__()
+
+        ResBlocks = [ResBlock(n_hidden) for _ in range(n_res_block)]
+
+        self.melresnet_model = nn.Sequential(
+            nn.Conv1d(in_channels=n_freq, out_channels=n_hidden, kernel_size=kernel_size, bias=False),
+            nn.BatchNorm1d(n_hidden),
+            nn.ReLU(inplace=True),
+            *ResBlocks,
+            nn.Conv1d(in_channels=n_hidden, out_channels=n_output, kernel_size=1),
+        )
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""Pass the input through the MelResNet layer.
+        Args:
+            specgram (Tensor): the input sequence to the MelResNet layer (n_batch, n_freq, n_time).
+
+        Return:
+            Tensor shape: (n_batch, n_output, n_time - kernel_size + 1)
+        """
+
+        return self.melresnet_model(specgram)
+
+
+class Stretch2d(nn.Module):
+    r"""Upscale the frequency and time dimensions of a spectrogram.
+
+    Args:
+        time_scale: the scale factor in time dimension
+        freq_scale: the scale factor in frequency dimension
+
+    Examples
+        >>> stretch2d = Stretch2d(time_scale=10, freq_scale=5)
+
+        >>> input = torch.rand(10, 100, 512)  # a random spectrogram
+        >>> output = stretch2d(input)  # shape: (10, 500, 5120)
+    """
+
+    def __init__(self, time_scale: int, freq_scale: int) -> None:
+        super().__init__()
+
+        self.freq_scale = freq_scale
+        self.time_scale = time_scale
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""Pass the input through the Stretch2d layer.
+
+        Args:
+            specgram (Tensor): the input sequence to the Stretch2d layer (..., n_freq, n_time).
+
+        Return:
+            Tensor shape: (..., n_freq * freq_scale, n_time * time_scale)
+        """
+
+        return specgram.repeat_interleave(self.freq_scale, -2).repeat_interleave(self.time_scale, -1)
+
+
+class UpsampleNetwork(nn.Module):
+    r"""Upscale the dimensions of a spectrogram.
+
+    Args:
+        upsample_scales: the list of upsample scales.
+        n_res_block: the number of ResBlock in stack. (Default: ``10``)
+        n_freq: the number of bins in a spectrogram. (Default: ``128``)
+        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
+        n_output: the number of output dimensions of melresnet. (Default: ``128``)
+        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)
+
+    Examples
+        >>> upsamplenetwork = UpsampleNetwork(upsample_scales=[4, 4, 16])
+        >>> input = torch.rand(10, 128, 10)  # a random spectrogram
+        >>> output = upsamplenetwork(input)  # shape: (10, 128, 1536), (10, 128, 1536)
+    """
+
+    def __init__(
+        self,
+        upsample_scales: List[int],
+        n_res_block: int = 10,
+        n_freq: int = 128,
+        n_hidden: int = 128,
+        n_output: int = 128,
+        kernel_size: int = 5,
+    ) -> None:
+        super().__init__()
+
+        total_scale = 1
+        for upsample_scale in upsample_scales:
+            total_scale *= upsample_scale
+        self.total_scale: int = total_scale
+
+        self.indent = (kernel_size - 1) // 2 * total_scale
+        self.resnet = MelResNet(n_res_block, n_freq, n_hidden, n_output, kernel_size)
+        self.resnet_stretch = Stretch2d(total_scale, 1)
+
+        up_layers = []
+        for scale in upsample_scales:
+            stretch = Stretch2d(scale, 1)
+            conv = nn.Conv2d(
+                in_channels=1, out_channels=1, kernel_size=(1, scale * 2 + 1), padding=(0, scale), bias=False
+            )
+            torch.nn.init.constant_(conv.weight, 1.0 / (scale * 2 + 1))
+            up_layers.append(stretch)
+            up_layers.append(conv)
+        self.upsample_layers = nn.Sequential(*up_layers)
+
+    def forward(self, specgram: Tensor) -> Tuple[Tensor, Tensor]:
+        r"""Pass the input through the UpsampleNetwork layer.
+
+        Args:
+            specgram (Tensor): the input sequence to the UpsampleNetwork layer (n_batch, n_freq, n_time)
+
+        Return:
+            Tensor shape: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale),
+                          (n_batch, n_output, (n_time - kernel_size + 1) * total_scale)
+        where total_scale is the product of all elements in upsample_scales.
+        """
+
+        resnet_output = self.resnet(specgram).unsqueeze(1)
+        resnet_output = self.resnet_stretch(resnet_output)
+        resnet_output = resnet_output.squeeze(1)
+
+        specgram = specgram.unsqueeze(1)
+        upsampling_output = self.upsample_layers(specgram)
+        upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent : -self.indent]
+
+        return upsampling_output, resnet_output
+
+
+class WaveRNN(nn.Module):
+    r"""WaveRNN model based on the implementation from `fatchord <https://github.com/fatchord/WaveRNN>`_.
+
+    The original implementation was introduced in *Efficient Neural Audio Synthesis*
+    [:footcite:`kalchbrenner2018efficient`]. The input channels of waveform and spectrogram have to be 1.
+    The product of `upsample_scales` must equal `hop_length`.
+
+    Args:
+        upsample_scales: the list of upsample scales.
+        n_classes: the number of output classes.
+        hop_length: the number of samples between the starts of consecutive frames.
+        n_res_block: the number of ResBlock in stack. (Default: ``10``)
+        n_rnn: the dimension of RNN layer. (Default: ``512``)
+        n_fc: the dimension of fully connected layer. (Default: ``512``)
+        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)
+        n_freq: the number of bins in a spectrogram. (Default: ``128``)
+        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
+        n_output: the number of output dimensions of melresnet. (Default: ``128``)
+
+    Example
+        >>> wavernn = WaveRNN(upsample_scales=[5,5,8], n_classes=512, hop_length=200)
+        >>> waveform, sample_rate = torchaudio.load(file)
+        >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
+        >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
+        >>> output = wavernn(waveform, specgram)
+        >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, n_classes)
+    """
+
+    def __init__(
+        self,
+        upsample_scales: List[int],
+        n_classes: int,
+        hop_length: int,
+        n_res_block: int = 10,
+        n_rnn: int = 512,
+        n_fc: int = 512,
+        kernel_size: int = 5,
+        n_freq: int = 128,
+        n_hidden: int = 128,
+        n_output: int = 128,
+    ) -> None:
+        super().__init__()
+
+        self.kernel_size = kernel_size
+        self._pad = (kernel_size - 1 if kernel_size % 2 else kernel_size) // 2
+        self.n_rnn = n_rnn
+        self.n_aux = n_output // 4
+        self.hop_length = hop_length
+        self.n_classes = n_classes
+        self.n_bits: int = int(math.log2(self.n_classes))
+
+        total_scale = 1
+        for upsample_scale in upsample_scales:
+            total_scale *= upsample_scale
+        if total_scale != self.hop_length:
+            raise ValueError(f"Expected: total_scale == hop_length, but found {total_scale} != {hop_length}")
+
+        self.upsample = UpsampleNetwork(upsample_scales, n_res_block, n_freq, n_hidden, n_output, kernel_size)
+        self.fc = nn.Linear(n_freq + self.n_aux + 1, n_rnn)
+
+        self.rnn1 = nn.GRU(n_rnn, n_rnn, batch_first=True)
+        self.rnn2 = nn.GRU(n_rnn + self.n_aux, n_rnn, batch_first=True)
+
+        self.relu1 = nn.ReLU(inplace=True)
+        self.relu2 = nn.ReLU(inplace=True)
+
+        self.fc1 = nn.Linear(n_rnn + self.n_aux, n_fc)
+        self.fc2 = nn.Linear(n_fc + self.n_aux, n_fc)
+        self.fc3 = nn.Linear(n_fc, self.n_classes)
+
+    def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor:
+        r"""Pass the input through the WaveRNN model.
+
+        Args:
+            waveform: the input waveform to the WaveRNN layer (n_batch, 1, (n_time - kernel_size + 1) * hop_length)
+            specgram: the input spectrogram to the WaveRNN layer (n_batch, 1, n_freq, n_time)
+
+        Return:
+            Tensor: shape (n_batch, 1, (n_time - kernel_size + 1) * hop_length, n_classes)
+        """
+
+        assert waveform.size(1) == 1, "Require the input channel of waveform is 1"
+        assert specgram.size(1) == 1, "Require the input channel of specgram is 1"
+        # remove channel dimension until the end
+        waveform, specgram = waveform.squeeze(1), specgram.squeeze(1)
+
+        batch_size = waveform.size(0)
+        h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
+        h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
+        # output of upsample:
+        # specgram: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale)
+        # aux: (n_batch, n_output, (n_time - kernel_size + 1) * total_scale)
+        specgram, aux = self.upsample(specgram)
+        specgram = specgram.transpose(1, 2)
+        aux = aux.transpose(1, 2)
+
+        aux_idx = [self.n_aux * i for i in range(5)]
+        a1 = aux[:, :, aux_idx[0] : aux_idx[1]]
+        a2 = aux[:, :, aux_idx[1] : aux_idx[2]]
+        a3 = aux[:, :, aux_idx[2] : aux_idx[3]]
+        a4 = aux[:, :, aux_idx[3] : aux_idx[4]]
+
+        x = torch.cat([waveform.unsqueeze(-1), specgram, a1], dim=-1)
+        x = self.fc(x)
+        res = x
+        x, _ = self.rnn1(x, h1)
+
+        x = x + res
+        res = x
+        x = torch.cat([x, a2], dim=-1)
+        x, _ = self.rnn2(x, h2)
+
+        x = x + res
+        x = torch.cat([x, a3], dim=-1)
+        x = self.fc1(x)
+        x = self.relu1(x)
+
+        x = torch.cat([x, a4], dim=-1)
+        x = self.fc2(x)
+        x = self.relu2(x)
+        x = self.fc3(x)
+
+        # bring back channel dimension
+        return x.unsqueeze(1)
+
+    @torch.jit.export
+    def infer(self, specgram: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
+        r"""Inference method of WaveRNN.
+
+        This function currently only supports multinomial sampling, which assumes the
+        network is trained on cross entropy loss.
+
+        Args:
+            specgram (Tensor):
+                Batch of spectrograms. Shape: `(n_batch, n_freq, n_time)`.
+            lengths (Tensor or None, optional):
+                Indicates the valid length of each audio in the batch.
+                Shape: `(batch, )`.
+                When the ``specgram`` contains spectrograms with different durations,
+                by providing ``lengths`` argument, the model will compute
+                the corresponding valid output lengths.
+                If ``None``, it is assumed that all the audio in ``waveforms``
+                have valid length. Default: ``None``.
+
+        Returns:
+            (Tensor, Optional[Tensor]):
+            Tensor
+                The inferred waveform of size `(n_batch, 1, n_time)`.
+                1 stands for a single channel.
+            Tensor or None
+                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
+                is returned.
+                It indicates the valid length in time axis of the output Tensor.
+        """
+
+        device = specgram.device
+        dtype = specgram.dtype
+
+        specgram = torch.nn.functional.pad(specgram, (self._pad, self._pad))
+        specgram, aux = self.upsample(specgram)
+        if lengths is not None:
+            lengths = lengths * self.upsample.total_scale
+
+        output: List[Tensor] = []
+        b_size, _, seq_len = specgram.size()
+
+        h1 = torch.zeros((1, b_size, self.n_rnn), device=device, dtype=dtype)
+        h2 = torch.zeros((1, b_size, self.n_rnn), device=device, dtype=dtype)
+        x = torch.zeros((b_size, 1), device=device, dtype=dtype)
+
+        aux_split = [aux[:, self.n_aux * i : self.n_aux * (i + 1), :] for i in range(4)]
+
+        for i in range(seq_len):
+
+            m_t = specgram[:, :, i]
+
+            a1_t, a2_t, a3_t, a4_t = [a[:, :, i] for a in aux_split]
+
+            x = torch.cat([x, m_t, a1_t], dim=1)
+            x = self.fc(x)
+            _, h1 = self.rnn1(x.unsqueeze(1), h1)
+
+            x = x + h1[0]
+            inp = torch.cat([x, a2_t], dim=1)
+            _, h2 = self.rnn2(inp.unsqueeze(1), h2)
+
+            x = x + h2[0]
+            x = torch.cat([x, a3_t], dim=1)
+            x = F.relu(self.fc1(x))
+
+            x = torch.cat([x, a4_t], dim=1)
+            x = F.relu(self.fc2(x))
+
+            logits = self.fc3(x)
+
+            posterior = F.softmax(logits, dim=1)
+
+            x = torch.multinomial(posterior, 1).float()
+            # Transform label [0, 2 ** n_bits - 1] to waveform [-1, 1]
+            x = 2 * x / (2**self.n_bits - 1.0) - 1.0
+
+            output.append(x)
+
+        return torch.stack(output).permute(1, 2, 0), lengths
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/pipelines/__init__.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8d96bf37e4ab10a10463f871e5702982cd6b6da
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/pipelines/__init__.py
@@ -0,0 +1,71 @@
+from ._tts import (
+    TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH,
+    TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH,
+    TACOTRON2_WAVERNN_CHAR_LJSPEECH,
+    TACOTRON2_WAVERNN_PHONE_LJSPEECH,
+    Tacotron2TTSBundle,
+)
+from ._wav2vec2.impl import (
+    HUBERT_ASR_LARGE,
+    HUBERT_ASR_XLARGE,
+    HUBERT_BASE,
+    HUBERT_LARGE,
+    HUBERT_XLARGE,
+    VOXPOPULI_ASR_BASE_10K_DE,
+    VOXPOPULI_ASR_BASE_10K_EN,
+    VOXPOPULI_ASR_BASE_10K_ES,
+    VOXPOPULI_ASR_BASE_10K_FR,
+    VOXPOPULI_ASR_BASE_10K_IT,
+    WAV2VEC2_ASR_BASE_100H,
+    WAV2VEC2_ASR_BASE_10M,
+    WAV2VEC2_ASR_BASE_960H,
+    WAV2VEC2_ASR_LARGE_100H,
+    WAV2VEC2_ASR_LARGE_10M,
+    WAV2VEC2_ASR_LARGE_960H,
+    WAV2VEC2_ASR_LARGE_LV60K_100H,
+    WAV2VEC2_ASR_LARGE_LV60K_10M,
+    WAV2VEC2_ASR_LARGE_LV60K_960H,
+    WAV2VEC2_BASE,
+    WAV2VEC2_LARGE,
+    WAV2VEC2_LARGE_LV60K,
+    WAV2VEC2_XLSR53,
+    Wav2Vec2ASRBundle,
+    Wav2Vec2Bundle,
+)
+from .rnnt_pipeline import EMFORMER_RNNT_BASE_LIBRISPEECH, RNNTBundle
+
+
+__all__ = [
+    "Wav2Vec2Bundle",
+    "Wav2Vec2ASRBundle",
+    "WAV2VEC2_BASE",
+    "WAV2VEC2_LARGE",
+    "WAV2VEC2_LARGE_LV60K",
+    "WAV2VEC2_ASR_BASE_10M",
+    "WAV2VEC2_ASR_BASE_100H",
+    "WAV2VEC2_ASR_BASE_960H",
+    "WAV2VEC2_ASR_LARGE_10M",
+    "WAV2VEC2_ASR_LARGE_100H",
+    "WAV2VEC2_ASR_LARGE_960H",
+    "WAV2VEC2_ASR_LARGE_LV60K_10M",
+    "WAV2VEC2_ASR_LARGE_LV60K_100H",
+    "WAV2VEC2_ASR_LARGE_LV60K_960H",
+    "WAV2VEC2_XLSR53",
+    "VOXPOPULI_ASR_BASE_10K_EN",
+    "VOXPOPULI_ASR_BASE_10K_ES",
+    "VOXPOPULI_ASR_BASE_10K_DE",
+    "VOXPOPULI_ASR_BASE_10K_FR",
+    "VOXPOPULI_ASR_BASE_10K_IT",
+    "HUBERT_BASE",
+    "HUBERT_LARGE",
+    "HUBERT_XLARGE",
+    "HUBERT_ASR_LARGE",
+    "HUBERT_ASR_XLARGE",
+    "Tacotron2TTSBundle",
+    "TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH",
+    "TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH",
+    "TACOTRON2_WAVERNN_CHAR_LJSPEECH",
+    "TACOTRON2_WAVERNN_PHONE_LJSPEECH",
+    "RNNTBundle",
+    "EMFORMER_RNNT_BASE_LIBRISPEECH",
+]
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/pipelines/rnnt_pipeline.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/pipelines/rnnt_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6cb403eb65c0dbae77fcea6ef2b280b30c0565d
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/pipelines/rnnt_pipeline.py
@@ -0,0 +1,380 @@
+import json
+import math
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from functools import partial
+from typing import Callable, List, Tuple
+
+import torch
+import torchaudio
+from torchaudio._internal import module_utils
+from torchaudio.models import emformer_rnnt_base, RNNT, RNNTBeamSearch
+
+
+__all__ = []
+
+_decibel = 2 * 20 * math.log10(torch.iinfo(torch.int16).max)
+_gain = pow(10, 0.05 * _decibel)
+
+
+def _piecewise_linear_log(x):
+    x[x > math.e] = torch.log(x[x > math.e])
+    x[x <= math.e] = x[x <= math.e] / math.e
+    return x
+
+
+class _FunctionalModule(torch.nn.Module):
+    def __init__(self, functional):
+        super().__init__()
+        self.functional = functional
+
+    def forward(self, input):
+        return self.functional(input)
+
+
+class _GlobalStatsNormalization(torch.nn.Module):
+    def __init__(self, global_stats_path):
+        super().__init__()
+
+        with open(global_stats_path) as f:
+            blob = json.loads(f.read())
+
+        self.register_buffer("mean", torch.tensor(blob["mean"]))
+        self.register_buffer("invstddev", torch.tensor(blob["invstddev"]))
+
+    def forward(self, input):
+        return (input - self.mean) * self.invstddev
+
+
+class _FeatureExtractor(ABC):
+    @abstractmethod
+    def __call__(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Generates features and length output from the given input tensor.
+
+        Args:
+            input (torch.Tensor): input tensor.
+
+        Returns:
+            (torch.Tensor, torch.Tensor):
+            torch.Tensor:
+                Features, with shape `(length, *)`.
+            torch.Tensor:
+                Length, with shape `(1,)`.
+        """
+
+
+class _TokenProcessor(ABC):
+    @abstractmethod
+    def __call__(self, tokens: List[int], **kwargs) -> str:
+        """Decodes given list of tokens to text sequence.
+
+        Args:
+            tokens (List[int]): list of tokens to decode.
+
+        Returns:
+            str:
+                Decoded text sequence.
+        """
+
+
+class _ModuleFeatureExtractor(torch.nn.Module, _FeatureExtractor):
+    """``torch.nn.Module``-based feature extraction pipeline.
+
+    Args:
+        pipeline (torch.nn.Module): module that implements feature extraction logic.
+    """
+
+    def __init__(self, pipeline: torch.nn.Module) -> None:
+        super().__init__()
+        self.pipeline = pipeline
+
+    def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Generates features and length output from the given input tensor.
+
+        Args:
+            input (torch.Tensor): input tensor.
+
+        Returns:
+            (torch.Tensor, torch.Tensor):
+            torch.Tensor:
+                Features, with shape `(length, *)`.
+            torch.Tensor:
+                Length, with shape `(1,)`.
+        """
+        features = self.pipeline(input)
+        length = torch.tensor([features.shape[0]])
+        return features, length
+
+
+class _SentencePieceTokenProcessor(_TokenProcessor):
+    """SentencePiece-model-based token processor.
+
+    Args:
+        sp_model_path (str): path to SentencePiece model.
+    """
+
+    def __init__(self, sp_model_path: str) -> None:
+        if not module_utils.is_module_available("sentencepiece"):
+            raise RuntimeError("SentencePiece is not available. Please install it.")
+
+        import sentencepiece as spm
+
+        self.sp_model = spm.SentencePieceProcessor(model_file=sp_model_path)
+        self.post_process_remove_list = {
+            self.sp_model.unk_id(),
+            self.sp_model.eos_id(),
+            self.sp_model.pad_id(),
+        }
+
+    def __call__(self, tokens: List[int], lstrip: bool = True) -> str:
+        """Decodes given list of tokens to text sequence.
+
+        Args:
+            tokens (List[int]): list of tokens to decode.
+            lstrip (bool, optional): if ``True``, returns text sequence with leading whitespace
+                removed. (Default: ``True``).
+
+        Returns:
+            str:
+                Decoded text sequence.
+        """
+        filtered_hypo_tokens = [
+            token_index for token_index in tokens[1:] if token_index not in self.post_process_remove_list
+        ]
+        output_string = "".join(self.sp_model.id_to_piece(filtered_hypo_tokens)).replace("\u2581", " ")
+
+        if lstrip:
+            return output_string.lstrip()
+        else:
+            return output_string
+
+
+@dataclass
+class RNNTBundle:
+    """torchaudio.pipelines.RNNTBundle()
+
+    Dataclass that bundles components for performing automatic speech recognition (ASR, speech-to-text)
+    inference with an RNN-T model.
+
+    More specifically, the class provides methods that produce the featurization pipeline,
+    decoder wrapping the specified RNN-T model, and output token post-processor that together
+    constitute a complete end-to-end ASR inference pipeline that produces a text sequence
+    given a raw waveform.
+
+    It can support non-streaming (full-context) inference as well as streaming inference.
+
+    Users should not directly instantiate objects of this class; rather, users should use the
+    instances (representing pre-trained models) that exist within the module,
+    e.g. :py:obj:`EMFORMER_RNNT_BASE_LIBRISPEECH`.
+
+    Example
+        >>> import torchaudio
+        >>> from torchaudio.pipelines import EMFORMER_RNNT_BASE_LIBRISPEECH
+        >>> import torch
+        >>>
+        >>> # Non-streaming inference.
+        >>> # Build feature extractor, decoder with RNN-T model, and token processor.
+        >>> feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_feature_extractor()
+        100%|███████████████████████████████| 3.81k/3.81k [00:00<00:00, 4.22MB/s]
+        >>> decoder = EMFORMER_RNNT_BASE_LIBRISPEECH.get_decoder()
+        Downloading: "https://download.pytorch.org/torchaudio/models/emformer_rnnt_base_librispeech.pt"
+        100%|███████████████████████████████| 293M/293M [00:07<00:00, 42.1MB/s]
+        >>> token_processor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_token_processor()
+        100%|███████████████████████████████| 295k/295k [00:00<00:00, 25.4MB/s]
+        >>>
+        >>> # Instantiate LibriSpeech dataset; retrieve waveform for first sample.
+        >>> dataset = torchaudio.datasets.LIBRISPEECH("/home/librispeech", url="test-clean")
+        >>> waveform = next(iter(dataset))[0].squeeze()
+        >>>
+        >>> with torch.no_grad():
+        >>>     # Produce mel-scale spectrogram features.
+        >>>     features, length = feature_extractor(waveform)
+        >>>
+        >>>     # Generate top-10 hypotheses.
+        >>>     hypotheses = decoder(features, length, 10)
+        >>>
+        >>> # For top hypothesis, convert predicted tokens to text.
+        >>> text = token_processor(hypotheses[0][0])
+        >>> print(text)
+        he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to [...]
+        >>>
+        >>>
+        >>> # Streaming inference.
+        >>> hop_length = EMFORMER_RNNT_BASE_LIBRISPEECH.hop_length
+        >>> num_samples_segment = EMFORMER_RNNT_BASE_LIBRISPEECH.segment_length * hop_length
+        >>> num_samples_segment_right_context = (
+        >>>     num_samples_segment + EMFORMER_RNNT_BASE_LIBRISPEECH.right_context_length * hop_length
+        >>> )
+        >>>
+        >>> # Build streaming inference feature extractor.
+        >>> streaming_feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_streaming_feature_extractor()
+        >>>
+        >>> # Process same waveform as before, this time sequentially across overlapping segments
+        >>> # to simulate streaming inference. Note the usage of ``streaming_feature_extractor`` and ``decoder.infer``.
+        >>> state, hypothesis = None, None
+        >>> for idx in range(0, len(waveform), num_samples_segment):
+        >>>     segment = waveform[idx: idx + num_samples_segment_right_context]
+        >>>     segment = torch.nn.functional.pad(segment, (0, num_samples_segment_right_context - len(segment)))
+        >>>     with torch.no_grad():
+        >>>         features, length = streaming_feature_extractor(segment)
+        >>>         hypotheses, state = decoder.infer(features, length, 10, state=state, hypothesis=hypothesis)
+        >>>     hypothesis = hypotheses[0]
+        >>>     transcript = token_processor(hypothesis[0])
+        >>>     if transcript:
+        >>>         print(transcript, end=" ", flush=True)
+        he hoped there would be stew for dinner turn ips and car rots and bru 'd oes and fat mut ton pieces to [...]
+    """
+
+    class FeatureExtractor(_FeatureExtractor):
+        pass
+
+    class TokenProcessor(_TokenProcessor):
+        pass
+
+    _rnnt_path: str
+    _rnnt_factory_func: Callable[[], RNNT]
+    _global_stats_path: str
+    _sp_model_path: str
+    _right_padding: int
+    _blank: int
+    _sample_rate: int
+    _n_fft: int
+    _n_mels: int
+    _hop_length: int
+    _segment_length: int
+    _right_context_length: int
+
+    def _get_model(self) -> RNNT:
+        model = self._rnnt_factory_func()
+        path = torchaudio.utils.download_asset(self._rnnt_path)
+        state_dict = torch.load(path)
+        model.load_state_dict(state_dict)
+        model.eval()
+        return model
+
+    @property
+    def sample_rate(self) -> int:
+        """Sample rate (in cycles per second) of input waveforms.
+
+        :type: int
+        """
+        return self._sample_rate
+
+    @property
+    def n_fft(self) -> int:
+        """Size of FFT window to use.
+
+        :type: int
+        """
+        return self._n_fft
+
+    @property
+    def n_mels(self) -> int:
+        """Number of mel spectrogram features to extract from input waveforms.
+
+        :type: int
+        """
+        return self._n_mels
+
+    @property
+    def hop_length(self) -> int:
+        """Number of samples between successive frames in input expected by model.
+
+        :type: int
+        """
+        return self._hop_length
+
+    @property
+    def segment_length(self) -> int:
+        """Number of frames in segment in input expected by model.
+
+        :type: int
+        """
+        return self._segment_length
+
+    @property
+    def right_context_length(self) -> int:
+        """Number of frames in right contextual block in input expected by model.
+
+        :type: int
+        """
+        return self._right_context_length
+
+    def get_decoder(self) -> RNNTBeamSearch:
+        """Constructs RNN-T decoder.
+
+        Returns:
+            RNNTBeamSearch
+        """
+        model = self._get_model()
+        return RNNTBeamSearch(model, self._blank)
+
+    def get_feature_extractor(self) -> FeatureExtractor:
+        """Constructs feature extractor for non-streaming (full-context) ASR.
+
+        Returns:
+            FeatureExtractor
+        """
+        local_path = torchaudio.utils.download_asset(self._global_stats_path)
+        return _ModuleFeatureExtractor(
+            torch.nn.Sequential(
+                torchaudio.transforms.MelSpectrogram(
+                    sample_rate=self.sample_rate, n_fft=self.n_fft, n_mels=self.n_mels, hop_length=self.hop_length
+                ),
+                _FunctionalModule(lambda x: x.transpose(1, 0)),
+                _FunctionalModule(lambda x: _piecewise_linear_log(x * _gain)),
+                _GlobalStatsNormalization(local_path),
+                _FunctionalModule(lambda x: torch.nn.functional.pad(x, (0, 0, 0, self._right_padding))),
+            )
+        )
+
+    def get_streaming_feature_extractor(self) -> FeatureExtractor:
+        """Constructs feature extractor for streaming (simultaneous) ASR.
+
+        Returns:
+            FeatureExtractor
+        """
+        local_path = torchaudio.utils.download_asset(self._global_stats_path)
+        return _ModuleFeatureExtractor(
+            torch.nn.Sequential(
+                torchaudio.transforms.MelSpectrogram(
+                    sample_rate=self.sample_rate, n_fft=self.n_fft, n_mels=self.n_mels, hop_length=self.hop_length
+                ),
+                _FunctionalModule(lambda x: x.transpose(1, 0)),
+                _FunctionalModule(lambda x: _piecewise_linear_log(x * _gain)),
+                _GlobalStatsNormalization(local_path),
+            )
+        )
+
+    def get_token_processor(self) -> TokenProcessor:
+        """Constructs token processor.
+
+        Returns:
+            TokenProcessor
+        """
+        local_path = torchaudio.utils.download_asset(self._sp_model_path)
+        return _SentencePieceTokenProcessor(local_path)
+
+
+EMFORMER_RNNT_BASE_LIBRISPEECH = RNNTBundle(
+    _rnnt_path="models/emformer_rnnt_base_librispeech.pt",
+    _rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=4097),
+    _global_stats_path="pipeline-assets/global_stats_rnnt_librispeech.json",
+    _sp_model_path="pipeline-assets/spm_bpe_4096_librispeech.model",
+    _right_padding=4,
+    _blank=4096,
+    _sample_rate=16000,
+    _n_fft=400,
+    _n_mels=80,
+    _hop_length=160,
+    _segment_length=16,
+    _right_context_length=4,
+)
+EMFORMER_RNNT_BASE_LIBRISPEECH.__doc__ = """Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both streaming and non-streaming inference.
+
+    The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
+    and utilizes weights trained on LibriSpeech using training script ``train.py``
+    `here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__ with default arguments.
+
+    Please refer to :py:class:`RNNTBundle` for usage instructions.
+    """
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/__init__.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..527da5c7d2859c74b4207863f2c47edda0a62a05
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/__init__.py
@@ -0,0 +1,57 @@
+from ._multi_channel import MVDR, PSD, RTFMVDR, SoudenMVDR
+from ._transforms import (
+    AmplitudeToDB,
+    ComputeDeltas,
+    Fade,
+    FrequencyMasking,
+    GriffinLim,
+    InverseMelScale,
+    InverseSpectrogram,
+    LFCC,
+    MelScale,
+    MelSpectrogram,
+    MFCC,
+    MuLawDecoding,
+    MuLawEncoding,
+    PitchShift,
+    Resample,
+    RNNTLoss,
+    SlidingWindowCmn,
+    SpectralCentroid,
+    Spectrogram,
+    TimeMasking,
+    TimeStretch,
+    Vad,
+    Vol,
+)
+
+
+__all__ = [
+    "AmplitudeToDB",
+    "ComputeDeltas",
+    "Fade",
+    "FrequencyMasking",
+    "GriffinLim",
+    "InverseMelScale",
+    "InverseSpectrogram",
+    "LFCC",
+    "MFCC",
+    "MVDR",
+    "MelScale",
+    "MelSpectrogram",
+    "MuLawDecoding",
+    "MuLawEncoding",
+    "PSD",
+    "PitchShift",
+    "RNNTLoss",
+    "RTFMVDR",
+    "Resample",
+    "SlidingWindowCmn",
+    "SoudenMVDR",
+    "SpectralCentroid",
+    "Spectrogram",
+    "TimeMasking",
+    "TimeStretch",
+    "Vad",
+    "Vol",
+]
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/_multi_channel.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/_multi_channel.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a97dbc27345c3345612f2b18efbb7b1e049bc4e
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/_multi_channel.py
@@ -0,0 +1,464 @@
+# -*- coding: utf-8 -*-
+
+import warnings
+from typing import Optional, Union
+
+import torch
+from torch import Tensor
+from torchaudio import functional as F
+
+
+__all__ = []
+
+
+def _get_mvdr_vector(
+    psd_s: torch.Tensor,
+    psd_n: torch.Tensor,
+    reference_vector: torch.Tensor,
+    solution: str = "ref_channel",
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+) -> torch.Tensor:
+    r"""Compute the MVDR beamforming weights with ``solution`` argument.
+
+    Args:
+        psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        reference_vector (torch.Tensor): one-hot reference channel matrix.
+        solution (str, optional): Solution to compute the MVDR beamforming weights.
+            Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``)
+        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+            (Default: ``True``)
+        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+        eps (float, optional): Value to add to the denominator in the beamforming weight formula.
+            (Default: ``1e-8``)
+
+    Returns:
+        torch.Tensor: the mvdr beamforming weight matrix
+    """
+    if solution == "ref_channel":
+        beamform_vector = F.mvdr_weights_souden(psd_s, psd_n, reference_vector, diagonal_loading, diag_eps, eps)
+    else:
+        if solution == "stv_evd":
+            stv = F.rtf_evd(psd_s)
+        else:
+            stv = F.rtf_power(psd_s, psd_n, reference_vector, diagonal_loading=diagonal_loading, diag_eps=diag_eps)
+        beamform_vector = F.mvdr_weights_rtf(stv, psd_n, reference_vector, diagonal_loading, diag_eps, eps)
+
+    return beamform_vector
+
+
+class PSD(torch.nn.Module):
+    r"""Compute cross-channel power spectral density (PSD) matrix.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        multi_mask (bool, optional): If ``True``, only accepts multi-channel Time-Frequency masks. (Default: ``False``)
+        normalize (bool, optional): If ``True``, normalize the mask along the time dimension. (Default: ``True``)
+        eps (float, optional): Value to add to the denominator in mask normalization. (Default: ``1e-15``)
+    """
+
+    def __init__(self, multi_mask: bool = False, normalize: bool = True, eps: float = 1e-15):
+        super().__init__()
+        self.multi_mask = multi_mask
+        self.normalize = normalize
+        self.eps = eps
+
+    def forward(self, specgram: torch.Tensor, mask: Optional[torch.Tensor] = None):
+        """
+        Args:
+            specgram (torch.Tensor): Multi-channel complex-valued spectrum.
+                Tensor with dimensions `(..., channel, freq, time)`.
+            mask (torch.Tensor or None, optional): Time-Frequency mask for normalization.
+                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False`` or
+                with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
+                (Default: ``None``)
+
+        Returns:
+            torch.Tensor: The complex-valued PSD matrix of the input spectrum.
+                Tensor with dimensions `(..., freq, channel, channel)`
+        """
+        if mask is not None:
+            if self.multi_mask:
+                # Averaging mask along channel dimension
+                mask = mask.mean(dim=-3)  # (..., freq, time)
+        psd = F.psd(specgram, mask, self.normalize, self.eps)
+
+        return psd
+
+
+class MVDR(torch.nn.Module):
+    """Minimum Variance Distortionless Response (MVDR) module that performs MVDR beamforming with Time-Frequency masks.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Based on https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/beamformer.py
+
+    We provide three solutions of MVDR beamforming. One is based on *reference channel selection*
+    [:footcite:`souden2009optimal`] (``solution=ref_channel``).
+
+    .. math::
+        \\textbf{w}_{\\text{MVDR}}(f) =\
+        \\frac{{{\\bf{\\Phi}_{\\textbf{NN}}^{-1}}(f){\\bf{\\Phi}_{\\textbf{SS}}}}(f)}\
+        {\\text{Trace}({{{\\bf{\\Phi}_{\\textbf{NN}}^{-1}}(f) \\bf{\\Phi}_{\\textbf{SS}}}(f))}}\\bm{u}
+
+    where :math:`\\bf{\\Phi}_{\\textbf{SS}}` and :math:`\\bf{\\Phi}_{\\textbf{NN}}` are the covariance\
+        matrices of speech and noise, respectively. :math:`\\bf{u}` is an one-hot vector to determine the\
+         reference channel.
+
+    The other two solutions are based on the steering vector (``solution=stv_evd`` or ``solution=stv_power``).
+
+    .. math::
+        \\textbf{w}_{\\text{MVDR}}(f) =\
+        \\frac{{{\\bf{\\Phi}_{\\textbf{NN}}^{-1}}(f){\\bm{v}}(f)}}\
+        {{\\bm{v}^{\\mathsf{H}}}(f){\\bf{\\Phi}_{\\textbf{NN}}^{-1}}(f){\\bm{v}}(f)}
+
+    where :math:`\\bm{v}` is the acoustic transfer function or the steering vector.\
+        :math:`.^{\\mathsf{H}}` denotes the Hermitian Conjugate operation.
+
+    We apply either *eigenvalue decomposition*
+    [:footcite:`higuchi2016robust`] or the *power method* [:footcite:`mises1929praktische`] to get the
+    steering vector from the PSD matrix of speech.
+
+    After estimating the beamforming weight, the enhanced Short-time Fourier Transform (STFT) is obtained by
+
+    .. math::
+        \\hat{\\bf{S}} = {\\bf{w}^\\mathsf{H}}{\\bf{Y}}, {\\bf{w}} \\in \\mathbb{C}^{M \\times F}
+
+    where :math:`\\bf{Y}` and :math:`\\hat{\\bf{S}}` are the STFT of the multi-channel noisy speech and\
+        the single-channel enhanced speech, respectively.
+
+    For online streaming audio, we provide a *recursive method* [:footcite:`higuchi2017online`] to update the
+    PSD matrices of speech and noise, respectively.
+
+    Args:
+        ref_channel (int, optional): Reference channel for beamforming. (Default: ``0``)
+        solution (str, optional): Solution to compute the MVDR beamforming weights.
+            Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``)
+        multi_mask (bool, optional): If ``True``, only accepts multi-channel Time-Frequency masks. (Default: ``False``)
+        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to the covariance matrix
+            of the noise. (Default: ``True``)
+        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+        online (bool, optional): If ``True``, updates the MVDR beamforming weights based on
+            the previous covarience matrices. (Default: ``False``)
+
+    Note:
+        To improve the numerical stability, the input spectrogram will be converted to double precision
+        (``torch.complex128`` or ``torch.cdouble``) dtype for internal computation. The output spectrogram
+        is converted to the dtype of the input spectrogram to be compatible with other modules.
+
+    Note:
+        If you use ``stv_evd`` solution, the gradient of the same input may not be identical if the
+        eigenvalues of the PSD matrix are not distinct (i.e. some eigenvalues are close or identical).
+    """
+
+    def __init__(
+        self,
+        ref_channel: int = 0,
+        solution: str = "ref_channel",
+        multi_mask: bool = False,
+        diag_loading: bool = True,
+        diag_eps: float = 1e-7,
+        online: bool = False,
+    ):
+        super().__init__()
+        assert solution in [
+            "ref_channel",
+            "stv_evd",
+            "stv_power",
+        ], "Unknown solution provided. Must be one of [``ref_channel``, ``stv_evd``, ``stv_power``]."
+        self.ref_channel = ref_channel
+        self.solution = solution
+        self.multi_mask = multi_mask
+        self.diag_loading = diag_loading
+        self.diag_eps = diag_eps
+        self.online = online
+        self.psd = PSD(multi_mask)
+
+        psd_s: torch.Tensor = torch.zeros(1)
+        psd_n: torch.Tensor = torch.zeros(1)
+        mask_sum_s: torch.Tensor = torch.zeros(1)
+        mask_sum_n: torch.Tensor = torch.zeros(1)
+        self.register_buffer("psd_s", psd_s)
+        self.register_buffer("psd_n", psd_n)
+        self.register_buffer("mask_sum_s", mask_sum_s)
+        self.register_buffer("mask_sum_n", mask_sum_n)
+
+    def _get_updated_mvdr_vector(
+        self,
+        psd_s: torch.Tensor,
+        psd_n: torch.Tensor,
+        mask_s: torch.Tensor,
+        mask_n: torch.Tensor,
+        reference_vector: torch.Tensor,
+        solution: str = "ref_channel",
+        diagonal_loading: bool = True,
+        diag_eps: float = 1e-7,
+        eps: float = 1e-8,
+    ) -> torch.Tensor:
+        r"""Recursively update the MVDR beamforming vector.
+
+        Args:
+            psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+                Tensor with dimensions `(..., freq, channel, channel)`.
+            psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+                Tensor with dimensions `(..., freq, channel, channel)`.
+            mask_s (torch.Tensor): Time-Frequency mask of the target speech.
+                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
+                or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
+            mask_n (torch.Tensor or None, optional): Time-Frequency mask of the noise.
+                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
+                or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
+            reference_vector (torch.Tensor): One-hot reference channel matrix.
+            solution (str, optional): Solution to compute the MVDR beamforming weights.
+                Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``)
+            diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+                (Default: ``True``)
+            diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+                It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+            eps (float, optional): Value to add to the denominator in the beamforming weight formula.
+                (Default: ``1e-8``)
+
+        Returns:
+            torch.Tensor: The MVDR beamforming weight matrix.
+        """
+        if self.multi_mask:
+            # Averaging mask along channel dimension
+            mask_s = mask_s.mean(dim=-3)  # (..., freq, time)
+            mask_n = mask_n.mean(dim=-3)  # (..., freq, time)
+        if self.psd_s.ndim == 1:
+            self.psd_s = psd_s
+            self.psd_n = psd_n
+            self.mask_sum_s = mask_s.sum(dim=-1)
+            self.mask_sum_n = mask_n.sum(dim=-1)
+            return _get_mvdr_vector(psd_s, psd_n, reference_vector, solution, diagonal_loading, diag_eps, eps)
+        else:
+            psd_s = self._get_updated_psd_speech(psd_s, mask_s)
+            psd_n = self._get_updated_psd_noise(psd_n, mask_n)
+            self.psd_s = psd_s
+            self.psd_n = psd_n
+            self.mask_sum_s = self.mask_sum_s + mask_s.sum(dim=-1)
+            self.mask_sum_n = self.mask_sum_n + mask_n.sum(dim=-1)
+            return _get_mvdr_vector(psd_s, psd_n, reference_vector, solution, diagonal_loading, diag_eps, eps)
+
+    def _get_updated_psd_speech(self, psd_s: torch.Tensor, mask_s: torch.Tensor) -> torch.Tensor:
+        r"""Update psd of speech recursively.
+
+        Args:
+            psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+                Tensor with dimensions `(..., freq, channel, channel)`.
+            mask_s (torch.Tensor): Time-Frequency mask of the target speech.
+                Tensor with dimensions `(..., freq, time)`.
+
+        Returns:
+            torch.Tensor: The updated PSD matrix of target speech.
+        """
+        numerator = self.mask_sum_s / (self.mask_sum_s + mask_s.sum(dim=-1))
+        denominator = 1 / (self.mask_sum_s + mask_s.sum(dim=-1))
+        psd_s = self.psd_s * numerator[..., None, None] + psd_s * denominator[..., None, None]
+        return psd_s
+
+    def _get_updated_psd_noise(self, psd_n: torch.Tensor, mask_n: torch.Tensor) -> torch.Tensor:
+        r"""Update psd of noise recursively.
+
+        Args:
+            psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+                Tensor with dimensions `(..., freq, channel, channel)`.
+            mask_n (torch.Tensor or None, optional): Time-Frequency mask of the noise.
+                Tensor with dimensions `(..., freq, time)`.
+
+        Returns:
+            torch.Tensor:  The updated PSD matrix of noise.
+        """
+        numerator = self.mask_sum_n / (self.mask_sum_n + mask_n.sum(dim=-1))
+        denominator = 1 / (self.mask_sum_n + mask_n.sum(dim=-1))
+        psd_n = self.psd_n * numerator[..., None, None] + psd_n * denominator[..., None, None]
+        return psd_n
+
+    def forward(
+        self, specgram: torch.Tensor, mask_s: torch.Tensor, mask_n: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Perform MVDR beamforming.
+
+        Args:
+            specgram (torch.Tensor): Multi-channel complex-valued spectrum.
+                Tensor with dimensions `(..., channel, freq, time)`
+            mask_s (torch.Tensor): Time-Frequency mask of target speech.
+                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
+                or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
+            mask_n (torch.Tensor or None, optional): Time-Frequency mask of noise.
+                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
+                or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
+                (Default: None)
+
+        Returns:
+            torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`.
+        """
+        dtype = specgram.dtype
+        if specgram.ndim < 3:
+            raise ValueError(f"Expected at least 3D tensor (..., channel, freq, time). Found: {specgram.shape}")
+        if not specgram.is_complex():
+            raise ValueError(
+                f"The type of ``specgram`` tensor must be ``torch.cfloat`` or ``torch.cdouble``.\
+                    Found: {specgram.dtype}"
+            )
+        if specgram.dtype == torch.cfloat:
+            specgram = specgram.cdouble()  # Convert specgram to ``torch.cdouble``.
+
+        if mask_n is None:
+            warnings.warn("``mask_n`` is not provided, use ``1 - mask_s`` as ``mask_n``.")
+            mask_n = 1 - mask_s
+
+        psd_s = self.psd(specgram, mask_s)  # (..., freq, time, channel, channel)
+        psd_n = self.psd(specgram, mask_n)  # (..., freq, time, channel, channel)
+
+        u = torch.zeros(specgram.size()[:-2], device=specgram.device, dtype=torch.cdouble)  # (..., channel)
+        u[..., self.ref_channel].fill_(1)
+
+        if self.online:
+            w_mvdr = self._get_updated_mvdr_vector(
+                psd_s, psd_n, mask_s, mask_n, u, self.solution, self.diag_loading, self.diag_eps
+            )
+        else:
+            w_mvdr = _get_mvdr_vector(psd_s, psd_n, u, self.solution, self.diag_loading, self.diag_eps)
+
+        specgram_enhanced = F.apply_beamforming(w_mvdr, specgram)
+
+        return specgram_enhanced.to(dtype)
+
+
+class RTFMVDR(torch.nn.Module):
+    r"""Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) module
+    based on the relative transfer function (RTF) and power spectral density (PSD) matrix of noise.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Given the multi-channel complex-valued spectrum :math:`\textbf{Y}`, the relative transfer function (RTF) matrix
+    or the steering vector of target speech :math:`\bm{v}`, the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and
+    a one-hot vector that represents the reference channel :math:`\bf{u}`, the module computes the single-channel
+    complex-valued spectrum of the enhanced speech :math:`\hat{\textbf{S}}`. The formula is defined as:
+
+    .. math::
+        \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f)
+
+    where :math:`\textbf{w}_{\text{bf}}(f)` is the MVDR beamforming weight for the :math:`f`-th frequency bin,
+    :math:`(.)^{\mathsf{H}}` denotes the Hermitian Conjugate operation.
+
+    The beamforming weight is computed by:
+
+    .. math::
+        \textbf{w}_{\text{MVDR}}(f) =
+        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}}
+        {{\bm{v}^{\mathsf{H}}}(f){\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}
+    """
+
+    def forward(
+        self,
+        specgram: Tensor,
+        rtf: Tensor,
+        psd_n: Tensor,
+        reference_channel: Union[int, Tensor],
+        diagonal_loading: bool = True,
+        diag_eps: float = 1e-7,
+        eps: float = 1e-8,
+    ) -> Tensor:
+        """
+        Args:
+            specgram (torch.Tensor): Multi-channel complex-valued spectrum.
+                Tensor with dimensions `(..., channel, freq, time)`
+            rtf (torch.Tensor): The complex-valued RTF vector of target speech.
+                Tensor with dimensions `(..., freq, channel)`.
+            psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+                Tensor with dimensions `(..., freq, channel, channel)`.
+            reference_channel (int or torch.Tensor): Specifies the reference channel.
+                If the dtype is ``int``, it represents the reference channel index.
+                If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
+                is one-hot.
+            diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+                (Default: ``True``)
+            diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+                It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+            eps (float, optional): Value to add to the denominator in the beamforming weight formula.
+                (Default: ``1e-8``)
+
+        Returns:
+            torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`.
+        """
+        w_mvdr = F.mvdr_weights_rtf(rtf, psd_n, reference_channel, diagonal_loading, diag_eps, eps)
+        spectrum_enhanced = F.apply_beamforming(w_mvdr, specgram)
+        return spectrum_enhanced
+
+
+class SoudenMVDR(torch.nn.Module):
+    r"""Minimum Variance Distortionless Response (*MVDR* [:footcite:`capon1969high`]) module
+    based on the method proposed by *Souden et, al.* [:footcite:`souden2009optimal`].
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Given the multi-channel complex-valued spectrum :math:`\textbf{Y}`, the power spectral density (PSD) matrix
+    of target speech :math:`\bf{\Phi}_{\textbf{SS}}`, the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and
+    a one-hot vector that represents the reference channel :math:`\bf{u}`, the module computes the single-channel
+    complex-valued spectrum of the enhanced speech :math:`\hat{\textbf{S}}`. The formula is defined as:
+
+    .. math::
+        \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f)
+
+    where :math:`\textbf{w}_{\text{bf}}(f)` is the MVDR beamforming weight for the :math:`f`-th frequency bin.
+
+    The beamforming weight is computed by:
+
+    .. math::
+        \textbf{w}_{\text{MVDR}}(f) =
+        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bf{\Phi}_{\textbf{SS}}}}(f)}
+        {\text{Trace}({{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f) \bf{\Phi}_{\textbf{SS}}}(f))}}\bm{u}
+    """
+
+    def forward(
+        self,
+        specgram: Tensor,
+        psd_s: Tensor,
+        psd_n: Tensor,
+        reference_channel: Union[int, Tensor],
+        diagonal_loading: bool = True,
+        diag_eps: float = 1e-7,
+        eps: float = 1e-8,
+    ) -> torch.Tensor:
+        """
+        Args:
+            specgram (torch.Tensor): Multi-channel complex-valued spectrum.
+                Tensor with dimensions `(..., channel, freq, time)`.
+            psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+                Tensor with dimensions `(..., freq, channel, channel)`.
+            psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+                Tensor with dimensions `(..., freq, channel, channel)`.
+            reference_channel (int or torch.Tensor): Specifies the reference channel.
+                If the dtype is ``int``, it represents the reference channel index.
+                If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
+                is one-hot.
+            diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+                (Default: ``True``)
+            diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+                It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+            eps (float, optional): Value to add to the denominator in the beamforming weight formula.
+                (Default: ``1e-8``)
+
+        Returns:
+            torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`.
+        """
+        w_mvdr = F.mvdr_weights_souden(psd_s, psd_n, reference_channel, diagonal_loading, diag_eps, eps)
+        spectrum_enhanced = F.apply_beamforming(w_mvdr, specgram)
+        return spectrum_enhanced
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/_transforms.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..489901f9eabfb774bac5adf539a7345785dc264a
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/transforms/_transforms.py
@@ -0,0 +1,1693 @@
+# -*- coding: utf-8 -*-
+
+import math
+import warnings
+from typing import Callable, Optional
+
+import torch
+from torch import Tensor
+from torch.nn.modules.lazy import LazyModuleMixin
+from torch.nn.parameter import UninitializedParameter
+
+from torchaudio import functional as F
+from torchaudio.functional.functional import (
+    _apply_sinc_resample_kernel,
+    _get_sinc_resample_kernel,
+    _stretch_waveform,
+    _fix_waveform_shape,
+)
+
+__all__ = []
+
+
+class Spectrogram(torch.nn.Module):
+    r"""Create a spectrogram from a audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        power (float or None, optional): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for energy, 2 for power, etc.
+            If None, then the complex spectrum is returned instead. (Default: ``2``)
+        normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
+        wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            (Default: ``True``)
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. (Default: ``"reflect"``)
+        onesided (bool, optional): controls whether to return half of results to
+            avoid redundancy (Default: ``True``)
+        return_complex (bool, optional):
+            Deprecated and not used.
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True)
+        >>> transform = torchaudio.transforms.Spectrogram(n_fft=800)
+        >>> spectrogram = transform(waveform)
+
+    """
+    __constants__ = ["n_fft", "win_length", "hop_length", "pad", "power", "normalized"]
+
+    def __init__(
+        self,
+        n_fft: int = 400,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        pad: int = 0,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        power: Optional[float] = 2.0,
+        normalized: bool = False,
+        wkwargs: Optional[dict] = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        onesided: bool = True,
+        return_complex: Optional[bool] = None,
+    ) -> None:
+        super(Spectrogram, self).__init__()
+        self.n_fft = n_fft
+        # number of FFT bins. the returned STFT result will have n_fft // 2 + 1
+        # number of frequencies due to onesided=True in torch.stft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
+        self.register_buffer("window", window)
+        self.pad = pad
+        self.power = power
+        self.normalized = normalized
+        self.center = center
+        self.pad_mode = pad_mode
+        self.onesided = onesided
+        if return_complex is not None:
+            warnings.warn(
+                "`return_complex` argument is now deprecated and is not effective."
+                "`torchaudio.transforms.Spectrogram(power=None)` always returns a tensor with "
+                "complex dtype. Please remove the argument in the function call."
+            )
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: Dimension (..., freq, time), where freq is
+            ``n_fft // 2 + 1`` where ``n_fft`` is the number of
+            Fourier bins, and time is the number of window hops (n_frame).
+        """
+        return F.spectrogram(
+            waveform,
+            self.pad,
+            self.window,
+            self.n_fft,
+            self.hop_length,
+            self.win_length,
+            self.power,
+            self.normalized,
+            self.center,
+            self.pad_mode,
+            self.onesided,
+        )
+
+
+class InverseSpectrogram(torch.nn.Module):
+    r"""Create an inverse spectrogram to recover an audio signal from a spectrogram.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        normalized (bool, optional): Whether the spectrogram was normalized by magnitude after stft.
+            (Default: ``False``)
+        wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
+        center (bool, optional): whether the signal in spectrogram was padded on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            (Default: ``True``)
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. (Default: ``"reflect"``)
+        onesided (bool, optional): controls whether spectrogram was used to return half of results to
+            avoid redundancy (Default: ``True``)
+
+    Example
+        >>> batch, freq, time = 2, 257, 100
+        >>> length = 25344
+        >>> spectrogram = torch.randn(batch, freq, time, dtype=torch.cdouble)
+        >>> transform = transforms.InverseSpectrogram(n_fft=512)
+        >>> waveform = transform(spectrogram, length)
+    """
+    __constants__ = ["n_fft", "win_length", "hop_length", "pad", "power", "normalized"]
+
+    def __init__(
+        self,
+        n_fft: int = 400,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        pad: int = 0,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        normalized: bool = False,
+        wkwargs: Optional[dict] = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        onesided: bool = True,
+    ) -> None:
+        super(InverseSpectrogram, self).__init__()
+        self.n_fft = n_fft
+        # number of FFT bins. the returned STFT result will have n_fft // 2 + 1
+        # number of frequencies due to onesided=True in torch.stft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
+        self.register_buffer("window", window)
+        self.pad = pad
+        self.normalized = normalized
+        self.center = center
+        self.pad_mode = pad_mode
+        self.onesided = onesided
+
+    def forward(self, spectrogram: Tensor, length: Optional[int] = None) -> Tensor:
+        r"""
+        Args:
+            spectrogram (Tensor): Complex tensor of audio of dimension (..., freq, time).
+            length (int or None, optional): The output length of the waveform.
+
+        Returns:
+            Tensor: Dimension (..., time), Least squares estimation of the original signal.
+        """
+        return F.inverse_spectrogram(
+            spectrogram,
+            length,
+            self.pad,
+            self.window,
+            self.n_fft,
+            self.hop_length,
+            self.win_length,
+            self.normalized,
+            self.center,
+            self.pad_mode,
+            self.onesided,
+        )
+
+
+class GriffinLim(torch.nn.Module):
+    r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Implementation ported from
+    *librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
+    and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].
+
+    Args:
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        n_iter (int, optional): Number of iteration for phase recovery process. (Default: ``32``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        power (float, optional): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
+        wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
+        momentum (float, optional): The momentum parameter for fast Griffin-Lim.
+            Setting this to 0 recovers the original Griffin-Lim method.
+            Values near 1 can lead to faster convergence, but above 1 may not converge. (Default: ``0.99``)
+        length (int, optional): Array length of the expected output. (Default: ``None``)
+        rand_init (bool, optional): Initializes phase randomly if True and to zero otherwise. (Default: ``True``)
+
+    Example
+        >>> batch, freq, time = 2, 257, 100
+        >>> spectrogram = torch.randn(batch, freq, time)
+        >>> transform = transforms.GriffinLim(n_fft=512)
+        >>> waveform = transform(spectrogram)
+    """
+    __constants__ = ["n_fft", "n_iter", "win_length", "hop_length", "power", "length", "momentum", "rand_init"]
+
+    def __init__(
+        self,
+        n_fft: int = 400,
+        n_iter: int = 32,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        power: float = 2.0,
+        wkwargs: Optional[dict] = None,
+        momentum: float = 0.99,
+        length: Optional[int] = None,
+        rand_init: bool = True,
+    ) -> None:
+        super(GriffinLim, self).__init__()
+
+        assert momentum < 1, "momentum={} > 1 can be unstable".format(momentum)
+        assert momentum >= 0, "momentum={} < 0".format(momentum)
+
+        self.n_fft = n_fft
+        self.n_iter = n_iter
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
+        self.register_buffer("window", window)
+        self.length = length
+        self.power = power
+        self.momentum = momentum / (1 + momentum)
+        self.rand_init = rand_init
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor):
+                A magnitude-only STFT spectrogram of dimension (..., freq, frames)
+                where freq is ``n_fft // 2 + 1``.
+
+        Returns:
+            Tensor: waveform of (..., time), where time equals the ``length`` parameter if given.
+        """
+        return F.griffinlim(
+            specgram,
+            self.window,
+            self.n_fft,
+            self.hop_length,
+            self.win_length,
+            self.power,
+            self.n_iter,
+            self.momentum,
+            self.length,
+            self.rand_init,
+        )
+
+
+class AmplitudeToDB(torch.nn.Module):
+    r"""Turn a tensor from the power/amplitude scale to the decibel scale.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    This output depends on the maximum value in the input tensor, and so
+    may return different values for an audio clip split into snippets vs. a
+    a full clip.
+
+    Args:
+        stype (str, optional): scale of input tensor (``'power'`` or ``'magnitude'``). The
+            power being the elementwise square of the magnitude. (Default: ``'power'``)
+        top_db (float or None, optional): minimum negative cut-off in decibels.  A reasonable
+            number is 80. (Default: ``None``)
+    """
+    __constants__ = ["multiplier", "amin", "ref_value", "db_multiplier"]
+
+    def __init__(self, stype: str = "power", top_db: Optional[float] = None) -> None:
+        super(AmplitudeToDB, self).__init__()
+        self.stype = stype
+        if top_db is not None and top_db < 0:
+            raise ValueError("top_db must be positive value")
+        self.top_db = top_db
+        self.multiplier = 10.0 if stype == "power" else 20.0
+        self.amin = 1e-10
+        self.ref_value = 1.0
+        self.db_multiplier = math.log10(max(self.amin, self.ref_value))
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""Numerically stable implementation from Librosa.
+
+        https://librosa.org/doc/latest/generated/librosa.amplitude_to_db.html
+
+        Args:
+            x (Tensor): Input tensor before being converted to decibel scale.
+
+        Returns:
+            Tensor: Output tensor in decibel scale.
+        """
+        return F.amplitude_to_DB(x, self.multiplier, self.amin, self.db_multiplier, self.top_db)
+
+
+class MelScale(torch.nn.Module):
+    r"""Turn a normal STFT into a mel frequency STFT with triangular filter banks.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
+        n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
+        norm (str or None, optional): If ``'slaney'``, divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    See also:
+        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["n_mels", "sample_rate", "f_min", "f_max"]
+
+    def __init__(
+        self,
+        n_mels: int = 128,
+        sample_rate: int = 16000,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        n_stft: int = 201,
+        norm: Optional[str] = None,
+        mel_scale: str = "htk",
+    ) -> None:
+        super(MelScale, self).__init__()
+        self.n_mels = n_mels
+        self.sample_rate = sample_rate
+        self.f_max = f_max if f_max is not None else float(sample_rate // 2)
+        self.f_min = f_min
+        self.norm = norm
+        self.mel_scale = mel_scale
+
+        assert f_min <= self.f_max, "Require f_min: {} < f_max: {}".format(f_min, self.f_max)
+        fb = F.melscale_fbanks(n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, self.norm, self.mel_scale)
+        self.register_buffer("fb", fb)
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor): A spectrogram STFT of dimension (..., freq, time).
+
+        Returns:
+            Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time).
+        """
+
+        # (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
+        mel_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2)
+
+        return mel_specgram
+
+
+class InverseMelScale(torch.nn.Module):
+    r"""Estimate a STFT in normal frequency domain from mel frequency domain.
+
+    .. devices:: CPU CUDA
+
+    It minimizes the euclidian norm between the input mel-spectrogram and the product between
+    the estimated spectrogram and the filter banks using SGD.
+
+    Args:
+        n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
+        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
+        max_iter (int, optional): Maximum number of optimization iterations. (Default: ``100000``)
+        tolerance_loss (float, optional): Value of loss to stop optimization at. (Default: ``1e-5``)
+        tolerance_change (float, optional): Difference in losses to stop optimization at. (Default: ``1e-8``)
+        sgdargs (dict or None, optional): Arguments for the SGD optimizer. (Default: ``None``)
+        norm (str or None, optional): If 'slaney', divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+    """
+    __constants__ = [
+        "n_stft",
+        "n_mels",
+        "sample_rate",
+        "f_min",
+        "f_max",
+        "max_iter",
+        "tolerance_loss",
+        "tolerance_change",
+        "sgdargs",
+    ]
+
+    def __init__(
+        self,
+        n_stft: int,
+        n_mels: int = 128,
+        sample_rate: int = 16000,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        max_iter: int = 100000,
+        tolerance_loss: float = 1e-5,
+        tolerance_change: float = 1e-8,
+        sgdargs: Optional[dict] = None,
+        norm: Optional[str] = None,
+        mel_scale: str = "htk",
+    ) -> None:
+        super(InverseMelScale, self).__init__()
+        self.n_mels = n_mels
+        self.sample_rate = sample_rate
+        self.f_max = f_max or float(sample_rate // 2)
+        self.f_min = f_min
+        self.max_iter = max_iter
+        self.tolerance_loss = tolerance_loss
+        self.tolerance_change = tolerance_change
+        self.sgdargs = sgdargs or {"lr": 0.1, "momentum": 0.9}
+
+        assert f_min <= self.f_max, "Require f_min: {} < f_max: {}".format(f_min, self.f_max)
+
+        fb = F.melscale_fbanks(n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, norm, mel_scale)
+        self.register_buffer("fb", fb)
+
+    def forward(self, melspec: Tensor) -> Tensor:
+        r"""
+        Args:
+            melspec (Tensor): A Mel frequency spectrogram of dimension (..., ``n_mels``, time)
+
+        Returns:
+            Tensor: Linear scale spectrogram of size (..., freq, time)
+        """
+        # pack batch
+        shape = melspec.size()
+        melspec = melspec.view(-1, shape[-2], shape[-1])
+
+        n_mels, time = shape[-2], shape[-1]
+        freq, _ = self.fb.size()  # (freq, n_mels)
+        melspec = melspec.transpose(-1, -2)
+        assert self.n_mels == n_mels
+
+        specgram = torch.rand(
+            melspec.size()[0], time, freq, requires_grad=True, dtype=melspec.dtype, device=melspec.device
+        )
+
+        optim = torch.optim.SGD([specgram], **self.sgdargs)
+
+        loss = float("inf")
+        for _ in range(self.max_iter):
+            optim.zero_grad()
+            diff = melspec - specgram.matmul(self.fb)
+            new_loss = diff.pow(2).sum(axis=-1).mean()
+            # take sum over mel-frequency then average over other dimensions
+            # so that loss threshold is applied par unit timeframe
+            new_loss.backward()
+            optim.step()
+            specgram.data = specgram.data.clamp(min=0)
+
+            new_loss = new_loss.item()
+            if new_loss < self.tolerance_loss or abs(loss - new_loss) < self.tolerance_change:
+                break
+            loss = new_loss
+
+        specgram.requires_grad_(False)
+        specgram = specgram.clamp(min=0).transpose(-1, -2)
+
+        # unpack batch
+        specgram = specgram.view(shape[:-2] + (freq, time))
+        return specgram
+
+
+class MelSpectrogram(torch.nn.Module):
+    r"""Create MelSpectrogram for a raw audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and
+    and :py:func:`torchaudio.transforms.MelScale`.
+
+    Sources
+        * https://gist.github.com/kastnerkyle/179d6e9a88202ab0a2fe
+        * https://timsainb.github.io/spectrograms-mfccs-and-inversion-in-python.html
+        * http://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
+
+    Args:
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``None``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
+        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        power (float, optional): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
+        normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
+        wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            (Default: ``True``)
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. (Default: ``"reflect"``)
+        onesided (bool, optional): controls whether to return half of results to
+            avoid redundancy. (Default: ``True``)
+        norm (str or None, optional): If 'slaney', divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True)
+        >>> transform = transforms.MelSpectrogram(sample_rate)
+        >>> mel_specgram = transform(waveform)  # (channel, n_mels, time)
+
+    See also:
+        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_mels", "f_min"]
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        n_fft: int = 400,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        pad: int = 0,
+        n_mels: int = 128,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        power: float = 2.0,
+        normalized: bool = False,
+        wkwargs: Optional[dict] = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        onesided: bool = True,
+        norm: Optional[str] = None,
+        mel_scale: str = "htk",
+    ) -> None:
+        super(MelSpectrogram, self).__init__()
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        self.pad = pad
+        self.power = power
+        self.normalized = normalized
+        self.n_mels = n_mels  # number of mel frequency bins
+        self.f_max = f_max
+        self.f_min = f_min
+        self.spectrogram = Spectrogram(
+            n_fft=self.n_fft,
+            win_length=self.win_length,
+            hop_length=self.hop_length,
+            pad=self.pad,
+            window_fn=window_fn,
+            power=self.power,
+            normalized=self.normalized,
+            wkwargs=wkwargs,
+            center=center,
+            pad_mode=pad_mode,
+            onesided=onesided,
+        )
+        self.mel_scale = MelScale(
+            self.n_mels, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, norm, mel_scale
+        )
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time).
+        """
+        specgram = self.spectrogram(waveform)
+        mel_specgram = self.mel_scale(specgram)
+        return mel_specgram
+
+
+class MFCC(torch.nn.Module):
+    r"""Create the Mel-frequency cepstrum coefficients from an audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    By default, this calculates the MFCC on the DB-scaled Mel spectrogram.
+    This is not the textbook implementation, but is implemented here to
+    give consistency with librosa.
+
+    This output depends on the maximum value in the input spectrogram, and so
+    may return different values for an audio clip split into snippets vs. a
+    a full clip.
+
+    Args:
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        n_mfcc (int, optional): Number of mfc coefficients to retain. (Default: ``40``)
+        dct_type (int, optional): type of DCT (discrete cosine transform) to use. (Default: ``2``)
+        norm (str, optional): norm to use. (Default: ``'ortho'``)
+        log_mels (bool, optional): whether to use log-mel spectrograms instead of db-scaled. (Default: ``False``)
+        melkwargs (dict or None, optional): arguments for MelSpectrogram. (Default: ``None``)
+
+    See also:
+        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["sample_rate", "n_mfcc", "dct_type", "top_db", "log_mels"]
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        n_mfcc: int = 40,
+        dct_type: int = 2,
+        norm: str = "ortho",
+        log_mels: bool = False,
+        melkwargs: Optional[dict] = None,
+    ) -> None:
+        super(MFCC, self).__init__()
+        supported_dct_types = [2]
+        if dct_type not in supported_dct_types:
+            raise ValueError("DCT type not supported: {}".format(dct_type))
+        self.sample_rate = sample_rate
+        self.n_mfcc = n_mfcc
+        self.dct_type = dct_type
+        self.norm = norm
+        self.top_db = 80.0
+        self.amplitude_to_DB = AmplitudeToDB("power", self.top_db)
+
+        melkwargs = melkwargs or {}
+        self.MelSpectrogram = MelSpectrogram(sample_rate=self.sample_rate, **melkwargs)
+
+        if self.n_mfcc > self.MelSpectrogram.n_mels:
+            raise ValueError("Cannot select more MFCC coefficients than # mel bins")
+        dct_mat = F.create_dct(self.n_mfcc, self.MelSpectrogram.n_mels, self.norm)
+        self.register_buffer("dct_mat", dct_mat)
+        self.log_mels = log_mels
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: specgram_mel_db of size (..., ``n_mfcc``, time).
+        """
+        mel_specgram = self.MelSpectrogram(waveform)
+        if self.log_mels:
+            log_offset = 1e-6
+            mel_specgram = torch.log(mel_specgram + log_offset)
+        else:
+            mel_specgram = self.amplitude_to_DB(mel_specgram)
+
+        # (..., time, n_mels) dot (n_mels, n_mfcc) -> (..., n_nfcc, time)
+        mfcc = torch.matmul(mel_specgram.transpose(-1, -2), self.dct_mat).transpose(-1, -2)
+        return mfcc
+
+
+class LFCC(torch.nn.Module):
+    r"""Create the linear-frequency cepstrum coefficients from an audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    By default, this calculates the LFCC on the DB-scaled linear filtered spectrogram.
+    This is not the textbook implementation, but is implemented here to
+    give consistency with librosa.
+
+    This output depends on the maximum value in the input spectrogram, and so
+    may return different values for an audio clip split into snippets vs. a
+    a full clip.
+
+    Args:
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        n_filter (int, optional): Number of linear filters to apply. (Default: ``128``)
+        n_lfcc (int, optional): Number of lfc coefficients to retain. (Default: ``40``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``None``)
+        dct_type (int, optional): type of DCT (discrete cosine transform) to use. (Default: ``2``)
+        norm (str, optional): norm to use. (Default: ``'ortho'``)
+        log_lf (bool, optional): whether to use log-lf spectrograms instead of db-scaled. (Default: ``False``)
+        speckwargs (dict or None, optional): arguments for Spectrogram. (Default: ``None``)
+
+
+    See also:
+        :py:func:`torchaudio.functional.linear_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["sample_rate", "n_filter", "n_lfcc", "dct_type", "top_db", "log_lf"]
+
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        n_filter: int = 128,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        n_lfcc: int = 40,
+        dct_type: int = 2,
+        norm: str = "ortho",
+        log_lf: bool = False,
+        speckwargs: Optional[dict] = None,
+    ) -> None:
+        super(LFCC, self).__init__()
+        supported_dct_types = [2]
+        if dct_type not in supported_dct_types:
+            raise ValueError("DCT type not supported: {}".format(dct_type))
+        self.sample_rate = sample_rate
+        self.f_min = f_min
+        self.f_max = f_max if f_max is not None else float(sample_rate // 2)
+        self.n_filter = n_filter
+        self.n_lfcc = n_lfcc
+        self.dct_type = dct_type
+        self.norm = norm
+        self.top_db = 80.0
+        self.amplitude_to_DB = AmplitudeToDB("power", self.top_db)
+
+        speckwargs = speckwargs or {}
+        self.Spectrogram = Spectrogram(**speckwargs)
+
+        if self.n_lfcc > self.Spectrogram.n_fft:
+            raise ValueError("Cannot select more LFCC coefficients than # fft bins")
+
+        filter_mat = F.linear_fbanks(
+            n_freqs=self.Spectrogram.n_fft // 2 + 1,
+            f_min=self.f_min,
+            f_max=self.f_max,
+            n_filter=self.n_filter,
+            sample_rate=self.sample_rate,
+        )
+        self.register_buffer("filter_mat", filter_mat)
+
+        dct_mat = F.create_dct(self.n_lfcc, self.n_filter, self.norm)
+        self.register_buffer("dct_mat", dct_mat)
+        self.log_lf = log_lf
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: Linear Frequency Cepstral Coefficients of size (..., ``n_lfcc``, time).
+        """
+        specgram = self.Spectrogram(waveform)
+
+        # (..., time, freq) dot (freq, n_filter) -> (..., n_filter, time)
+        specgram = torch.matmul(specgram.transpose(-1, -2), self.filter_mat).transpose(-1, -2)
+
+        if self.log_lf:
+            log_offset = 1e-6
+            specgram = torch.log(specgram + log_offset)
+        else:
+            specgram = self.amplitude_to_DB(specgram)
+
+        # (..., time, n_filter) dot (n_filter, n_lfcc) -> (..., n_lfcc, time)
+        lfcc = torch.matmul(specgram.transpose(-1, -2), self.dct_mat).transpose(-1, -2)
+        return lfcc
+
+
+class MuLawEncoding(torch.nn.Module):
+    r"""Encode signal based on mu-law companding.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    For more info see the
+    `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
+
+    This algorithm assumes the signal has been scaled to between -1 and 1 and
+    returns a signal encoded with values from 0 to quantization_channels - 1
+
+    Args:
+        quantization_channels (int, optional): Number of channels. (Default: ``256``)
+
+    Example
+       >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True)
+       >>> transform = torchaudio.transforms.MuLawEncoding(quantization_channels=512)
+       >>> mulawtrans = transform(waveform)
+
+    """
+    __constants__ = ["quantization_channels"]
+
+    def __init__(self, quantization_channels: int = 256) -> None:
+        super(MuLawEncoding, self).__init__()
+        self.quantization_channels = quantization_channels
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""
+        Args:
+            x (Tensor): A signal to be encoded.
+
+        Returns:
+            Tensor: An encoded signal.
+        """
+        return F.mu_law_encoding(x, self.quantization_channels)
+
+
+class MuLawDecoding(torch.nn.Module):
+    r"""Decode mu-law encoded signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    For more info see the
+    `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
+
+    This expects an input with values between 0 and ``quantization_channels - 1``
+    and returns a signal scaled between -1 and 1.
+
+    Args:
+        quantization_channels (int, optional): Number of channels. (Default: ``256``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True)
+        >>> transform = torchaudio.transforms.MuLawDecoding(quantization_channels=512)
+        >>> mulawtrans = transform(waveform)
+    """
+    __constants__ = ["quantization_channels"]
+
+    def __init__(self, quantization_channels: int = 256) -> None:
+        super(MuLawDecoding, self).__init__()
+        self.quantization_channels = quantization_channels
+
+    def forward(self, x_mu: Tensor) -> Tensor:
+        r"""
+        Args:
+            x_mu (Tensor): A mu-law encoded signal which needs to be decoded.
+
+        Returns:
+            Tensor: The signal decoded.
+        """
+        return F.mu_law_decoding(x_mu, self.quantization_channels)
+
+
+class Resample(torch.nn.Module):
+    r"""Resample a signal from one frequency to another. A resampling method can be given.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Note:
+        If resampling on waveforms of higher precision than float32, there may be a small loss of precision
+        because the kernel is cached once as float32. If high precision resampling is important for your application,
+        the functional form will retain higher precision, but run slower because it does not cache the kernel.
+        Alternatively, you could rewrite a transform that caches a higher precision kernel.
+
+    Args:
+        orig_freq (int, optional): The original frequency of the signal. (Default: ``16000``)
+        new_freq (int, optional): The desired frequency. (Default: ``16000``)
+        resampling_method (str, optional): The resampling method to use.
+            Options: [``sinc_interpolation``, ``kaiser_window``] (Default: ``'sinc_interpolation'``)
+        lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper
+            but less efficient. (Default: ``6``)
+        rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist.
+            Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``)
+        beta (float or None, optional): The shape parameter used for kaiser window.
+        dtype (torch.device, optional):
+            Determnines the precision that resampling kernel is pre-computed and cached. If not provided,
+            kernel is computed with ``torch.float64`` then cached as ``torch.float32``.
+            If you need higher precision, provide ``torch.float64``, and the pre-computed kernel is computed and
+            cached as ``torch.float64``. If you use resample with lower precision, then instead of providing this
+            providing this argument, please use ``Resample.to(dtype)``, so that the kernel generation is still
+            carried out on ``torch.float64``.
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True)
+        >>> transform = transforms.Resample(sample_rate, sample_rate/10)
+        >>> waveform = transform(waveform)
+    """
+
+    def __init__(
+        self,
+        orig_freq: int = 16000,
+        new_freq: int = 16000,
+        resampling_method: str = "sinc_interpolation",
+        lowpass_filter_width: int = 6,
+        rolloff: float = 0.99,
+        beta: Optional[float] = None,
+        *,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        super().__init__()
+
+        self.orig_freq = orig_freq
+        self.new_freq = new_freq
+        self.gcd = math.gcd(int(self.orig_freq), int(self.new_freq))
+        self.resampling_method = resampling_method
+        self.lowpass_filter_width = lowpass_filter_width
+        self.rolloff = rolloff
+        self.beta = beta
+
+        if self.orig_freq != self.new_freq:
+            kernel, self.width = _get_sinc_resample_kernel(
+                self.orig_freq,
+                self.new_freq,
+                self.gcd,
+                self.lowpass_filter_width,
+                self.rolloff,
+                self.resampling_method,
+                beta,
+                dtype=dtype,
+            )
+            self.register_buffer("kernel", kernel)
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: Output signal of dimension (..., time).
+        """
+        if self.orig_freq == self.new_freq:
+            return waveform
+        return _apply_sinc_resample_kernel(waveform, self.orig_freq, self.new_freq, self.gcd, self.kernel, self.width)
+
+
+class ComputeDeltas(torch.nn.Module):
+    r"""Compute delta coefficients of a tensor, usually a spectrogram.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    See `torchaudio.functional.compute_deltas` for more details.
+
+    Args:
+        win_length (int, optional): The window length used for computing delta. (Default: ``5``)
+        mode (str, optional): Mode parameter passed to padding. (Default: ``'replicate'``)
+    """
+    __constants__ = ["win_length"]
+
+    def __init__(self, win_length: int = 5, mode: str = "replicate") -> None:
+        super(ComputeDeltas, self).__init__()
+        self.win_length = win_length
+        self.mode = mode
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor): Tensor of audio of dimension (..., freq, time).
+
+        Returns:
+            Tensor: Tensor of deltas of dimension (..., freq, time).
+        """
+        return F.compute_deltas(specgram, win_length=self.win_length, mode=self.mode)
+
+
+class TimeStretch(torch.nn.Module):
+    r"""Stretch stft in time without modifying pitch for a given rate.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Proposed in *SpecAugment* [:footcite:`specaugment`].
+
+    Args:
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        n_freq (int, optional): number of filter banks from stft. (Default: ``201``)
+        fixed_rate (float or None, optional): rate to speed up or slow down by.
+            If None is provided, rate must be passed to the forward method. (Default: ``None``)
+
+    Example
+        >>> spectrogram = torchaudio.transforms.Spectrogram()
+        >>> stretch = torchaudio.transforms.TimeStretch()
+        >>>
+        >>> original = spectrogram(waveform)
+        >>> streched_1_2 = stretch(original, 1.2)
+        >>> streched_0_9 = stretch(original, 0.9)
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch_1.png
+           :width: 600
+           :alt: Spectrogram streched by 1.2
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch_2.png
+           :width: 600
+           :alt: The original spectrogram
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch_3.png
+           :width: 600
+           :alt: Spectrogram streched by 0.9
+
+    """
+    __constants__ = ["fixed_rate"]
+
+    def __init__(self, hop_length: Optional[int] = None, n_freq: int = 201, fixed_rate: Optional[float] = None) -> None:
+        super(TimeStretch, self).__init__()
+
+        self.fixed_rate = fixed_rate
+
+        n_fft = (n_freq - 1) * 2
+        hop_length = hop_length if hop_length is not None else n_fft // 2
+        self.register_buffer("phase_advance", torch.linspace(0, math.pi * hop_length, n_freq)[..., None])
+
+    def forward(self, complex_specgrams: Tensor, overriding_rate: Optional[float] = None) -> Tensor:
+        r"""
+        Args:
+            complex_specgrams (Tensor):
+                A tensor of dimension `(..., freq, num_frame)` with complex dtype.
+            overriding_rate (float or None, optional): speed up to apply to this batch.
+                If no rate is passed, use ``self.fixed_rate``. (Default: ``None``)
+
+        Returns:
+            Tensor:
+                Stretched spectrogram. The resulting tensor is of the same dtype as the input
+                spectrogram, but the number of frames is changed to ``ceil(num_frame / rate)``.
+        """
+        if overriding_rate is None:
+            if self.fixed_rate is None:
+                raise ValueError("If no fixed_rate is specified, must pass a valid rate to the forward method.")
+            rate = self.fixed_rate
+        else:
+            rate = overriding_rate
+        return F.phase_vocoder(complex_specgrams, rate, self.phase_advance)
+
+
+class Fade(torch.nn.Module):
+    r"""Add a fade in and/or fade out to an waveform.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        fade_in_len (int, optional): Length of fade-in (time frames). (Default: ``0``)
+        fade_out_len (int, optional): Length of fade-out (time frames). (Default: ``0``)
+        fade_shape (str, optional): Shape of fade. Must be one of: "quarter_sine",
+            ``"half_sine"``, ``"linear"``, ``"logarithmic"``, ``"exponential"``.
+            (Default: ``"linear"``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True)
+        >>> transform = transforms.Fade(fade_in_len=sample_rate, fade_out_len=2 * sample_rate, fade_shape='linear')
+        >>> faded_waveform = transform(waveform)
+    """
+
+    def __init__(self, fade_in_len: int = 0, fade_out_len: int = 0, fade_shape: str = "linear") -> None:
+        super(Fade, self).__init__()
+        self.fade_in_len = fade_in_len
+        self.fade_out_len = fade_out_len
+        self.fade_shape = fade_shape
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension `(..., time)`.
+
+        Returns:
+            Tensor: Tensor of audio of dimension `(..., time)`.
+        """
+        waveform_length = waveform.size()[-1]
+        device = waveform.device
+        return self._fade_in(waveform_length, device) * self._fade_out(waveform_length, device) * waveform
+
+    def _fade_in(self, waveform_length: int, device: torch.device) -> Tensor:
+        fade = torch.linspace(0, 1, self.fade_in_len, device=device)
+        ones = torch.ones(waveform_length - self.fade_in_len, device=device)
+
+        if self.fade_shape == "linear":
+            fade = fade
+
+        if self.fade_shape == "exponential":
+            fade = torch.pow(2, (fade - 1)) * fade
+
+        if self.fade_shape == "logarithmic":
+            fade = torch.log10(0.1 + fade) + 1
+
+        if self.fade_shape == "quarter_sine":
+            fade = torch.sin(fade * math.pi / 2)
+
+        if self.fade_shape == "half_sine":
+            fade = torch.sin(fade * math.pi - math.pi / 2) / 2 + 0.5
+
+        return torch.cat((fade, ones)).clamp_(0, 1)
+
+    def _fade_out(self, waveform_length: int, device: torch.device) -> Tensor:
+        fade = torch.linspace(0, 1, self.fade_out_len, device=device)
+        ones = torch.ones(waveform_length - self.fade_out_len, device=device)
+
+        if self.fade_shape == "linear":
+            fade = -fade + 1
+
+        if self.fade_shape == "exponential":
+            fade = torch.pow(2, -fade) * (1 - fade)
+
+        if self.fade_shape == "logarithmic":
+            fade = torch.log10(1.1 - fade) + 1
+
+        if self.fade_shape == "quarter_sine":
+            fade = torch.sin(fade * math.pi / 2 + math.pi / 2)
+
+        if self.fade_shape == "half_sine":
+            fade = torch.sin(fade * math.pi + math.pi / 2) / 2 + 0.5
+
+        return torch.cat((ones, fade)).clamp_(0, 1)
+
+
+class _AxisMasking(torch.nn.Module):
+    r"""Apply masking to a spectrogram.
+
+    Args:
+        mask_param (int): Maximum possible length of the mask.
+        axis (int): What dimension the mask is applied on.
+        iid_masks (bool): Applies iid masks to each of the examples in the batch dimension.
+            This option is applicable only when the input tensor is 4D.
+        p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
+    """
+    __constants__ = ["mask_param", "axis", "iid_masks", "p"]
+
+    def __init__(self, mask_param: int, axis: int, iid_masks: bool, p: float = 1.0) -> None:
+
+        super(_AxisMasking, self).__init__()
+        self.mask_param = mask_param
+        self.axis = axis
+        self.iid_masks = iid_masks
+        self.p = p
+
+    def forward(self, specgram: Tensor, mask_value: float = 0.0) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor): Tensor of dimension `(..., freq, time)`.
+            mask_value (float): Value to assign to the masked columns.
+
+        Returns:
+            Tensor: Masked spectrogram of dimensions `(..., freq, time)`.
+        """
+        # if iid_masks flag marked and specgram has a batch dimension
+        if self.iid_masks and specgram.dim() == 4:
+            return F.mask_along_axis_iid(specgram, self.mask_param, mask_value, self.axis + 1, p=self.p)
+        else:
+            return F.mask_along_axis(specgram, self.mask_param, mask_value, self.axis, p=self.p)
+
+
+class FrequencyMasking(_AxisMasking):
+    r"""Apply masking to a spectrogram in the frequency domain.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Proposed in *SpecAugment* [:footcite:`specaugment`].
+
+    Args:
+        freq_mask_param (int): maximum possible length of the mask.
+            Indices uniformly sampled from [0, freq_mask_param).
+        iid_masks (bool, optional): whether to apply different masks to each
+            example/channel in the batch. (Default: ``False``)
+            This option is applicable only when the input tensor is 4D.
+
+    Example
+        >>> spectrogram = torchaudio.transforms.Spectrogram()
+        >>> masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
+        >>>
+        >>> original = spectrogram(waveform)
+        >>> masked = masking(original)
+
+        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_freq_masking1.png
+           :alt: The original spectrogram
+
+        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_freq_masking2.png
+           :alt: The spectrogram masked along frequency axis
+    """
+
+    def __init__(self, freq_mask_param: int, iid_masks: bool = False) -> None:
+        super(FrequencyMasking, self).__init__(freq_mask_param, 1, iid_masks)
+
+
+class TimeMasking(_AxisMasking):
+    r"""Apply masking to a spectrogram in the time domain.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Proposed in *SpecAugment* [:footcite:`specaugment`].
+
+    Args:
+        time_mask_param (int): maximum possible length of the mask.
+            Indices uniformly sampled from [0, time_mask_param).
+        iid_masks (bool, optional): whether to apply different masks to each
+            example/channel in the batch. (Default: ``False``)
+            This option is applicable only when the input tensor is 4D.
+        p (float, optional): maximum proportion of time steps that can be masked.
+            Must be within range [0.0, 1.0]. (Default: 1.0)
+
+    Example
+        >>> spectrogram = torchaudio.transforms.Spectrogram()
+        >>> masking = torchaudio.transforms.TimeMasking(time_mask_param=80)
+        >>>
+        >>> original = spectrogram(waveform)
+        >>> masked = masking(original)
+
+        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_masking1.png
+           :alt: The original spectrogram
+
+        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_masking2.png
+           :alt: The spectrogram masked along time axis
+    """
+
+    def __init__(self, time_mask_param: int, iid_masks: bool = False, p: float = 1.0) -> None:
+        if not 0.0 <= p <= 1.0:
+            raise ValueError(f"The value of p must be between 0.0 and 1.0 ({p} given).")
+        super(TimeMasking, self).__init__(time_mask_param, 2, iid_masks, p=p)
+
+
+class Vol(torch.nn.Module):
+    r"""Add a volume to an waveform.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        gain (float): Interpreted according to the given gain_type:
+            If ``gain_type`` = ``amplitude``, ``gain`` is a positive amplitude ratio.
+            If ``gain_type`` = ``power``, ``gain`` is a power (voltage squared).
+            If ``gain_type`` = ``db``, ``gain`` is in decibels.
+        gain_type (str, optional): Type of gain. One of: ``amplitude``, ``power``, ``db`` (Default: ``amplitude``)
+    """
+
+    def __init__(self, gain: float, gain_type: str = "amplitude"):
+        super(Vol, self).__init__()
+        self.gain = gain
+        self.gain_type = gain_type
+
+        if gain_type in ["amplitude", "power"] and gain < 0:
+            raise ValueError("If gain_type = amplitude or power, gain must be positive.")
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension `(..., time)`.
+
+        Returns:
+            Tensor: Tensor of audio of dimension `(..., time)`.
+        """
+        if self.gain_type == "amplitude":
+            waveform = waveform * self.gain
+
+        if self.gain_type == "db":
+            waveform = F.gain(waveform, self.gain)
+
+        if self.gain_type == "power":
+            waveform = F.gain(waveform, 10 * math.log10(self.gain))
+
+        return torch.clamp(waveform, -1, 1)
+
+
+class SlidingWindowCmn(torch.nn.Module):
+    r"""
+    Apply sliding-window cepstral mean (and optionally variance) normalization per utterance.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600)
+        min_cmn_window (int, optional):  Minimum CMN window used at start of decoding (adds latency only at start).
+            Only applicable if center == false, ignored if center==true (int, default = 100)
+        center (bool, optional): If true, use a window centered on the current frame
+            (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)
+        norm_vars (bool, optional): If true, normalize variance to one. (bool, default = false)
+    """
+
+    def __init__(
+        self, cmn_window: int = 600, min_cmn_window: int = 100, center: bool = False, norm_vars: bool = False
+    ) -> None:
+        super().__init__()
+        self.cmn_window = cmn_window
+        self.min_cmn_window = min_cmn_window
+        self.center = center
+        self.norm_vars = norm_vars
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor): Tensor of spectrogram of dimension `(..., time, freq)`.
+
+        Returns:
+            Tensor: Tensor of spectrogram of dimension `(..., time, freq)`.
+        """
+        cmn_specgram = F.sliding_window_cmn(specgram, self.cmn_window, self.min_cmn_window, self.center, self.norm_vars)
+        return cmn_specgram
+
+
+class Vad(torch.nn.Module):
+    r"""Voice Activity Detector. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
+    The algorithm currently uses a simple cepstral power measurement to detect voice,
+    so may be fooled by other things, especially music.
+
+    The effect can trim only from the front of the audio,
+    so in order to trim from the back, the reverse effect must also be used.
+
+    Args:
+        sample_rate (int): Sample rate of audio signal.
+        trigger_level (float, optional): The measurement level used to trigger activity detection.
+            This may need to be cahnged depending on the noise level, signal level,
+            and other characteristics of the input audio. (Default: 7.0)
+        trigger_time (float, optional): The time constant (in seconds)
+            used to help ignore short bursts of sound. (Default: 0.25)
+        search_time (float, optional): The amount of audio (in seconds)
+            to search for quieter/shorter bursts of audio to include prior
+            to the detected trigger point. (Default: 1.0)
+        allowed_gap (float, optional): The allowed gap (in seconds) between
+            quiteter/shorter bursts of audio to include prior
+            to the detected trigger point. (Default: 0.25)
+        pre_trigger_time (float, optional): The amount of audio (in seconds) to preserve
+            before the trigger point and any found quieter/shorter bursts. (Default: 0.0)
+        boot_time (float, optional) The algorithm (internally) uses adaptive noise
+            estimation/reduction in order to detect the start of the wanted audio.
+            This option sets the time for the initial noise estimate. (Default: 0.35)
+        noise_up_time (float, optional) Time constant used by the adaptive noise estimator
+            for when the noise level is increasing. (Default: 0.1)
+        noise_down_time (float, optional) Time constant used by the adaptive noise estimator
+            for when the noise level is decreasing. (Default: 0.01)
+        noise_reduction_amount (float, optional) Amount of noise reduction to use in
+            the detection algorithm (e.g. 0, 0.5, ...). (Default: 1.35)
+        measure_freq (float, optional) Frequency of the algorithm’s
+            processing/measurements. (Default: 20.0)
+        measure_duration: (float or None, optional) Measurement duration.
+            (Default: Twice the measurement period; i.e. with overlap.)
+        measure_smooth_time (float, optional) Time constant used to smooth
+            spectral measurements. (Default: 0.4)
+        hp_filter_freq (float, optional) "Brick-wall" frequency of high-pass filter applied
+            at the input to the detector algorithm. (Default: 50.0)
+        lp_filter_freq (float, optional) "Brick-wall" frequency of low-pass filter applied
+            at the input to the detector algorithm. (Default: 6000.0)
+        hp_lifter_freq (float, optional) "Brick-wall" frequency of high-pass lifter used
+            in the detector algorithm. (Default: 150.0)
+        lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used
+            in the detector algorithm. (Default: 2000.0)
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+
+    def __init__(
+        self,
+        sample_rate: int,
+        trigger_level: float = 7.0,
+        trigger_time: float = 0.25,
+        search_time: float = 1.0,
+        allowed_gap: float = 0.25,
+        pre_trigger_time: float = 0.0,
+        boot_time: float = 0.35,
+        noise_up_time: float = 0.1,
+        noise_down_time: float = 0.01,
+        noise_reduction_amount: float = 1.35,
+        measure_freq: float = 20.0,
+        measure_duration: Optional[float] = None,
+        measure_smooth_time: float = 0.4,
+        hp_filter_freq: float = 50.0,
+        lp_filter_freq: float = 6000.0,
+        hp_lifter_freq: float = 150.0,
+        lp_lifter_freq: float = 2000.0,
+    ) -> None:
+        super().__init__()
+
+        self.sample_rate = sample_rate
+        self.trigger_level = trigger_level
+        self.trigger_time = trigger_time
+        self.search_time = search_time
+        self.allowed_gap = allowed_gap
+        self.pre_trigger_time = pre_trigger_time
+        self.boot_time = boot_time
+        self.noise_up_time = noise_up_time
+        self.noise_down_time = noise_down_time
+        self.noise_reduction_amount = noise_reduction_amount
+        self.measure_freq = measure_freq
+        self.measure_duration = measure_duration
+        self.measure_smooth_time = measure_smooth_time
+        self.hp_filter_freq = hp_filter_freq
+        self.lp_filter_freq = lp_filter_freq
+        self.hp_lifter_freq = hp_lifter_freq
+        self.lp_lifter_freq = lp_lifter_freq
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension `(channels, time)` or `(time)`
+                Tensor of shape `(channels, time)` is treated as a multi-channel recording
+                of the same event and the resulting output will be trimmed to the earliest
+                voice activity in any channel.
+        """
+        return F.vad(
+            waveform=waveform,
+            sample_rate=self.sample_rate,
+            trigger_level=self.trigger_level,
+            trigger_time=self.trigger_time,
+            search_time=self.search_time,
+            allowed_gap=self.allowed_gap,
+            pre_trigger_time=self.pre_trigger_time,
+            boot_time=self.boot_time,
+            noise_up_time=self.noise_up_time,
+            noise_down_time=self.noise_down_time,
+            noise_reduction_amount=self.noise_reduction_amount,
+            measure_freq=self.measure_freq,
+            measure_duration=self.measure_duration,
+            measure_smooth_time=self.measure_smooth_time,
+            hp_filter_freq=self.hp_filter_freq,
+            lp_filter_freq=self.lp_filter_freq,
+            hp_lifter_freq=self.hp_lifter_freq,
+            lp_lifter_freq=self.lp_lifter_freq,
+        )
+
+
+class SpectralCentroid(torch.nn.Module):
+    r"""Compute the spectral centroid for each channel along the time axis.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    The spectral centroid is defined as the weighted average of the
+    frequency values, weighted by their magnitude.
+
+    Args:
+        sample_rate (int): Sample rate of audio signal.
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True)
+        >>> transform = transforms.SpectralCentroid(sample_rate)
+        >>> spectral_centroid = transform(waveform)  # (channel, time)
+    """
+    __constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad"]
+
+    def __init__(
+        self,
+        sample_rate: int,
+        n_fft: int = 400,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        pad: int = 0,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        wkwargs: Optional[dict] = None,
+    ) -> None:
+        super(SpectralCentroid, self).__init__()
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
+        self.register_buffer("window", window)
+        self.pad = pad
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension `(..., time)`.
+
+        Returns:
+            Tensor: Spectral Centroid of size `(..., time)`.
+        """
+
+        return F.spectral_centroid(
+            waveform, self.sample_rate, self.pad, self.window, self.n_fft, self.hop_length, self.win_length
+        )
+
+
+class PitchShift(LazyModuleMixin, torch.nn.Module):
+    r"""Shift the pitch of a waveform by ``n_steps`` steps.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        waveform (Tensor): The input waveform of shape `(..., time)`.
+        sample_rate (int): Sample rate of `waveform`.
+        n_steps (int): The (fractional) steps to shift `waveform`.
+        bins_per_octave (int, optional): The number of steps per octave (Default : ``12``).
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins (Default: ``512``).
+        win_length (int or None, optional): Window size. If None, then ``n_fft`` is used. (Default: ``None``).
+        hop_length (int or None, optional): Length of hop between STFT windows. If None, then ``win_length // 4``
+            is used (Default: ``None``).
+        window (Tensor or None, optional): Window tensor that is applied/multiplied to each frame/window.
+            If None, then ``torch.hann_window(win_length)`` is used (Default: ``None``).
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load('test.wav', normalize=True)
+        >>> transform = transforms.PitchShift(sample_rate, 4)
+        >>> waveform_shift = transform(waveform)  # (channel, time)
+    """
+    __constants__ = ["sample_rate", "n_steps", "bins_per_octave", "n_fft", "win_length", "hop_length"]
+
+    kernel: UninitializedParameter
+    width: int
+
+    def __init__(
+        self,
+        sample_rate: int,
+        n_steps: int,
+        bins_per_octave: int = 12,
+        n_fft: int = 512,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        window_fn: Callable[..., Tensor] = torch.hann_window,
+        wkwargs: Optional[dict] = None,
+    ) -> None:
+        super().__init__()
+        self.n_steps = n_steps
+        self.bins_per_octave = bins_per_octave
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 4
+        window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
+        self.register_buffer("window", window)
+        rate = 2.0 ** (-float(n_steps) / bins_per_octave)
+        self.orig_freq = int(sample_rate / rate)
+        self.gcd = math.gcd(int(self.orig_freq), int(sample_rate))
+
+        if self.orig_freq != sample_rate:
+            self.width = -1
+            self.kernel = UninitializedParameter(device=None, dtype=None)
+
+    def initialize_parameters(self, input):
+        if self.has_uninitialized_params():
+            if self.orig_freq != self.sample_rate:
+                with torch.no_grad():
+                    kernel, self.width = _get_sinc_resample_kernel(
+                        self.orig_freq,
+                        self.sample_rate,
+                        self.gcd,
+                        dtype=input.dtype,
+                        device=input.device,
+                    )
+                    self.kernel.materialize(kernel.shape)
+                    self.kernel.copy_(kernel)
+
+    def forward(self, waveform: Tensor) -> Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension `(..., time)`.
+
+        Returns:
+            Tensor: The pitch-shifted audio of shape `(..., time)`.
+        """
+        shape = waveform.size()
+
+        waveform_stretch = _stretch_waveform(
+            waveform,
+            self.n_steps,
+            self.bins_per_octave,
+            self.n_fft,
+            self.win_length,
+            self.hop_length,
+            self.window,
+        )
+
+        if self.orig_freq != self.sample_rate:
+            waveform_shift = _apply_sinc_resample_kernel(
+                waveform_stretch,
+                self.orig_freq,
+                self.sample_rate,
+                self.gcd,
+                self.kernel,
+                self.width,
+            )
+        else:
+            waveform_shift = waveform_stretch
+
+        return _fix_waveform_shape(
+            waveform_shift,
+            shape,
+        )
+
+
+class RNNTLoss(torch.nn.Module):
+    """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
+    [:footcite:`graves2012sequence`].
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    The RNN Transducer loss extends the CTC loss by defining a distribution over output
+    sequences of all lengths, and by jointly modelling both input-output and output-output
+    dependencies.
+
+    Args:
+        blank (int, optional): blank label (Default: ``-1``)
+        clamp (float, optional): clamp for gradients (Default: ``-1``)
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. (Default: ``'mean'``)
+
+    Example
+        >>> # Hypothetical values
+        >>> logits = torch.tensor([[[[0.1, 0.6, 0.1, 0.1, 0.1],
+        >>>                          [0.1, 0.1, 0.6, 0.1, 0.1],
+        >>>                          [0.1, 0.1, 0.2, 0.8, 0.1]],
+        >>>                         [[0.1, 0.6, 0.1, 0.1, 0.1],
+        >>>                          [0.1, 0.1, 0.2, 0.1, 0.1],
+        >>>                          [0.7, 0.1, 0.2, 0.1, 0.1]]]],
+        >>>                       dtype=torch.float32,
+        >>>                       requires_grad=True)
+        >>> targets = torch.tensor([[1, 2]], dtype=torch.int)
+        >>> logit_lengths = torch.tensor([2], dtype=torch.int)
+        >>> target_lengths = torch.tensor([2], dtype=torch.int)
+        >>> transform = transforms.RNNTLoss(blank=0)
+        >>> loss = transform(logits, targets, logit_lengths, target_lengths)
+        >>> loss.backward()
+    """
+
+    def __init__(
+        self,
+        blank: int = -1,
+        clamp: float = -1.0,
+        reduction: str = "mean",
+    ):
+        super().__init__()
+        self.blank = blank
+        self.clamp = clamp
+        self.reduction = reduction
+
+    def forward(
+        self,
+        logits: Tensor,
+        targets: Tensor,
+        logit_lengths: Tensor,
+        target_lengths: Tensor,
+    ):
+        """
+        Args:
+            logits (Tensor): Tensor of dimension `(batch, max seq length, max target length + 1, class)`
+                containing output from joiner
+            targets (Tensor): Tensor of dimension `(batch, max target length)` containing targets with zero padded
+            logit_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of each sequence from encoder
+            target_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of targets for each sequence
+        Returns:
+            Tensor: Loss with the reduction option applied. If ``reduction`` is  ``'none'``, then size (batch),
+            otherwise scalar.
+        """
+        return F.rnnt_loss(logits, targets, logit_lengths, target_lengths, self.blank, self.clamp, self.reduction)
diff --git a/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/utils/download.py b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/utils/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4a3a062ddb96d7988bef4a5c8bbdf886f51a92d
--- /dev/null
+++ b/my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/utils/download.py
@@ -0,0 +1,89 @@
+import hashlib
+import logging
+from os import PathLike
+from pathlib import Path
+from typing import Union
+
+import torch
+
+
+_LG = logging.getLogger(__name__)
+
+
+def _get_local_path(key):
+    path = Path(torch.hub.get_dir()) / "torchaudio" / Path(key)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def _download(key, path, progress):
+    url = f"https://download.pytorch.org/torchaudio/{key}"
+    torch.hub.download_url_to_file(url, path, progress=progress)
+
+
+def _get_hash(path, hash, chunk_size=1028):
+    m = hashlib.sha256()
+    with open(path, "rb") as file:
+        data = file.read(chunk_size)
+        while data:
+            m.update(data)
+            data = file.read(chunk_size)
+    return m.hexdigest()
+
+
+def download_asset(
+    key: str,
+    hash: str = "",
+    path: Union[str, PathLike] = "",
+    *,
+    progress: bool = True,
+) -> str:
+    """Download and store torchaudio assets to local file system.
+
+    If a file exists at the download path, then that path is returned with or without
+    hash validation.
+
+    Args:
+        key (str): The asset identifier.
+        hash (str, optional):
+            The value of SHA256 hash of the asset. If provided, it is used to verify
+            the downloaded / cached object. If not provided, then no hash validation
+            is performed. This means if a file exists at the download path, then the path
+            is returned as-is without verifying the identity of the file.
+        path (path-like object, optional):
+            By default, the downloaded asset is saved in a directory under
+            :py:func:`torch.hub.get_dir` and intermediate directories based on the given `key`
+            are created.
+            This argument can be used to overwrite the target location.
+            When this argument is provided, all the intermediate directories have to be
+            created beforehand.
+        progress (bool): Whether to show progress bar for downloading. Default: ``True``.
+
+    Note:
+        Currently the valid key values are the route on ``download.pytorch.org/torchaudio``,
+        but this is an implementation detail.
+
+    Returns:
+        str: The path to the asset on the local file system.
+    """
+    path = path or _get_local_path(key)
+
+    if path.exists():
+        _LG.info("The local file (%s) exists. Skipping the download.", path)
+    else:
+        _LG.info("Downloading %s to %s", key, path)
+        _download(key, path, progress=progress)
+
+    if hash:
+        _LG.info("Verifying the hash value.")
+        digest = _get_hash(path, hash)
+
+        if digest != hash:
+            raise ValueError(
+                f"The hash value of the downloaded file ({path}), '{digest}' does not match "
+                f"the provided hash value, '{hash}'."
+            )
+
+        _LG.info("Hash validated.")
+
+    return str(path)