diff --git a/.gitattributes b/.gitattributes
index 2b716bbd9280bea303a16943d1aad9687eefda20..9255f7e32eab613442ed1ba133efa16f7356cac9 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -51,3 +51,14 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Plex/Trans
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/ExprNodes.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/idna/__pycache__/uts46data.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/__pycache__/_emoji_codes.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/Scanning.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/idna/__pycache__/idnadata.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/PyrexTypes.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Code.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/typing_extensions.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/ParseTreeTransforms.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_functions2.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Nodes.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/Scanning.cpython-311-x86_64-linux-gnu.so b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/Scanning.cpython-311-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..0460cbae69bb4331de217febe2102ddc98276943
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/Scanning.cpython-311-x86_64-linux-gnu.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35bbd7708e61d6b2d4704c7139018d3eae67bca303d9fa03228b50845f6fffe6
+size 340320
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Code.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Code.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de5990d5f865148dca19d1380505714f00d679ad
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Code.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e03d22fd7cc8b4e378f65e07858c4720dcc03e0fa3553c776863e4969826cfd4
+size 145746
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Nodes.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Nodes.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b2569700f7d23448c43dd5ef7e1aa08aca06537
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Nodes.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c423f97f1ac36f06a8a2c6ff723696608c3e094001049a85ad421706ae558dea
+size 522167
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/ParseTreeTransforms.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/ParseTreeTransforms.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60261f39b9770d7458b77b9729df9f271f7e8ee7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/ParseTreeTransforms.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab3f1d1811e8f1f97f96bc002bc8705a4adb7a26f43def577bf24b25263f4b32
+size 213081
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/PyrexTypes.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/PyrexTypes.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72f3b8d343a587b579984deb2ebf7487e5af6cea
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/PyrexTypes.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:465b72a0af764658a62bbb1d50e50b9a762ba16ddb1a6be0dd5b3b1f15c8a205
+size 254554
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/typing_extensions.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/typing_extensions.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b21fa6a35fad1e2aa4d277778fcc8dbdfa0643da
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/typing_extensions.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f505b823a26bd0da98ceb5e93ba4f79513f56cebf4f8cb1c8ed579dcdabaac32
+size 129942
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71ade6609aaccad8856ecff8899d967fe3ee91ee
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_unix.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_unix.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9cdf7e8ffe81efb2449ef9e3e232c6b16844915
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_unix.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/INSTALLER b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/INSTALLER
new file mode 100644
index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/INSTALLER
@@ -0,0 +1 @@
+pip
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/RECORD b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/RECORD
new file mode 100644
index 0000000000000000000000000000000000000000..efaf5f8ceafaeab082f59ade8a16b3d560198153
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/RECORD
@@ -0,0 +1,104 @@
+fsspec-2024.2.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+fsspec-2024.2.0.dist-info/LICENSE,sha256=LcNUls5TpzB5FcAIqESq1T53K0mzTN0ARFBnaRQH7JQ,1513
+fsspec-2024.2.0.dist-info/METADATA,sha256=uwzW1Braxnd_QGVI8W6J0KHi5KTiTJEm8YzSUdG-_Dc,6786
+fsspec-2024.2.0.dist-info/RECORD,,
+fsspec-2024.2.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+fsspec-2024.2.0.dist-info/top_level.txt,sha256=blt2pDrQDwN3Gklcw13CSPLQRd6aaOgJ8AxqrW395MI,7
+fsspec/__init__.py,sha256=2kT62GfFK-AjgS-LgwSsCo_VA2IePvsyv8Ash5oiaFA,1982
+fsspec/__pycache__/__init__.cpython-311.pyc,,
+fsspec/__pycache__/_version.cpython-311.pyc,,
+fsspec/__pycache__/archive.cpython-311.pyc,,
+fsspec/__pycache__/asyn.cpython-311.pyc,,
+fsspec/__pycache__/caching.cpython-311.pyc,,
+fsspec/__pycache__/callbacks.cpython-311.pyc,,
+fsspec/__pycache__/compression.cpython-311.pyc,,
+fsspec/__pycache__/config.cpython-311.pyc,,
+fsspec/__pycache__/conftest.cpython-311.pyc,,
+fsspec/__pycache__/core.cpython-311.pyc,,
+fsspec/__pycache__/dircache.cpython-311.pyc,,
+fsspec/__pycache__/exceptions.cpython-311.pyc,,
+fsspec/__pycache__/fuse.cpython-311.pyc,,
+fsspec/__pycache__/generic.cpython-311.pyc,,
+fsspec/__pycache__/gui.cpython-311.pyc,,
+fsspec/__pycache__/mapping.cpython-311.pyc,,
+fsspec/__pycache__/parquet.cpython-311.pyc,,
+fsspec/__pycache__/registry.cpython-311.pyc,,
+fsspec/__pycache__/spec.cpython-311.pyc,,
+fsspec/__pycache__/transaction.cpython-311.pyc,,
+fsspec/__pycache__/utils.cpython-311.pyc,,
+fsspec/_version.py,sha256=onTKKWe4fXkBjQxbTwM82SUT0H3x4U17IYrciFAryaU,500
+fsspec/archive.py,sha256=S__DzfZj-urAN3tp2W6jJ6YDiXG1fAl7FjvWUN73qIE,2386
+fsspec/asyn.py,sha256=kJ45sFFya2lZsmu2v8CVc8ZPRs8AccEzAy6Jot2ylkU,36157
+fsspec/caching.py,sha256=N45pzJdD4w5FOX_sxGvHWirggPNB66JTGP1HH6fpSck,28781
+fsspec/callbacks.py,sha256=BDIwLzK6rr_0V5ch557fSzsivCElpdqhXr5dZ9Te-EE,9210
+fsspec/compression.py,sha256=Yyd8FXw2rwWRtVoRVah_yguv-J7BUcBo4yDu6Qt52a0,4859
+fsspec/config.py,sha256=LF4Zmu1vhJW7Je9Q-cwkRc3xP7Rhyy7Xnwj26Z6sv2g,4279
+fsspec/conftest.py,sha256=fVfx-NLrH_OZS1TIpYNoPzM7efEcMoL62reHOdYeFCA,1245
+fsspec/core.py,sha256=0yCj1Z5MhbSDIQiqFs49VORl9QaGwV6hp9bXdkIoPIo,22363
+fsspec/dircache.py,sha256=YzogWJrhEastHU7vWz-cJiJ7sdtLXFXhEpInGKd4EcM,2717
+fsspec/exceptions.py,sha256=xcS7LiRrQ748kvOB9mrUR14kpjNztrHgEkZWi9M-VaI,330
+fsspec/fuse.py,sha256=66amOa6wdIbS0DMhhfAPUoOB37HPorfXD1izV0prmTY,10145
+fsspec/generic.py,sha256=NuNaP66OaphwMbuLHRFBLda78TD81isa9O4ozJqbUv0,13455
+fsspec/gui.py,sha256=XKoXZpUhRE7jOhRCJH4-jRbKhVu56aS8h9tecvPD3nc,13932
+fsspec/implementations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+fsspec/implementations/__pycache__/__init__.cpython-311.pyc,,
+fsspec/implementations/__pycache__/arrow.cpython-311.pyc,,
+fsspec/implementations/__pycache__/cache_mapper.cpython-311.pyc,,
+fsspec/implementations/__pycache__/cache_metadata.cpython-311.pyc,,
+fsspec/implementations/__pycache__/cached.cpython-311.pyc,,
+fsspec/implementations/__pycache__/dask.cpython-311.pyc,,
+fsspec/implementations/__pycache__/data.cpython-311.pyc,,
+fsspec/implementations/__pycache__/dbfs.cpython-311.pyc,,
+fsspec/implementations/__pycache__/dirfs.cpython-311.pyc,,
+fsspec/implementations/__pycache__/ftp.cpython-311.pyc,,
+fsspec/implementations/__pycache__/git.cpython-311.pyc,,
+fsspec/implementations/__pycache__/github.cpython-311.pyc,,
+fsspec/implementations/__pycache__/http.cpython-311.pyc,,
+fsspec/implementations/__pycache__/jupyter.cpython-311.pyc,,
+fsspec/implementations/__pycache__/libarchive.cpython-311.pyc,,
+fsspec/implementations/__pycache__/local.cpython-311.pyc,,
+fsspec/implementations/__pycache__/memory.cpython-311.pyc,,
+fsspec/implementations/__pycache__/reference.cpython-311.pyc,,
+fsspec/implementations/__pycache__/sftp.cpython-311.pyc,,
+fsspec/implementations/__pycache__/smb.cpython-311.pyc,,
+fsspec/implementations/__pycache__/tar.cpython-311.pyc,,
+fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc,,
+fsspec/implementations/__pycache__/zip.cpython-311.pyc,,
+fsspec/implementations/arrow.py,sha256=_7TLuV6ZzNlpmUU_v6ud56u2wadzsKmY5qugPBxgMEs,8649
+fsspec/implementations/cache_mapper.py,sha256=iHgBA6gjzDJ7_mBboHFzpLTf55HP3UEwUOZ43xyUK4M,2429
+fsspec/implementations/cache_metadata.py,sha256=ZvyA7Y3KK-5Ct4E5pELzD6mH_5T03XqaKVT96qYDADU,8576
+fsspec/implementations/cached.py,sha256=LbbPbeUup07O0y7gXD_atFgajWM9p1vlDKu_BOyLfbo,30943
+fsspec/implementations/dask.py,sha256=CXZbJzIVOhKV8ILcxuy3bTvcacCueAbyQxmvAkbPkrk,4466
+fsspec/implementations/data.py,sha256=Oti0dKzyeadnVIedo3s8CADoh9bNM-96_6viTEYr4lo,1245
+fsspec/implementations/dbfs.py,sha256=cix9OYUveuSOx5UO5uRUwNUkYqjzyY0fkKnca1kTgZ0,15014
+fsspec/implementations/dirfs.py,sha256=inDIRSDPhI1_ud1MMBFrpZQ11VIAMJ_dZQtbE4V08Ng,11384
+fsspec/implementations/ftp.py,sha256=rp6cTog8xqjDPlKdSLKcsyP7K593_ByMabxGbNSEpTo,11655
+fsspec/implementations/git.py,sha256=vKGI-Vd5q4H2RrvhebkPc9NwlfkZ980OUGhebeCw-M0,4034
+fsspec/implementations/github.py,sha256=0kIiKkeAaROuHgdWBHVQFrzJ2ZfoDgymCehL_kJXHYA,7565
+fsspec/implementations/http.py,sha256=PkhfgUV3-T7fG2Jf-NLX9doH52snV5Wmw91uVA9k74M,29454
+fsspec/implementations/jupyter.py,sha256=B2uj7OEm7yIk-vRSsO37_ND0t0EBvn4B-Su43ibN4Pg,3811
+fsspec/implementations/libarchive.py,sha256=5_I2DiLXwQ1JC8x-K7jXu-tBwhO9dj7tFLnb0bTnVMQ,7102
+fsspec/implementations/local.py,sha256=nxiRKg9FAQHTQss9-ET8ZzDXPGhSOktgkxrg0ffMs2I,13454
+fsspec/implementations/memory.py,sha256=2iU--pOV2KCTrS-d5K8VKSygh9MPk2D7NZ_C8lMMEIw,9701
+fsspec/implementations/reference.py,sha256=0iGu8mscaQ3a5iTlRNByytQ3_-1Bj8__ARqVwyy4q2M,43871
+fsspec/implementations/sftp.py,sha256=fMY9XZcmpjszQ2tCqO_TPaJesaeD_Dv7ptYzgUPGoO0,5631
+fsspec/implementations/smb.py,sha256=k3RtzW97lJtYuw_QpP1rJRFnUBmSsw9twFjUCex0a5U,10591
+fsspec/implementations/tar.py,sha256=dam78Tp_CozybNqCY2JYgGBS3Uc9FuJUAT9oB0lolOs,4111
+fsspec/implementations/webhdfs.py,sha256=wqVfno7z0TY1HepaIvKTUUcl_bi5NkV6qWsST8t_s7Y,16745
+fsspec/implementations/zip.py,sha256=JDX-3HOI15qUl6VTBsNPuDp5RVN6s2n3Bywd4mMu0T0,4347
+fsspec/mapping.py,sha256=WFEXRWxujQwfzzkRP5tpdIE0265okAtlP97qFZGvV1k,8165
+fsspec/parquet.py,sha256=qVxDhwc960SGOt5etcYAJxCr-7HQKP01687KpDR02Gw,19463
+fsspec/registry.py,sha256=-dl7sh2tsfhMA2uxz5KQDsPFehQTgMJIbVjNq6QLoKU,11145
+fsspec/spec.py,sha256=3t96RgizRN_slIuHXnuR0bXjVUfBS1TfuDrEua4oQvE,66277
+fsspec/tests/abstract/__init__.py,sha256=i1wcFixV6QhOwdoB24c8oXjzobISNqiKVz9kl2DvAY8,10028
+fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc,,
+fsspec/tests/abstract/__pycache__/common.cpython-311.pyc,,
+fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc,,
+fsspec/tests/abstract/__pycache__/get.cpython-311.pyc,,
+fsspec/tests/abstract/__pycache__/put.cpython-311.pyc,,
+fsspec/tests/abstract/common.py,sha256=1GQwNo5AONzAnzZj0fWgn8NJPLXALehbsuGxS3FzWVU,4973
+fsspec/tests/abstract/copy.py,sha256=gU5-d97U3RSde35Vp4RxPY4rWwL744HiSrJ8IBOp9-8,19967
+fsspec/tests/abstract/get.py,sha256=vNR4HztvTR7Cj56AMo7_tx7TeYz1Jgr_2Wb8Lv-UiBY,20755
+fsspec/tests/abstract/put.py,sha256=7aih17OKB_IZZh1Mkq1eBDIjobhtMQmI8x-Pw-S_aZk,21201
+fsspec/transaction.py,sha256=jeexB-H6Aw_gN6Z7hoKKe6v8zizITq39-gyTgpipIKE,2251
+fsspec/utils.py,sha256=_VX_0VwDtoAFSjMYrxvJvnPNX9FMoHO5BlFHXJ0bHFI,23053
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/WHEEL b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/WHEEL
new file mode 100644
index 0000000000000000000000000000000000000000..98c0d20b7a64f4f998d7913e1d38a05dba20916c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/WHEEL
@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: bdist_wheel (0.42.0)
+Root-Is-Purelib: true
+Tag: py3-none-any
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/top_level.txt b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..968fea66e533ba30593c7fbfe750c36fae2f3cfe
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/top_level.txt
@@ -0,0 +1 @@
+fsspec
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..a2be08170e5dea4db678c7a8f3dccb7d37e8f332
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d0c8228a395e1b7975c5d22cd5fe655e5a7b7024723a69164e0c9045aee847d
+size 324168
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1411fcaefbf746364892ac2d72f6109b630472b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc6ec603b289fea3017e8bb0c8eb537328f368d775f0aee16f2837595da3258b
+size 110499
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/__pycache__/ctx_mp.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/__pycache__/ctx_mp.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..598fa60a0df31c902712c040cef5ecec8db1f727
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/__pycache__/ctx_mp.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1573114afc4fbce73f2ba9d2ddc99882c00027c0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__init__.py
@@ -0,0 +1,77 @@
+from .libmpf import (prec_to_dps, dps_to_prec, repr_dps,
+  round_down, round_up, round_floor, round_ceiling, round_nearest,
+  to_pickable, from_pickable, ComplexResult,
+  fzero, fnzero, fone, fnone, ftwo, ften, fhalf, fnan, finf, fninf,
+  math_float_inf, round_int, normalize, normalize1,
+  from_man_exp, from_int, to_man_exp, to_int, mpf_ceil, mpf_floor,
+  mpf_nint, mpf_frac,
+  from_float, from_npfloat, from_Decimal, to_float, from_rational, to_rational, to_fixed,
+  mpf_rand, mpf_eq, mpf_hash, mpf_cmp, mpf_lt, mpf_le, mpf_gt, mpf_ge,
+  mpf_pos, mpf_neg, mpf_abs, mpf_sign, mpf_add, mpf_sub, mpf_sum,
+  mpf_mul, mpf_mul_int, mpf_shift, mpf_frexp,
+  mpf_div, mpf_rdiv_int, mpf_mod, mpf_pow_int,
+  mpf_perturb,
+  to_digits_exp, to_str, str_to_man_exp, from_str, from_bstr, to_bstr,
+  mpf_sqrt, mpf_hypot)
+
+from .libmpc import (mpc_one, mpc_zero, mpc_two, mpc_half,
+  mpc_is_inf, mpc_is_infnan, mpc_to_str, mpc_to_complex, mpc_hash,
+  mpc_conjugate, mpc_is_nonzero, mpc_add, mpc_add_mpf,
+  mpc_sub, mpc_sub_mpf, mpc_pos, mpc_neg, mpc_shift, mpc_abs,
+  mpc_arg, mpc_floor, mpc_ceil,  mpc_nint, mpc_frac, mpc_mul, mpc_square,
+  mpc_mul_mpf, mpc_mul_imag_mpf, mpc_mul_int,
+  mpc_div, mpc_div_mpf, mpc_reciprocal, mpc_mpf_div,
+  complex_int_pow, mpc_pow, mpc_pow_mpf, mpc_pow_int,
+  mpc_sqrt, mpc_nthroot, mpc_cbrt, mpc_exp, mpc_log, mpc_cos, mpc_sin,
+  mpc_tan, mpc_cos_pi, mpc_sin_pi, mpc_cosh, mpc_sinh, mpc_tanh,
+  mpc_atan, mpc_acos, mpc_asin, mpc_asinh, mpc_acosh, mpc_atanh,
+  mpc_fibonacci, mpf_expj, mpf_expjpi, mpc_expj, mpc_expjpi,
+  mpc_cos_sin, mpc_cos_sin_pi)
+
+from .libelefun import (ln2_fixed, mpf_ln2, ln10_fixed, mpf_ln10,
+  pi_fixed, mpf_pi, e_fixed, mpf_e, phi_fixed, mpf_phi,
+  degree_fixed, mpf_degree,
+  mpf_pow, mpf_nthroot, mpf_cbrt, log_int_fixed, agm_fixed,
+  mpf_log, mpf_log_hypot, mpf_exp, mpf_cos_sin, mpf_cos, mpf_sin, mpf_tan,
+  mpf_cos_sin_pi, mpf_cos_pi, mpf_sin_pi, mpf_cosh_sinh,
+  mpf_cosh, mpf_sinh, mpf_tanh, mpf_atan, mpf_atan2, mpf_asin,
+  mpf_acos, mpf_asinh, mpf_acosh, mpf_atanh, mpf_fibonacci)
+
+from .libhyper import (NoConvergence, make_hyp_summator,
+  mpf_erf, mpf_erfc, mpf_ei, mpc_ei, mpf_e1, mpc_e1, mpf_expint,
+  mpf_ci_si, mpf_ci, mpf_si, mpc_ci, mpc_si, mpf_besseljn,
+  mpc_besseljn, mpf_agm, mpf_agm1, mpc_agm, mpc_agm1,
+  mpf_ellipk, mpc_ellipk, mpf_ellipe, mpc_ellipe)
+
+from .gammazeta import (catalan_fixed, mpf_catalan,
+  khinchin_fixed, mpf_khinchin, glaisher_fixed, mpf_glaisher,
+  apery_fixed, mpf_apery, euler_fixed, mpf_euler, mertens_fixed,
+  mpf_mertens, twinprime_fixed, mpf_twinprime,
+  mpf_bernoulli, bernfrac, mpf_gamma_int,
+  mpf_factorial, mpc_factorial, mpf_gamma, mpc_gamma,
+  mpf_loggamma, mpc_loggamma, mpf_rgamma, mpc_rgamma,
+  mpf_harmonic, mpc_harmonic, mpf_psi0, mpc_psi0,
+  mpf_psi, mpc_psi, mpf_zeta_int, mpf_zeta, mpc_zeta,
+  mpf_altzeta, mpc_altzeta, mpf_zetasum, mpc_zetasum)
+
+from .libmpi import (mpi_str,
+  mpi_from_str, mpi_to_str,
+  mpi_eq, mpi_ne,
+  mpi_lt, mpi_le, mpi_gt, mpi_ge,
+  mpi_add, mpi_sub, mpi_delta, mpi_mid,
+  mpi_pos, mpi_neg, mpi_abs, mpi_mul, mpi_div, mpi_exp,
+  mpi_log, mpi_sqrt, mpi_pow_int, mpi_pow, mpi_cos_sin,
+  mpi_cos, mpi_sin, mpi_tan, mpi_cot,
+  mpi_atan, mpi_atan2,
+  mpci_pos, mpci_neg, mpci_add, mpci_sub, mpci_mul, mpci_div, mpci_pow,
+  mpci_abs, mpci_pow, mpci_exp, mpci_log, mpci_cos, mpci_sin,
+  mpi_gamma, mpci_gamma, mpi_loggamma, mpci_loggamma,
+  mpi_rgamma, mpci_rgamma, mpi_factorial, mpci_factorial)
+
+from .libintmath import (trailing, bitcount, numeral, bin_to_radix,
+  isqrt, isqrt_small, isqrt_fast, sqrt_fixed, sqrtrem, ifib, ifac,
+  list_primes, isprime, moebius, gcd, eulernum, stirling1, stirling2)
+
+from .backend import (gmpy, sage, BACKEND, STRICT, MPZ, MPZ_TYPE,
+  MPZ_ZERO, MPZ_ONE, MPZ_TWO, MPZ_THREE, MPZ_FIVE, int_types,
+  HASH_MODULUS, HASH_BITS)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libelefun.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libelefun.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b237a35594f26f29abf244b8aa8bad49ae13dc2
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libelefun.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libhyper.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libhyper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0f53d6b0c7006eb4a5fcf8f1f7cb6b5b5c214f8
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libhyper.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libmpc.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libmpc.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..904ba5a435ee661e1f2ecc6f769d3d80131f1bb1
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libmpc.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libelefun.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libelefun.py
new file mode 100644
index 0000000000000000000000000000000000000000..3de2e5aaef02296ed03dec3df3021b56823f3728
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libelefun.py
@@ -0,0 +1,1428 @@
+"""
+This module implements computation of elementary transcendental
+functions (powers, logarithms, trigonometric and hyperbolic
+functions, inverse trigonometric and hyperbolic) for real
+floating-point numbers.
+
+For complex and interval implementations of the same functions,
+see libmpc and libmpi.
+
+"""
+
+import math
+from bisect import bisect
+
+from .backend import xrange
+from .backend import MPZ, MPZ_ZERO, MPZ_ONE, MPZ_TWO, MPZ_FIVE, BACKEND
+
+from .libmpf import (
+    round_floor, round_ceiling, round_down, round_up,
+    round_nearest, round_fast,
+    ComplexResult,
+    bitcount, bctable, lshift, rshift, giant_steps, sqrt_fixed,
+    from_int, to_int, from_man_exp, to_fixed, to_float, from_float,
+    from_rational, normalize,
+    fzero, fone, fnone, fhalf, finf, fninf, fnan,
+    mpf_cmp, mpf_sign, mpf_abs,
+    mpf_pos, mpf_neg, mpf_add, mpf_sub, mpf_mul, mpf_div, mpf_shift,
+    mpf_rdiv_int, mpf_pow_int, mpf_sqrt,
+    reciprocal_rnd, negative_rnd, mpf_perturb,
+    isqrt_fast
+)
+
+from .libintmath import ifib
+
+
+#-------------------------------------------------------------------------------
+# Tuning parameters
+#-------------------------------------------------------------------------------
+
+# Cutoff for computing exp from cosh+sinh. This reduces the
+# number of terms by half, but also requires a square root which
+# is expensive with the pure-Python square root code.
+if BACKEND == 'python':
+    EXP_COSH_CUTOFF = 600
+else:
+    EXP_COSH_CUTOFF = 400
+# Cutoff for using more than 2 series
+EXP_SERIES_U_CUTOFF = 1500
+
+# Also basically determined by sqrt
+if BACKEND == 'python':
+    COS_SIN_CACHE_PREC = 400
+else:
+    COS_SIN_CACHE_PREC = 200
+COS_SIN_CACHE_STEP = 8
+cos_sin_cache = {}
+
+# Number of integer logarithms to cache (for zeta sums)
+MAX_LOG_INT_CACHE = 2000
+log_int_cache = {}
+
+LOG_TAYLOR_PREC = 2500  # Use Taylor series with caching up to this prec
+LOG_TAYLOR_SHIFT = 9    # Cache log values in steps of size 2^-N
+log_taylor_cache = {}
+# prec/size ratio of x for fastest convergence in AGM formula
+LOG_AGM_MAG_PREC_RATIO = 20
+
+ATAN_TAYLOR_PREC = 3000  # Same as for log
+ATAN_TAYLOR_SHIFT = 7   # steps of size 2^-N
+atan_taylor_cache = {}
+
+
+# ~= next power of two + 20
+cache_prec_steps = [22,22]
+for k in xrange(1, bitcount(LOG_TAYLOR_PREC)+1):
+    cache_prec_steps += [min(2**k,LOG_TAYLOR_PREC)+20] * 2**(k-1)
+
+
+#----------------------------------------------------------------------------#
+#                                                                            #
+#                   Elementary mathematical constants                        #
+#                                                                            #
+#----------------------------------------------------------------------------#
+
+def constant_memo(f):
+    """
+    Decorator for caching computed values of mathematical
+    constants. This decorator should be applied to a
+    function taking a single argument prec as input and
+    returning a fixed-point value with the given precision.
+    """
+    f.memo_prec = -1
+    f.memo_val = None
+    def g(prec, **kwargs):
+        memo_prec = f.memo_prec
+        if prec <= memo_prec:
+            return f.memo_val >> (memo_prec-prec)
+        newprec = int(prec*1.05+10)
+        f.memo_val = f(newprec, **kwargs)
+        f.memo_prec = newprec
+        return f.memo_val >> (newprec-prec)
+    g.__name__ = f.__name__
+    g.__doc__ = f.__doc__
+    return g
+
+def def_mpf_constant(fixed):
+    """
+    Create a function that computes the mpf value for a mathematical
+    constant, given a function that computes the fixed-point value.
+
+    Assumptions: the constant is positive and has magnitude ~= 1;
+    the fixed-point function rounds to floor.
+    """
+    def f(prec, rnd=round_fast):
+        wp = prec + 20
+        v = fixed(wp)
+        if rnd in (round_up, round_ceiling):
+            v += 1
+        return normalize(0, v, -wp, bitcount(v), prec, rnd)
+    f.__doc__ = fixed.__doc__
+    return f
+
+def bsp_acot(q, a, b, hyperbolic):
+    if b - a == 1:
+        a1 = MPZ(2*a + 3)
+        if hyperbolic or a&1:
+            return MPZ_ONE, a1 * q**2, a1
+        else:
+            return -MPZ_ONE, a1 * q**2, a1
+    m = (a+b)//2
+    p1, q1, r1 = bsp_acot(q, a, m, hyperbolic)
+    p2, q2, r2 = bsp_acot(q, m, b, hyperbolic)
+    return q2*p1 + r1*p2, q1*q2, r1*r2
+
+# the acoth(x) series converges like the geometric series for x^2
+# N = ceil(p*log(2)/(2*log(x)))
+def acot_fixed(a, prec, hyperbolic):
+    """
+    Compute acot(a) or acoth(a) for an integer a with binary splitting; see
+    http://numbers.computation.free.fr/Constants/Algorithms/splitting.html
+    """
+    N = int(0.35 * prec/math.log(a) + 20)
+    p, q, r = bsp_acot(a, 0,N, hyperbolic)
+    return ((p+q)<<prec)//(q*a)
+
+def machin(coefs, prec, hyperbolic=False):
+    """
+    Evaluate a Machin-like formula, i.e., a linear combination of
+    acot(n) or acoth(n) for specific integer values of n, using fixed-
+    point arithmetic. The input should be a list [(c, n), ...], giving
+    c*acot[h](n) + ...
+    """
+    extraprec = 10
+    s = MPZ_ZERO
+    for a, b in coefs:
+        s += MPZ(a) * acot_fixed(MPZ(b), prec+extraprec, hyperbolic)
+    return (s >> extraprec)
+
+# Logarithms of integers are needed for various computations involving
+# logarithms, powers, radix conversion, etc
+
+@constant_memo
+def ln2_fixed(prec):
+    """
+    Computes ln(2). This is done with a hyperbolic Machin-type formula,
+    with binary splitting at high precision.
+    """
+    return machin([(18, 26), (-2, 4801), (8, 8749)], prec, True)
+
+@constant_memo
+def ln10_fixed(prec):
+    """
+    Computes ln(10). This is done with a hyperbolic Machin-type formula.
+    """
+    return machin([(46, 31), (34, 49), (20, 161)], prec, True)
+
+
+r"""
+For computation of pi, we use the Chudnovsky series:
+
+             oo
+             ___        k
+      1     \       (-1)  (6 k)! (A + B k)
+    ----- =  )     -----------------------
+    12 pi   /___               3  3k+3/2
+                    (3 k)! (k!)  C
+            k = 0
+
+where A, B, and C are certain integer constants. This series adds roughly
+14 digits per term. Note that C^(3/2) can be extracted so that the
+series contains only rational terms. This makes binary splitting very
+efficient.
+
+The recurrence formulas for the binary splitting were taken from
+ftp://ftp.gmplib.org/pub/src/gmp-chudnovsky.c
+
+Previously, Machin's formula was used at low precision and the AGM iteration
+was used at high precision. However, the Chudnovsky series is essentially as
+fast as the Machin formula at low precision and in practice about 3x faster
+than the AGM at high precision (despite theoretically having a worse
+asymptotic complexity), so there is no reason not to use it in all cases.
+
+"""
+
+# Constants in Chudnovsky's series
+CHUD_A = MPZ(13591409)
+CHUD_B = MPZ(545140134)
+CHUD_C = MPZ(640320)
+CHUD_D = MPZ(12)
+
+def bs_chudnovsky(a, b, level, verbose):
+    """
+    Computes the sum from a to b of the series in the Chudnovsky
+    formula. Returns g, p, q where p/q is the sum as an exact
+    fraction and g is a temporary value used to save work
+    for recursive calls.
+    """
+    if b-a == 1:
+        g = MPZ((6*b-5)*(2*b-1)*(6*b-1))
+        p = b**3 * CHUD_C**3 // 24
+        q = (-1)**b * g * (CHUD_A+CHUD_B*b)
+    else:
+        if verbose and level < 4:
+            print("  binary splitting", a, b)
+        mid = (a+b)//2
+        g1, p1, q1 = bs_chudnovsky(a, mid, level+1, verbose)
+        g2, p2, q2 = bs_chudnovsky(mid, b, level+1, verbose)
+        p = p1*p2
+        g = g1*g2
+        q = q1*p2 + q2*g1
+    return g, p, q
+
+@constant_memo
+def pi_fixed(prec, verbose=False, verbose_base=None):
+    """
+    Compute floor(pi * 2**prec) as a big integer.
+
+    This is done using Chudnovsky's series (see comments in
+    libelefun.py for details).
+    """
+    # The Chudnovsky series gives 14.18 digits per term
+    N = int(prec/3.3219280948/14.181647462 + 2)
+    if verbose:
+        print("binary splitting with N =", N)
+    g, p, q = bs_chudnovsky(0, N, 0, verbose)
+    sqrtC = isqrt_fast(CHUD_C<<(2*prec))
+    v = p*CHUD_C*sqrtC//((q+CHUD_A*p)*CHUD_D)
+    return v
+
+def degree_fixed(prec):
+    return pi_fixed(prec)//180
+
+def bspe(a, b):
+    """
+    Sum series for exp(1)-1 between a, b, returning the result
+    as an exact fraction (p, q).
+    """
+    if b-a == 1:
+        return MPZ_ONE, MPZ(b)
+    m = (a+b)//2
+    p1, q1 = bspe(a, m)
+    p2, q2 = bspe(m, b)
+    return p1*q2+p2, q1*q2
+
+@constant_memo
+def e_fixed(prec):
+    """
+    Computes exp(1). This is done using the ordinary Taylor series for
+    exp, with binary splitting. For a description of the algorithm,
+    see:
+
+        http://numbers.computation.free.fr/Constants/
+            Algorithms/splitting.html
+    """
+    # Slight overestimate of N needed for 1/N! < 2**(-prec)
+    # This could be tightened for large N.
+    N = int(1.1*prec/math.log(prec) + 20)
+    p, q = bspe(0,N)
+    return ((p+q)<<prec)//q
+
+@constant_memo
+def phi_fixed(prec):
+    """
+    Computes the golden ratio, (1+sqrt(5))/2
+    """
+    prec += 10
+    a = isqrt_fast(MPZ_FIVE<<(2*prec)) + (MPZ_ONE << prec)
+    return a >> 11
+
+mpf_phi    = def_mpf_constant(phi_fixed)
+mpf_pi     = def_mpf_constant(pi_fixed)
+mpf_e      = def_mpf_constant(e_fixed)
+mpf_degree = def_mpf_constant(degree_fixed)
+mpf_ln2    = def_mpf_constant(ln2_fixed)
+mpf_ln10   = def_mpf_constant(ln10_fixed)
+
+
+@constant_memo
+def ln_sqrt2pi_fixed(prec):
+    wp = prec + 10
+    # ln(sqrt(2*pi)) = ln(2*pi)/2
+    return to_fixed(mpf_log(mpf_shift(mpf_pi(wp), 1), wp), prec-1)
+
+@constant_memo
+def sqrtpi_fixed(prec):
+    return sqrt_fixed(pi_fixed(prec), prec)
+
+mpf_sqrtpi   = def_mpf_constant(sqrtpi_fixed)
+mpf_ln_sqrt2pi   = def_mpf_constant(ln_sqrt2pi_fixed)
+
+
+#----------------------------------------------------------------------------#
+#                                                                            #
+#                                    Powers                                  #
+#                                                                            #
+#----------------------------------------------------------------------------#
+
+def mpf_pow(s, t, prec, rnd=round_fast):
+    """
+    Compute s**t. Raises ComplexResult if s is negative and t is
+    fractional.
+    """
+    ssign, sman, sexp, sbc = s
+    tsign, tman, texp, tbc = t
+    if ssign and texp < 0:
+        raise ComplexResult("negative number raised to a fractional power")
+    if texp >= 0:
+        return mpf_pow_int(s, (-1)**tsign * (tman<<texp), prec, rnd)
+    # s**(n/2) = sqrt(s)**n
+    if texp == -1:
+        if tman == 1:
+            if tsign:
+                return mpf_div(fone, mpf_sqrt(s, prec+10,
+                    reciprocal_rnd[rnd]), prec, rnd)
+            return mpf_sqrt(s, prec, rnd)
+        else:
+            if tsign:
+                return mpf_pow_int(mpf_sqrt(s, prec+10,
+                    reciprocal_rnd[rnd]), -tman, prec, rnd)
+            return mpf_pow_int(mpf_sqrt(s, prec+10, rnd), tman, prec, rnd)
+    # General formula: s**t = exp(t*log(s))
+    # TODO: handle rnd direction of the logarithm carefully
+    c = mpf_log(s, prec+10, rnd)
+    return mpf_exp(mpf_mul(t, c), prec, rnd)
+
+def int_pow_fixed(y, n, prec):
+    """n-th power of a fixed point number with precision prec
+
+       Returns the power in the form man, exp,
+       man * 2**exp ~= y**n
+    """
+    if n == 2:
+        return (y*y), 0
+    bc = bitcount(y)
+    exp = 0
+    workprec = 2 * (prec + 4*bitcount(n) + 4)
+    _, pm, pe, pbc = fone
+    while 1:
+        if n & 1:
+            pm = pm*y
+            pe = pe+exp
+            pbc += bc - 2
+            pbc = pbc + bctable[int(pm >> pbc)]
+            if pbc > workprec:
+                pm = pm >> (pbc-workprec)
+                pe += pbc - workprec
+                pbc = workprec
+            n -= 1
+            if not n:
+                break
+        y = y*y
+        exp = exp+exp
+        bc = bc + bc - 2
+        bc = bc + bctable[int(y >> bc)]
+        if bc > workprec:
+            y = y >> (bc-workprec)
+            exp += bc - workprec
+            bc = workprec
+        n = n // 2
+    return pm, pe
+
+# froot(s, n, prec, rnd) computes the real n-th root of a
+# positive mpf tuple s.
+# To compute the root we start from a 50-bit estimate for r
+# generated with ordinary floating-point arithmetic, and then refine
+# the value to full accuracy using the iteration
+
+#            1  /                     y       \
+#   r     = --- | (n-1)  * r   +  ----------  |
+#    n+1     n  \           n     r_n**(n-1)  /
+
+# which is simply Newton's method applied to the equation r**n = y.
+# With giant_steps(start, prec+extra) = [p0,...,pm, prec+extra]
+# and y = man * 2**-shift  one has
+# (man * 2**exp)**(1/n) =
+# y**(1/n) * 2**(start-prec/n) * 2**(p0-start) * ... * 2**(prec+extra-pm) *
+# 2**((exp+shift-(n-1)*prec)/n -extra))
+# The last factor is accounted for in the last line of froot.
+
+def nthroot_fixed(y, n, prec, exp1):
+    start = 50
+    try:
+        y1 = rshift(y, prec - n*start)
+        r = MPZ(int(y1**(1.0/n)))
+    except OverflowError:
+        y1 = from_int(y1, start)
+        fn = from_int(n)
+        fn = mpf_rdiv_int(1, fn, start)
+        r = mpf_pow(y1, fn, start)
+        r = to_int(r)
+    extra = 10
+    extra1 = n
+    prevp = start
+    for p in giant_steps(start, prec+extra):
+        pm, pe = int_pow_fixed(r, n-1, prevp)
+        r2 = rshift(pm, (n-1)*prevp - p - pe - extra1)
+        B = lshift(y, 2*p-prec+extra1)//r2
+        r = (B + (n-1) * lshift(r, p-prevp))//n
+        prevp = p
+    return r
+
+def mpf_nthroot(s, n, prec, rnd=round_fast):
+    """nth-root of a positive number
+
+    Use the Newton method when faster, otherwise use x**(1/n)
+    """
+    sign, man, exp, bc = s
+    if sign:
+        raise ComplexResult("nth root of a negative number")
+    if not man:
+        if s == fnan:
+            return fnan
+        if s == fzero:
+            if n > 0:
+                return fzero
+            if n == 0:
+                return fone
+            return finf
+        # Infinity
+        if not n:
+            return fnan
+        if n < 0:
+            return fzero
+        return finf
+    flag_inverse = False
+    if n < 2:
+        if n == 0:
+            return fone
+        if n == 1:
+            return mpf_pos(s, prec, rnd)
+        if n == -1:
+            return mpf_div(fone, s, prec, rnd)
+        # n < 0
+        rnd = reciprocal_rnd[rnd]
+        flag_inverse = True
+        extra_inverse = 5
+        prec += extra_inverse
+        n = -n
+    if n > 20 and (n >= 20000 or prec < int(233 + 28.3 * n**0.62)):
+        prec2 = prec + 10
+        fn = from_int(n)
+        nth = mpf_rdiv_int(1, fn, prec2)
+        r = mpf_pow(s, nth, prec2, rnd)
+        s = normalize(r[0], r[1], r[2], r[3], prec, rnd)
+        if flag_inverse:
+            return mpf_div(fone, s, prec-extra_inverse, rnd)
+        else:
+            return s
+    # Convert to a fixed-point number with prec2 bits.
+    prec2 = prec + 2*n - (prec%n)
+    # a few tests indicate that
+    # for 10 < n < 10**4 a bit more precision is needed
+    if n > 10:
+        prec2 += prec2//10
+        prec2 = prec2 - prec2%n
+    # Mantissa may have more bits than we need. Trim it down.
+    shift = bc - prec2
+    # Adjust exponents to make prec2 and exp+shift multiples of n.
+    sign1 = 0
+    es = exp+shift
+    if es < 0:
+        sign1 = 1
+        es = -es
+    if sign1:
+        shift += es%n
+    else:
+        shift -= es%n
+    man = rshift(man, shift)
+    extra = 10
+    exp1 = ((exp+shift-(n-1)*prec2)//n) - extra
+    rnd_shift = 0
+    if flag_inverse:
+        if rnd == 'u' or rnd == 'c':
+            rnd_shift = 1
+    else:
+        if rnd == 'd' or rnd == 'f':
+            rnd_shift = 1
+    man = nthroot_fixed(man+rnd_shift, n, prec2, exp1)
+    s = from_man_exp(man, exp1, prec, rnd)
+    if flag_inverse:
+        return mpf_div(fone, s, prec-extra_inverse, rnd)
+    else:
+        return s
+
+def mpf_cbrt(s, prec, rnd=round_fast):
+    """cubic root of a positive number"""
+    return mpf_nthroot(s, 3, prec, rnd)
+
+#----------------------------------------------------------------------------#
+#                                                                            #
+#                                Logarithms                                  #
+#                                                                            #
+#----------------------------------------------------------------------------#
+
+
+def log_int_fixed(n, prec, ln2=None):
+    """
+    Fast computation of log(n), caching the value for small n,
+    intended for zeta sums.
+    """
+    if n in log_int_cache:
+        value, vprec = log_int_cache[n]
+        if vprec >= prec:
+            return value >> (vprec - prec)
+    wp = prec + 10
+    if wp <= LOG_TAYLOR_SHIFT:
+        if ln2 is None:
+            ln2 = ln2_fixed(wp)
+        r = bitcount(n)
+        x = n << (wp-r)
+        v = log_taylor_cached(x, wp) + r*ln2
+    else:
+        v = to_fixed(mpf_log(from_int(n), wp+5), wp)
+    if n < MAX_LOG_INT_CACHE:
+        log_int_cache[n] = (v, wp)
+    return v >> (wp-prec)
+
+def agm_fixed(a, b, prec):
+    """
+    Fixed-point computation of agm(a,b), assuming
+    a, b both close to unit magnitude.
+    """
+    i = 0
+    while 1:
+        anew = (a+b)>>1
+        if i > 4 and abs(a-anew) < 8:
+            return a
+        b = isqrt_fast(a*b)
+        a = anew
+        i += 1
+    return a
+
+def log_agm(x, prec):
+    """
+    Fixed-point computation of -log(x) = log(1/x), suitable
+    for large precision. It is required that 0 < x < 1. The
+    algorithm used is the Sasaki-Kanada formula
+
+        -log(x) = pi/agm(theta2(x)^2,theta3(x)^2). [1]
+
+    For faster convergence in the theta functions, x should
+    be chosen closer to 0.
+
+    Guard bits must be added by the caller.
+
+    HYPOTHESIS: if x = 2^(-n), n bits need to be added to
+    account for the truncation to a fixed-point number,
+    and this is the only significant cancellation error.
+
+    The number of bits lost to roundoff is small and can be
+    considered constant.
+
+    [1] Richard P. Brent, "Fast Algorithms for High-Precision
+        Computation of Elementary Functions (extended abstract)",
+        http://wwwmaths.anu.edu.au/~brent/pd/RNC7-Brent.pdf
+
+    """
+    x2 = (x*x) >> prec
+    # Compute jtheta2(x)**2
+    s = a = b = x2
+    while a:
+        b = (b*x2) >> prec
+        a = (a*b) >> prec
+        s += a
+    s += (MPZ_ONE<<prec)
+    s = (s*s)>>(prec-2)
+    s = (s*isqrt_fast(x<<prec))>>prec
+    # Compute jtheta3(x)**2
+    t = a = b = x
+    while a:
+        b = (b*x2) >> prec
+        a = (a*b) >> prec
+        t += a
+    t = (MPZ_ONE<<prec) + (t<<1)
+    t = (t*t)>>prec
+    # Final formula
+    p = agm_fixed(s, t, prec)
+    return (pi_fixed(prec) << prec) // p
+
+def log_taylor(x, prec, r=0):
+    """
+    Fixed-point calculation of log(x). It is assumed that x is close
+    enough to 1 for the Taylor series to converge quickly. Convergence
+    can be improved by specifying r > 0 to compute
+    log(x^(1/2^r))*2^r, at the cost of performing r square roots.
+
+    The caller must provide sufficient guard bits.
+    """
+    for i in xrange(r):
+        x = isqrt_fast(x<<prec)
+    one = MPZ_ONE << prec
+    v = ((x-one)<<prec)//(x+one)
+    sign = v < 0
+    if sign:
+        v = -v
+    v2 = (v*v) >> prec
+    v4 = (v2*v2) >> prec
+    s0 = v
+    s1 = v//3
+    v = (v*v4) >> prec
+    k = 5
+    while v:
+        s0 += v // k
+        k += 2
+        s1 += v // k
+        v = (v*v4) >> prec
+        k += 2
+    s1 = (s1*v2) >> prec
+    s = (s0+s1) << (1+r)
+    if sign:
+        return -s
+    return s
+
+def log_taylor_cached(x, prec):
+    """
+    Fixed-point computation of log(x), assuming x in (0.5, 2)
+    and prec <= LOG_TAYLOR_PREC.
+    """
+    n = x >> (prec-LOG_TAYLOR_SHIFT)
+    cached_prec = cache_prec_steps[prec]
+    dprec = cached_prec - prec
+    if (n, cached_prec) in log_taylor_cache:
+        a, log_a = log_taylor_cache[n, cached_prec]
+    else:
+        a = n << (cached_prec - LOG_TAYLOR_SHIFT)
+        log_a = log_taylor(a, cached_prec, 8)
+        log_taylor_cache[n, cached_prec] = (a, log_a)
+    a >>= dprec
+    log_a >>= dprec
+    u = ((x - a) << prec) // a
+    v = (u << prec) // ((MPZ_TWO << prec) + u)
+    v2 = (v*v) >> prec
+    v4 = (v2*v2) >> prec
+    s0 = v
+    s1 = v//3
+    v = (v*v4) >> prec
+    k = 5
+    while v:
+        s0 += v//k
+        k += 2
+        s1 += v//k
+        v = (v*v4) >> prec
+        k += 2
+    s1 = (s1*v2) >> prec
+    s = (s0+s1) << 1
+    return log_a + s
+
+def mpf_log(x, prec, rnd=round_fast):
+    """
+    Compute the natural logarithm of the mpf value x. If x is negative,
+    ComplexResult is raised.
+    """
+    sign, man, exp, bc = x
+    #------------------------------------------------------------------
+    # Handle special values
+    if not man:
+        if x == fzero: return fninf
+        if x == finf: return finf
+        if x == fnan: return fnan
+    if sign:
+        raise ComplexResult("logarithm of a negative number")
+    wp = prec + 20
+    #------------------------------------------------------------------
+    # Handle log(2^n) = log(n)*2.
+    # Here we catch the only possible exact value, log(1) = 0
+    if man == 1:
+        if not exp:
+            return fzero
+        return from_man_exp(exp*ln2_fixed(wp), -wp, prec, rnd)
+    mag = exp+bc
+    abs_mag = abs(mag)
+    #------------------------------------------------------------------
+    # Handle x = 1+eps, where log(x) ~ x. We need to check for
+    # cancellation when moving to fixed-point math and compensate
+    # by increasing the precision. Note that abs_mag in (0, 1) <=>
+    # 0.5 < x < 2 and x != 1
+    if abs_mag <= 1:
+        # Calculate t = x-1 to measure distance from 1 in bits
+        tsign = 1-abs_mag
+        if tsign:
+            tman = (MPZ_ONE<<bc) - man
+        else:
+            tman = man - (MPZ_ONE<<(bc-1))
+        tbc = bitcount(tman)
+        cancellation = bc - tbc
+        if cancellation > wp:
+            t = normalize(tsign, tman, abs_mag-bc, tbc, tbc, 'n')
+            return mpf_perturb(t, tsign, prec, rnd)
+        else:
+            wp += cancellation
+        # TODO: if close enough to 1, we could use Taylor series
+        # even in the AGM precision range, since the Taylor series
+        # converges rapidly
+    #------------------------------------------------------------------
+    # Another special case:
+    # n*log(2) is a good enough approximation
+    if abs_mag > 10000:
+        if bitcount(abs_mag) > wp:
+            return from_man_exp(exp*ln2_fixed(wp), -wp, prec, rnd)
+    #------------------------------------------------------------------
+    # General case.
+    # Perform argument reduction using log(x) = log(x*2^n) - n*log(2):
+    # If we are in the Taylor precision range, choose magnitude 0 or 1.
+    # If we are in the AGM precision range, choose magnitude -m for
+    # some large m; benchmarking on one machine showed m = prec/20 to be
+    # optimal between 1000 and 100,000 digits.
+    if wp <= LOG_TAYLOR_PREC:
+        m = log_taylor_cached(lshift(man, wp-bc), wp)
+        if mag:
+            m += mag*ln2_fixed(wp)
+    else:
+        optimal_mag = -wp//LOG_AGM_MAG_PREC_RATIO
+        n = optimal_mag - mag
+        x = mpf_shift(x, n)
+        wp += (-optimal_mag)
+        m = -log_agm(to_fixed(x, wp), wp)
+        m -= n*ln2_fixed(wp)
+    return from_man_exp(m, -wp, prec, rnd)
+
+def mpf_log_hypot(a, b, prec, rnd):
+    """
+    Computes log(sqrt(a^2+b^2)) accurately.
+    """
+    # If either a or b is inf/nan/0, assume it to be a
+    if not b[1]:
+        a, b = b, a
+    # a is inf/nan/0
+    if not a[1]:
+        # both are inf/nan/0
+        if not b[1]:
+            if a == b == fzero:
+                return fninf
+            if fnan in (a, b):
+                return fnan
+            # at least one term is (+/- inf)^2
+            return finf
+        # only a is inf/nan/0
+        if a == fzero:
+            # log(sqrt(0+b^2)) = log(|b|)
+            return mpf_log(mpf_abs(b), prec, rnd)
+        if a == fnan:
+            return fnan
+        return finf
+    # Exact
+    a2 = mpf_mul(a,a)
+    b2 = mpf_mul(b,b)
+    extra = 20
+    # Not exact
+    h2 = mpf_add(a2, b2, prec+extra)
+    cancelled = mpf_add(h2, fnone, 10)
+    mag_cancelled = cancelled[2]+cancelled[3]
+    # Just redo the sum exactly if necessary (could be smarter
+    # and avoid memory allocation when a or b is precisely 1
+    # and the other is tiny...)
+    if cancelled == fzero or mag_cancelled < -extra//2:
+        h2 = mpf_add(a2, b2, prec+extra-min(a2[2],b2[2]))
+    return mpf_shift(mpf_log(h2, prec, rnd), -1)
+
+
+#----------------------------------------------------------------------
+# Inverse tangent
+#
+
+def atan_newton(x, prec):
+    if prec >= 100:
+        r = math.atan(int((x>>(prec-53)))/2.0**53)
+    else:
+        r = math.atan(int(x)/2.0**prec)
+    prevp = 50
+    r = MPZ(int(r * 2.0**53) >> (53-prevp))
+    extra_p = 50
+    for wp in giant_steps(prevp, prec):
+        wp += extra_p
+        r = r << (wp-prevp)
+        cos, sin = cos_sin_fixed(r, wp)
+        tan = (sin << wp) // cos
+        a = ((tan-rshift(x, prec-wp)) << wp) // ((MPZ_ONE<<wp) + ((tan**2)>>wp))
+        r = r - a
+        prevp = wp
+    return rshift(r, prevp-prec)
+
+def atan_taylor_get_cached(n, prec):
+    # Taylor series with caching wins up to huge precisions
+    # To avoid unnecessary precomputation at low precision, we
+    # do it in steps
+    # Round to next power of 2
+    prec2 = (1<<(bitcount(prec-1))) + 20
+    dprec = prec2 - prec
+    if (n, prec2) in atan_taylor_cache:
+        a, atan_a = atan_taylor_cache[n, prec2]
+    else:
+        a = n << (prec2 - ATAN_TAYLOR_SHIFT)
+        atan_a = atan_newton(a, prec2)
+        atan_taylor_cache[n, prec2] = (a, atan_a)
+    return (a >> dprec), (atan_a >> dprec)
+
+def atan_taylor(x, prec):
+    n = (x >> (prec-ATAN_TAYLOR_SHIFT))
+    a, atan_a = atan_taylor_get_cached(n, prec)
+    d = x - a
+    s0 = v = (d << prec) // ((a**2 >> prec) + (a*d >> prec) + (MPZ_ONE << prec))
+    v2 = (v**2 >> prec)
+    v4 = (v2 * v2) >> prec
+    s1 = v//3
+    v = (v * v4) >> prec
+    k = 5
+    while v:
+        s0 += v // k
+        k += 2
+        s1 += v // k
+        v = (v * v4) >> prec
+        k += 2
+    s1 = (s1 * v2) >> prec
+    s = s0 - s1
+    return atan_a + s
+
+def atan_inf(sign, prec, rnd):
+    if not sign:
+        return mpf_shift(mpf_pi(prec, rnd), -1)
+    return mpf_neg(mpf_shift(mpf_pi(prec, negative_rnd[rnd]), -1))
+
+def mpf_atan(x, prec, rnd=round_fast):
+    sign, man, exp, bc = x
+    if not man:
+        if x == fzero: return fzero
+        if x == finf: return atan_inf(0, prec, rnd)
+        if x == fninf: return atan_inf(1, prec, rnd)
+        return fnan
+    mag = exp + bc
+    # Essentially infinity
+    if mag > prec+20:
+        return atan_inf(sign, prec, rnd)
+    # Essentially ~ x
+    if -mag > prec+20:
+        return mpf_perturb(x, 1-sign, prec, rnd)
+    wp = prec + 30 + abs(mag)
+    # For large x, use atan(x) = pi/2 - atan(1/x)
+    if mag >= 2:
+        x = mpf_rdiv_int(1, x, wp)
+        reciprocal = True
+    else:
+        reciprocal = False
+    t = to_fixed(x, wp)
+    if sign:
+        t = -t
+    if wp < ATAN_TAYLOR_PREC:
+        a = atan_taylor(t, wp)
+    else:
+        a = atan_newton(t, wp)
+    if reciprocal:
+        a = ((pi_fixed(wp)>>1)+1) - a
+    if sign:
+        a = -a
+    return from_man_exp(a, -wp, prec, rnd)
+
+# TODO: cleanup the special cases
+def mpf_atan2(y, x, prec, rnd=round_fast):
+    xsign, xman, xexp, xbc = x
+    ysign, yman, yexp, ybc = y
+    if not yman:
+        if y == fzero and x != fnan:
+            if mpf_sign(x) >= 0:
+                return fzero
+            return mpf_pi(prec, rnd)
+        if y in (finf, fninf):
+            if x in (finf, fninf):
+                return fnan
+            # pi/2
+            if y == finf:
+                return mpf_shift(mpf_pi(prec, rnd), -1)
+            # -pi/2
+            return mpf_neg(mpf_shift(mpf_pi(prec, negative_rnd[rnd]), -1))
+        return fnan
+    if ysign:
+        return mpf_neg(mpf_atan2(mpf_neg(y), x, prec, negative_rnd[rnd]))
+    if not xman:
+        if x == fnan:
+            return fnan
+        if x == finf:
+            return fzero
+        if x == fninf:
+            return mpf_pi(prec, rnd)
+        if y == fzero:
+            return fzero
+        return mpf_shift(mpf_pi(prec, rnd), -1)
+    tquo = mpf_atan(mpf_div(y, x, prec+4), prec+4)
+    if xsign:
+        return mpf_add(mpf_pi(prec+4), tquo, prec, rnd)
+    else:
+        return mpf_pos(tquo, prec, rnd)
+
+def mpf_asin(x, prec, rnd=round_fast):
+    sign, man, exp, bc = x
+    if bc+exp > 0 and x not in (fone, fnone):
+        raise ComplexResult("asin(x) is real only for -1 <= x <= 1")
+    # asin(x) = 2*atan(x/(1+sqrt(1-x**2)))
+    wp = prec + 15
+    a = mpf_mul(x, x)
+    b = mpf_add(fone, mpf_sqrt(mpf_sub(fone, a, wp), wp), wp)
+    c = mpf_div(x, b, wp)
+    return mpf_shift(mpf_atan(c, prec, rnd), 1)
+
+def mpf_acos(x, prec, rnd=round_fast):
+    # acos(x) = 2*atan(sqrt(1-x**2)/(1+x))
+    sign, man, exp, bc = x
+    if bc + exp > 0:
+        if x not in (fone, fnone):
+            raise ComplexResult("acos(x) is real only for -1 <= x <= 1")
+        if x == fnone:
+            return mpf_pi(prec, rnd)
+    wp = prec + 15
+    a = mpf_mul(x, x)
+    b = mpf_sqrt(mpf_sub(fone, a, wp), wp)
+    c = mpf_div(b, mpf_add(fone, x, wp), wp)
+    return mpf_shift(mpf_atan(c, prec, rnd), 1)
+
+def mpf_asinh(x, prec, rnd=round_fast):
+    wp = prec + 20
+    sign, man, exp, bc = x
+    mag = exp+bc
+    if mag < -8:
+        if mag < -wp:
+            return mpf_perturb(x, 1-sign, prec, rnd)
+        wp += (-mag)
+    # asinh(x) = log(x+sqrt(x**2+1))
+    # use reflection symmetry to avoid cancellation
+    q = mpf_sqrt(mpf_add(mpf_mul(x, x), fone, wp), wp)
+    q = mpf_add(mpf_abs(x), q, wp)
+    if sign:
+        return mpf_neg(mpf_log(q, prec, negative_rnd[rnd]))
+    else:
+        return mpf_log(q, prec, rnd)
+
+def mpf_acosh(x, prec, rnd=round_fast):
+    # acosh(x) = log(x+sqrt(x**2-1))
+    wp = prec + 15
+    if mpf_cmp(x, fone) == -1:
+        raise ComplexResult("acosh(x) is real only for x >= 1")
+    q = mpf_sqrt(mpf_add(mpf_mul(x,x), fnone, wp), wp)
+    return mpf_log(mpf_add(x, q, wp), prec, rnd)
+
+def mpf_atanh(x, prec, rnd=round_fast):
+    # atanh(x) = log((1+x)/(1-x))/2
+    sign, man, exp, bc = x
+    if (not man) and exp:
+        if x in (fzero, fnan):
+            return x
+        raise ComplexResult("atanh(x) is real only for -1 <= x <= 1")
+    mag = bc + exp
+    if mag > 0:
+        if mag == 1 and man == 1:
+            return [finf, fninf][sign]
+        raise ComplexResult("atanh(x) is real only for -1 <= x <= 1")
+    wp = prec + 15
+    if mag < -8:
+        if mag < -wp:
+            return mpf_perturb(x, sign, prec, rnd)
+        wp += (-mag)
+    a = mpf_add(x, fone, wp)
+    b = mpf_sub(fone, x, wp)
+    return mpf_shift(mpf_log(mpf_div(a, b, wp), prec, rnd), -1)
+
+def mpf_fibonacci(x, prec, rnd=round_fast):
+    sign, man, exp, bc = x
+    if not man:
+        if x == fninf:
+            return fnan
+        return x
+    # F(2^n) ~= 2^(2^n)
+    size = abs(exp+bc)
+    if exp >= 0:
+        # Exact
+        if size < 10 or size <= bitcount(prec):
+            return from_int(ifib(to_int(x)), prec, rnd)
+    # Use the modified Binet formula
+    wp = prec + size + 20
+    a = mpf_phi(wp)
+    b = mpf_add(mpf_shift(a, 1), fnone, wp)
+    u = mpf_pow(a, x, wp)
+    v = mpf_cos_pi(x, wp)
+    v = mpf_div(v, u, wp)
+    u = mpf_sub(u, v, wp)
+    u = mpf_div(u, b, prec, rnd)
+    return u
+
+
+#-------------------------------------------------------------------------------
+# Exponential-type functions
+#-------------------------------------------------------------------------------
+
+def exponential_series(x, prec, type=0):
+    """
+    Taylor series for cosh/sinh or cos/sin.
+
+    type = 0 -- returns exp(x)  (slightly faster than cosh+sinh)
+    type = 1 -- returns (cosh(x), sinh(x))
+    type = 2 -- returns (cos(x), sin(x))
+    """
+    if x < 0:
+        x = -x
+        sign = 1
+    else:
+        sign = 0
+    r = int(0.5*prec**0.5)
+    xmag = bitcount(x) - prec
+    r = max(0, xmag + r)
+    extra = 10 + 2*max(r,-xmag)
+    wp = prec + extra
+    x <<= (extra - r)
+    one = MPZ_ONE << wp
+    alt = (type == 2)
+    if prec < EXP_SERIES_U_CUTOFF:
+        x2 = a = (x*x) >> wp
+        x4 = (x2*x2) >> wp
+        s0 = s1 = MPZ_ZERO
+        k = 2
+        while a:
+            a //= (k-1)*k; s0 += a; k += 2
+            a //= (k-1)*k; s1 += a; k += 2
+            a = (a*x4) >> wp
+        s1 = (x2*s1) >> wp
+        if alt:
+            c = s1 - s0 + one
+        else:
+            c = s1 + s0 + one
+    else:
+        u = int(0.3*prec**0.35)
+        x2 = a = (x*x) >> wp
+        xpowers = [one, x2]
+        for i in xrange(1, u):
+            xpowers.append((xpowers[-1]*x2)>>wp)
+        sums = [MPZ_ZERO] * u
+        k = 2
+        while a:
+            for i in xrange(u):
+                a //= (k-1)*k
+                if alt and k & 2: sums[i] -= a
+                else:             sums[i] += a
+                k += 2
+            a = (a*xpowers[-1]) >> wp
+        for i in xrange(1, u):
+            sums[i] = (sums[i]*xpowers[i]) >> wp
+        c = sum(sums) + one
+    if type == 0:
+        s = isqrt_fast(c*c - (one<<wp))
+        if sign:
+            v = c - s
+        else:
+            v = c + s
+        for i in xrange(r):
+            v = (v*v) >> wp
+        return v >> extra
+    else:
+        # Repeatedly apply the double-angle formula
+        # cosh(2*x) = 2*cosh(x)^2 - 1
+        # cos(2*x) = 2*cos(x)^2 - 1
+        pshift = wp-1
+        for i in xrange(r):
+            c = ((c*c) >> pshift) - one
+        # With the abs, this is the same for sinh and sin
+        s = isqrt_fast(abs((one<<wp) - c*c))
+        if sign:
+            s = -s
+        return (c>>extra), (s>>extra)
+
+def exp_basecase(x, prec):
+    """
+    Compute exp(x) as a fixed-point number. Works for any x,
+    but for speed should have |x| < 1. For an arbitrary number,
+    use exp(x) = exp(x-m*log(2)) * 2^m where m = floor(x/log(2)).
+    """
+    if prec > EXP_COSH_CUTOFF:
+        return exponential_series(x, prec, 0)
+    r = int(prec**0.5)
+    prec += r
+    s0 = s1 = (MPZ_ONE << prec)
+    k = 2
+    a = x2 = (x*x) >> prec
+    while a:
+        a //= k; s0 += a; k += 1
+        a //= k; s1 += a; k += 1
+        a = (a*x2) >> prec
+    s1 = (s1*x) >> prec
+    s = s0 + s1
+    u = r
+    while r:
+        s = (s*s) >> prec
+        r -= 1
+    return s >> u
+
+def exp_expneg_basecase(x, prec):
+    """
+    Computation of exp(x), exp(-x)
+    """
+    if prec > EXP_COSH_CUTOFF:
+        cosh, sinh = exponential_series(x, prec, 1)
+        return cosh+sinh, cosh-sinh
+    a = exp_basecase(x, prec)
+    b = (MPZ_ONE << (prec+prec)) // a
+    return a, b
+
+def cos_sin_basecase(x, prec):
+    """
+    Compute cos(x), sin(x) as fixed-point numbers, assuming x
+    in [0, pi/2). For an arbitrary number, use x' = x - m*(pi/2)
+    where m = floor(x/(pi/2)) along with quarter-period symmetries.
+    """
+    if prec > COS_SIN_CACHE_PREC:
+        return exponential_series(x, prec, 2)
+    precs = prec - COS_SIN_CACHE_STEP
+    t = x >> precs
+    n = int(t)
+    if n not in cos_sin_cache:
+        w = t<<(10+COS_SIN_CACHE_PREC-COS_SIN_CACHE_STEP)
+        cos_t, sin_t = exponential_series(w, 10+COS_SIN_CACHE_PREC, 2)
+        cos_sin_cache[n] = (cos_t>>10), (sin_t>>10)
+    cos_t, sin_t = cos_sin_cache[n]
+    offset = COS_SIN_CACHE_PREC - prec
+    cos_t >>= offset
+    sin_t >>= offset
+    x -= t << precs
+    cos = MPZ_ONE << prec
+    sin = x
+    k = 2
+    a = -((x*x) >> prec)
+    while a:
+        a //= k; cos += a; k += 1; a = (a*x) >> prec
+        a //= k; sin += a; k += 1; a = -((a*x) >> prec)
+    return ((cos*cos_t-sin*sin_t) >> prec), ((sin*cos_t+cos*sin_t) >> prec)
+
+def mpf_exp(x, prec, rnd=round_fast):
+    sign, man, exp, bc = x
+    if man:
+        mag = bc + exp
+        wp = prec + 14
+        if sign:
+            man = -man
+        # TODO: the best cutoff depends on both x and the precision.
+        if prec > 600 and exp >= 0:
+            # Need about log2(exp(n)) ~= 1.45*mag extra precision
+            e = mpf_e(wp+int(1.45*mag))
+            return mpf_pow_int(e, man<<exp, prec, rnd)
+        if mag < -wp:
+            return mpf_perturb(fone, sign, prec, rnd)
+        # |x| >= 2
+        if mag > 1:
+            # For large arguments: exp(2^mag*(1+eps)) =
+            # exp(2^mag)*exp(2^mag*eps) = exp(2^mag)*(1 + 2^mag*eps + ...)
+            # so about mag extra bits is required.
+            wpmod = wp + mag
+            offset = exp + wpmod
+            if offset >= 0:
+                t = man << offset
+            else:
+                t = man >> (-offset)
+            lg2 = ln2_fixed(wpmod)
+            n, t = divmod(t, lg2)
+            n = int(n)
+            t >>= mag
+        else:
+            offset = exp + wp
+            if offset >= 0:
+                t = man << offset
+            else:
+                t = man >> (-offset)
+            n = 0
+        man = exp_basecase(t, wp)
+        return from_man_exp(man, n-wp, prec, rnd)
+    if not exp:
+        return fone
+    if x == fninf:
+        return fzero
+    return x
+
+
+def mpf_cosh_sinh(x, prec, rnd=round_fast, tanh=0):
+    """Simultaneously compute (cosh(x), sinh(x)) for real x"""
+    sign, man, exp, bc = x
+    if (not man) and exp:
+        if tanh:
+            if x == finf: return fone
+            if x == fninf: return fnone
+            return fnan
+        if x == finf: return (finf, finf)
+        if x == fninf: return (finf, fninf)
+        return fnan, fnan
+    mag = exp+bc
+    wp = prec+14
+    if mag < -4:
+        # Extremely close to 0, sinh(x) ~= x and cosh(x) ~= 1
+        if mag < -wp:
+            if tanh:
+                return mpf_perturb(x, 1-sign, prec, rnd)
+            cosh = mpf_perturb(fone, 0, prec, rnd)
+            sinh = mpf_perturb(x, sign, prec, rnd)
+            return cosh, sinh
+        # Fix for cancellation when computing sinh
+        wp += (-mag)
+    # Does exp(-2*x) vanish?
+    if mag > 10:
+        if 3*(1<<(mag-1)) > wp:
+            # XXX: rounding
+            if tanh:
+                return mpf_perturb([fone,fnone][sign], 1-sign, prec, rnd)
+            c = s = mpf_shift(mpf_exp(mpf_abs(x), prec, rnd), -1)
+            if sign:
+                s = mpf_neg(s)
+            return c, s
+    # |x| > 1
+    if mag > 1:
+        wpmod = wp + mag
+        offset = exp + wpmod
+        if offset >= 0:
+            t = man << offset
+        else:
+            t = man >> (-offset)
+        lg2 = ln2_fixed(wpmod)
+        n, t = divmod(t, lg2)
+        n = int(n)
+        t >>= mag
+    else:
+        offset = exp + wp
+        if offset >= 0:
+            t = man << offset
+        else:
+            t = man >> (-offset)
+        n = 0
+    a, b = exp_expneg_basecase(t, wp)
+    # TODO: optimize division precision
+    cosh = a + (b>>(2*n))
+    sinh = a - (b>>(2*n))
+    if sign:
+        sinh = -sinh
+    if tanh:
+        man = (sinh << wp) // cosh
+        return from_man_exp(man, -wp, prec, rnd)
+    else:
+        cosh = from_man_exp(cosh, n-wp-1, prec, rnd)
+        sinh = from_man_exp(sinh, n-wp-1, prec, rnd)
+        return cosh, sinh
+
+
+def mod_pi2(man, exp, mag, wp):
+    # Reduce to standard interval
+    if mag > 0:
+        i = 0
+        while 1:
+            cancellation_prec = 20 << i
+            wpmod = wp + mag + cancellation_prec
+            pi2 = pi_fixed(wpmod-1)
+            pi4 = pi2 >> 1
+            offset = wpmod + exp
+            if offset >= 0:
+                t = man << offset
+            else:
+                t = man >> (-offset)
+            n, y = divmod(t, pi2)
+            if y > pi4:
+                small = pi2 - y
+            else:
+                small = y
+            if small >> (wp+mag-10):
+                n = int(n)
+                t = y >> mag
+                wp = wpmod - mag
+                break
+            i += 1
+    else:
+        wp += (-mag)
+        offset = exp + wp
+        if offset >= 0:
+            t = man << offset
+        else:
+            t = man >> (-offset)
+        n = 0
+    return t, n, wp
+
+
+def mpf_cos_sin(x, prec, rnd=round_fast, which=0, pi=False):
+    """
+    which:
+    0 -- return cos(x), sin(x)
+    1 -- return cos(x)
+    2 -- return sin(x)
+    3 -- return tan(x)
+
+    if pi=True, compute for pi*x
+    """
+    sign, man, exp, bc = x
+    if not man:
+        if exp:
+            c, s = fnan, fnan
+        else:
+            c, s = fone, fzero
+        if which == 0: return c, s
+        if which == 1: return c
+        if which == 2: return s
+        if which == 3: return s
+
+    mag = bc + exp
+    wp = prec + 10
+
+    # Extremely small?
+    if mag < 0:
+        if mag < -wp:
+            if pi:
+                x = mpf_mul(x, mpf_pi(wp))
+            c = mpf_perturb(fone, 1, prec, rnd)
+            s = mpf_perturb(x, 1-sign, prec, rnd)
+            if which == 0: return c, s
+            if which == 1: return c
+            if which == 2: return s
+            if which == 3: return mpf_perturb(x, sign, prec, rnd)
+    if pi:
+        if exp >= -1:
+            if exp == -1:
+                c = fzero
+                s = (fone, fnone)[bool(man & 2) ^ sign]
+            elif exp == 0:
+                c, s = (fnone, fzero)
+            else:
+                c, s = (fone, fzero)
+            if which == 0: return c, s
+            if which == 1: return c
+            if which == 2: return s
+            if which == 3: return mpf_div(s, c, prec, rnd)
+        # Subtract nearest half-integer (= mod by pi/2)
+        n = ((man >> (-exp-2)) + 1) >> 1
+        man = man - (n << (-exp-1))
+        mag2 = bitcount(man) + exp
+        wp = prec + 10 - mag2
+        offset = exp + wp
+        if offset >= 0:
+            t = man << offset
+        else:
+            t = man >> (-offset)
+        t = (t*pi_fixed(wp)) >> wp
+    else:
+        t, n, wp = mod_pi2(man, exp, mag, wp)
+    c, s = cos_sin_basecase(t, wp)
+    m = n & 3
+    if   m == 1: c, s = -s, c
+    elif m == 2: c, s = -c, -s
+    elif m == 3: c, s = s, -c
+    if sign:
+        s = -s
+    if which == 0:
+        c = from_man_exp(c, -wp, prec, rnd)
+        s = from_man_exp(s, -wp, prec, rnd)
+        return c, s
+    if which == 1:
+        return from_man_exp(c, -wp, prec, rnd)
+    if which == 2:
+        return from_man_exp(s, -wp, prec, rnd)
+    if which == 3:
+        return from_rational(s, c, prec, rnd)
+
+def mpf_cos(x, prec, rnd=round_fast): return mpf_cos_sin(x, prec, rnd, 1)
+def mpf_sin(x, prec, rnd=round_fast): return mpf_cos_sin(x, prec, rnd, 2)
+def mpf_tan(x, prec, rnd=round_fast): return mpf_cos_sin(x, prec, rnd, 3)
+def mpf_cos_sin_pi(x, prec, rnd=round_fast): return mpf_cos_sin(x, prec, rnd, 0, 1)
+def mpf_cos_pi(x, prec, rnd=round_fast): return mpf_cos_sin(x, prec, rnd, 1, 1)
+def mpf_sin_pi(x, prec, rnd=round_fast): return mpf_cos_sin(x, prec, rnd, 2, 1)
+def mpf_cosh(x, prec, rnd=round_fast): return mpf_cosh_sinh(x, prec, rnd)[0]
+def mpf_sinh(x, prec, rnd=round_fast): return mpf_cosh_sinh(x, prec, rnd)[1]
+def mpf_tanh(x, prec, rnd=round_fast): return mpf_cosh_sinh(x, prec, rnd, tanh=1)
+
+
+# Low-overhead fixed-point versions
+
+def cos_sin_fixed(x, prec, pi2=None):
+    if pi2 is None:
+        pi2 = pi_fixed(prec-1)
+    n, t = divmod(x, pi2)
+    n = int(n)
+    c, s = cos_sin_basecase(t, prec)
+    m = n & 3
+    if m == 0: return c, s
+    if m == 1: return -s, c
+    if m == 2: return -c, -s
+    if m == 3: return s, -c
+
+def exp_fixed(x, prec, ln2=None):
+    if ln2 is None:
+        ln2 = ln2_fixed(prec)
+    n, t = divmod(x, ln2)
+    n = int(n)
+    v = exp_basecase(t, prec)
+    if n >= 0:
+        return v << n
+    else:
+        return v >> (-n)
+
+
+if BACKEND == 'sage':
+    try:
+        import sage.libs.mpmath.ext_libmp as _lbmp
+        mpf_sqrt = _lbmp.mpf_sqrt
+        mpf_exp = _lbmp.mpf_exp
+        mpf_log = _lbmp.mpf_log
+        mpf_cos = _lbmp.mpf_cos
+        mpf_sin = _lbmp.mpf_sin
+        mpf_pow = _lbmp.mpf_pow
+        exp_fixed = _lbmp.exp_fixed
+        cos_sin_fixed = _lbmp.cos_sin_fixed
+        log_int_fixed = _lbmp.log_int_fixed
+    except (ImportError, AttributeError):
+        print("Warning: Sage imports in libelefun failed")
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libhyper.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libhyper.py
new file mode 100644
index 0000000000000000000000000000000000000000..04f52d59710be77819066aea5c1cf4b0883f72d7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libhyper.py
@@ -0,0 +1,1150 @@
+"""
+This module implements computation of hypergeometric and related
+functions. In particular, it provides code for generic summation
+of hypergeometric series. Optimized versions for various special
+cases are also provided.
+"""
+
+import operator
+import math
+
+from .backend import MPZ_ZERO, MPZ_ONE, BACKEND, xrange, exec_
+
+from .libintmath import gcd
+
+from .libmpf import (\
+    ComplexResult, round_fast, round_nearest,
+    negative_rnd, bitcount, to_fixed, from_man_exp, from_int, to_int,
+    from_rational,
+    fzero, fone, fnone, ftwo, finf, fninf, fnan,
+    mpf_sign, mpf_add, mpf_abs, mpf_pos,
+    mpf_cmp, mpf_lt, mpf_le, mpf_gt, mpf_min_max,
+    mpf_perturb, mpf_neg, mpf_shift, mpf_sub, mpf_mul, mpf_div,
+    sqrt_fixed, mpf_sqrt, mpf_rdiv_int, mpf_pow_int,
+    to_rational,
+)
+
+from .libelefun import (\
+    mpf_pi, mpf_exp, mpf_log, pi_fixed, mpf_cos_sin, mpf_cos, mpf_sin,
+    mpf_sqrt, agm_fixed,
+)
+
+from .libmpc import (\
+    mpc_one, mpc_sub, mpc_mul_mpf, mpc_mul, mpc_neg, complex_int_pow,
+    mpc_div, mpc_add_mpf, mpc_sub_mpf,
+    mpc_log, mpc_add, mpc_pos, mpc_shift,
+    mpc_is_infnan, mpc_zero, mpc_sqrt, mpc_abs,
+    mpc_mpf_div, mpc_square, mpc_exp
+)
+
+from .libintmath import ifac
+from .gammazeta import mpf_gamma_int, mpf_euler, euler_fixed
+
+class NoConvergence(Exception):
+    pass
+
+
+#-----------------------------------------------------------------------#
+#                                                                       #
+#                     Generic hypergeometric series                     #
+#                                                                       #
+#-----------------------------------------------------------------------#
+
+"""
+TODO:
+
+1. proper mpq parsing
+2. imaginary z special-cased (also: rational, integer?)
+3. more clever handling of series that don't converge because of stupid
+   upwards rounding
+4. checking for cancellation
+
+"""
+
+def make_hyp_summator(key):
+    """
+    Returns a function that sums a generalized hypergeometric series,
+    for given parameter types (integer, rational, real, complex).
+
+    """
+    p, q, param_types, ztype = key
+
+    pstring = "".join(param_types)
+    fname = "hypsum_%i_%i_%s_%s_%s" % (p, q, pstring[:p], pstring[p:], ztype)
+    #print "generating hypsum", fname
+
+    have_complex_param = 'C' in param_types
+    have_complex_arg = ztype == 'C'
+    have_complex = have_complex_param or have_complex_arg
+
+    source = []
+    add = source.append
+
+    aint = []
+    arat = []
+    bint = []
+    brat = []
+    areal = []
+    breal = []
+    acomplex = []
+    bcomplex = []
+
+    #add("wp = prec + 40")
+    add("MAX = kwargs.get('maxterms', wp*100)")
+    add("HIGH = MPZ_ONE<<epsshift")
+    add("LOW = -HIGH")
+
+    # Setup code
+    add("SRE = PRE = one = (MPZ_ONE << wp)")
+    if have_complex:
+        add("SIM = PIM = MPZ_ZERO")
+
+    if have_complex_arg:
+        add("xsign, xm, xe, xbc = z[0]")
+        add("if xsign: xm = -xm")
+        add("ysign, ym, ye, ybc = z[1]")
+        add("if ysign: ym = -ym")
+    else:
+        add("xsign, xm, xe, xbc = z")
+        add("if xsign: xm = -xm")
+
+    add("offset = xe + wp")
+    add("if offset >= 0:")
+    add("    ZRE = xm << offset")
+    add("else:")
+    add("    ZRE = xm >> (-offset)")
+    if have_complex_arg:
+        add("offset = ye + wp")
+        add("if offset >= 0:")
+        add("    ZIM = ym << offset")
+        add("else:")
+        add("    ZIM = ym >> (-offset)")
+
+    for i, flag in enumerate(param_types):
+        W = ["A", "B"][i >= p]
+        if flag == 'Z':
+            ([aint,bint][i >= p]).append(i)
+            add("%sINT_%i = coeffs[%i]" % (W, i, i))
+        elif flag == 'Q':
+            ([arat,brat][i >= p]).append(i)
+            add("%sP_%i, %sQ_%i = coeffs[%i]._mpq_" % (W, i, W, i, i))
+        elif flag == 'R':
+            ([areal,breal][i >= p]).append(i)
+            add("xsign, xm, xe, xbc = coeffs[%i]._mpf_" % i)
+            add("if xsign: xm = -xm")
+            add("offset = xe + wp")
+            add("if offset >= 0:")
+            add("    %sREAL_%i = xm << offset" % (W, i))
+            add("else:")
+            add("    %sREAL_%i = xm >> (-offset)" % (W, i))
+        elif flag == 'C':
+            ([acomplex,bcomplex][i >= p]).append(i)
+            add("__re, __im = coeffs[%i]._mpc_" % i)
+            add("xsign, xm, xe, xbc = __re")
+            add("if xsign: xm = -xm")
+            add("ysign, ym, ye, ybc = __im")
+            add("if ysign: ym = -ym")
+
+            add("offset = xe + wp")
+            add("if offset >= 0:")
+            add("    %sCRE_%i = xm << offset" % (W, i))
+            add("else:")
+            add("    %sCRE_%i = xm >> (-offset)" % (W, i))
+            add("offset = ye + wp")
+            add("if offset >= 0:")
+            add("    %sCIM_%i = ym << offset" % (W, i))
+            add("else:")
+            add("    %sCIM_%i = ym >> (-offset)" % (W, i))
+        else:
+            raise ValueError
+
+    l_areal = len(areal)
+    l_breal = len(breal)
+    cancellable_real = min(l_areal, l_breal)
+    noncancellable_real_num = areal[cancellable_real:]
+    noncancellable_real_den = breal[cancellable_real:]
+
+    # LOOP
+    add("for n in xrange(1,10**8):")
+
+    add("    if n in magnitude_check:")
+    add("        p_mag = bitcount(abs(PRE))")
+    if have_complex:
+        add("        p_mag = max(p_mag, bitcount(abs(PIM)))")
+    add("        magnitude_check[n] = wp-p_mag")
+
+    # Real factors
+    multiplier = " * ".join(["AINT_#".replace("#", str(i)) for i in aint] + \
+                            ["AP_#".replace("#", str(i)) for i in arat] + \
+                            ["BQ_#".replace("#", str(i)) for i in brat])
+
+    divisor    = " * ".join(["BINT_#".replace("#", str(i)) for i in bint] + \
+                            ["BP_#".replace("#", str(i)) for i in brat] + \
+                            ["AQ_#".replace("#", str(i)) for i in arat] + ["n"])
+
+    if multiplier:
+        add("    mul = " + multiplier)
+    add("    div = " + divisor)
+
+    # Check for singular terms
+    add("    if not div:")
+    if multiplier:
+        add("        if not mul:")
+        add("            break")
+    add("        raise ZeroDivisionError")
+
+    # Update product
+    if have_complex:
+
+        # TODO: when there are several real parameters and just a few complex
+        # (maybe just the complex argument), we only need to do about
+        # half as many ops if we accumulate the real factor in a single real variable
+        for k in range(cancellable_real): add("    PRE = PRE * AREAL_%i // BREAL_%i" % (areal[k], breal[k]))
+        for i in noncancellable_real_num: add("    PRE = (PRE * AREAL_#) >> wp".replace("#", str(i)))
+        for i in noncancellable_real_den: add("    PRE = (PRE << wp) // BREAL_#".replace("#", str(i)))
+        for k in range(cancellable_real): add("    PIM = PIM * AREAL_%i // BREAL_%i" % (areal[k], breal[k]))
+        for i in noncancellable_real_num: add("    PIM = (PIM * AREAL_#) >> wp".replace("#", str(i)))
+        for i in noncancellable_real_den: add("    PIM = (PIM << wp) // BREAL_#".replace("#", str(i)))
+
+        if multiplier:
+            if have_complex_arg:
+                add("    PRE, PIM = (mul*(PRE*ZRE-PIM*ZIM))//div, (mul*(PIM*ZRE+PRE*ZIM))//div")
+                add("    PRE >>= wp")
+                add("    PIM >>= wp")
+            else:
+                add("    PRE = ((mul * PRE * ZRE) >> wp) // div")
+                add("    PIM = ((mul * PIM * ZRE) >> wp) // div")
+        else:
+            if have_complex_arg:
+                add("    PRE, PIM = (PRE*ZRE-PIM*ZIM)//div, (PIM*ZRE+PRE*ZIM)//div")
+                add("    PRE >>= wp")
+                add("    PIM >>= wp")
+            else:
+                add("    PRE = ((PRE * ZRE) >> wp) // div")
+                add("    PIM = ((PIM * ZRE) >> wp) // div")
+
+        for i in acomplex:
+            add("    PRE, PIM = PRE*ACRE_#-PIM*ACIM_#, PIM*ACRE_#+PRE*ACIM_#".replace("#", str(i)))
+            add("    PRE >>= wp")
+            add("    PIM >>= wp")
+
+        for i in bcomplex:
+            add("    mag = BCRE_#*BCRE_#+BCIM_#*BCIM_#".replace("#", str(i)))
+            add("    re = PRE*BCRE_# + PIM*BCIM_#".replace("#", str(i)))
+            add("    im = PIM*BCRE_# - PRE*BCIM_#".replace("#", str(i)))
+            add("    PRE = (re << wp) // mag".replace("#", str(i)))
+            add("    PIM = (im << wp) // mag".replace("#", str(i)))
+
+    else:
+        for k in range(cancellable_real): add("    PRE = PRE * AREAL_%i // BREAL_%i" % (areal[k], breal[k]))
+        for i in noncancellable_real_num: add("    PRE = (PRE * AREAL_#) >> wp".replace("#", str(i)))
+        for i in noncancellable_real_den: add("    PRE = (PRE << wp) // BREAL_#".replace("#", str(i)))
+        if multiplier:
+            add("    PRE = ((PRE * mul * ZRE) >> wp) // div")
+        else:
+            add("    PRE = ((PRE * ZRE) >> wp) // div")
+
+    # Add product to sum
+    if have_complex:
+        add("    SRE += PRE")
+        add("    SIM += PIM")
+        add("    if (HIGH > PRE > LOW) and (HIGH > PIM > LOW):")
+        add("        break")
+    else:
+        add("    SRE += PRE")
+        add("    if HIGH > PRE > LOW:")
+        add("        break")
+
+    #add("    from mpmath import nprint, log, ldexp")
+    #add("    nprint([n, log(abs(PRE),2), ldexp(PRE,-wp)])")
+
+    add("    if n > MAX:")
+    add("        raise NoConvergence('Hypergeometric series converges too slowly. Try increasing maxterms.')")
+
+    # +1 all parameters for next loop
+    for i in aint:     add("    AINT_# += 1".replace("#", str(i)))
+    for i in bint:     add("    BINT_# += 1".replace("#", str(i)))
+    for i in arat:     add("    AP_# += AQ_#".replace("#", str(i)))
+    for i in brat:     add("    BP_# += BQ_#".replace("#", str(i)))
+    for i in areal:    add("    AREAL_# += one".replace("#", str(i)))
+    for i in breal:    add("    BREAL_# += one".replace("#", str(i)))
+    for i in acomplex: add("    ACRE_# += one".replace("#", str(i)))
+    for i in bcomplex: add("    BCRE_# += one".replace("#", str(i)))
+
+    if have_complex:
+        add("a = from_man_exp(SRE, -wp, prec, 'n')")
+        add("b = from_man_exp(SIM, -wp, prec, 'n')")
+
+        add("if SRE:")
+        add("    if SIM:")
+        add("        magn = max(a[2]+a[3], b[2]+b[3])")
+        add("    else:")
+        add("        magn = a[2]+a[3]")
+        add("elif SIM:")
+        add("    magn = b[2]+b[3]")
+        add("else:")
+        add("    magn = -wp+1")
+
+        add("return (a, b), True, magn")
+    else:
+        add("a = from_man_exp(SRE, -wp, prec, 'n')")
+
+        add("if SRE:")
+        add("    magn = a[2]+a[3]")
+        add("else:")
+        add("    magn = -wp+1")
+
+        add("return a, False, magn")
+
+    source = "\n".join(("    " + line) for line in source)
+    source = ("def %s(coeffs, z, prec, wp, epsshift, magnitude_check, **kwargs):\n" % fname) + source
+
+    namespace = {}
+
+    exec_(source, globals(), namespace)
+
+    #print source
+    return source, namespace[fname]
+
+
+if BACKEND == 'sage':
+
+    def make_hyp_summator(key):
+        """
+        Returns a function that sums a generalized hypergeometric series,
+        for given parameter types (integer, rational, real, complex).
+        """
+        from sage.libs.mpmath.ext_main import hypsum_internal
+        p, q, param_types, ztype = key
+        def _hypsum(coeffs, z, prec, wp, epsshift, magnitude_check, **kwargs):
+            return hypsum_internal(p, q, param_types, ztype, coeffs, z,
+                prec, wp, epsshift, magnitude_check, kwargs)
+
+        return "(none)", _hypsum
+
+
+#-----------------------------------------------------------------------#
+#                                                                       #
+#                              Error functions                          #
+#                                                                       #
+#-----------------------------------------------------------------------#
+
+# TODO: mpf_erf should call mpf_erfc when appropriate (currently
+#    only the converse delegation is implemented)
+
+def mpf_erf(x, prec, rnd=round_fast):
+    sign, man, exp, bc = x
+    if not man:
+        if x == fzero: return fzero
+        if x == finf: return fone
+        if x== fninf: return fnone
+        return fnan
+    size = exp + bc
+    lg = math.log
+    # The approximation erf(x) = 1 is accurate to > x^2 * log(e,2) bits
+    if size > 3 and 2*(size-1) + 0.528766 > lg(prec,2):
+        if sign:
+            return mpf_perturb(fnone, 0, prec, rnd)
+        else:
+            return mpf_perturb(fone, 1, prec, rnd)
+    # erf(x) ~ 2*x/sqrt(pi) close to 0
+    if size < -prec:
+        # 2*x
+        x = mpf_shift(x,1)
+        c = mpf_sqrt(mpf_pi(prec+20), prec+20)
+        # TODO: interval rounding
+        return mpf_div(x, c, prec, rnd)
+    wp = prec + abs(size) + 25
+    # Taylor series for erf, fixed-point summation
+    t = abs(to_fixed(x, wp))
+    t2 = (t*t) >> wp
+    s, term, k = t, 12345, 1
+    while term:
+        t = ((t * t2) >> wp) // k
+        term = t // (2*k+1)
+        if k & 1:
+            s -= term
+        else:
+            s += term
+        k += 1
+    s = (s << (wp+1)) // sqrt_fixed(pi_fixed(wp), wp)
+    if sign:
+        s = -s
+    return from_man_exp(s, -wp, prec, rnd)
+
+# If possible, we use the asymptotic series for erfc.
+# This is an alternating divergent asymptotic series, so
+# the error is at most equal to the first omitted term.
+# Here we check if the smallest term is small enough
+# for a given x and precision
+def erfc_check_series(x, prec):
+    n = to_int(x)
+    if n**2 * 1.44 > prec:
+        return True
+    return False
+
+def mpf_erfc(x, prec, rnd=round_fast):
+    sign, man, exp, bc = x
+    if not man:
+        if x == fzero: return fone
+        if x == finf: return fzero
+        if x == fninf: return ftwo
+        return fnan
+    wp = prec + 20
+    mag = bc+exp
+    # Preserve full accuracy when exponent grows huge
+    wp += max(0, 2*mag)
+    regular_erf = sign or mag < 2
+    if regular_erf or not erfc_check_series(x, wp):
+        if regular_erf:
+            return mpf_sub(fone, mpf_erf(x, prec+10, negative_rnd[rnd]), prec, rnd)
+        # 1-erf(x) ~ exp(-x^2), increase prec to deal with cancellation
+        n = to_int(x)+1
+        return mpf_sub(fone, mpf_erf(x, prec + int(n**2*1.44) + 10), prec, rnd)
+    s = term = MPZ_ONE << wp
+    term_prev = 0
+    t = (2 * to_fixed(x, wp) ** 2) >> wp
+    k = 1
+    while 1:
+        term = ((term * (2*k - 1)) << wp) // t
+        if k > 4 and term > term_prev or not term:
+            break
+        if k & 1:
+            s -= term
+        else:
+            s += term
+        term_prev = term
+        #print k, to_str(from_man_exp(term, -wp, 50), 10)
+        k += 1
+    s = (s << wp) // sqrt_fixed(pi_fixed(wp), wp)
+    s = from_man_exp(s, -wp, wp)
+    z = mpf_exp(mpf_neg(mpf_mul(x,x,wp),wp),wp)
+    y = mpf_div(mpf_mul(z, s, wp), x, prec, rnd)
+    return y
+
+
+#-----------------------------------------------------------------------#
+#                                                                       #
+#                         Exponential integrals                         #
+#                                                                       #
+#-----------------------------------------------------------------------#
+
+def ei_taylor(x, prec):
+    s = t = x
+    k = 2
+    while t:
+        t = ((t*x) >> prec) // k
+        s += t // k
+        k += 1
+    return s
+
+def complex_ei_taylor(zre, zim, prec):
+    _abs = abs
+    sre = tre = zre
+    sim = tim = zim
+    k = 2
+    while _abs(tre) + _abs(tim) > 5:
+        tre, tim = ((tre*zre-tim*zim)//k)>>prec, ((tre*zim+tim*zre)//k)>>prec
+        sre += tre // k
+        sim += tim // k
+        k += 1
+    return sre, sim
+
+def ei_asymptotic(x, prec):
+    one = MPZ_ONE << prec
+    x = t = ((one << prec) // x)
+    s = one + x
+    k = 2
+    while t:
+        t = (k*t*x) >> prec
+        s += t
+        k += 1
+    return s
+
+def complex_ei_asymptotic(zre, zim, prec):
+    _abs = abs
+    one = MPZ_ONE << prec
+    M = (zim*zim + zre*zre) >> prec
+    # 1 / z
+    xre = tre = (zre << prec) // M
+    xim = tim = ((-zim) << prec) // M
+    sre = one + xre
+    sim = xim
+    k = 2
+    while _abs(tre) + _abs(tim) > 1000:
+        #print tre, tim
+        tre, tim = ((tre*xre-tim*xim)*k)>>prec, ((tre*xim+tim*xre)*k)>>prec
+        sre += tre
+        sim += tim
+        k += 1
+        if k > prec:
+            raise NoConvergence
+    return sre, sim
+
+def mpf_ei(x, prec, rnd=round_fast, e1=False):
+    if e1:
+        x = mpf_neg(x)
+    sign, man, exp, bc = x
+    if e1 and not sign:
+        if x == fzero:
+            return finf
+        raise ComplexResult("E1(x) for x < 0")
+    if man:
+        xabs = 0, man, exp, bc
+        xmag = exp+bc
+        wp = prec + 20
+        can_use_asymp = xmag > wp
+        if not can_use_asymp:
+            if exp >= 0:
+                xabsint = man << exp
+            else:
+                xabsint = man >> (-exp)
+            can_use_asymp = xabsint > int(wp*0.693) + 10
+        if can_use_asymp:
+            if xmag > wp:
+                v = fone
+            else:
+                v = from_man_exp(ei_asymptotic(to_fixed(x, wp), wp), -wp)
+            v = mpf_mul(v, mpf_exp(x, wp), wp)
+            v = mpf_div(v, x, prec, rnd)
+        else:
+            wp += 2*int(to_int(xabs))
+            u = to_fixed(x, wp)
+            v = ei_taylor(u, wp) + euler_fixed(wp)
+            t1 = from_man_exp(v,-wp)
+            t2 = mpf_log(xabs,wp)
+            v = mpf_add(t1, t2, prec, rnd)
+    else:
+        if x == fzero: v = fninf
+        elif x == finf: v = finf
+        elif x == fninf: v = fzero
+        else: v = fnan
+    if e1:
+        v = mpf_neg(v)
+    return v
+
+def mpc_ei(z, prec, rnd=round_fast, e1=False):
+    if e1:
+        z = mpc_neg(z)
+    a, b = z
+    asign, aman, aexp, abc = a
+    bsign, bman, bexp, bbc = b
+    if b == fzero:
+        if e1:
+            x = mpf_neg(mpf_ei(a, prec, rnd))
+            if not asign:
+                y = mpf_neg(mpf_pi(prec, rnd))
+            else:
+                y = fzero
+            return x, y
+        else:
+            return mpf_ei(a, prec, rnd), fzero
+    if a != fzero:
+        if not aman or not bman:
+            return (fnan, fnan)
+    wp = prec + 40
+    amag = aexp+abc
+    bmag = bexp+bbc
+    zmag = max(amag, bmag)
+    can_use_asymp = zmag > wp
+    if not can_use_asymp:
+        zabsint = abs(to_int(a)) + abs(to_int(b))
+        can_use_asymp = zabsint > int(wp*0.693) + 20
+    try:
+        if can_use_asymp:
+            if zmag > wp:
+                v = fone, fzero
+            else:
+                zre = to_fixed(a, wp)
+                zim = to_fixed(b, wp)
+                vre, vim = complex_ei_asymptotic(zre, zim, wp)
+                v = from_man_exp(vre, -wp), from_man_exp(vim, -wp)
+            v = mpc_mul(v, mpc_exp(z, wp), wp)
+            v = mpc_div(v, z, wp)
+            if e1:
+                v = mpc_neg(v, prec, rnd)
+            else:
+                x, y = v
+                if bsign:
+                    v = mpf_pos(x, prec, rnd), mpf_sub(y, mpf_pi(wp), prec, rnd)
+                else:
+                    v = mpf_pos(x, prec, rnd), mpf_add(y, mpf_pi(wp), prec, rnd)
+            return v
+    except NoConvergence:
+        pass
+    #wp += 2*max(0,zmag)
+    wp += 2*int(to_int(mpc_abs(z, 5)))
+    zre = to_fixed(a, wp)
+    zim = to_fixed(b, wp)
+    vre, vim = complex_ei_taylor(zre, zim, wp)
+    vre += euler_fixed(wp)
+    v = from_man_exp(vre,-wp), from_man_exp(vim,-wp)
+    if e1:
+        u = mpc_log(mpc_neg(z),wp)
+    else:
+        u = mpc_log(z,wp)
+    v = mpc_add(v, u, prec, rnd)
+    if e1:
+        v = mpc_neg(v)
+    return v
+
+def mpf_e1(x, prec, rnd=round_fast):
+    return mpf_ei(x, prec, rnd, True)
+
+def mpc_e1(x, prec, rnd=round_fast):
+    return mpc_ei(x, prec, rnd, True)
+
+def mpf_expint(n, x, prec, rnd=round_fast, gamma=False):
+    """
+    E_n(x), n an integer, x real
+
+    With gamma=True, computes Gamma(n,x)   (upper incomplete gamma function)
+
+    Returns (real, None) if real, otherwise (real, imag)
+    The imaginary part is an optional branch cut term
+
+    """
+    sign, man, exp, bc = x
+    if not man:
+        if gamma:
+            if x == fzero:
+                # Actually gamma function pole
+                if n <= 0:
+                    return finf, None
+                return mpf_gamma_int(n, prec, rnd), None
+            if x == finf:
+                return fzero, None
+            # TODO: could return finite imaginary value at -inf
+            return fnan, fnan
+        else:
+            if x == fzero:
+                if n > 1:
+                    return from_rational(1, n-1, prec, rnd), None
+                else:
+                    return finf, None
+            if x == finf:
+                return fzero, None
+            return fnan, fnan
+    n_orig = n
+    if gamma:
+        n = 1-n
+    wp = prec + 20
+    xmag = exp + bc
+    # Beware of near-poles
+    if xmag < -10:
+        raise NotImplementedError
+    nmag = bitcount(abs(n))
+    have_imag = n > 0 and sign
+    negx = mpf_neg(x)
+    # Skip series if direct convergence
+    if n == 0 or 2*nmag - xmag < -wp:
+        if gamma:
+            v = mpf_exp(negx, wp)
+            re = mpf_mul(v, mpf_pow_int(x, n_orig-1, wp), prec, rnd)
+        else:
+            v = mpf_exp(negx, wp)
+            re = mpf_div(v, x, prec, rnd)
+    else:
+        # Finite number of terms, or...
+        can_use_asymptotic_series = -3*wp < n <= 0
+        # ...large enough?
+        if not can_use_asymptotic_series:
+            xi = abs(to_int(x))
+            m = min(max(1, xi-n), 2*wp)
+            siz = -n*nmag + (m+n)*bitcount(abs(m+n)) - m*xmag - (144*m//100)
+            tol = -wp-10
+            can_use_asymptotic_series = siz < tol
+        if can_use_asymptotic_series:
+            r = ((-MPZ_ONE) << (wp+wp)) // to_fixed(x, wp)
+            m = n
+            t = r*m
+            s = MPZ_ONE << wp
+            while m and t:
+                s += t
+                m += 1
+                t = (m*r*t) >> wp
+            v = mpf_exp(negx, wp)
+            if gamma:
+                # ~ exp(-x) * x^(n-1) * (1 + ...)
+                v = mpf_mul(v, mpf_pow_int(x, n_orig-1, wp), wp)
+            else:
+                # ~ exp(-x)/x * (1 + ...)
+                v = mpf_div(v, x, wp)
+            re = mpf_mul(v, from_man_exp(s, -wp), prec, rnd)
+        elif n == 1:
+            re = mpf_neg(mpf_ei(negx, prec, rnd))
+        elif n > 0 and n < 3*wp:
+            T1 = mpf_neg(mpf_ei(negx, wp))
+            if gamma:
+                if n_orig & 1:
+                    T1 = mpf_neg(T1)
+            else:
+                T1 = mpf_mul(T1, mpf_pow_int(negx, n-1, wp), wp)
+            r = t = to_fixed(x, wp)
+            facs = [1] * (n-1)
+            for k in range(1,n-1):
+                facs[k] = facs[k-1] * k
+            facs = facs[::-1]
+            s = facs[0] << wp
+            for k in range(1, n-1):
+                if k & 1:
+                    s -= facs[k] * t
+                else:
+                    s += facs[k] * t
+                t = (t*r) >> wp
+            T2 = from_man_exp(s, -wp, wp)
+            T2 = mpf_mul(T2, mpf_exp(negx, wp))
+            if gamma:
+                T2 = mpf_mul(T2, mpf_pow_int(x, n_orig, wp), wp)
+            R = mpf_add(T1, T2)
+            re = mpf_div(R, from_int(ifac(n-1)), prec, rnd)
+        else:
+            raise NotImplementedError
+    if have_imag:
+        M = from_int(-ifac(n-1))
+        if gamma:
+            im = mpf_div(mpf_pi(wp), M, prec, rnd)
+            if n_orig & 1:
+                im = mpf_neg(im)
+        else:
+            im = mpf_div(mpf_mul(mpf_pi(wp), mpf_pow_int(negx, n_orig-1, wp), wp), M, prec, rnd)
+        return re, im
+    else:
+        return re, None
+
+def mpf_ci_si_taylor(x, wp, which=0):
+    """
+    0 - Ci(x) - (euler+log(x))
+    1 - Si(x)
+    """
+    x = to_fixed(x, wp)
+    x2 = -(x*x) >> wp
+    if which == 0:
+        s, t, k = 0, (MPZ_ONE<<wp), 2
+    else:
+        s, t, k = x, x, 3
+    while t:
+        t = (t*x2//(k*(k-1)))>>wp
+        s += t//k
+        k += 2
+    return from_man_exp(s, -wp)
+
+def mpc_ci_si_taylor(re, im, wp, which=0):
+    # The following code is only designed for small arguments,
+    # and not too small arguments (for relative accuracy)
+    if re[1]:
+        mag = re[2]+re[3]
+    elif im[1]:
+        mag = im[2]+im[3]
+    if im[1]:
+        mag = max(mag, im[2]+im[3])
+    if mag > 2 or mag < -wp:
+        raise NotImplementedError
+    wp += (2-mag)
+    zre = to_fixed(re, wp)
+    zim = to_fixed(im, wp)
+    z2re = (zim*zim-zre*zre)>>wp
+    z2im = (-2*zre*zim)>>wp
+    tre = zre
+    tim = zim
+    one = MPZ_ONE<<wp
+    if which == 0:
+        sre, sim, tre, tim, k = 0, 0, (MPZ_ONE<<wp), 0, 2
+    else:
+        sre, sim, tre, tim, k = zre, zim, zre, zim, 3
+    while max(abs(tre), abs(tim)) > 2:
+        f = k*(k-1)
+        tre, tim = ((tre*z2re-tim*z2im)//f)>>wp, ((tre*z2im+tim*z2re)//f)>>wp
+        sre += tre//k
+        sim += tim//k
+        k += 2
+    return from_man_exp(sre, -wp), from_man_exp(sim, -wp)
+
+def mpf_ci_si(x, prec, rnd=round_fast, which=2):
+    """
+    Calculation of Ci(x), Si(x) for real x.
+
+    which = 0 -- returns (Ci(x), -)
+    which = 1 -- returns (Si(x), -)
+    which = 2 -- returns (Ci(x), Si(x))
+
+    Note: if x < 0, Ci(x) needs an additional imaginary term, pi*i.
+    """
+    wp = prec + 20
+    sign, man, exp, bc = x
+    ci, si = None, None
+    if not man:
+        if x == fzero:
+            return (fninf, fzero)
+        if x == fnan:
+            return (x, x)
+        ci = fzero
+        if which != 0:
+            if x == finf:
+                si = mpf_shift(mpf_pi(prec, rnd), -1)
+            if x == fninf:
+                si = mpf_neg(mpf_shift(mpf_pi(prec, negative_rnd[rnd]), -1))
+        return (ci, si)
+    # For small x: Ci(x) ~ euler + log(x), Si(x) ~ x
+    mag = exp+bc
+    if mag < -wp:
+        if which != 0:
+            si = mpf_perturb(x, 1-sign, prec, rnd)
+        if which != 1:
+            y = mpf_euler(wp)
+            xabs = mpf_abs(x)
+            ci = mpf_add(y, mpf_log(xabs, wp), prec, rnd)
+        return ci, si
+    # For huge x: Ci(x) ~ sin(x)/x, Si(x) ~ pi/2
+    elif mag > wp:
+        if which != 0:
+            if sign:
+                si = mpf_neg(mpf_pi(prec, negative_rnd[rnd]))
+            else:
+                si = mpf_pi(prec, rnd)
+            si = mpf_shift(si, -1)
+        if which != 1:
+            ci = mpf_div(mpf_sin(x, wp), x, prec, rnd)
+        return ci, si
+    else:
+        wp += abs(mag)
+    # Use an asymptotic series? The smallest value of n!/x^n
+    # occurs for n ~ x, where the magnitude is ~ exp(-x).
+    asymptotic = mag-1 > math.log(wp, 2)
+    # Case 1: convergent series near 0
+    if not asymptotic:
+        if which != 0:
+            si = mpf_pos(mpf_ci_si_taylor(x, wp, 1), prec, rnd)
+        if which != 1:
+            ci = mpf_ci_si_taylor(x, wp, 0)
+            ci = mpf_add(ci, mpf_euler(wp), wp)
+            ci = mpf_add(ci, mpf_log(mpf_abs(x), wp), prec, rnd)
+        return ci, si
+    x = mpf_abs(x)
+    # Case 2: asymptotic series for x >> 1
+    xf = to_fixed(x, wp)
+    xr = (MPZ_ONE<<(2*wp)) // xf   # 1/x
+    s1 = (MPZ_ONE << wp)
+    s2 = xr
+    t = xr
+    k = 2
+    while t:
+        t = -t
+        t = (t*xr*k)>>wp
+        k += 1
+        s1 += t
+        t = (t*xr*k)>>wp
+        k += 1
+        s2 += t
+    s1 = from_man_exp(s1, -wp)
+    s2 = from_man_exp(s2, -wp)
+    s1 = mpf_div(s1, x, wp)
+    s2 = mpf_div(s2, x, wp)
+    cos, sin = mpf_cos_sin(x, wp)
+    # Ci(x) = sin(x)*s1-cos(x)*s2
+    # Si(x) = pi/2-cos(x)*s1-sin(x)*s2
+    if which != 0:
+        si = mpf_add(mpf_mul(cos, s1), mpf_mul(sin, s2), wp)
+        si = mpf_sub(mpf_shift(mpf_pi(wp), -1), si, wp)
+        if sign:
+            si = mpf_neg(si)
+        si = mpf_pos(si, prec, rnd)
+    if which != 1:
+        ci = mpf_sub(mpf_mul(sin, s1), mpf_mul(cos, s2), prec, rnd)
+    return ci, si
+
+def mpf_ci(x, prec, rnd=round_fast):
+    if mpf_sign(x) < 0:
+        raise ComplexResult
+    return mpf_ci_si(x, prec, rnd, 0)[0]
+
+def mpf_si(x, prec, rnd=round_fast):
+    return mpf_ci_si(x, prec, rnd, 1)[1]
+
+def mpc_ci(z, prec, rnd=round_fast):
+    re, im = z
+    if im == fzero:
+        ci = mpf_ci_si(re, prec, rnd, 0)[0]
+        if mpf_sign(re) < 0:
+            return (ci, mpf_pi(prec, rnd))
+        return (ci, fzero)
+    wp = prec + 20
+    cre, cim = mpc_ci_si_taylor(re, im, wp, 0)
+    cre = mpf_add(cre, mpf_euler(wp), wp)
+    ci = mpc_add((cre, cim), mpc_log(z, wp), prec, rnd)
+    return ci
+
+def mpc_si(z, prec, rnd=round_fast):
+    re, im = z
+    if im == fzero:
+        return (mpf_ci_si(re, prec, rnd, 1)[1], fzero)
+    wp = prec + 20
+    z = mpc_ci_si_taylor(re, im, wp, 1)
+    return mpc_pos(z, prec, rnd)
+
+
+#-----------------------------------------------------------------------#
+#                                                                       #
+#                             Bessel functions                          #
+#                                                                       #
+#-----------------------------------------------------------------------#
+
+# A Bessel function of the first kind of integer order, J_n(x), is
+# given by the power series
+
+#             oo
+#             ___         k         2 k + n
+#            \        (-1)     / x \
+#    J_n(x) = )    ----------- | - |
+#            /___  k! (k + n)! \ 2 /
+#            k = 0
+
+# Simplifying the quotient between two successive terms gives the
+# ratio x^2 / (-4*k*(k+n)). Hence, we only need one full-precision
+# multiplication and one division by a small integer per term.
+# The complex version is very similar, the only difference being
+# that the multiplication is actually 4 multiplies.
+
+# In the general case, we have
+# J_v(x) = (x/2)**v / v! * 0F1(v+1, (-1/4)*z**2)
+
+# TODO: for extremely large x, we could use an asymptotic
+# trigonometric approximation.
+
+# TODO: recompute at higher precision if the fixed-point mantissa
+# is very small
+
+def mpf_besseljn(n, x, prec, rounding=round_fast):
+    prec += 50
+    negate = n < 0 and n & 1
+    mag = x[2]+x[3]
+    n = abs(n)
+    wp = prec + 20 + n*bitcount(n)
+    if mag < 0:
+        wp -= n * mag
+    x = to_fixed(x, wp)
+    x2 = (x**2) >> wp
+    if not n:
+        s = t = MPZ_ONE << wp
+    else:
+        s = t = (x**n // ifac(n)) >> ((n-1)*wp + n)
+    k = 1
+    while t:
+        t = ((t * x2) // (-4*k*(k+n))) >> wp
+        s += t
+        k += 1
+    if negate:
+        s = -s
+    return from_man_exp(s, -wp, prec, rounding)
+
+def mpc_besseljn(n, z, prec, rounding=round_fast):
+    negate = n < 0 and n & 1
+    n = abs(n)
+    origprec = prec
+    zre, zim = z
+    mag = max(zre[2]+zre[3], zim[2]+zim[3])
+    prec += 20 + n*bitcount(n) + abs(mag)
+    if mag < 0:
+        prec -= n * mag
+    zre = to_fixed(zre, prec)
+    zim = to_fixed(zim, prec)
+    z2re = (zre**2 - zim**2) >> prec
+    z2im = (zre*zim) >> (prec-1)
+    if not n:
+        sre = tre = MPZ_ONE << prec
+        sim = tim = MPZ_ZERO
+    else:
+        re, im = complex_int_pow(zre, zim, n)
+        sre = tre = (re // ifac(n)) >> ((n-1)*prec + n)
+        sim = tim = (im // ifac(n)) >> ((n-1)*prec + n)
+    k = 1
+    while abs(tre) + abs(tim) > 3:
+        p = -4*k*(k+n)
+        tre, tim = tre*z2re - tim*z2im, tim*z2re + tre*z2im
+        tre = (tre // p) >> prec
+        tim = (tim // p) >> prec
+        sre += tre
+        sim += tim
+        k += 1
+    if negate:
+        sre = -sre
+        sim = -sim
+    re = from_man_exp(sre, -prec, origprec, rounding)
+    im = from_man_exp(sim, -prec, origprec, rounding)
+    return (re, im)
+
+def mpf_agm(a, b, prec, rnd=round_fast):
+    """
+    Computes the arithmetic-geometric mean agm(a,b) for
+    nonnegative mpf values a, b.
+    """
+    asign, aman, aexp, abc = a
+    bsign, bman, bexp, bbc = b
+    if asign or bsign:
+        raise ComplexResult("agm of a negative number")
+    # Handle inf, nan or zero in either operand
+    if not (aman and bman):
+        if a == fnan or b == fnan:
+            return fnan
+        if a == finf:
+            if b == fzero:
+                return fnan
+            return finf
+        if b == finf:
+            if a == fzero:
+                return fnan
+            return finf
+        # agm(0,x) = agm(x,0) = 0
+        return fzero
+    wp = prec + 20
+    amag = aexp+abc
+    bmag = bexp+bbc
+    mag_delta = amag - bmag
+    # Reduce to roughly the same magnitude using floating-point AGM
+    abs_mag_delta = abs(mag_delta)
+    if abs_mag_delta > 10:
+        while abs_mag_delta > 10:
+            a, b = mpf_shift(mpf_add(a,b,wp),-1), \
+                mpf_sqrt(mpf_mul(a,b,wp),wp)
+            abs_mag_delta //= 2
+        asign, aman, aexp, abc = a
+        bsign, bman, bexp, bbc = b
+        amag = aexp+abc
+        bmag = bexp+bbc
+        mag_delta = amag - bmag
+    #print to_float(a), to_float(b)
+    # Use agm(a,b) = agm(x*a,x*b)/x to obtain a, b ~= 1
+    min_mag = min(amag,bmag)
+    max_mag = max(amag,bmag)
+    n = 0
+    # If too small, we lose precision when going to fixed-point
+    if min_mag < -8:
+        n = -min_mag
+    # If too large, we waste time using fixed-point with large numbers
+    elif max_mag > 20:
+        n = -max_mag
+    if n:
+        a = mpf_shift(a, n)
+        b = mpf_shift(b, n)
+    #print to_float(a), to_float(b)
+    af = to_fixed(a, wp)
+    bf = to_fixed(b, wp)
+    g = agm_fixed(af, bf, wp)
+    return from_man_exp(g, -wp-n, prec, rnd)
+
+def mpf_agm1(a, prec, rnd=round_fast):
+    """
+    Computes the arithmetic-geometric mean agm(1,a) for a nonnegative
+    mpf value a.
+    """
+    return mpf_agm(fone, a, prec, rnd)
+
+def mpc_agm(a, b, prec, rnd=round_fast):
+    """
+    Complex AGM.
+
+    TODO:
+    * check that convergence works as intended
+    * optimize
+    * select a nonarbitrary branch
+    """
+    if mpc_is_infnan(a) or mpc_is_infnan(b):
+        return fnan, fnan
+    if mpc_zero in (a, b):
+        return fzero, fzero
+    if mpc_neg(a) == b:
+        return fzero, fzero
+    wp = prec+20
+    eps = mpf_shift(fone, -wp+10)
+    while 1:
+        a1 = mpc_shift(mpc_add(a, b, wp), -1)
+        b1 = mpc_sqrt(mpc_mul(a, b, wp), wp)
+        a, b = a1, b1
+        size = mpf_min_max([mpc_abs(a,10), mpc_abs(b,10)])[1]
+        err = mpc_abs(mpc_sub(a, b, 10), 10)
+        if size == fzero or mpf_lt(err, mpf_mul(eps, size)):
+            return a
+
+def mpc_agm1(a, prec, rnd=round_fast):
+    return mpc_agm(mpc_one, a, prec, rnd)
+
+def mpf_ellipk(x, prec, rnd=round_fast):
+    if not x[1]:
+        if x == fzero:
+            return mpf_shift(mpf_pi(prec, rnd), -1)
+        if x == fninf:
+            return fzero
+        if x == fnan:
+            return x
+    if x == fone:
+        return finf
+    # TODO: for |x| << 1/2, one could use fall back to
+    # pi/2 * hyp2f1_rat((1,2),(1,2),(1,1), x)
+    wp = prec + 15
+    # Use K(x) = pi/2/agm(1,a) where a = sqrt(1-x)
+    # The sqrt raises ComplexResult if x > 0
+    a = mpf_sqrt(mpf_sub(fone, x, wp), wp)
+    v = mpf_agm1(a, wp)
+    r = mpf_div(mpf_pi(wp), v, prec, rnd)
+    return mpf_shift(r, -1)
+
+def mpc_ellipk(z, prec, rnd=round_fast):
+    re, im = z
+    if im == fzero:
+        if re == finf:
+            return mpc_zero
+        if mpf_le(re, fone):
+            return mpf_ellipk(re, prec, rnd), fzero
+    wp = prec + 15
+    a = mpc_sqrt(mpc_sub(mpc_one, z, wp), wp)
+    v = mpc_agm1(a, wp)
+    r = mpc_mpf_div(mpf_pi(wp), v, prec, rnd)
+    return mpc_shift(r, -1)
+
+def mpf_ellipe(x, prec, rnd=round_fast):
+    # http://functions.wolfram.com/EllipticIntegrals/
+    # EllipticK/20/01/0001/
+    # E = (1-m)*(K'(m)*2*m + K(m))
+    sign, man, exp, bc = x
+    if not man:
+        if x == fzero:
+            return mpf_shift(mpf_pi(prec, rnd), -1)
+        if x == fninf:
+            return finf
+        if x == fnan:
+            return x
+        if x == finf:
+            raise ComplexResult
+    if x == fone:
+        return fone
+    wp = prec+20
+    mag = exp+bc
+    if mag < -wp:
+        return mpf_shift(mpf_pi(prec, rnd), -1)
+    # Compute a finite difference for K'
+    p = max(mag, 0) - wp
+    h = mpf_shift(fone, p)
+    K = mpf_ellipk(x, 2*wp)
+    Kh = mpf_ellipk(mpf_sub(x, h), 2*wp)
+    Kdiff = mpf_shift(mpf_sub(K, Kh), -p)
+    t = mpf_sub(fone, x)
+    b = mpf_mul(Kdiff, mpf_shift(x,1), wp)
+    return mpf_mul(t, mpf_add(K, b), prec, rnd)
+
+def mpc_ellipe(z, prec, rnd=round_fast):
+    re, im = z
+    if im == fzero:
+        if re == finf:
+            return (fzero, finf)
+        if mpf_le(re, fone):
+            return mpf_ellipe(re, prec, rnd), fzero
+    wp = prec + 15
+    mag = mpc_abs(z, 1)
+    p = max(mag[2]+mag[3], 0) - wp
+    h = mpf_shift(fone, p)
+    K = mpc_ellipk(z, 2*wp)
+    Kh = mpc_ellipk(mpc_add_mpf(z, h, 2*wp), 2*wp)
+    Kdiff = mpc_shift(mpc_sub(Kh, K, wp), -p)
+    t = mpc_sub(mpc_one, z, wp)
+    b = mpc_mul(Kdiff, mpc_shift(z,1), wp)
+    return mpc_mul(t, mpc_add(K, b, wp), prec, rnd)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libintmath.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libintmath.py
new file mode 100644
index 0000000000000000000000000000000000000000..7880546e135639208d136488408b102ad41682a2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libintmath.py
@@ -0,0 +1,584 @@
+"""
+Utility functions for integer math.
+
+TODO: rename, cleanup, perhaps move the gmpy wrapper code
+here from settings.py
+
+"""
+
+import math
+from bisect import bisect
+
+from .backend import xrange
+from .backend import BACKEND, gmpy, sage, sage_utils, MPZ, MPZ_ONE, MPZ_ZERO
+
+small_trailing = [0] * 256
+for j in range(1,8):
+    small_trailing[1<<j::1<<(j+1)] = [j] * (1<<(7-j))
+
+def giant_steps(start, target, n=2):
+    """
+    Return a list of integers ~=
+
+    [start, n*start, ..., target/n^2, target/n, target]
+
+    but conservatively rounded so that the quotient between two
+    successive elements is actually slightly less than n.
+
+    With n = 2, this describes suitable precision steps for a
+    quadratically convergent algorithm such as Newton's method;
+    with n = 3 steps for cubic convergence (Halley's method), etc.
+
+        >>> giant_steps(50,1000)
+        [66, 128, 253, 502, 1000]
+        >>> giant_steps(50,1000,4)
+        [65, 252, 1000]
+
+    """
+    L = [target]
+    while L[-1] > start*n:
+        L = L + [L[-1]//n + 2]
+    return L[::-1]
+
+def rshift(x, n):
+    """For an integer x, calculate x >> n with the fastest (floor)
+    rounding. Unlike the plain Python expression (x >> n), n is
+    allowed to be negative, in which case a left shift is performed."""
+    if n >= 0: return x >> n
+    else:      return x << (-n)
+
+def lshift(x, n):
+    """For an integer x, calculate x << n. Unlike the plain Python
+    expression (x << n), n is allowed to be negative, in which case a
+    right shift with default (floor) rounding is performed."""
+    if n >= 0: return x << n
+    else:      return x >> (-n)
+
+if BACKEND == 'sage':
+    import operator
+    rshift = operator.rshift
+    lshift = operator.lshift
+
+def python_trailing(n):
+    """Count the number of trailing zero bits in abs(n)."""
+    if not n:
+        return 0
+    low_byte = n & 0xff
+    if low_byte:
+        return small_trailing[low_byte]
+    t = 8
+    n >>= 8
+    while not n & 0xff:
+        n >>= 8
+        t += 8
+    return t + small_trailing[n & 0xff]
+
+if BACKEND == 'gmpy':
+    if gmpy.version() >= '2':
+        def gmpy_trailing(n):
+            """Count the number of trailing zero bits in abs(n) using gmpy."""
+            if n: return MPZ(n).bit_scan1()
+            else: return 0
+    else:
+        def gmpy_trailing(n):
+            """Count the number of trailing zero bits in abs(n) using gmpy."""
+            if n: return MPZ(n).scan1()
+            else: return 0
+
+# Small powers of 2
+powers = [1<<_ for _ in range(300)]
+
+def python_bitcount(n):
+    """Calculate bit size of the nonnegative integer n."""
+    bc = bisect(powers, n)
+    if bc != 300:
+        return bc
+    bc = int(math.log(n, 2)) - 4
+    return bc + bctable[n>>bc]
+
+def gmpy_bitcount(n):
+    """Calculate bit size of the nonnegative integer n."""
+    if n: return MPZ(n).numdigits(2)
+    else: return 0
+
+#def sage_bitcount(n):
+#    if n: return MPZ(n).nbits()
+#    else: return 0
+
+def sage_trailing(n):
+    return MPZ(n).trailing_zero_bits()
+
+if BACKEND == 'gmpy':
+    bitcount = gmpy_bitcount
+    trailing = gmpy_trailing
+elif BACKEND == 'sage':
+    sage_bitcount = sage_utils.bitcount
+    bitcount = sage_bitcount
+    trailing = sage_trailing
+else:
+    bitcount = python_bitcount
+    trailing = python_trailing
+
+if BACKEND == 'gmpy' and 'bit_length' in dir(gmpy):
+    bitcount = gmpy.bit_length
+
+# Used to avoid slow function calls as far as possible
+trailtable = [trailing(n) for n in range(256)]
+bctable = [bitcount(n) for n in range(1024)]
+
+# TODO: speed up for bases 2, 4, 8, 16, ...
+
+def bin_to_radix(x, xbits, base, bdigits):
+    """Changes radix of a fixed-point number; i.e., converts
+    x * 2**xbits to floor(x * 10**bdigits)."""
+    return x * (MPZ(base)**bdigits) >> xbits
+
+stddigits = '0123456789abcdefghijklmnopqrstuvwxyz'
+
+def small_numeral(n, base=10, digits=stddigits):
+    """Return the string numeral of a positive integer in an arbitrary
+    base. Most efficient for small input."""
+    if base == 10:
+        return str(n)
+    digs = []
+    while n:
+        n, digit = divmod(n, base)
+        digs.append(digits[digit])
+    return "".join(digs[::-1])
+
+def numeral_python(n, base=10, size=0, digits=stddigits):
+    """Represent the integer n as a string of digits in the given base.
+    Recursive division is used to make this function about 3x faster
+    than Python's str() for converting integers to decimal strings.
+
+    The 'size' parameters specifies the number of digits in n; this
+    number is only used to determine splitting points and need not be
+    exact."""
+    if n <= 0:
+        if not n:
+            return "0"
+        return "-" + numeral(-n, base, size, digits)
+    # Fast enough to do directly
+    if size < 250:
+        return small_numeral(n, base, digits)
+    # Divide in half
+    half = (size // 2) + (size & 1)
+    A, B = divmod(n, base**half)
+    ad = numeral(A, base, half, digits)
+    bd = numeral(B, base, half, digits).rjust(half, "0")
+    return ad + bd
+
+def numeral_gmpy(n, base=10, size=0, digits=stddigits):
+    """Represent the integer n as a string of digits in the given base.
+    Recursive division is used to make this function about 3x faster
+    than Python's str() for converting integers to decimal strings.
+
+    The 'size' parameters specifies the number of digits in n; this
+    number is only used to determine splitting points and need not be
+    exact."""
+    if n < 0:
+        return "-" + numeral(-n, base, size, digits)
+    # gmpy.digits() may cause a segmentation fault when trying to convert
+    # extremely large values to a string. The size limit may need to be
+    # adjusted on some platforms, but 1500000 works on Windows and Linux.
+    if size < 1500000:
+        return gmpy.digits(n, base)
+    # Divide in half
+    half = (size // 2) + (size & 1)
+    A, B = divmod(n, MPZ(base)**half)
+    ad = numeral(A, base, half, digits)
+    bd = numeral(B, base, half, digits).rjust(half, "0")
+    return ad + bd
+
+if BACKEND == "gmpy":
+    numeral = numeral_gmpy
+else:
+    numeral = numeral_python
+
+_1_800 = 1<<800
+_1_600 = 1<<600
+_1_400 = 1<<400
+_1_200 = 1<<200
+_1_100 = 1<<100
+_1_50 = 1<<50
+
+def isqrt_small_python(x):
+    """
+    Correctly (floor) rounded integer square root, using
+    division. Fast up to ~200 digits.
+    """
+    if not x:
+        return x
+    if x < _1_800:
+        # Exact with IEEE double precision arithmetic
+        if x < _1_50:
+            return int(x**0.5)
+        # Initial estimate can be any integer >= the true root; round up
+        r = int(x**0.5 * 1.00000000000001) + 1
+    else:
+        bc = bitcount(x)
+        n = bc//2
+        r = int((x>>(2*n-100))**0.5+2)<<(n-50)  # +2 is to round up
+    # The following iteration now precisely computes floor(sqrt(x))
+    # See e.g. Crandall & Pomerance, "Prime Numbers: A Computational
+    # Perspective"
+    while 1:
+        y = (r+x//r)>>1
+        if y >= r:
+            return r
+        r = y
+
+def isqrt_fast_python(x):
+    """
+    Fast approximate integer square root, computed using division-free
+    Newton iteration for large x. For random integers the result is almost
+    always correct (floor(sqrt(x))), but is 1 ulp too small with a roughly
+    0.1% probability. If x is very close to an exact square, the answer is
+    1 ulp wrong with high probability.
+
+    With 0 guard bits, the largest error over a set of 10^5 random
+    inputs of size 1-10^5 bits was 3 ulp. The use of 10 guard bits
+    almost certainly guarantees a max 1 ulp error.
+    """
+    # Use direct division-based iteration if sqrt(x) < 2^400
+    # Assume floating-point square root accurate to within 1 ulp, then:
+    # 0 Newton iterations good to 52 bits
+    # 1 Newton iterations good to 104 bits
+    # 2 Newton iterations good to 208 bits
+    # 3 Newton iterations good to 416 bits
+    if x < _1_800:
+        y = int(x**0.5)
+        if x >= _1_100:
+            y = (y + x//y) >> 1
+            if x >= _1_200:
+                y = (y + x//y) >> 1
+                if x >= _1_400:
+                    y = (y + x//y) >> 1
+        return y
+    bc = bitcount(x)
+    guard_bits = 10
+    x <<= 2*guard_bits
+    bc += 2*guard_bits
+    bc += (bc&1)
+    hbc = bc//2
+    startprec = min(50, hbc)
+    # Newton iteration for 1/sqrt(x), with floating-point starting value
+    r = int(2.0**(2*startprec) * (x >> (bc-2*startprec)) ** -0.5)
+    pp = startprec
+    for p in giant_steps(startprec, hbc):
+        # r**2, scaled from real size 2**(-bc) to 2**p
+        r2 = (r*r) >> (2*pp - p)
+        # x*r**2, scaled from real size ~1.0 to 2**p
+        xr2 = ((x >> (bc-p)) * r2) >> p
+        # New value of r, scaled from real size 2**(-bc/2) to 2**p
+        r = (r * ((3<<p) - xr2)) >> (pp+1)
+        pp = p
+    # (1/sqrt(x))*x = sqrt(x)
+    return (r*(x>>hbc)) >> (p+guard_bits)
+
+def sqrtrem_python(x):
+    """Correctly rounded integer (floor) square root with remainder."""
+    # to check cutoff:
+    # plot(lambda x: timing(isqrt, 2**int(x)), [0,2000])
+    if x < _1_600:
+        y = isqrt_small_python(x)
+        return y, x - y*y
+    y = isqrt_fast_python(x) + 1
+    rem = x - y*y
+    # Correct remainder
+    while rem < 0:
+        y -= 1
+        rem += (1+2*y)
+    else:
+        if rem:
+            while rem > 2*(1+y):
+                y += 1
+                rem -= (1+2*y)
+    return y, rem
+
+def isqrt_python(x):
+    """Integer square root with correct (floor) rounding."""
+    return sqrtrem_python(x)[0]
+
+def sqrt_fixed(x, prec):
+    return isqrt_fast(x<<prec)
+
+sqrt_fixed2 = sqrt_fixed
+
+if BACKEND == 'gmpy':
+    if gmpy.version() >= '2':
+        isqrt_small = isqrt_fast = isqrt = gmpy.isqrt
+        sqrtrem = gmpy.isqrt_rem
+    else:
+        isqrt_small = isqrt_fast = isqrt = gmpy.sqrt
+        sqrtrem = gmpy.sqrtrem
+elif BACKEND == 'sage':
+    isqrt_small = isqrt_fast = isqrt = \
+        getattr(sage_utils, "isqrt", lambda n: MPZ(n).isqrt())
+    sqrtrem = lambda n: MPZ(n).sqrtrem()
+else:
+    isqrt_small = isqrt_small_python
+    isqrt_fast = isqrt_fast_python
+    isqrt = isqrt_python
+    sqrtrem = sqrtrem_python
+
+
+def ifib(n, _cache={}):
+    """Computes the nth Fibonacci number as an integer, for
+    integer n."""
+    if n < 0:
+        return (-1)**(-n+1) * ifib(-n)
+    if n in _cache:
+        return _cache[n]
+    m = n
+    # Use Dijkstra's logarithmic algorithm
+    # The following implementation is basically equivalent to
+    # http://en.literateprograms.org/Fibonacci_numbers_(Scheme)
+    a, b, p, q = MPZ_ONE, MPZ_ZERO, MPZ_ZERO, MPZ_ONE
+    while n:
+        if n & 1:
+            aq = a*q
+            a, b = b*q+aq+a*p, b*p+aq
+            n -= 1
+        else:
+            qq = q*q
+            p, q = p*p+qq, qq+2*p*q
+            n >>= 1
+    if m < 250:
+        _cache[m] = b
+    return b
+
+MAX_FACTORIAL_CACHE = 1000
+
+def ifac(n, memo={0:1, 1:1}):
+    """Return n factorial (for integers n >= 0 only)."""
+    f = memo.get(n)
+    if f:
+        return f
+    k = len(memo)
+    p = memo[k-1]
+    MAX = MAX_FACTORIAL_CACHE
+    while k <= n:
+        p *= k
+        if k <= MAX:
+            memo[k] = p
+        k += 1
+    return p
+
+def ifac2(n, memo_pair=[{0:1}, {1:1}]):
+    """Return n!! (double factorial), integers n >= 0 only."""
+    memo = memo_pair[n&1]
+    f = memo.get(n)
+    if f:
+        return f
+    k = max(memo)
+    p = memo[k]
+    MAX = MAX_FACTORIAL_CACHE
+    while k < n:
+        k += 2
+        p *= k
+        if k <= MAX:
+            memo[k] = p
+    return p
+
+if BACKEND == 'gmpy':
+    ifac = gmpy.fac
+elif BACKEND == 'sage':
+    ifac = lambda n: int(sage.factorial(n))
+    ifib = sage.fibonacci
+
+def list_primes(n):
+    n = n + 1
+    sieve = list(xrange(n))
+    sieve[:2] = [0, 0]
+    for i in xrange(2, int(n**0.5)+1):
+        if sieve[i]:
+            for j in xrange(i**2, n, i):
+                sieve[j] = 0
+    return [p for p in sieve if p]
+
+if BACKEND == 'sage':
+    # Note: it is *VERY* important for performance that we convert
+    # the list to Python ints.
+    def list_primes(n):
+        return [int(_) for _ in sage.primes(n+1)]
+
+small_odd_primes = (3,5,7,11,13,17,19,23,29,31,37,41,43,47)
+small_odd_primes_set = set(small_odd_primes)
+
+def isprime(n):
+    """
+    Determines whether n is a prime number. A probabilistic test is
+    performed if n is very large. No special trick is used for detecting
+    perfect powers.
+
+        >>> sum(list_primes(100000))
+        454396537
+        >>> sum(n*isprime(n) for n in range(100000))
+        454396537
+
+    """
+    n = int(n)
+    if not n & 1:
+        return n == 2
+    if n < 50:
+        return n in small_odd_primes_set
+    for p in small_odd_primes:
+        if not n % p:
+            return False
+    m = n-1
+    s = trailing(m)
+    d = m >> s
+    def test(a):
+        x = pow(a,d,n)
+        if x == 1 or x == m:
+            return True
+        for r in xrange(1,s):
+            x = x**2 % n
+            if x == m:
+                return True
+        return False
+    # See http://primes.utm.edu/prove/prove2_3.html
+    if n < 1373653:
+        witnesses = [2,3]
+    elif n < 341550071728321:
+        witnesses = [2,3,5,7,11,13,17]
+    else:
+        witnesses = small_odd_primes
+    for a in witnesses:
+        if not test(a):
+            return False
+    return True
+
+def moebius(n):
+    """
+    Evaluates the Moebius function which is `mu(n) = (-1)^k` if `n`
+    is a product of `k` distinct primes and `mu(n) = 0` otherwise.
+
+    TODO: speed up using factorization
+    """
+    n = abs(int(n))
+    if n < 2:
+        return n
+    factors = []
+    for p in xrange(2, n+1):
+        if not (n % p):
+            if not (n % p**2):
+                return 0
+            if not sum(p % f for f in factors):
+                factors.append(p)
+    return (-1)**len(factors)
+
+def gcd(*args):
+    a = 0
+    for b in args:
+        if a:
+            while b:
+                a, b = b, a % b
+        else:
+            a = b
+    return a
+
+
+#  Comment by Juan Arias de Reyna:
+#
+#  I learn this method to compute EulerE[2n] from van de Lune.
+#
+#  We apply the formula   EulerE[2n] = (-1)^n 2**(-2n) sum_{j=0}^n a(2n,2j+1)
+#
+#  where the numbers a(n,j) vanish for  j > n+1 or j <= -1  and satisfies
+#
+#  a(0,-1) = a(0,0) = 0;  a(0,1)= 1; a(0,2) = a(0,3) = 0
+#
+#  a(n,j) = a(n-1,j)                              when n+j is even
+#  a(n,j) = (j-1) a(n-1,j-1) + (j+1) a(n-1,j+1)   when n+j is odd
+#
+#
+#  But we can use only one array unidimensional a(j) since to compute
+#  a(n,j) we only need to know a(n-1,k) where k and j are of different parity
+#  and we have not to conserve the used values.
+#
+#  We cached up the values of Euler numbers to sufficiently high order.
+#
+#  Important Observation: If we pretend to use the numbers
+#     EulerE[1], EulerE[2], ... , EulerE[n]
+#     it is convenient to compute first EulerE[n], since the algorithm
+#     computes first all
+#     the previous ones, and keeps them in the CACHE
+
+MAX_EULER_CACHE = 500
+
+def eulernum(m, _cache={0:MPZ_ONE}):
+    r"""
+    Computes the Euler numbers `E(n)`, which can be defined as
+    coefficients of the Taylor expansion of `1/cosh x`:
+
+    .. math ::
+
+        \frac{1}{\cosh x} = \sum_{n=0}^\infty \frac{E_n}{n!} x^n
+
+    Example::
+
+        >>> [int(eulernum(n)) for n in range(11)]
+        [1, 0, -1, 0, 5, 0, -61, 0, 1385, 0, -50521]
+        >>> [int(eulernum(n)) for n in range(11)]   # test cache
+        [1, 0, -1, 0, 5, 0, -61, 0, 1385, 0, -50521]
+
+    """
+    # for odd m > 1, the Euler numbers are zero
+    if m & 1:
+        return MPZ_ZERO
+    f = _cache.get(m)
+    if f:
+        return f
+    MAX = MAX_EULER_CACHE
+    n = m
+    a = [MPZ(_) for _ in [0,0,1,0,0,0]]
+    for  n in range(1, m+1):
+        for j in range(n+1, -1, -2):
+            a[j+1] = (j-1)*a[j] + (j+1)*a[j+2]
+        a.append(0)
+        suma = 0
+        for k in range(n+1, -1, -2):
+            suma += a[k+1]
+            if n <= MAX:
+                _cache[n] = ((-1)**(n//2))*(suma // 2**n)
+        if n == m:
+            return ((-1)**(n//2))*suma // 2**n
+
+def stirling1(n, k):
+    """
+    Stirling number of the first kind.
+    """
+    if n < 0 or k < 0:
+        raise ValueError
+    if k >= n:
+        return MPZ(n == k)
+    if k < 1:
+        return MPZ_ZERO
+    L = [MPZ_ZERO] * (k+1)
+    L[1] = MPZ_ONE
+    for m in xrange(2, n+1):
+        for j in xrange(min(k, m), 0, -1):
+            L[j] = (m-1) * L[j] + L[j-1]
+    return (-1)**(n+k) * L[k]
+
+def stirling2(n, k):
+    """
+    Stirling number of the second kind.
+    """
+    if n < 0 or k < 0:
+        raise ValueError
+    if k >= n:
+        return MPZ(n == k)
+    if k <= 1:
+        return MPZ(k == 1)
+    s = MPZ_ZERO
+    t = MPZ_ONE
+    for j in xrange(k+1):
+        if (k + j) & 1:
+            s -= t * MPZ(j)**n
+        else:
+            s += t * MPZ(j)**n
+        t = t * (k - j) // (j + 1)
+    return s // ifac(k)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libmpc.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libmpc.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc22d0e73674676c8a9249ebc2d48da7f3be8b0d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libmpc.py
@@ -0,0 +1,835 @@
+"""
+Low-level functions for complex arithmetic.
+"""
+
+import sys
+
+from .backend import MPZ, MPZ_ZERO, MPZ_ONE, MPZ_TWO, BACKEND
+
+from .libmpf import (\
+    round_floor, round_ceiling, round_down, round_up,
+    round_nearest, round_fast, bitcount,
+    bctable, normalize, normalize1, reciprocal_rnd, rshift, lshift, giant_steps,
+    negative_rnd,
+    to_str, to_fixed, from_man_exp, from_float, to_float, from_int, to_int,
+    fzero, fone, ftwo, fhalf, finf, fninf, fnan, fnone,
+    mpf_abs, mpf_pos, mpf_neg, mpf_add, mpf_sub, mpf_mul,
+    mpf_div, mpf_mul_int, mpf_shift, mpf_sqrt, mpf_hypot,
+    mpf_rdiv_int, mpf_floor, mpf_ceil, mpf_nint, mpf_frac,
+    mpf_sign, mpf_hash,
+    ComplexResult
+)
+
+from .libelefun import (\
+    mpf_pi, mpf_exp, mpf_log, mpf_cos_sin, mpf_cosh_sinh, mpf_tan, mpf_pow_int,
+    mpf_log_hypot,
+    mpf_cos_sin_pi, mpf_phi,
+    mpf_cos, mpf_sin, mpf_cos_pi, mpf_sin_pi,
+    mpf_atan, mpf_atan2, mpf_cosh, mpf_sinh, mpf_tanh,
+    mpf_asin, mpf_acos, mpf_acosh, mpf_nthroot, mpf_fibonacci
+)
+
+# An mpc value is a (real, imag) tuple
+mpc_one = fone, fzero
+mpc_zero = fzero, fzero
+mpc_two = ftwo, fzero
+mpc_half = (fhalf, fzero)
+
+_infs = (finf, fninf)
+_infs_nan = (finf, fninf, fnan)
+
+def mpc_is_inf(z):
+    """Check if either real or imaginary part is infinite"""
+    re, im = z
+    if re in _infs: return True
+    if im in _infs: return True
+    return False
+
+def mpc_is_infnan(z):
+    """Check if either real or imaginary part is infinite or nan"""
+    re, im = z
+    if re in _infs_nan: return True
+    if im in _infs_nan: return True
+    return False
+
+def mpc_to_str(z, dps, **kwargs):
+    re, im = z
+    rs = to_str(re, dps)
+    if im[0]:
+        return rs + " - " + to_str(mpf_neg(im), dps, **kwargs) + "j"
+    else:
+        return rs + " + " + to_str(im, dps, **kwargs) + "j"
+
+def mpc_to_complex(z, strict=False, rnd=round_fast):
+    re, im = z
+    return complex(to_float(re, strict, rnd), to_float(im, strict, rnd))
+
+def mpc_hash(z):
+    if sys.version_info >= (3, 2):
+        re, im = z
+        h = mpf_hash(re) + sys.hash_info.imag * mpf_hash(im)
+        # Need to reduce either module 2^32 or 2^64
+        h = h % (2**sys.hash_info.width)
+        return int(h)
+    else:
+        try:
+            return hash(mpc_to_complex(z, strict=True))
+        except OverflowError:
+            return hash(z)
+
+def mpc_conjugate(z, prec, rnd=round_fast):
+    re, im = z
+    return re, mpf_neg(im, prec, rnd)
+
+def mpc_is_nonzero(z):
+    return z != mpc_zero
+
+def mpc_add(z, w, prec, rnd=round_fast):
+    a, b = z
+    c, d = w
+    return mpf_add(a, c, prec, rnd), mpf_add(b, d, prec, rnd)
+
+def mpc_add_mpf(z, x, prec, rnd=round_fast):
+    a, b = z
+    return mpf_add(a, x, prec, rnd), b
+
+def mpc_sub(z, w, prec=0, rnd=round_fast):
+    a, b = z
+    c, d = w
+    return mpf_sub(a, c, prec, rnd), mpf_sub(b, d, prec, rnd)
+
+def mpc_sub_mpf(z, p, prec=0, rnd=round_fast):
+    a, b = z
+    return mpf_sub(a, p, prec, rnd), b
+
+def mpc_pos(z, prec, rnd=round_fast):
+    a, b = z
+    return mpf_pos(a, prec, rnd), mpf_pos(b, prec, rnd)
+
+def mpc_neg(z, prec=None, rnd=round_fast):
+    a, b = z
+    return mpf_neg(a, prec, rnd), mpf_neg(b, prec, rnd)
+
+def mpc_shift(z, n):
+    a, b = z
+    return mpf_shift(a, n), mpf_shift(b, n)
+
+def mpc_abs(z, prec, rnd=round_fast):
+    """Absolute value of a complex number, |a+bi|.
+    Returns an mpf value."""
+    a, b = z
+    return mpf_hypot(a, b, prec, rnd)
+
+def mpc_arg(z, prec, rnd=round_fast):
+    """Argument of a complex number. Returns an mpf value."""
+    a, b = z
+    return mpf_atan2(b, a, prec, rnd)
+
+def mpc_floor(z, prec, rnd=round_fast):
+    a, b = z
+    return mpf_floor(a, prec, rnd), mpf_floor(b, prec, rnd)
+
+def mpc_ceil(z, prec, rnd=round_fast):
+    a, b = z
+    return mpf_ceil(a, prec, rnd), mpf_ceil(b, prec, rnd)
+
+def mpc_nint(z, prec, rnd=round_fast):
+    a, b = z
+    return mpf_nint(a, prec, rnd), mpf_nint(b, prec, rnd)
+
+def mpc_frac(z, prec, rnd=round_fast):
+    a, b = z
+    return mpf_frac(a, prec, rnd), mpf_frac(b, prec, rnd)
+
+
+def mpc_mul(z, w, prec, rnd=round_fast):
+    """
+    Complex multiplication.
+
+    Returns the real and imaginary part of (a+bi)*(c+di), rounded to
+    the specified precision. The rounding mode applies to the real and
+    imaginary parts separately.
+    """
+    a, b = z
+    c, d = w
+    p = mpf_mul(a, c)
+    q = mpf_mul(b, d)
+    r = mpf_mul(a, d)
+    s = mpf_mul(b, c)
+    re = mpf_sub(p, q, prec, rnd)
+    im = mpf_add(r, s, prec, rnd)
+    return re, im
+
+def mpc_square(z, prec, rnd=round_fast):
+    # (a+b*I)**2 == a**2 - b**2 + 2*I*a*b
+    a, b = z
+    p = mpf_mul(a,a)
+    q = mpf_mul(b,b)
+    r = mpf_mul(a,b, prec, rnd)
+    re = mpf_sub(p, q, prec, rnd)
+    im = mpf_shift(r, 1)
+    return re, im
+
+def mpc_mul_mpf(z, p, prec, rnd=round_fast):
+    a, b = z
+    re = mpf_mul(a, p, prec, rnd)
+    im = mpf_mul(b, p, prec, rnd)
+    return re, im
+
+def mpc_mul_imag_mpf(z, x, prec, rnd=round_fast):
+    """
+    Multiply the mpc value z by I*x where x is an mpf value.
+    """
+    a, b = z
+    re = mpf_neg(mpf_mul(b, x, prec, rnd))
+    im = mpf_mul(a, x, prec, rnd)
+    return re, im
+
+def mpc_mul_int(z, n, prec, rnd=round_fast):
+    a, b = z
+    re = mpf_mul_int(a, n, prec, rnd)
+    im = mpf_mul_int(b, n, prec, rnd)
+    return re, im
+
+def mpc_div(z, w, prec, rnd=round_fast):
+    a, b = z
+    c, d = w
+    wp = prec + 10
+    # mag = c*c + d*d
+    mag = mpf_add(mpf_mul(c, c), mpf_mul(d, d), wp)
+    # (a*c+b*d)/mag, (b*c-a*d)/mag
+    t = mpf_add(mpf_mul(a,c), mpf_mul(b,d), wp)
+    u = mpf_sub(mpf_mul(b,c), mpf_mul(a,d), wp)
+    return mpf_div(t,mag,prec,rnd), mpf_div(u,mag,prec,rnd)
+
+def mpc_div_mpf(z, p, prec, rnd=round_fast):
+    """Calculate z/p where p is real"""
+    a, b = z
+    re = mpf_div(a, p, prec, rnd)
+    im = mpf_div(b, p, prec, rnd)
+    return re, im
+
+def mpc_reciprocal(z, prec, rnd=round_fast):
+    """Calculate 1/z efficiently"""
+    a, b = z
+    m = mpf_add(mpf_mul(a,a),mpf_mul(b,b),prec+10)
+    re = mpf_div(a, m, prec, rnd)
+    im = mpf_neg(mpf_div(b, m, prec, rnd))
+    return re, im
+
+def mpc_mpf_div(p, z, prec, rnd=round_fast):
+    """Calculate p/z where p is real efficiently"""
+    a, b = z
+    m = mpf_add(mpf_mul(a,a),mpf_mul(b,b), prec+10)
+    re = mpf_div(mpf_mul(a,p), m, prec, rnd)
+    im = mpf_div(mpf_neg(mpf_mul(b,p)), m, prec, rnd)
+    return re, im
+
+def complex_int_pow(a, b, n):
+    """Complex integer power: computes (a+b*I)**n exactly for
+    nonnegative n (a and b must be Python ints)."""
+    wre = 1
+    wim = 0
+    while n:
+        if n & 1:
+            wre, wim = wre*a - wim*b, wim*a + wre*b
+            n -= 1
+        a, b = a*a - b*b, 2*a*b
+        n //= 2
+    return wre, wim
+
+def mpc_pow(z, w, prec, rnd=round_fast):
+    if w[1] == fzero:
+        return mpc_pow_mpf(z, w[0], prec, rnd)
+    return mpc_exp(mpc_mul(mpc_log(z, prec+10), w, prec+10), prec, rnd)
+
+def mpc_pow_mpf(z, p, prec, rnd=round_fast):
+    psign, pman, pexp, pbc = p
+    if pexp >= 0:
+        return mpc_pow_int(z, (-1)**psign * (pman<<pexp), prec, rnd)
+    if pexp == -1:
+        sqrtz = mpc_sqrt(z, prec+10)
+        return mpc_pow_int(sqrtz, (-1)**psign * pman, prec, rnd)
+    return mpc_exp(mpc_mul_mpf(mpc_log(z, prec+10), p, prec+10), prec, rnd)
+
+def mpc_pow_int(z, n, prec, rnd=round_fast):
+    a, b = z
+    if b == fzero:
+        return mpf_pow_int(a, n, prec, rnd), fzero
+    if a == fzero:
+        v = mpf_pow_int(b, n, prec, rnd)
+        n %= 4
+        if n == 0:
+            return v, fzero
+        elif n == 1:
+            return fzero, v
+        elif n == 2:
+            return mpf_neg(v), fzero
+        elif n == 3:
+            return fzero, mpf_neg(v)
+    if n == 0: return mpc_one
+    if n == 1: return mpc_pos(z, prec, rnd)
+    if n == 2: return mpc_square(z, prec, rnd)
+    if n == -1: return mpc_reciprocal(z, prec, rnd)
+    if n < 0: return mpc_reciprocal(mpc_pow_int(z, -n, prec+4), prec, rnd)
+    asign, aman, aexp, abc = a
+    bsign, bman, bexp, bbc = b
+    if asign: aman = -aman
+    if bsign: bman = -bman
+    de = aexp - bexp
+    abs_de = abs(de)
+    exact_size = n*(abs_de + max(abc, bbc))
+    if exact_size < 10000:
+        if de > 0:
+            aman <<= de
+            aexp = bexp
+        else:
+            bman <<= (-de)
+            bexp = aexp
+        re, im = complex_int_pow(aman, bman, n)
+        re = from_man_exp(re, int(n*aexp), prec, rnd)
+        im = from_man_exp(im, int(n*bexp), prec, rnd)
+        return re, im
+    return mpc_exp(mpc_mul_int(mpc_log(z, prec+10), n, prec+10), prec, rnd)
+
+def mpc_sqrt(z, prec, rnd=round_fast):
+    """Complex square root (principal branch).
+
+    We have sqrt(a+bi) = sqrt((r+a)/2) + b/sqrt(2*(r+a))*i where
+    r = abs(a+bi), when a+bi is not a negative real number."""
+    a, b = z
+    if b == fzero:
+        if a == fzero:
+            return (a, b)
+        # When a+bi is a negative real number, we get a real sqrt times i
+        if a[0]:
+            im = mpf_sqrt(mpf_neg(a), prec, rnd)
+            return (fzero, im)
+        else:
+            re = mpf_sqrt(a, prec, rnd)
+            return (re, fzero)
+    wp = prec+20
+    if not a[0]:                               # case a positive
+        t  = mpf_add(mpc_abs((a, b), wp), a, wp)  # t = abs(a+bi) + a
+        u = mpf_shift(t, -1)                      # u = t/2
+        re = mpf_sqrt(u, prec, rnd)               # re = sqrt(u)
+        v = mpf_shift(t, 1)                       # v = 2*t
+        w  = mpf_sqrt(v, wp)                      # w = sqrt(v)
+        im = mpf_div(b, w, prec, rnd)             # im = b / w
+    else:                                      # case a negative
+        t = mpf_sub(mpc_abs((a, b), wp), a, wp)   # t = abs(a+bi) - a
+        u = mpf_shift(t, -1)                      # u = t/2
+        im = mpf_sqrt(u, prec, rnd)               # im = sqrt(u)
+        v = mpf_shift(t, 1)                       # v = 2*t
+        w  = mpf_sqrt(v, wp)                      # w = sqrt(v)
+        re = mpf_div(b, w, prec, rnd)             # re = b/w
+        if b[0]:
+            re = mpf_neg(re)
+            im = mpf_neg(im)
+    return re, im
+
+def mpc_nthroot_fixed(a, b, n, prec):
+    # a, b signed integers at fixed precision prec
+    start = 50
+    a1 = int(rshift(a, prec - n*start))
+    b1 = int(rshift(b, prec - n*start))
+    try:
+        r = (a1 + 1j * b1)**(1.0/n)
+        re = r.real
+        im = r.imag
+        re = MPZ(int(re))
+        im = MPZ(int(im))
+    except OverflowError:
+        a1 = from_int(a1, start)
+        b1 = from_int(b1, start)
+        fn = from_int(n)
+        nth = mpf_rdiv_int(1, fn, start)
+        re, im = mpc_pow((a1, b1), (nth, fzero), start)
+        re = to_int(re)
+        im = to_int(im)
+    extra = 10
+    prevp = start
+    extra1 = n
+    for p in giant_steps(start, prec+extra):
+        # this is slow for large n, unlike int_pow_fixed
+        re2, im2 = complex_int_pow(re, im, n-1)
+        re2 = rshift(re2, (n-1)*prevp - p - extra1)
+        im2 = rshift(im2, (n-1)*prevp - p - extra1)
+        r4 = (re2*re2 + im2*im2) >> (p + extra1)
+        ap = rshift(a, prec - p)
+        bp = rshift(b, prec - p)
+        rec = (ap * re2 + bp * im2) >> p
+        imc = (-ap * im2 + bp * re2) >> p
+        reb = (rec << p) // r4
+        imb = (imc << p) // r4
+        re = (reb + (n-1)*lshift(re, p-prevp))//n
+        im = (imb + (n-1)*lshift(im, p-prevp))//n
+        prevp = p
+    return re, im
+
+def mpc_nthroot(z, n, prec, rnd=round_fast):
+    """
+    Complex n-th root.
+
+    Use Newton method as in the real case when it is faster,
+    otherwise use z**(1/n)
+    """
+    a, b = z
+    if a[0] == 0 and b == fzero:
+        re = mpf_nthroot(a, n, prec, rnd)
+        return (re, fzero)
+    if n < 2:
+        if n == 0:
+            return mpc_one
+        if n == 1:
+            return mpc_pos((a, b), prec, rnd)
+        if n == -1:
+            return mpc_div(mpc_one, (a, b), prec, rnd)
+        inverse = mpc_nthroot((a, b), -n, prec+5, reciprocal_rnd[rnd])
+        return mpc_div(mpc_one, inverse, prec, rnd)
+    if n <= 20:
+        prec2 = int(1.2 * (prec + 10))
+        asign, aman, aexp, abc = a
+        bsign, bman, bexp, bbc = b
+        pf = mpc_abs((a,b), prec)
+        if pf[-2] + pf[-1] > -10  and pf[-2] + pf[-1] < prec:
+            af = to_fixed(a, prec2)
+            bf = to_fixed(b, prec2)
+            re, im = mpc_nthroot_fixed(af, bf, n, prec2)
+            extra = 10
+            re = from_man_exp(re, -prec2-extra, prec2, rnd)
+            im = from_man_exp(im, -prec2-extra, prec2, rnd)
+            return re, im
+    fn = from_int(n)
+    prec2 = prec+10 + 10
+    nth = mpf_rdiv_int(1, fn, prec2)
+    re, im = mpc_pow((a, b), (nth, fzero), prec2, rnd)
+    re = normalize(re[0], re[1], re[2], re[3], prec, rnd)
+    im = normalize(im[0], im[1], im[2], im[3], prec, rnd)
+    return re, im
+
+def mpc_cbrt(z, prec, rnd=round_fast):
+    """
+    Complex cubic root.
+    """
+    return mpc_nthroot(z, 3, prec, rnd)
+
+def mpc_exp(z, prec, rnd=round_fast):
+    """
+    Complex exponential function.
+
+    We use the direct formula exp(a+bi) = exp(a) * (cos(b) + sin(b)*i)
+    for the computation. This formula is very nice because it is
+    pefectly stable; since we just do real multiplications, the only
+    numerical errors that can creep in are single-ulp rounding errors.
+
+    The formula is efficient since mpmath's real exp is quite fast and
+    since we can compute cos and sin simultaneously.
+
+    It is no problem if a and b are large; if the implementations of
+    exp/cos/sin are accurate and efficient for all real numbers, then
+    so is this function for all complex numbers.
+    """
+    a, b = z
+    if a == fzero:
+        return mpf_cos_sin(b, prec, rnd)
+    if b == fzero:
+        return mpf_exp(a, prec, rnd), fzero
+    mag = mpf_exp(a, prec+4, rnd)
+    c, s = mpf_cos_sin(b, prec+4, rnd)
+    re = mpf_mul(mag, c, prec, rnd)
+    im = mpf_mul(mag, s, prec, rnd)
+    return re, im
+
+def mpc_log(z, prec, rnd=round_fast):
+    re = mpf_log_hypot(z[0], z[1], prec, rnd)
+    im = mpc_arg(z, prec, rnd)
+    return re, im
+
+def mpc_cos(z, prec, rnd=round_fast):
+    """Complex cosine. The formula used is cos(a+bi) = cos(a)*cosh(b) -
+    sin(a)*sinh(b)*i.
+
+    The same comments apply as for the complex exp: only real
+    multiplications are pewrormed, so no cancellation errors are
+    possible. The formula is also efficient since we can compute both
+    pairs (cos, sin) and (cosh, sinh) in single stwps."""
+    a, b = z
+    if b == fzero:
+        return mpf_cos(a, prec, rnd), fzero
+    if a == fzero:
+        return mpf_cosh(b, prec, rnd), fzero
+    wp = prec + 6
+    c, s = mpf_cos_sin(a, wp)
+    ch, sh = mpf_cosh_sinh(b, wp)
+    re = mpf_mul(c, ch, prec, rnd)
+    im = mpf_mul(s, sh, prec, rnd)
+    return re, mpf_neg(im)
+
+def mpc_sin(z, prec, rnd=round_fast):
+    """Complex sine. We have sin(a+bi) = sin(a)*cosh(b) +
+    cos(a)*sinh(b)*i. See the docstring for mpc_cos for additional
+    comments."""
+    a, b = z
+    if b == fzero:
+        return mpf_sin(a, prec, rnd), fzero
+    if a == fzero:
+        return fzero, mpf_sinh(b, prec, rnd)
+    wp = prec + 6
+    c, s = mpf_cos_sin(a, wp)
+    ch, sh = mpf_cosh_sinh(b, wp)
+    re = mpf_mul(s, ch, prec, rnd)
+    im = mpf_mul(c, sh, prec, rnd)
+    return re, im
+
+def mpc_tan(z, prec, rnd=round_fast):
+    """Complex tangent. Computed as tan(a+bi) = sin(2a)/M + sinh(2b)/M*i
+    where M = cos(2a) + cosh(2b)."""
+    a, b = z
+    asign, aman, aexp, abc = a
+    bsign, bman, bexp, bbc = b
+    if b == fzero: return mpf_tan(a, prec, rnd), fzero
+    if a == fzero: return fzero, mpf_tanh(b, prec, rnd)
+    wp = prec + 15
+    a = mpf_shift(a, 1)
+    b = mpf_shift(b, 1)
+    c, s = mpf_cos_sin(a, wp)
+    ch, sh = mpf_cosh_sinh(b, wp)
+    # TODO: handle cancellation when c ~=  -1 and ch ~= 1
+    mag = mpf_add(c, ch, wp)
+    re = mpf_div(s, mag, prec, rnd)
+    im = mpf_div(sh, mag, prec, rnd)
+    return re, im
+
+def mpc_cos_pi(z, prec, rnd=round_fast):
+    a, b = z
+    if b == fzero:
+        return mpf_cos_pi(a, prec, rnd), fzero
+    b = mpf_mul(b, mpf_pi(prec+5), prec+5)
+    if a == fzero:
+        return mpf_cosh(b, prec, rnd), fzero
+    wp = prec + 6
+    c, s = mpf_cos_sin_pi(a, wp)
+    ch, sh = mpf_cosh_sinh(b, wp)
+    re = mpf_mul(c, ch, prec, rnd)
+    im = mpf_mul(s, sh, prec, rnd)
+    return re, mpf_neg(im)
+
+def mpc_sin_pi(z, prec, rnd=round_fast):
+    a, b = z
+    if b == fzero:
+        return mpf_sin_pi(a, prec, rnd), fzero
+    b = mpf_mul(b, mpf_pi(prec+5), prec+5)
+    if a == fzero:
+        return fzero, mpf_sinh(b, prec, rnd)
+    wp = prec + 6
+    c, s = mpf_cos_sin_pi(a, wp)
+    ch, sh = mpf_cosh_sinh(b, wp)
+    re = mpf_mul(s, ch, prec, rnd)
+    im = mpf_mul(c, sh, prec, rnd)
+    return re, im
+
+def mpc_cos_sin(z, prec, rnd=round_fast):
+    a, b = z
+    if a == fzero:
+        ch, sh = mpf_cosh_sinh(b, prec, rnd)
+        return (ch, fzero), (fzero, sh)
+    if b == fzero:
+        c, s = mpf_cos_sin(a, prec, rnd)
+        return (c, fzero), (s, fzero)
+    wp = prec + 6
+    c, s = mpf_cos_sin(a, wp)
+    ch, sh = mpf_cosh_sinh(b, wp)
+    cre = mpf_mul(c, ch, prec, rnd)
+    cim = mpf_mul(s, sh, prec, rnd)
+    sre = mpf_mul(s, ch, prec, rnd)
+    sim = mpf_mul(c, sh, prec, rnd)
+    return (cre, mpf_neg(cim)), (sre, sim)
+
+def mpc_cos_sin_pi(z, prec, rnd=round_fast):
+    a, b = z
+    if b == fzero:
+        c, s = mpf_cos_sin_pi(a, prec, rnd)
+        return (c, fzero), (s, fzero)
+    b = mpf_mul(b, mpf_pi(prec+5), prec+5)
+    if a == fzero:
+        ch, sh = mpf_cosh_sinh(b, prec, rnd)
+        return (ch, fzero), (fzero, sh)
+    wp = prec + 6
+    c, s = mpf_cos_sin_pi(a, wp)
+    ch, sh = mpf_cosh_sinh(b, wp)
+    cre = mpf_mul(c, ch, prec, rnd)
+    cim = mpf_mul(s, sh, prec, rnd)
+    sre = mpf_mul(s, ch, prec, rnd)
+    sim = mpf_mul(c, sh, prec, rnd)
+    return (cre, mpf_neg(cim)), (sre, sim)
+
+def mpc_cosh(z, prec, rnd=round_fast):
+    """Complex hyperbolic cosine. Computed as cosh(z) = cos(z*i)."""
+    a, b = z
+    return mpc_cos((b, mpf_neg(a)), prec, rnd)
+
+def mpc_sinh(z, prec, rnd=round_fast):
+    """Complex hyperbolic sine. Computed as sinh(z) = -i*sin(z*i)."""
+    a, b = z
+    b, a = mpc_sin((b, a), prec, rnd)
+    return a, b
+
+def mpc_tanh(z, prec, rnd=round_fast):
+    """Complex hyperbolic tangent. Computed as tanh(z) = -i*tan(z*i)."""
+    a, b = z
+    b, a = mpc_tan((b, a), prec, rnd)
+    return a, b
+
+# TODO: avoid loss of accuracy
+def mpc_atan(z, prec, rnd=round_fast):
+    a, b = z
+    # atan(z) = (I/2)*(log(1-I*z) - log(1+I*z))
+    # x = 1-I*z = 1 + b - I*a
+    # y = 1+I*z = 1 - b + I*a
+    wp = prec + 15
+    x = mpf_add(fone, b, wp), mpf_neg(a)
+    y = mpf_sub(fone, b, wp), a
+    l1 = mpc_log(x, wp)
+    l2 = mpc_log(y, wp)
+    a, b = mpc_sub(l1, l2, prec, rnd)
+    # (I/2) * (a+b*I) = (-b/2 + a/2*I)
+    v = mpf_neg(mpf_shift(b,-1)), mpf_shift(a,-1)
+    # Subtraction at infinity gives correct real part but
+    # wrong imaginary part (should be zero)
+    if v[1] == fnan and mpc_is_inf(z):
+        v = (v[0], fzero)
+    return v
+
+beta_crossover = from_float(0.6417)
+alpha_crossover = from_float(1.5)
+
+def acos_asin(z, prec, rnd, n):
+    """ complex acos for n = 0, asin for n = 1
+    The algorithm is described in
+    T.E. Hull, T.F. Fairgrieve and P.T.P. Tang
+    'Implementing the Complex Arcsine and Arcosine Functions
+    using Exception Handling',
+    ACM Trans. on Math. Software Vol. 23 (1997), p299
+    The complex acos and asin can be defined as
+    acos(z) = acos(beta) - I*sign(a)* log(alpha + sqrt(alpha**2 -1))
+    asin(z) = asin(beta) + I*sign(a)* log(alpha + sqrt(alpha**2 -1))
+    where z = a + I*b
+    alpha = (1/2)*(r + s); beta = (1/2)*(r - s) = a/alpha
+    r = sqrt((a+1)**2 + y**2); s = sqrt((a-1)**2 + y**2)
+    These expressions are rewritten in different ways in different
+    regions, delimited by two crossovers alpha_crossover and beta_crossover,
+    and by abs(a) <= 1, in order to improve the numerical accuracy.
+    """
+    a, b = z
+    wp = prec + 10
+    # special cases with real argument
+    if b == fzero:
+        am = mpf_sub(fone, mpf_abs(a), wp)
+        # case abs(a) <= 1
+        if not am[0]:
+            if n == 0:
+                return mpf_acos(a, prec, rnd), fzero
+            else:
+                return mpf_asin(a, prec, rnd), fzero
+        # cases abs(a) > 1
+        else:
+            # case a < -1
+            if a[0]:
+                pi = mpf_pi(prec, rnd)
+                c = mpf_acosh(mpf_neg(a), prec, rnd)
+                if n == 0:
+                    return pi, mpf_neg(c)
+                else:
+                    return mpf_neg(mpf_shift(pi, -1)), c
+            # case a > 1
+            else:
+                c = mpf_acosh(a, prec, rnd)
+                if n == 0:
+                    return fzero, c
+                else:
+                    pi = mpf_pi(prec, rnd)
+                    return mpf_shift(pi, -1), mpf_neg(c)
+    asign = bsign = 0
+    if a[0]:
+        a = mpf_neg(a)
+        asign = 1
+    if b[0]:
+        b = mpf_neg(b)
+        bsign = 1
+    am = mpf_sub(fone, a, wp)
+    ap = mpf_add(fone, a, wp)
+    r = mpf_hypot(ap, b, wp)
+    s = mpf_hypot(am, b, wp)
+    alpha = mpf_shift(mpf_add(r, s, wp), -1)
+    beta = mpf_div(a, alpha, wp)
+    b2 = mpf_mul(b,b, wp)
+    # case beta <= beta_crossover
+    if not mpf_sub(beta_crossover, beta, wp)[0]:
+        if n == 0:
+            re = mpf_acos(beta, wp)
+        else:
+            re = mpf_asin(beta, wp)
+    else:
+        # to compute the real part in this region use the identity
+        # asin(beta) = atan(beta/sqrt(1-beta**2))
+        # beta/sqrt(1-beta**2) = (alpha + a) * (alpha - a)
+        # alpha + a is numerically accurate; alpha - a can have
+        # cancellations leading to numerical inaccuracies, so rewrite
+        # it in differente ways according to the region
+        Ax = mpf_add(alpha, a, wp)
+        # case a <= 1
+        if not am[0]:
+            # c = b*b/(r + (a+1)); d = (s + (1-a))
+            # alpha - a = (1/2)*(c + d)
+            # case n=0: re = atan(sqrt((1/2) * Ax * (c + d))/a)
+            # case n=1: re = atan(a/sqrt((1/2) * Ax * (c + d)))
+            c = mpf_div(b2, mpf_add(r, ap, wp), wp)
+            d = mpf_add(s, am, wp)
+            re = mpf_shift(mpf_mul(Ax, mpf_add(c, d, wp), wp), -1)
+            if n == 0:
+                re = mpf_atan(mpf_div(mpf_sqrt(re, wp), a, wp), wp)
+            else:
+                re = mpf_atan(mpf_div(a, mpf_sqrt(re, wp), wp), wp)
+        else:
+            # c = Ax/(r + (a+1)); d = Ax/(s - (1-a))
+            # alpha - a = (1/2)*(c + d)
+            # case n = 0: re = atan(b*sqrt(c + d)/2/a)
+            # case n = 1: re = atan(a/(b*sqrt(c + d)/2)
+            c = mpf_div(Ax, mpf_add(r, ap, wp), wp)
+            d = mpf_div(Ax, mpf_sub(s, am, wp), wp)
+            re = mpf_shift(mpf_add(c, d, wp), -1)
+            re = mpf_mul(b, mpf_sqrt(re, wp), wp)
+            if n == 0:
+                re = mpf_atan(mpf_div(re, a, wp), wp)
+            else:
+                re = mpf_atan(mpf_div(a, re, wp), wp)
+    # to compute alpha + sqrt(alpha**2 - 1), if alpha <= alpha_crossover
+    # replace it with 1 + Am1 + sqrt(Am1*(alpha+1)))
+    # where Am1 = alpha -1
+    # if alpha <= alpha_crossover:
+    if not mpf_sub(alpha_crossover, alpha, wp)[0]:
+        c1 = mpf_div(b2, mpf_add(r, ap, wp), wp)
+        # case a < 1
+        if mpf_neg(am)[0]:
+            # Am1 = (1/2) * (b*b/(r + (a+1)) + b*b/(s + (1-a))
+            c2 = mpf_add(s, am, wp)
+            c2 = mpf_div(b2, c2, wp)
+            Am1 = mpf_shift(mpf_add(c1, c2, wp), -1)
+        else:
+            # Am1 = (1/2) * (b*b/(r + (a+1)) + (s - (1-a)))
+            c2 = mpf_sub(s, am, wp)
+            Am1 = mpf_shift(mpf_add(c1, c2, wp), -1)
+        # im = log(1 + Am1 + sqrt(Am1*(alpha+1)))
+        im = mpf_mul(Am1, mpf_add(alpha, fone, wp), wp)
+        im = mpf_log(mpf_add(fone, mpf_add(Am1, mpf_sqrt(im, wp), wp), wp), wp)
+    else:
+        # im = log(alpha + sqrt(alpha*alpha - 1))
+        im = mpf_sqrt(mpf_sub(mpf_mul(alpha, alpha, wp), fone, wp), wp)
+        im = mpf_log(mpf_add(alpha, im, wp), wp)
+    if asign:
+        if n == 0:
+            re = mpf_sub(mpf_pi(wp), re, wp)
+        else:
+            re = mpf_neg(re)
+    if not bsign and n == 0:
+        im = mpf_neg(im)
+    if bsign and n == 1:
+        im = mpf_neg(im)
+    re = normalize(re[0], re[1], re[2], re[3], prec, rnd)
+    im = normalize(im[0], im[1], im[2], im[3], prec, rnd)
+    return re, im
+
+def mpc_acos(z, prec, rnd=round_fast):
+    return acos_asin(z, prec, rnd, 0)
+
+def mpc_asin(z, prec, rnd=round_fast):
+    return acos_asin(z, prec, rnd, 1)
+
+def mpc_asinh(z, prec, rnd=round_fast):
+    # asinh(z) = I * asin(-I z)
+    a, b = z
+    a, b =  mpc_asin((b, mpf_neg(a)), prec, rnd)
+    return mpf_neg(b), a
+
+def mpc_acosh(z, prec, rnd=round_fast):
+    # acosh(z) = -I * acos(z)   for Im(acos(z)) <= 0
+    #            +I * acos(z)   otherwise
+    a, b = mpc_acos(z, prec, rnd)
+    if b[0] or b == fzero:
+        return mpf_neg(b), a
+    else:
+        return b, mpf_neg(a)
+
+def mpc_atanh(z, prec, rnd=round_fast):
+    # atanh(z) = (log(1+z)-log(1-z))/2
+    wp = prec + 15
+    a = mpc_add(z, mpc_one, wp)
+    b = mpc_sub(mpc_one, z, wp)
+    a = mpc_log(a, wp)
+    b = mpc_log(b, wp)
+    v = mpc_shift(mpc_sub(a, b, wp), -1)
+    # Subtraction at infinity gives correct imaginary part but
+    # wrong real part (should be zero)
+    if v[0] == fnan and mpc_is_inf(z):
+        v = (fzero, v[1])
+    return v
+
+def mpc_fibonacci(z, prec, rnd=round_fast):
+    re, im = z
+    if im == fzero:
+        return (mpf_fibonacci(re, prec, rnd), fzero)
+    size = max(abs(re[2]+re[3]), abs(re[2]+re[3]))
+    wp = prec + size + 20
+    a = mpf_phi(wp)
+    b = mpf_add(mpf_shift(a, 1), fnone, wp)
+    u = mpc_pow((a, fzero), z, wp)
+    v = mpc_cos_pi(z, wp)
+    v = mpc_div(v, u, wp)
+    u = mpc_sub(u, v, wp)
+    u = mpc_div_mpf(u, b, prec, rnd)
+    return u
+
+def mpf_expj(x, prec, rnd='f'):
+    raise ComplexResult
+
+def mpc_expj(z, prec, rnd='f'):
+    re, im = z
+    if im == fzero:
+        return mpf_cos_sin(re, prec, rnd)
+    if re == fzero:
+        return mpf_exp(mpf_neg(im), prec, rnd), fzero
+    ey = mpf_exp(mpf_neg(im), prec+10)
+    c, s = mpf_cos_sin(re, prec+10)
+    re = mpf_mul(ey, c, prec, rnd)
+    im = mpf_mul(ey, s, prec, rnd)
+    return re, im
+
+def mpf_expjpi(x, prec, rnd='f'):
+    raise ComplexResult
+
+def mpc_expjpi(z, prec, rnd='f'):
+    re, im = z
+    if im == fzero:
+        return mpf_cos_sin_pi(re, prec, rnd)
+    sign, man, exp, bc = im
+    wp = prec+10
+    if man:
+        wp += max(0, exp+bc)
+    im = mpf_neg(mpf_mul(mpf_pi(wp), im, wp))
+    if re == fzero:
+        return mpf_exp(im, prec, rnd), fzero
+    ey = mpf_exp(im, prec+10)
+    c, s = mpf_cos_sin_pi(re, prec+10)
+    re = mpf_mul(ey, c, prec, rnd)
+    im = mpf_mul(ey, s, prec, rnd)
+    return re, im
+
+
+if BACKEND == 'sage':
+    try:
+        import sage.libs.mpmath.ext_libmp as _lbmp
+        mpc_exp = _lbmp.mpc_exp
+        mpc_sqrt = _lbmp.mpc_sqrt
+    except (ImportError, AttributeError):
+        print("Warning: Sage imports in libmpc failed")
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_functions2.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_functions2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5387556064b641cdadfcdaf50914ebd39eff3f65
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_functions2.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c96a8c60ccaff3dbe94603afb496582f94300b3dd5c8ec016ff0c7e71f975baf
+size 172649
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openacc/cupti_openacc.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openacc/cupti_openacc.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7ea50da7beb2187e77f7606dd70faed0e4b4add
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openacc/cupti_openacc.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#include <cuda_stdint.h>
+
+#if !defined(_CUPTI_OPENACC_H_)
+#define _CUPTI_OPENACC_H_
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__LP64__)
+#define CUPTILP64 1
+#elif defined(_WIN64)
+#define CUPTILP64 1
+#else
+#undef CUPTILP64
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \brief Initialize OpenACC support
+ *
+ * \param profRegister function of type acc_prof_reg as obtained from acc_register_library
+ * \param profUnregister function of type acc_prof_reg as obtained from acc_register_library
+ * \param profLookup function of type acc_prof_lookup as obtained from acc_register_library
+ */
+CUptiResult CUPTIAPI
+cuptiOpenACCInitialize(void *profRegister, void *profUnregister, void *profLookup);
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_OPENACC_H_*/
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb98c23e5591a45789d7e72a0a4561dce199905a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity.h
@@ -0,0 +1,10982 @@
+/*
+ * Copyright 2011-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_ACTIVITY_H_)
+#define _CUPTI_ACTIVITY_H_
+
+#include <cuda.h>
+#include <cupti_callbacks.h>
+#include <cupti_events.h>
+#include <cupti_metrics.h>
+#include <cupti_result.h>
+#if defined(CUPTI_DIRECTIVE_SUPPORT)
+#include <Openacc/cupti_openacc.h>
+#include <Openmp/cupti_openmp.h>
+#endif
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__LP64__)
+#define CUPTILP64 1
+#elif defined(_WIN64)
+#define CUPTILP64 1
+#else
+#undef CUPTILP64
+#endif
+
+#define ACTIVITY_RECORD_ALIGNMENT 8
+#if defined(_WIN32) // Windows 32- and 64-bit
+#define START_PACKED_ALIGNMENT __pragma(pack(push,1)) // exact fit - no padding
+#define PACKED_ALIGNMENT __declspec(align(ACTIVITY_RECORD_ALIGNMENT))
+#define END_PACKED_ALIGNMENT __pragma(pack(pop))
+#elif defined(__GNUC__) // GCC
+#define START_PACKED_ALIGNMENT
+#define PACKED_ALIGNMENT __attribute__ ((__packed__)) __attribute__ ((aligned (ACTIVITY_RECORD_ALIGNMENT)))
+#define END_PACKED_ALIGNMENT
+#else // all other compilers
+#define START_PACKED_ALIGNMENT
+#define PACKED_ALIGNMENT
+#define END_PACKED_ALIGNMENT
+#endif
+
+#define CUPTI_UNIFIED_MEMORY_CPU_DEVICE_ID ((uint32_t) 0xFFFFFFFFU)
+#define CUPTI_INVALID_CONTEXT_ID ((uint32_t) 0xFFFFFFFFU)
+#define CUPTI_INVALID_STREAM_ID ((uint32_t) 0xFFFFFFFFU)
+#define CUPTI_INVALID_CHANNEL_ID ((uint32_t) 0xFFFFFFFFU)
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_ACTIVITY_API CUPTI Activity API
+ * Functions, types, and enums that implement the CUPTI Activity API.
+ * @{
+ */
+
+/**
+ * \brief The kinds of activity records.
+ *
+ * Each activity record kind represents information about a GPU or an
+ * activity occurring on a CPU or GPU. Each kind is associated with a
+ * activity record structure that holds the information associated
+ * with the kind.
+ * \see CUpti_Activity
+ * \see CUpti_ActivityAPI
+ * \see CUpti_ActivityContext
+ * \see CUpti_ActivityDevice
+ * \see CUpti_ActivityDevice2
+ * \see CUpti_ActivityDevice3
+ * \see CUpti_ActivityDevice4
+ * \see CUpti_ActivityDeviceAttribute
+ * \see CUpti_ActivityEvent
+ * \see CUpti_ActivityEventInstance
+ * \see CUpti_ActivityKernel
+ * \see CUpti_ActivityKernel2
+ * \see CUpti_ActivityKernel3
+ * \see CUpti_ActivityKernel4
+ * \see CUpti_ActivityKernel5
+ * \see CUpti_ActivityKernel6
+ * \see CUpti_ActivityKernel7
+ * \see CUpti_ActivityKernel8
+ * \see CUpti_ActivityCdpKernel
+ * \see CUpti_ActivityPreemption
+ * \see CUpti_ActivityMemcpy
+ * \see CUpti_ActivityMemcpy3
+ * \see CUpti_ActivityMemcpy4
+ * \see CUpti_ActivityMemcpy5
+ * \see CUpti_ActivityMemcpyPtoP
+ * \see CUpti_ActivityMemcpyPtoP2
+ * \see CUpti_ActivityMemcpyPtoP3
+ * \see CUpti_ActivityMemcpyPtoP4
+ * \see CUpti_ActivityMemset
+ * \see CUpti_ActivityMemset2
+ * \see CUpti_ActivityMemset3
+ * \see CUpti_ActivityMemset4
+ * \see CUpti_ActivityMetric
+ * \see CUpti_ActivityMetricInstance
+ * \see CUpti_ActivityName
+ * \see CUpti_ActivityMarker
+ * \see CUpti_ActivityMarker2
+ * \see CUpti_ActivityMarkerData
+ * \see CUpti_ActivitySourceLocator
+ * \see CUpti_ActivityGlobalAccess
+ * \see CUpti_ActivityGlobalAccess2
+ * \see CUpti_ActivityGlobalAccess3
+ * \see CUpti_ActivityBranch
+ * \see CUpti_ActivityBranch2
+ * \see CUpti_ActivityOverhead
+ * \see CUpti_ActivityEnvironment
+ * \see CUpti_ActivityInstructionExecution
+ * \see CUpti_ActivityUnifiedMemoryCounter
+ * \see CUpti_ActivityFunction
+ * \see CUpti_ActivityModule
+ * \see CUpti_ActivitySharedAccess
+ * \see CUpti_ActivityPCSampling
+ * \see CUpti_ActivityPCSampling2
+ * \see CUpti_ActivityPCSampling3
+ * \see CUpti_ActivityPCSamplingRecordInfo
+ * \see CUpti_ActivityCudaEvent
+ * \see CUpti_ActivityStream
+ * \see CUpti_ActivitySynchronization
+ * \see CUpti_ActivityInstructionCorrelation
+ * \see CUpti_ActivityExternalCorrelation
+ * \see CUpti_ActivityUnifiedMemoryCounter2
+ * \see CUpti_ActivityOpenAccData
+ * \see CUpti_ActivityOpenAccLaunch
+ * \see CUpti_ActivityOpenAccOther
+ * \see CUpti_ActivityOpenMp
+ * \see CUpti_ActivityNvLink
+ * \see CUpti_ActivityNvLink2
+ * \see CUpti_ActivityNvLink3
+ * \see CUpti_ActivityNvLink4
+ * \see CUpti_ActivityMemory
+ * \see CUpti_ActivityPcie
+ */
+typedef enum {
+  /**
+   * The activity record is invalid.
+   */
+  CUPTI_ACTIVITY_KIND_INVALID  = 0,
+  /**
+   * A host<->host, host<->device, or device<->device memory copy. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityMemcpy5.
+   */
+  CUPTI_ACTIVITY_KIND_MEMCPY   = 1,
+  /**
+   * A memory set executing on the GPU. The corresponding activity
+   * record structure is \ref CUpti_ActivityMemset4.
+   */
+  CUPTI_ACTIVITY_KIND_MEMSET   = 2,
+  /**
+   * A kernel executing on the GPU. This activity kind may significantly change
+   * the overall performance characteristics of the application because all
+   * kernel executions are serialized on the GPU. Other activity kind for kernel
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL doesn't break kernel concurrency.
+   * The corresponding activity record structure is \ref CUpti_ActivityKernel8.
+   */
+  CUPTI_ACTIVITY_KIND_KERNEL   = 3,
+  /**
+   * A CUDA driver API function execution. The corresponding activity
+   * record structure is \ref CUpti_ActivityAPI.
+   */
+  CUPTI_ACTIVITY_KIND_DRIVER   = 4,
+  /**
+   * A CUDA runtime API function execution. The corresponding activity
+   * record structure is \ref CUpti_ActivityAPI.
+   */
+  CUPTI_ACTIVITY_KIND_RUNTIME  = 5,
+  /**
+   * An event value. The corresponding activity record structure is
+   * \ref CUpti_ActivityEvent.
+   */
+  CUPTI_ACTIVITY_KIND_EVENT    = 6,
+  /**
+   * A metric value. The corresponding activity record structure is
+   * \ref CUpti_ActivityMetric.
+   */
+  CUPTI_ACTIVITY_KIND_METRIC   = 7,
+  /**
+   * Information about a device. The corresponding activity record
+   * structure is \ref CUpti_ActivityDevice4.
+   */
+  CUPTI_ACTIVITY_KIND_DEVICE   = 8,
+  /**
+   * Information about a context. The corresponding activity record
+   * structure is \ref CUpti_ActivityContext.
+   */
+  CUPTI_ACTIVITY_KIND_CONTEXT  = 9,
+  /**
+   * A kernel executing on the GPU. This activity kind doesn't break
+   * kernel concurrency. The corresponding activity record structure
+   * is \ref CUpti_ActivityKernel8.
+   */
+  CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL = 10,
+  /**
+   * Resource naming done via NVTX APIs for thread, device, context, etc.
+   * The corresponding activity record structure is \ref CUpti_ActivityName.
+   */
+  CUPTI_ACTIVITY_KIND_NAME     = 11,
+  /**
+   * Instantaneous, start, or end NVTX marker. The corresponding activity
+   * record structure is \ref CUpti_ActivityMarker2.
+   */
+  CUPTI_ACTIVITY_KIND_MARKER = 12,
+  /**
+   * Extended, optional, data about a marker. The corresponding
+   * activity record structure is \ref CUpti_ActivityMarkerData.
+   */
+  CUPTI_ACTIVITY_KIND_MARKER_DATA = 13,
+  /**
+   * Source information about source level result. The corresponding
+   * activity record structure is \ref CUpti_ActivitySourceLocator.
+   */
+  CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR = 14,
+  /**
+   * Results for source-level global acccess. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityGlobalAccess3.
+   */
+  CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS = 15,
+  /**
+   * Results for source-level branch. The corresponding
+   * activity record structure is \ref CUpti_ActivityBranch2.
+   */
+  CUPTI_ACTIVITY_KIND_BRANCH = 16,
+  /**
+   * Overhead activity records. The
+   * corresponding activity record structure is
+   * \ref CUpti_ActivityOverhead.
+   */
+  CUPTI_ACTIVITY_KIND_OVERHEAD = 17,
+  /**
+   * A CDP (CUDA Dynamic Parallel) kernel executing on the GPU. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityCdpKernel.  This activity can not be directly
+   * enabled or disabled. It is enabled and disabled through
+   * concurrent kernel activity i.e. _CONCURRENT_KERNEL.
+   */
+  CUPTI_ACTIVITY_KIND_CDP_KERNEL = 18,
+  /**
+   * Preemption activity record indicating a preemption of a CDP (CUDA
+   * Dynamic Parallel) kernel executing on the GPU. The corresponding
+   * activity record structure is \ref CUpti_ActivityPreemption.
+   */
+  CUPTI_ACTIVITY_KIND_PREEMPTION = 19,
+  /**
+   * Environment activity records indicating power, clock, thermal,
+   * etc. levels of the GPU. The corresponding activity record
+   * structure is \ref CUpti_ActivityEnvironment.
+   */
+  CUPTI_ACTIVITY_KIND_ENVIRONMENT = 20,
+  /**
+   * An event value associated with a specific event domain
+   * instance. The corresponding activity record structure is \ref
+   * CUpti_ActivityEventInstance.
+   */
+  CUPTI_ACTIVITY_KIND_EVENT_INSTANCE = 21,
+  /**
+   * A peer to peer memory copy. The corresponding activity record
+   * structure is \ref CUpti_ActivityMemcpyPtoP4.
+   */
+  CUPTI_ACTIVITY_KIND_MEMCPY2 = 22,
+  /**
+   * A metric value associated with a specific metric domain
+   * instance. The corresponding activity record structure is \ref
+   * CUpti_ActivityMetricInstance.
+   */
+  CUPTI_ACTIVITY_KIND_METRIC_INSTANCE = 23,
+  /**
+   * Results for source-level instruction execution.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstructionExecution.
+   */
+  CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION = 24,
+  /**
+   * Unified Memory counter record. The corresponding activity
+   * record structure is \ref CUpti_ActivityUnifiedMemoryCounter2.
+   */
+  CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER = 25,
+  /**
+   * Device global/function record. The corresponding activity
+   * record structure is \ref CUpti_ActivityFunction.
+   */
+  CUPTI_ACTIVITY_KIND_FUNCTION = 26,
+  /**
+   * CUDA Module record. The corresponding activity
+   * record structure is \ref CUpti_ActivityModule.
+   */
+  CUPTI_ACTIVITY_KIND_MODULE = 27,
+  /**
+   * A device attribute value. The corresponding activity record
+   * structure is \ref CUpti_ActivityDeviceAttribute.
+   */
+  CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE   = 28,
+  /**
+   * Results for source-level shared acccess. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivitySharedAccess.
+   */
+  CUPTI_ACTIVITY_KIND_SHARED_ACCESS = 29,
+  /**
+   * Enable PC sampling for kernels. This will serialize
+   * kernels. The corresponding activity record structure
+   * is \ref CUpti_ActivityPCSampling3.
+   */
+  CUPTI_ACTIVITY_KIND_PC_SAMPLING = 30,
+  /**
+   * Summary information about PC sampling records. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityPCSamplingRecordInfo.
+   */
+  CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO = 31,
+  /**
+   * SASS/Source line-by-line correlation record.
+   * This will generate sass/source correlation for functions that have source
+   * level analysis or pc sampling results. The records will be generated only
+   * when either of source level analysis or pc sampling activity is enabled.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstructionCorrelation.
+   */
+  CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION = 32,
+  /**
+   * OpenACC data events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenAccData.
+   */
+  CUPTI_ACTIVITY_KIND_OPENACC_DATA = 33,
+  /**
+   * OpenACC launch events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenAccLaunch.
+   */
+  CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH = 34,
+  /**
+   * OpenACC other events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenAccOther.
+   */
+  CUPTI_ACTIVITY_KIND_OPENACC_OTHER = 35,
+  /**
+   * Information about a CUDA event. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityCudaEvent.
+   */
+  CUPTI_ACTIVITY_KIND_CUDA_EVENT = 36,
+  /**
+   * Information about a CUDA stream. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityStream.
+   */
+  CUPTI_ACTIVITY_KIND_STREAM = 37,
+  /**
+   * Records for synchronization management. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivitySynchronization.
+   */
+  CUPTI_ACTIVITY_KIND_SYNCHRONIZATION = 38,
+  /**
+   * Records for correlation of different programming APIs. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityExternalCorrelation.
+   */
+  CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION = 39,
+  /**
+   * NVLink information.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityNvLink4.
+   */
+  CUPTI_ACTIVITY_KIND_NVLINK = 40,
+  /**
+   * Instantaneous Event information.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousEvent.
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT = 41,
+  /**
+   * Instantaneous Event information for a specific event
+   * domain instance.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousEventInstance
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE = 42,
+  /**
+   * Instantaneous Metric information
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousMetric.
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC = 43,
+  /**
+   * Instantaneous Metric information for a specific metric
+   * domain instance.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousMetricInstance.
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE = 44,
+  /**
+   * Memory activity tracking allocation and freeing of the memory
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemory.
+   */
+  CUPTI_ACTIVITY_KIND_MEMORY = 45,
+  /**
+   * PCI devices information used for PCI topology.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityPcie.
+   */
+  CUPTI_ACTIVITY_KIND_PCIE = 46,
+  /**
+   * OpenMP parallel events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenMp.
+   */
+  CUPTI_ACTIVITY_KIND_OPENMP = 47,
+  /**
+   * A CUDA driver kernel launch occurring outside of any
+   * public API function execution.  Tools can handle these
+   * like records for driver API launch functions, although
+   * the cbid field is not used here.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityAPI.
+   */
+  CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API = 48,
+  /**
+   * Memory activity tracking allocation and freeing of the memory
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemory3.
+   */
+  CUPTI_ACTIVITY_KIND_MEMORY2 = 49,
+
+  /**
+   * Memory pool activity tracking creation, destruction and
+   * triming of the memory pool.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemoryPool2.
+   */
+  CUPTI_ACTIVITY_KIND_MEMORY_POOL = 50,
+
+  /**
+   * The corresponding activity record structure is \ref CUpti_ActivityGraphTrace.
+   */
+  CUPTI_ACTIVITY_KIND_GRAPH_TRACE = 51,
+
+  /**
+   * JIT operation tracking
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityJit.
+   */
+  CUPTI_ACTIVITY_KIND_JIT = 52,
+
+
+  CUPTI_ACTIVITY_KIND_COUNT,
+
+  CUPTI_ACTIVITY_KIND_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityKind;
+
+/**
+ * \brief The kinds of activity objects.
+ * \see CUpti_ActivityObjectKindId
+ */
+typedef enum {
+  /**
+   * The object kind is not known.
+   */
+  CUPTI_ACTIVITY_OBJECT_UNKNOWN  = 0,
+  /**
+   * A process.
+   */
+  CUPTI_ACTIVITY_OBJECT_PROCESS  = 1,
+  /**
+   * A thread.
+   */
+  CUPTI_ACTIVITY_OBJECT_THREAD   = 2,
+  /**
+   * A device.
+   */
+  CUPTI_ACTIVITY_OBJECT_DEVICE   = 3,
+  /**
+   * A context.
+   */
+  CUPTI_ACTIVITY_OBJECT_CONTEXT  = 4,
+  /**
+   * A stream.
+   */
+  CUPTI_ACTIVITY_OBJECT_STREAM   = 5,
+
+  CUPTI_ACTIVITY_OBJECT_FORCE_INT = 0x7fffffff
+} CUpti_ActivityObjectKind;
+
+/**
+ * \brief Identifiers for object kinds as specified by
+ * CUpti_ActivityObjectKind.
+ * \see CUpti_ActivityObjectKind
+ */
+typedef union {
+  /**
+   * A process object requires that we identify the process ID. A
+   * thread object requires that we identify both the process and
+   * thread ID.
+   */
+  struct {
+    uint32_t processId;
+    uint32_t threadId;
+  } pt;
+  /**
+   * A device object requires that we identify the device ID. A
+   * context object requires that we identify both the device and
+   * context ID. A stream object requires that we identify device,
+   * context, and stream ID.
+   */
+  struct {
+    uint32_t deviceId;
+    uint32_t contextId;
+    uint32_t streamId;
+  } dcs;
+} CUpti_ActivityObjectKindId;
+
+/**
+ * \brief The kinds of activity overhead.
+ */
+typedef enum {
+  /**
+   * The overhead kind is not known.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_UNKNOWN               = 0,
+  /**
+   * Compiler(JIT) overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER       = 1,
+  /**
+   * Activity buffer flush overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH    = 1<<16,
+  /**
+   * CUPTI instrumentation overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION = 2<<16,
+  /**
+   * CUPTI resource creation and destruction overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE        = 3<<16,
+  CUPTI_ACTIVITY_OVERHEAD_FORCE_INT             = 0x7fffffff
+} CUpti_ActivityOverheadKind;
+
+/**
+ * \brief The kind of a compute API.
+ */
+typedef enum {
+  /**
+   * The compute API is not known.
+   */
+  CUPTI_ACTIVITY_COMPUTE_API_UNKNOWN    = 0,
+  /**
+   * The compute APIs are for CUDA.
+   */
+  CUPTI_ACTIVITY_COMPUTE_API_CUDA       = 1,
+  /**
+   * The compute APIs are for CUDA running
+   * in MPS (Multi-Process Service) environment.
+   */
+  CUPTI_ACTIVITY_COMPUTE_API_CUDA_MPS   = 2,
+
+  CUPTI_ACTIVITY_COMPUTE_API_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityComputeApiKind;
+
+/**
+ * \brief Flags associated with activity records.
+ *
+ * Activity record flags. Flags can be combined by bitwise OR to
+ * associated multiple flags with an activity record. Each flag is
+ * specific to a certain activity kind, as noted below.
+ */
+typedef enum {
+  /**
+   * Indicates the activity record has no flags.
+   */
+  CUPTI_ACTIVITY_FLAG_NONE          = 0,
+
+  /**
+   * Indicates the activity represents a device that supports
+   * concurrent kernel execution. Valid for
+   * CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUPTI_ACTIVITY_FLAG_DEVICE_CONCURRENT_KERNELS  = 1 << 0,
+
+  /**
+   * Indicates if the activity represents a CUdevice_attribute value
+   * or a CUpti_DeviceAttribute value. Valid for
+   * CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE.
+   */
+  CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE  = 1 << 0,
+
+  /**
+   * Indicates the activity represents an asynchronous memcpy
+   * operation. Valid for CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC  = 1 << 0,
+
+  /**
+   * Indicates the activity represents an instantaneous marker. Valid
+   * for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_INSTANTANEOUS  = 1 << 0,
+
+  /**
+   * Indicates the activity represents a region start marker. Valid
+   * for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_START  = 1 << 1,
+
+  /**
+   * Indicates the activity represents a region end marker. Valid for
+   * CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_END  = 1 << 2,
+
+  /**
+   * Indicates the activity represents an attempt to acquire a user
+   * defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE = 1 << 3,
+
+  /**
+   * Indicates the activity represents success in acquiring the
+   * user defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_SUCCESS = 1 << 4,
+
+  /**
+   * Indicates the activity represents failure in acquiring the
+   * user defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_FAILED = 1 << 5,
+
+  /**
+   * Indicates the activity represents releasing a reservation on
+   * user defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_RELEASE = 1 << 6,
+
+  /**
+   * Indicates the activity represents a marker that does not specify
+   * a color. Valid for CUPTI_ACTIVITY_KIND_MARKER_DATA.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_COLOR_NONE  = 1 << 0,
+
+  /**
+   * Indicates the activity represents a marker that specifies a color
+   * in alpha-red-green-blue format. Valid for
+   * CUPTI_ACTIVITY_KIND_MARKER_DATA.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_COLOR_ARGB  = 1 << 1,
+
+  /**
+   * The number of bytes requested by each thread
+   * Valid for CUpti_ActivityGlobalAccess3.
+   */
+  CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_SIZE_MASK  = 0xFF << 0,
+  /**
+   * If bit in this flag is set, the access was load, else it is a
+   * store access. Valid for CUpti_ActivityGlobalAccess3.
+   */
+  CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_LOAD       = 1 << 8,
+  /**
+   * If this bit in flag is set, the load access was cached else it is
+   * uncached. Valid for CUpti_ActivityGlobalAccess3.
+   */
+  CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_CACHED     = 1 << 9,
+  /**
+   * If this bit in flag is set, the metric value overflowed. Valid
+   * for CUpti_ActivityMetric and CUpti_ActivityMetricInstance.
+   */
+  CUPTI_ACTIVITY_FLAG_METRIC_OVERFLOWED     = 1 << 0,
+  /**
+   * If this bit in flag is set, the metric value couldn't be
+   * calculated. This occurs when a value(s) required to calculate the
+   * metric is missing.  Valid for CUpti_ActivityMetric and
+   * CUpti_ActivityMetricInstance.
+   */
+  CUPTI_ACTIVITY_FLAG_METRIC_VALUE_INVALID  = 1 << 1,
+    /**
+   * If this bit in flag is set, the source level metric value couldn't be
+   * calculated. This occurs when a value(s) required to calculate the
+   * source level metric cannot be evaluated.
+   * Valid for CUpti_ActivityInstructionExecution.
+   */
+  CUPTI_ACTIVITY_FLAG_INSTRUCTION_VALUE_INVALID  = 1 << 0,
+  /**
+   * The mask for the instruction class, \ref CUpti_ActivityInstructionClass
+   * Valid for CUpti_ActivityInstructionExecution and
+   * CUpti_ActivityInstructionCorrelation
+   */
+  CUPTI_ACTIVITY_FLAG_INSTRUCTION_CLASS_MASK    = 0xFF << 1,
+  /**
+   * When calling cuptiActivityFlushAll, this flag
+   * can be set to force CUPTI to flush all records in the buffer, whether
+   * finished or not
+   */
+  CUPTI_ACTIVITY_FLAG_FLUSH_FORCED = 1 << 0,
+
+  /**
+   * The number of bytes requested by each thread
+   * Valid for CUpti_ActivitySharedAccess.
+   */
+  CUPTI_ACTIVITY_FLAG_SHARED_ACCESS_KIND_SIZE_MASK  = 0xFF << 0,
+  /**
+   * If bit in this flag is set, the access was load, else it is a
+   * store access.  Valid for CUpti_ActivitySharedAccess.
+   */
+  CUPTI_ACTIVITY_FLAG_SHARED_ACCESS_KIND_LOAD       = 1 << 8,
+
+  /**
+   * Indicates the activity represents an asynchronous memset
+   * operation. Valid for CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC  = 1 << 0,
+
+  /**
+   * Indicates the activity represents thrashing in CPU.
+   * Valid for counter of kind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING in
+   * CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUPTI_ACTIVITY_FLAG_THRASHING_IN_CPU = 1 << 0,
+
+ /**
+   * Indicates the activity represents page throttling in CPU.
+   * Valid for counter of kind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING in
+   * CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUPTI_ACTIVITY_FLAG_THROTTLING_IN_CPU = 1 << 0,
+
+  CUPTI_ACTIVITY_FLAG_FORCE_INT = 0x7fffffff
+} CUpti_ActivityFlag;
+
+/**
+ * \brief The stall reason for PC sampling activity.
+ */
+typedef enum {
+  /**
+   * Invalid reason
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_INVALID      = 0,
+   /**
+   * No stall, instruction is selected for issue
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_NONE         = 1,
+  /**
+   * Warp is blocked because next instruction is not yet available,
+   * because of instruction cache miss, or because of branching effects
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_INST_FETCH   = 2,
+  /**
+   * Instruction is waiting on an arithmatic dependency
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_EXEC_DEPENDENCY   = 3,
+  /**
+   * Warp is blocked because it is waiting for a memory access to complete.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_DEPENDENCY   = 4,
+  /**
+   * Texture sub-system is fully utilized or has too many outstanding requests.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_TEXTURE   = 5,
+  /**
+   * Warp is blocked as it is waiting at __syncthreads() or at memory barrier.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_SYNC   = 6,
+  /**
+   * Warp is blocked waiting for __constant__ memory and immediate memory access to complete.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_CONSTANT_MEMORY_DEPENDENCY   = 7,
+  /**
+   * Compute operation cannot be performed due to the required resources not
+   * being available.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_PIPE_BUSY   = 8,
+  /**
+   * Warp is blocked because there are too many pending memory operations.
+   * In Kepler architecture it often indicates high number of memory replays.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_THROTTLE   = 9,
+  /**
+   * Warp was ready to issue, but some other warp issued instead.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_NOT_SELECTED   = 10,
+  /**
+   * Miscellaneous reasons
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_OTHER   = 11,
+  /**
+   * Sleeping.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_SLEEPING   = 12,
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityPCSamplingStallReason;
+
+/**
+ * \brief Sampling period for PC sampling method
+ *
+ * Sampling period can be set using \ref cuptiActivityConfigurePCSampling
+ */
+typedef enum {
+  /**
+   * The PC sampling period is not set.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_INVALID = 0,
+  /**
+   * Minimum sampling period available on the device.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MIN = 1,
+  /**
+   * Sampling period in lower range.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_LOW = 2,
+  /**
+   * Medium sampling period.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MID = 3,
+  /**
+   * Sampling period in higher range.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_HIGH = 4,
+  /**
+   * Maximum sampling period available on the device.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MAX = 5,
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_FORCE_INT = 0x7fffffff
+} CUpti_ActivityPCSamplingPeriod;
+
+/**
+ * \brief The kind of a memory copy, indicating the source and
+ * destination targets of the copy.
+ *
+ * Each kind represents the source and destination targets of a memory
+ * copy. Targets are host, device, and array.
+ */
+typedef enum {
+  /**
+   * The memory copy kind is not known.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN = 0,
+  /**
+   * A host to device memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_HTOD    = 1,
+  /**
+   * A device to host memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_DTOH    = 2,
+  /**
+   * A host to device array memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_HTOA    = 3,
+  /**
+   * A device array to host memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_ATOH    = 4,
+  /**
+   * A device array to device array memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_ATOA    = 5,
+  /**
+   * A device array to device memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_ATOD    = 6,
+  /**
+   * A device to device array memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_DTOA    = 7,
+  /**
+   * A device to device memory copy on the same device.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_DTOD    = 8,
+  /**
+   * A host to host memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_HTOH    = 9,
+  /**
+   * A peer to peer memory copy across different devices.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_PTOP    = 10,
+
+  CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT = 0x7fffffff
+} CUpti_ActivityMemcpyKind;
+
+/**
+ * \brief The kinds of memory accessed by a memory operation/copy.
+ *
+ * Each kind represents the type of the memory
+ * accessed by a memory operation/copy.
+ */
+typedef enum {
+  /**
+   * The memory kind is unknown.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN            = 0,
+  /**
+   * The memory is pageable.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE           = 1,
+  /**
+   * The memory is pinned.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_PINNED             = 2,
+  /**
+   * The memory is on the device.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_DEVICE             = 3,
+  /**
+   * The memory is an array.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_ARRAY              = 4,
+  /**
+   * The memory is managed
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_MANAGED            = 5,
+  /**
+   * The memory is device static
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC      = 6,
+  /**
+   * The memory is managed static
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC     = 7,
+  CUPTI_ACTIVITY_MEMORY_KIND_FORCE_INT          = 0x7fffffff
+} CUpti_ActivityMemoryKind;
+
+/**
+ * \brief The kind of a preemption activity.
+ */
+typedef enum {
+  /**
+   * The preemption kind is not known.
+   */
+  CUPTI_ACTIVITY_PREEMPTION_KIND_UNKNOWN    = 0,
+  /**
+   * Preemption to save CDP block.
+   */
+  CUPTI_ACTIVITY_PREEMPTION_KIND_SAVE       = 1,
+  /**
+   * Preemption to restore CDP block.
+   */
+  CUPTI_ACTIVITY_PREEMPTION_KIND_RESTORE    = 2,
+  CUPTI_ACTIVITY_PREEMPTION_KIND_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityPreemptionKind;
+
+/**
+ * \brief The kind of environment data. Used to indicate what type of
+ * data is being reported by an environment activity record.
+ */
+typedef enum {
+  /**
+   * Unknown data.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_UNKNOWN = 0,
+  /**
+   * The environment data is related to speed.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_SPEED = 1,
+  /**
+   * The environment data is related to temperature.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_TEMPERATURE = 2,
+  /**
+   * The environment data is related to power.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_POWER = 3,
+  /**
+   * The environment data is related to cooling.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_COOLING = 4,
+
+  CUPTI_ACTIVITY_ENVIRONMENT_COUNT,
+  CUPTI_ACTIVITY_ENVIRONMENT_KIND_FORCE_INT    = 0x7fffffff
+} CUpti_ActivityEnvironmentKind;
+
+/**
+ * \brief Reasons for clock throttling.
+ *
+ * The possible reasons that a clock can be throttled. There can be
+ * more than one reason that a clock is being throttled so these types
+ * can be combined by bitwise OR.  These are used in the
+ * clocksThrottleReason field in the Environment Activity Record.
+ */
+typedef enum {
+  /**
+   * Nothing is running on the GPU and the clocks are dropping to idle
+   * state.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_GPU_IDLE              = 0x00000001,
+  /**
+   * The GPU clocks are limited by a user specified limit.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_USER_DEFINED_CLOCKS   = 0x00000002,
+  /**
+   * A software power scaling algorithm is reducing the clocks below
+   * requested clocks.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_SW_POWER_CAP          = 0x00000004,
+  /**
+   * Hardware slowdown to reduce the clock by a factor of two or more
+   * is engaged.  This is an indicator of one of the following: 1)
+   * Temperature is too high, 2) External power brake assertion is
+   * being triggered (e.g. by the system power supply), 3) Change in
+   * power state.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN           = 0x00000008,
+  /**
+   * Some unspecified factor is reducing the clocks.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_UNKNOWN               = 0x80000000,
+  /**
+   * Throttle reason is not supported for this GPU.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_UNSUPPORTED           = 0x40000000,
+  /**
+   * No clock throttling.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_NONE                  = 0x00000000,
+
+  CUPTI_CLOCKS_THROTTLE_REASON_FORCE_INT             = 0x7fffffff
+} CUpti_EnvironmentClocksThrottleReason;
+
+/**
+ * \brief Scope of the unified memory counter (deprecated in CUDA 7.0)
+ */
+typedef enum {
+  /**
+   * The unified memory counter scope is not known.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_UNKNOWN = 0,
+  /**
+   * Collect unified memory counter for single process on one device
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE = 1,
+  /**
+   * Collect unified memory counter for single process across all devices
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_ALL_DEVICES = 2,
+
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_COUNT,
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityUnifiedMemoryCounterScope;
+
+/**
+ * \brief Kind of the Unified Memory counter
+ *
+ * Many activities are associated with Unified Memory mechanism; among them
+ * are tranfer from host to device, device to host, page fault at
+ * host side.
+ */
+typedef enum {
+  /**
+   * The unified memory counter kind is not known.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_UNKNOWN = 0,
+  /**
+   * Number of bytes transfered from host to device
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD = 1,
+  /**
+   * Number of bytes transfered from device to host
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH = 2,
+  /**
+   * Number of CPU page faults, this is only supported on 64 bit
+   * Linux and Mac platforms
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT = 3,
+  /**
+   * Number of GPU page faults, this is only supported on devices with
+   * compute capability 6.0 and higher and 64 bit Linux platforms
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT = 4,
+  /**
+   * Thrashing occurs when data is frequently accessed by
+   * multiple processors and has to be constantly migrated around
+   * to achieve data locality. In this case the overhead of migration
+   * may exceed the benefits of locality.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING = 5,
+  /**
+   * Throttling is a prevention technique used by the driver to avoid
+   * further thrashing. Here, the driver doesn't service the fault for
+   * one of the contending processors for a specific period of time,
+   * so that the other processor can run at full-speed.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING = 6,
+  /**
+   * In case throttling does not help, the driver tries to pin the memory
+   * to a processor for a specific period of time. One of the contending
+   * processors will have slow  access to the memory, while the other will
+   * have fast access.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP = 7,
+
+  /**
+   * Number of bytes transferred from one device to another device.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD = 8,
+
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_COUNT,
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_FORCE_INT = 0x7fffffff
+} CUpti_ActivityUnifiedMemoryCounterKind;
+
+/**
+ * \brief Memory access type for unified memory page faults
+ *
+ * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT
+ * and \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT
+ */
+typedef enum {
+    /**
+     * The unified memory access type is not known
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_UNKNOWN = 0,
+    /**
+     * The page fault was triggered by read memory instruction
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_READ = 1,
+    /**
+     * The page fault was triggered by write memory instruction
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_WRITE = 2,
+    /**
+     * The page fault was triggered by atomic memory instruction
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_ATOMIC = 3,
+    /**
+     * The page fault was triggered by memory prefetch operation
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_PREFETCH = 4
+} CUpti_ActivityUnifiedMemoryAccessType;
+
+/**
+ * \brief Migration cause of the Unified Memory counter
+ *
+ * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+ * \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH
+ */
+typedef enum {
+    /**
+     * The unified memory migration cause is not known
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_UNKNOWN = 0,
+    /**
+     * The unified memory migrated due to an explicit call from
+     * the user e.g. cudaMemPrefetchAsync
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_USER = 1,
+    /**
+     * The unified memory migrated to guarantee data coherence
+     * e.g. CPU/GPU faults on Pascal+ and kernel launch on pre-Pascal GPUs
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_COHERENCE = 2,
+    /**
+     * The unified memory was speculatively migrated by the UVM driver
+     * before being accessed by the destination processor to improve
+     * performance
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_PREFETCH = 3,
+    /**
+     * The unified memory migrated to the CPU because it was evicted to make
+     * room for another block of memory on the GPU
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_EVICTION = 4,
+    /**
+      * The unified memory migrated to another processor because of access counter
+      * notifications. Only frequently accessed pages are migrated between CPU and GPU, or
+      * between peer GPUs.
+      */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_ACCESS_COUNTERS = 5,
+} CUpti_ActivityUnifiedMemoryMigrationCause;
+
+/**
+ * \brief Remote memory map cause of the Unified Memory counter
+ *
+ * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP
+ */
+typedef enum {
+    /**
+     * The cause of mapping to remote memory was unknown
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_UNKNOWN = 0,
+    /**
+     * Mapping to remote memory was added to maintain data coherence.
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_COHERENCE = 1,
+    /**
+     * Mapping to remote memory was added to prevent further thrashing
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_THRASHING = 2,
+    /**
+     * Mapping to remote memory was added to enforce the hints
+     * specified by the programmer or by performance heuristics of the
+     * UVM driver
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_POLICY = 3,
+    /**
+     * Mapping to remote memory was added because there is no more
+     * memory available on the processor and eviction was not
+     * possible
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_OUT_OF_MEMORY = 4,
+    /**
+     * Mapping to remote memory was added after the memory was
+     * evicted to make room for another block of memory on the GPU
+     */
+    CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_EVICTION = 5,
+} CUpti_ActivityUnifiedMemoryRemoteMapCause;
+
+/**
+ * \brief SASS instruction classification.
+ *
+ * The sass instruction are broadly divided into different class. Each enum represents a classification.
+ */
+typedef enum {
+  /**
+   * The instruction class is not known.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_UNKNOWN = 0,
+  /**
+   * Represents a 32 bit floating point operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_32 = 1,
+  /**
+   * Represents a 64 bit floating point operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_64 = 2,
+  /**
+   * Represents an integer operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_INTEGER = 3,
+  /**
+   * Represents a bit conversion operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_BIT_CONVERSION = 4,
+  /**
+   * Represents a control flow instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_CONTROL_FLOW = 5,
+  /**
+   * Represents a global load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_GLOBAL = 6,
+  /**
+   * Represents a shared load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SHARED = 7,
+  /**
+   * Represents a local load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_LOCAL = 8,
+  /**
+   * Represents a generic load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_GENERIC = 9,
+  /**
+   * Represents a surface load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SURFACE = 10,
+  /**
+   * Represents a constant load instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_CONSTANT = 11,
+  /**
+   * Represents a texture load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_TEXTURE = 12,
+  /**
+   * Represents a global atomic instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_GLOBAL_ATOMIC = 13,
+  /**
+   * Represents a shared atomic instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SHARED_ATOMIC = 14,
+  /**
+   * Represents a surface atomic instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SURFACE_ATOMIC = 15,
+  /**
+   * Represents a inter-thread communication instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_INTER_THREAD_COMMUNICATION = 16,
+  /**
+   * Represents a barrier instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_BARRIER = 17,
+  /**
+   * Represents some miscellaneous instructions which do not fit in the above classification.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_MISCELLANEOUS = 18,
+  /**
+   * Represents a 16 bit floating point operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_16 = 19,
+
+  /**
+   * Represents uniform instruction.
+   */
+
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_UNIFORM = 20,
+
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_KIND_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityInstructionClass;
+
+/**
+ * \brief Partitioned global caching option
+ */
+typedef enum {
+  /**
+   * Partitioned global cache config unknown.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_UNKNOWN = 0,
+  /**
+   * Partitioned global cache not supported.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_NOT_SUPPORTED = 1,
+  /**
+   * Partitioned global cache config off.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_OFF = 2,
+  /**
+   * Partitioned global cache config on.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_ON = 3,
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityPartitionedGlobalCacheConfig;
+
+/**
+ * \brief Synchronization type.
+ *
+ * The types of synchronization to be used with CUpti_ActivitySynchronization.
+ */
+
+typedef enum {
+  /**
+   * Unknown data.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_UNKNOWN = 0,
+  /**
+   * Event synchronize API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE = 1,
+  /**
+   * Stream wait event API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT = 2,
+  /**
+   * Stream synchronize API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE = 3,
+  /**
+   * Context synchronize API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_CONTEXT_SYNCHRONIZE = 4,
+
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_FORCE_INT     = 0x7fffffff
+} CUpti_ActivitySynchronizationType;
+
+/**
+ * \brief stream type.
+ *
+ * The types of stream to be used with CUpti_ActivityStream.
+ */
+
+typedef enum {
+  /**
+   * Unknown data.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_UNKNOWN = 0,
+  /**
+   * Default stream.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_DEFAULT = 1,
+  /**
+   * Non-blocking stream.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_NON_BLOCKING = 2,
+  /**
+   * Null stream.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_NULL = 3,
+  /**
+   * Stream create Mask
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_MASK = 0xFFFF,
+
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_FORCE_INT = 0x7fffffff
+} CUpti_ActivityStreamFlag;
+
+/**
+* \brief Link flags.
+*
+* Describes link properties, to be used with CUpti_ActivityNvLink.
+*/
+
+typedef enum {
+  CUPTI_LINK_FLAG_INVALID = 0,
+  /**
+  * Is peer to peer access supported by this link.
+  */
+  CUPTI_LINK_FLAG_PEER_ACCESS = (1 << 1),
+  /**
+  * Is system memory access supported by this link.
+  */
+  CUPTI_LINK_FLAG_SYSMEM_ACCESS = (1 << 2),
+  /**
+  * Is peer atomic access supported by this link.
+  */
+  CUPTI_LINK_FLAG_PEER_ATOMICS = (1 << 3),
+  /**
+  * Is system memory atomic access supported by this link.
+  */
+  CUPTI_LINK_FLAG_SYSMEM_ATOMICS = (1 << 4),
+
+  CUPTI_LINK_FLAG_FORCE_INT = 0x7fffffff
+} CUpti_LinkFlag;
+
+/**
+* \brief Memory operation types.
+*
+* Describes the type of memory operation, to be used with CUpti_ActivityMemory3.
+*/
+
+typedef enum {
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_INVALID = 0,
+  /**
+  * Memory is allocated.
+  */
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_ALLOCATION = 1,
+  /**
+  * Memory is released.
+  */
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_RELEASE = 2,
+
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityMemoryOperationType;
+
+/**
+* \brief Memory pool types.
+*
+* Describes the type of memory pool, to be used with CUpti_ActivityMemory3.
+*/
+
+typedef enum {
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_INVALID = 0,
+  /**
+  * Memory pool is local to the process.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL = 1,
+  /**
+  * Memory pool is imported by the process.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED = 2,
+
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityMemoryPoolType;
+
+/**
+* \brief Memory pool operation types.
+*
+* Describes the type of memory pool operation, to be used with CUpti_ActivityMemoryPool2.
+*/
+
+typedef enum {
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_INVALID = 0,
+  /**
+  * Memory pool is created.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_CREATED = 1,
+  /**
+  * Memory pool is destroyed.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_DESTROYED = 2,
+  /**
+  * Memory pool is trimmed.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED = 3,
+
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityMemoryPoolOperationType;
+
+typedef enum {
+  CUPTI_CHANNEL_TYPE_INVALID = 0,
+  CUPTI_CHANNEL_TYPE_COMPUTE = 1,
+  CUPTI_CHANNEL_TYPE_ASYNC_MEMCPY = 2
+} CUpti_ChannelType;
+
+/**
+ * The source-locator ID that indicates an unknown source
+ * location. There is not an actual CUpti_ActivitySourceLocator object
+ * corresponding to this value.
+ */
+#define CUPTI_SOURCE_LOCATOR_ID_UNKNOWN 0
+
+/**
+ * An invalid function index ID.
+ */
+#define CUPTI_FUNCTION_INDEX_ID_INVALID 0
+
+/**
+ * An invalid/unknown correlation ID. A correlation ID of this value
+ * indicates that there is no correlation for the activity record.
+ */
+#define CUPTI_CORRELATION_ID_UNKNOWN 0
+
+/**
+ * An invalid/unknown grid ID.
+ */
+#define CUPTI_GRID_ID_UNKNOWN 0LL
+
+/**
+ * An invalid/unknown timestamp for a start, end, queued, submitted,
+ * or completed time.
+ */
+#define CUPTI_TIMESTAMP_UNKNOWN 0LL
+
+/**
+ * An invalid/unknown value.
+ */
+#define CUPTI_SYNCHRONIZATION_INVALID_VALUE -1
+
+/**
+ * An invalid/unknown process id.
+ */
+#define CUPTI_AUTO_BOOST_INVALID_CLIENT_PID 0
+
+/**
+ * Invalid/unknown NVLink port number.
+*/
+#define CUPTI_NVLINK_INVALID_PORT -1
+
+/**
+ * Maximum NVLink port numbers.
+*/
+#define CUPTI_MAX_NVLINK_PORTS 32
+
+START_PACKED_ALIGNMENT
+/**
+ * \brief Unified Memory counters configuration structure
+ *
+ * This structure controls the enable/disable of the various
+ * Unified Memory counters consisting of scope, kind and other parameters.
+ * See function \ref cuptiActivityConfigureUnifiedMemoryCounter
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Unified Memory counter Counter scope. (deprecated in CUDA 7.0)
+   */
+  CUpti_ActivityUnifiedMemoryCounterScope scope;
+
+  /**
+   * Unified Memory counter Counter kind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind kind;
+
+  /**
+   * Device id of the traget device. This is relevant only
+   * for single device scopes. (deprecated in CUDA 7.0)
+   */
+  uint32_t deviceId;
+
+  /**
+   * Control to enable/disable the counter. To enable the counter
+   * set it to non-zero value while disable is indicated by zero.
+   */
+  uint32_t enable;
+} CUpti_ActivityUnifiedMemoryCounterConfig;
+
+/**
+ * \brief Device auto boost state structure
+ *
+ * This structure defines auto boost state for a device.
+ * See function \ref cuptiGetAutoBoostState
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Returned auto boost state. 1 is returned in case auto boost is enabled, 0
+   * otherwise
+   */
+  uint32_t enabled;
+
+  /**
+   * Id of process that has set the current boost state. The value will be
+   * CUPTI_AUTO_BOOST_INVALID_CLIENT_PID if the user does not have the
+   * permission to query process ids or there is an error in querying the
+   * process id.
+   */
+  uint32_t pid;
+
+} CUpti_ActivityAutoBoostState;
+
+/**
+ * \brief PC sampling configuration structure
+ *
+ * This structure defines the pc sampling configuration.
+ *
+ * See function \ref cuptiActivityConfigurePCSampling
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Size of configuration structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  uint32_t size;
+  /**
+   * There are 5 level provided for sampling period. The level
+   * internally maps to a period in terms of cycles. Same level can
+   * map to different number of cycles on different gpus. No of
+   * cycles will be chosen to minimize information loss. The period
+   * chosen will be given by samplingPeriodInCycles in
+   * \ref CUpti_ActivityPCSamplingRecordInfo for each kernel instance.
+   */
+  CUpti_ActivityPCSamplingPeriod samplingPeriod;
+
+  /**
+   * This will override the period set by samplingPeriod. Value 0 in samplingPeriod2 will be
+   * considered as samplingPeriod2 should not be used and samplingPeriod should be used.
+   * Valid values for samplingPeriod2 are between 5 to 31 both inclusive.
+   * This will set the sampling period to (2^samplingPeriod2) cycles.
+   */
+  uint32_t samplingPeriod2;
+} CUpti_ActivityPCSamplingConfig;
+
+/**
+ * \brief The base activity record.
+ *
+ * The activity API uses a CUpti_Activity as a generic representation
+ * for any activity. The 'kind' field is used to determine the
+ * specific activity kind, and from that the CUpti_Activity object can
+ * be cast to the specific activity record type appropriate for that kind.
+ *
+ * Note that all activity record types are padded and aligned to
+ * ensure that each member of the record is naturally aligned.
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+} CUpti_Activity;
+
+/**
+ * \brief The activity record for memory copies. (deprecated)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemcpy;
+
+/**
+ * \brief The activity record for memory copies. (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemcpy3;
+
+/**
+ * \brief The activity record for memory copies. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemcpy4;
+
+/**
+ * \brief The activity record for memory copies.
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the HW channel on which the memory copy is occuring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   *  Reserved for internal use.
+   */
+  uint32_t pad2;
+
+} CUpti_ActivityMemcpy5;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2) but is no longer generated
+ * by CUPTI. Peer-to-peer memory copy activities are now reported using the
+ * CUpti_ActivityMemcpyPtoP2 activity record..
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemcpyPtoP;
+
+typedef CUpti_ActivityMemcpyPtoP CUpti_ActivityMemcpy2;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ * (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemcpyPtoP2;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ * (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemcpyPtoP3;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the HW channel on which the memory copy is occuring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+} CUpti_ActivityMemcpyPtoP4;
+
+/**
+ * \brief The activity record for memset. (deprecated)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemset;
+
+/**
+ * \brief The activity record for memset. (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemset2;
+
+/**
+ * \brief The activity record for memset. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemset3;
+
+/**
+ * \brief The activity record for memset.
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the HW channel on which the memory set is occuring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   *  Undefined. Reserved for internal use
+   */
+  uint32_t pad2;
+
+} CUpti_ActivityMemset4;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY).
+ * This activity record provides a single record for the memory
+ * allocation and memory release operations.
+ *
+ * Note: It is recommended to move to the new activity record \ref CUpti_ActivityMemory3
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY2.
+ * \ref CUpti_ActivityMemory3 provides separate records for memory
+ * allocation and memory release operations. This allows to correlate the
+ * corresponding driver and runtime API activity record with the memory operation.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory kind requested by the user
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The virtual address of the allocation
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, i.e.
+   * the time when memory was allocated, in ns.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory operation, i.e.
+   * the time when memory was freed, in ns.
+   * This will be 0 if memory is not freed in the application
+   */
+  uint64_t end;
+
+  /**
+   * The program counter of the allocation of memory
+   */
+  uint64_t allocPC;
+
+  /**
+   * The program counter of the freeing of memory. This will
+   * be 0 if memory is not freed in the application
+   */
+  uint64_t freePC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory allocation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+} CUpti_ActivityMemory;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY2).
+ * This activity record provides separate records for memory allocation and
+ * memory release operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory operation.
+ *
+ * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY.
+ * \ref CUpti_ActivityMemory provides a single record for the memory
+ * allocation and memory release operations.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType.
+   */
+  CUpti_ActivityMemoryOperationType memoryOperationType;
+
+  /**
+   * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind.
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The correlation ID of the memory operation. Each memory operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The program counter of the memory operation.
+   */
+  uint64_t PC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory operation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID.
+   */
+  uint32_t streamId;
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+
+  /**
+   * \p isAsync is set if memory operation happens through async memory APIs.
+   */
+  uint32_t isAsync;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * The memory pool configuration used for the memory operations.
+   */
+  struct {
+    /**
+     * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+     */
+    CUpti_ActivityMemoryPoolType memoryPoolType;
+#ifdef CUPTILP64
+    /**
+     * Undefined. Reserved for internal use.
+     */
+    uint32_t pad2;
+#endif
+    /**
+     * The base address of the memory pool.
+     */
+    uint64_t address;
+    /**
+     * The release threshold of the memory pool in bytes. \p releaseThreshold is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t releaseThreshold;
+
+    union {
+      /**
+       * The size of the memory pool in bytes.
+       * \p size is valid if \p memoryPoolType is
+       * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+       */
+      uint64_t size;
+      /**
+       * The processId of the memory pool.
+       * \p processId is valid if \p memoryPoolType is
+       * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType.
+       */
+      uint64_t processId;
+    } pool;
+  } memoryPoolConfig;
+
+} CUpti_ActivityMemory2;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY2).
+ * This activity record provides separate records for memory allocation and
+ * memory release operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory operation.
+ *
+ * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY.
+ * \ref CUpti_ActivityMemory provides a single record for the memory
+ * allocation and memory release operations.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType.
+   */
+  CUpti_ActivityMemoryOperationType memoryOperationType;
+
+  /**
+   * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind.
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The correlation ID of the memory operation. Each memory operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The program counter of the memory operation.
+   */
+  uint64_t PC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory operation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID.
+   */
+  uint32_t streamId;
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+
+  /**
+   * \p isAsync is set if memory operation happens through async memory APIs.
+   */
+  uint32_t isAsync;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * The memory pool configuration used for the memory operations.
+   */
+  struct PACKED_ALIGNMENT {
+    /**
+     * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+     */
+    CUpti_ActivityMemoryPoolType memoryPoolType;
+#ifdef CUPTILP64
+    /**
+     * Undefined. Reserved for internal use.
+     */
+    uint32_t pad2;
+#endif
+    /**
+     * The base address of the memory pool.
+     */
+    uint64_t address;
+    /**
+     * The release threshold of the memory pool in bytes. \p releaseThreshold is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t releaseThreshold;
+
+    union {
+      /**
+       * The size of the memory pool in bytes.
+       * \p size is valid if \p memoryPoolType is
+       * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+       */
+      uint64_t size;
+      /**
+       * The processId of the memory pool.
+       * \p processId is valid if \p memoryPoolType is
+       * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType.
+       */
+      uint64_t processId;
+    } pool;
+
+    /**
+     * The utilized size of the memory pool. \p utilizedSize is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t utilizedSize;
+  } memoryPoolConfig;
+
+} CUpti_ActivityMemory3;
+
+/**
+ * \brief The activity record for memory pool.
+ *
+ * This activity record represents a memory pool creation, destruction and
+ * trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL).
+ * This activity record provides separate records for memory pool creation,
+ * destruction and triming operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory pool operation.
+ *
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType.
+   */
+  CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType;
+
+  /**
+   * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+   */
+  CUpti_ActivityMemoryPoolType memoryPoolType;
+
+  /**
+   * The correlation ID of the memory pool operation. Each memory pool
+   * operation is assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory pool is created.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The minimum bytes to keep of the memory pool. \p minBytesToKeep is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED,
+   * \ref CUpti_ActivityMemoryPoolOperationType
+   */
+  size_t minBytesToKeep;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The size of the memory pool operation in bytes. \p size is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t size;
+
+  /**
+   * The release threshold of the memory pool. \p releaseThreshold is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t releaseThreshold;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+} CUpti_ActivityMemoryPool;
+
+/**
+ * \brief The activity record for memory pool.
+ *
+ * This activity record represents a memory pool creation, destruction and
+ * trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL).
+ * This activity record provides separate records for memory pool creation,
+ * destruction and triming operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory pool operation.
+ *
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType.
+   */
+  CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType;
+
+  /**
+   * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+   */
+  CUpti_ActivityMemoryPoolType memoryPoolType;
+
+  /**
+   * The correlation ID of the memory pool operation. Each memory pool
+   * operation is assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory pool is created.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The minimum bytes to keep of the memory pool. \p minBytesToKeep is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED,
+   * \ref CUpti_ActivityMemoryPoolOperationType
+   */
+  size_t minBytesToKeep;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The size of the memory pool operation in bytes. \p size is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t size;
+
+  /**
+   * The release threshold of the memory pool. \p releaseThreshold is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t releaseThreshold;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The utilized size of the memory pool. \p utilizedSize is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t utilizedSize;
+} CUpti_ActivityMemoryPool2;
+
+/**
+ * \brief The activity record for kernel. (deprecated)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel8 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL
+   * or CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The cache configuration requested by the kernel. The value is one
+   * of the CUfunc_cache enumeration values from cuda.h.
+   */
+  uint8_t cacheConfigRequested;
+
+  /**
+   * The cache configuration used for the kernel. The value is one of
+   * the CUfunc_cache enumeration values from cuda.h.
+   */
+  uint8_t cacheConfigExecuted;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the kernel. Each kernel execution
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t runtimeCorrelationId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel;
+
+/**
+ * \brief The activity record for kernel. (deprecated)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel8 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel2;
+
+/**
+ * \brief The activity record for a kernel (CUDA 6.5(with sm_52 support) onwards).
+ * (deprecated in CUDA 9.0)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
+ * Kernel activities are now reported using the CUpti_ActivityKernel8 activity
+ * record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel3;
+
+/**
+ * \brief The type of the CUDA kernel launch.
+ */
+typedef enum {
+  /**
+  * The kernel was launched via a regular kernel call
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_REGULAR = 0,
+  /**
+  * The kernel was launched via API \ref cudaLaunchCooperativeKernel() or
+  * \ref cuLaunchCooperativeKernel()
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_SINGLE_DEVICE = 1,
+  /**
+  * The kernel was launched via API \ref cudaLaunchCooperativeKernelMultiDevice() or
+  * \ref cuLaunchCooperativeKernelMultiDevice()
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_MULTI_DEVICE = 2
+} CUpti_ActivityLaunchType;
+
+/**
+ * \brief The activity record for a kernel (CUDA 9.0(with sm_70 support) onwards).
+ * (deprecated in CUDA 11.0)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
+ * Kernel activities are now reported using the CUpti_ActivityKernel8 activity
+ * record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchrnous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+} CUpti_ActivityKernel4;
+
+/**
+ * \brief The shared memory limit per block config for a kernel
+ * This should be used to set 'cudaOccFuncShmemConfig' field in occupancy calculator API
+ */
+typedef enum  {
+    /* The shared memory limit config is default */
+    CUPTI_FUNC_SHMEM_LIMIT_DEFAULT              = 0x00,
+    /* User has opted for a higher dynamic shared memory limit using function attribute
+       'cudaFuncAttributeMaxDynamicSharedMemorySize' for runtime API or
+       CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES for driver API */
+    CUPTI_FUNC_SHMEM_LIMIT_OPTIN                = 0x01,
+    CUPTI_FUNC_SHMEM_LIMIT_FORCE_INT            = 0x7fffffff
+} CUpti_FuncShmemLimitConfig;
+
+/**
+ * \brief The activity record for a kernel (CUDA 11.0(with sm_80 support) onwards).
+ * (deprecated in CUDA 11.2)
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel8 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchrnous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+} CUpti_ActivityKernel5;
+
+/**
+ * \brief The activity record for kernel. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel8 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchrnous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+} CUpti_ActivityKernel6;
+
+/**
+ * \brief The activity record for kernel. (deprecated in CUDA 11.8)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel8 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchrnous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+} CUpti_ActivityKernel7;
+
+/**
+ * \brief The activity record for kernel.
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes (deprecated in CUDA 11.8).
+   * Refer field localMemoryTotal_v2
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchrnous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+
+  /**
+   * The X-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterX;
+
+  /**
+   * The Y-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterY;
+
+  /**
+   * The Z-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterZ;
+
+  /**
+   * The cluster scheduling policy for the kernel. Refer CUclusterSchedulingPolicy
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterSchedulingPolicy;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint64_t localMemoryTotal_v2;
+} CUpti_ActivityKernel8;
+
+/**
+ * \brief The activity record for CDP (CUDA Dynamic Parallelism)
+ * kernel.
+ *
+ * This activity record represents a CDP kernel execution.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CDP_KERNEL
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel execution
+   * is assigned a unique grid ID.
+   */
+  int64_t gridId;
+
+  /**
+   * The grid ID of the parent kernel.
+   */
+  int64_t parentGridId;
+
+  /**
+   * The timestamp when kernel is queued up, in ns. A value of
+   * CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time is
+   * unknown.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when kernel is submitted to the gpu, in ns. A value
+   * of CUPTI_TIMESTAMP_UNKNOWN indicates that the submission time is
+   * unknown.
+   */
+  uint64_t submitted;
+
+  /**
+   * The timestamp when kernel is marked as completed, in ns. A value
+   * of CUPTI_TIMESTAMP_UNKNOWN indicates that the completion time is
+   * unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The X-dimension of the parent block.
+   */
+  uint32_t parentBlockX;
+
+  /**
+   * The Y-dimension of the parent block.
+   */
+  uint32_t parentBlockY;
+
+  /**
+   * The Z-dimension of the parent block.
+   */
+  uint32_t parentBlockZ;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityCdpKernel;
+
+/**
+ * \brief The activity record for a preemption of a CDP kernel.
+ *
+ * This activity record represents a preemption of a CDP kernel.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PREEMPTION
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+  * kind of the preemption
+  */
+  CUpti_ActivityPreemptionKind preemptionKind;
+
+  /**
+   * The timestamp of the preemption, in ns. A value of 0 indicates
+   * that timestamp information could not be collected for the
+   * preemption.
+   */
+  uint64_t timestamp;
+
+  /**
+  * The grid-id of the block that is preempted
+  */
+  int64_t gridId;
+
+  /**
+   * The X-dimension of the block that is preempted
+   */
+  uint32_t blockX;
+
+  /**
+   * The Y-dimension of the block that is preempted
+   */
+  uint32_t blockY;
+
+  /**
+   * The Z-dimension of the block that is preempted
+   */
+  uint32_t blockZ;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityPreemption;
+
+/**
+ * \brief The activity record for a driver or runtime API invocation.
+ *
+ * This activity record represents an invocation of a driver or
+ * runtime API (CUPTI_ACTIVITY_KIND_DRIVER and
+ * CUPTI_ACTIVITY_KIND_RUNTIME).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DRIVER,
+   * CUPTI_ACTIVITY_KIND_RUNTIME, or CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the driver or runtime function.
+   */
+  CUpti_CallbackId cbid;
+
+  /**
+   * The start timestamp for the function, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the function.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the function, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the function.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the process where the driver or runtime CUDA function
+   * is executing.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the thread where the driver or runtime CUDA function is
+   * executing.
+   */
+  uint32_t threadId;
+
+  /**
+   * The correlation ID of the driver or runtime CUDA function. Each
+   * function invocation is assigned a unique correlation ID that is
+   * identical to the correlation ID in the memcpy, memset, or kernel
+   * activity record that is associated with this function.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The return value for the function. For a CUDA driver function
+   * with will be a CUresult value, and for a CUDA runtime function
+   * this will be a cudaError_t value.
+   */
+  uint32_t returnValue;
+} CUpti_ActivityAPI;
+
+/**
+ * \brief The activity record for a CUPTI event.
+ *
+ * This activity record represents a CUPTI event value
+ * (CUPTI_ACTIVITY_KIND_EVENT). This activity record kind is not
+ * produced by the activity API but is included for completeness and
+ * ease-of-use. Profile frameworks built on top of CUPTI that collect
+ * event data may choose to use this type to store the collected event
+ * data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_EVENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The event domain ID.
+   */
+  CUpti_EventDomainID domain;
+
+  /**
+   * The correlation ID of the event. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the event was gathered.
+   */
+  uint32_t correlationId;
+} CUpti_ActivityEvent;
+
+/**
+ * \brief The activity record for a CUPTI event with instance
+ * information.
+ *
+ * This activity record represents the a CUPTI event value for a
+ * specific event domain instance
+ * (CUPTI_ACTIVITY_KIND_EVENT_INSTANCE). This activity record kind is
+ * not produced by the activity API but is included for completeness
+ * and ease-of-use. Profile frameworks built on top of CUPTI that
+ * collect event data may choose to use this type to store the
+ * collected event data. This activity record should be used when
+ * event domain instance information needs to be associated with the
+ * event.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_EVENT_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event domain ID.
+   */
+  CUpti_EventDomainID domain;
+
+  /**
+   * The event domain instance.
+   */
+  uint32_t instance;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The correlation ID of the event. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the event was gathered.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityEventInstance;
+
+/**
+ * \brief The activity record for a CUPTI metric.
+ *
+ * This activity record represents the collection of a CUPTI metric
+ * value (CUPTI_ACTIVITY_KIND_METRIC). This activity record kind is not
+ * produced by the activity API but is included for completeness and
+ * ease-of-use. Profile frameworks built on top of CUPTI that collect
+ * metric data may choose to use this type to store the collected metric
+ * data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_METRIC.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The correlation ID of the metric. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the metric was gathered.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t pad[3];
+} CUpti_ActivityMetric;
+
+/**
+ * \brief The activity record for a CUPTI metric with instance
+ * information.
+ *
+ * This activity record represents a CUPTI metric value
+ * for a specific metric domain instance
+ * (CUPTI_ACTIVITY_KIND_METRIC_INSTANCE).  This activity record kind
+ * is not produced by the activity API but is included for
+ * completeness and ease-of-use. Profile frameworks built on top of
+ * CUPTI that collect metric data may choose to use this type to store
+ * the collected metric data. This activity record should be used when
+ * metric domain instance information needs to be associated with the
+ * metric.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_METRIC_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The metric domain instance.
+   */
+  uint32_t instance;
+
+  /**
+   * The correlation ID of the metric. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the metric was gathered.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t pad[7];
+} CUpti_ActivityMetricInstance;
+
+/**
+ * \brief The activity record for source locator.
+ *
+ * This activity record represents a source locator
+ * (CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID for the source path, will be used in all the source level
+   * results.
+   */
+  uint32_t id;
+
+  /**
+   * The line number in the source .
+   */
+  uint32_t lineNumber;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The path for the file.
+   */
+  const char *fileName;
+} CUpti_ActivitySourceLocator;
+
+/**
+ * \brief The activity record for source-level global
+ * access. (deprecated)
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ * Global access activities are now reported using the
+ * CUpti_ActivityGlobalAccess3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this access
+   */
+  uint64_t l2_transactions;
+} CUpti_ActivityGlobalAccess;
+
+/**
+ * \brief The activity record for source-level global
+ * access. (deprecated in CUDA 9.0)
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ * Global access activities are now reported using the
+ * CUpti_ActivityGlobalAccess3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this access
+   */
+  uint64_t l2_transactions;
+
+  /**
+   * The minimum number of L2 transactions possible based on the access pattern.
+   */
+  uint64_t theoreticalL2Transactions;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityGlobalAccess2;
+
+/**
+ * \brief The activity record for source-level global
+ * access.
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint64_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number of
+   * threads that executed this instruction with predicate and condition code
+   * evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this
+     access
+   */
+  uint64_t l2_transactions;
+
+  /**
+   * The minimum number of L2 transactions possible based on the access pattern.
+   */
+  uint64_t theoreticalL2Transactions;
+} CUpti_ActivityGlobalAccess3;
+
+/**
+ * \brief The activity record for source level result
+ * branch. (deprecated)
+ *
+ * This activity record the locations of the branches in the
+ * source (CUPTI_ACTIVITY_KIND_BRANCH).
+ * Branch activities are now reported using the
+ * CUpti_ActivityBranch2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The pc offset for the branch.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * regardless of predicate or condition code.
+   */
+  uint32_t executed;
+
+  /**
+   * Number of times this branch diverged
+   */
+  uint32_t diverged;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction
+   */
+  uint64_t threadsExecuted;
+} CUpti_ActivityBranch;
+
+/**
+ * \brief The activity record for source level result
+ * branch.
+ *
+ * This activity record the locations of the branches in the
+ * source (CUPTI_ACTIVITY_KIND_BRANCH).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the branch.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times this branch diverged
+   */
+  uint32_t diverged;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * regardless of predicate or condition code.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityBranch2;
+
+
+/**
+ * \brief The activity record for a device. (deprecated)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice4 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityDevice;
+
+/**
+ * \brief The activity record for a device. (deprecated)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice4 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityDevice2;
+
+/**
+ * \brief The activity record for a device. (CUDA 7.0 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice4 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  uint8_t reserved[7];
+} CUpti_ActivityDevice3;
+
+
+/**
+ * \brief The activity record for a device. (CUDA 11.6 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  /**
+   * MIG enabled flag for device
+   */
+  uint8_t isMigEnabled;
+
+  uint8_t reserved[6];
+
+  /**
+   * GPU Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t  gpuInstanceId;
+
+  /**
+   * Compute Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t  computeInstanceId;
+
+  /**
+   * The MIG UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid    migUuid;
+
+} CUpti_ActivityDevice4;
+
+
+/**
+ * \brief The activity record for a device attribute.
+ *
+ * This activity record represents information about a GPU device:
+ * either a CUpti_DeviceAttribute or CUdevice_attribute value
+ * (CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID of the device that this attribute applies to.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The attribute, either a CUpti_DeviceAttribute or
+   * CUdevice_attribute. Flag
+   * CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE is used to indicate
+   * what kind of attribute this is. If
+   * CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE is 1 then
+   * CUdevice_attribute field is value, otherwise
+   * CUpti_DeviceAttribute field is valid.
+   */
+  union {
+    CUdevice_attribute cu;
+    CUpti_DeviceAttribute cupti;
+  } attribute;
+
+  /**
+   * The value for the attribute. See CUpti_DeviceAttribute and
+   * CUdevice_attribute for the type of the value for a given
+   * attribute.
+   */
+  union {
+    double vDouble;
+    uint32_t vUint32;
+    uint64_t vUint64;
+    int32_t vInt32;
+    int64_t vInt64;
+  } value;
+} CUpti_ActivityDeviceAttribute;
+
+/**
+ * \brief The activity record for a context.
+ *
+ * This activity record represents information about a context
+ * (CUPTI_ACTIVITY_KIND_CONTEXT).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CONTEXT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The context ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The compute API kind. \see CUpti_ActivityComputeApiKind
+   */
+  uint16_t computeApiKind;
+
+  /**
+   * The ID for the NULL stream in this context
+   */
+  uint16_t nullStreamId;
+
+} CUpti_ActivityContext;
+
+/**
+ * \brief The activity record providing a name.
+ *
+ * This activity record provides a name for a device, context, thread,
+ * etc. and other resource naming done via NVTX APIs
+ * (CUPTI_ACTIVITY_KIND_NAME).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NAME.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of activity object being named.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The name.
+   */
+  const char *name;
+
+} CUpti_ActivityName;
+
+/**
+ * \brief The activity record providing a marker which is an
+ * instantaneous point in time. (deprecated in CUDA 8.0)
+ *
+ * The marker is specified with a descriptive name and unique id
+ * (CUPTI_ACTIVITY_KIND_MARKER).
+ * Marker activity is now reported using the
+ * CUpti_ActivityMarker2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the marker. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The timestamp for the marker, in ns. A value of 0 indicates that
+   * timestamp information could not be collected for the marker.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The marker ID.
+   */
+  uint32_t id;
+
+  /**
+   * The kind of activity object associated with this marker.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object associated with this
+   * marker. 'objectKind' indicates which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The marker name for an instantaneous or start marker. This will
+   * be NULL for an end marker.
+   */
+  const char *name;
+
+} CUpti_ActivityMarker;
+
+/**
+ * \brief The activity record providing a marker which is an
+ * instantaneous point in time.
+ *
+ * The marker is specified with a descriptive name and unique id
+ * (CUPTI_ACTIVITY_KIND_MARKER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the marker. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The timestamp for the marker, in ns. A value of 0 indicates that
+   * timestamp information could not be collected for the marker.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The marker ID.
+   */
+  uint32_t id;
+
+  /**
+   * The kind of activity object associated with this marker.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object associated with this
+   * marker. 'objectKind' indicates which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+
+
+  /**
+   * The marker name for an instantaneous or start marker. This will
+   * be NULL for an end marker.
+   */
+  const char *name;
+
+  /**
+   * The name of the domain to which this marker belongs to.
+   * This will be NULL for default domain.
+   */
+  const char *domain;
+
+} CUpti_ActivityMarker2;
+
+/**
+ * \brief The activity record providing detailed information for a marker.
+ *
+ * The marker data contains color, payload, and category.
+ * (CUPTI_ACTIVITY_KIND_MARKER_DATA).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_MARKER_DATA.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the marker. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The marker ID.
+   */
+  uint32_t id;
+
+  /**
+   * Defines the payload format for the value associated with the marker.
+   */
+  CUpti_MetricValueKind payloadKind;
+
+  /**
+   * The payload value.
+   */
+  CUpti_MetricValue payload;
+
+  /**
+   * The color for the marker.
+   */
+  uint32_t color;
+
+  /**
+   * The category for the marker.
+   */
+  uint32_t category;
+
+} CUpti_ActivityMarkerData;
+
+/**
+ * \brief The activity record for CUPTI and driver overheads.
+ *
+ * This activity record provides CUPTI and driver overhead information
+ * (CUPTI_ACTIVITY_OVERHEAD).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of overhead, CUPTI, DRIVER, COMPILER etc.
+   */
+  CUpti_ActivityOverheadKind overheadKind;
+
+  /**
+   * The kind of activity object that the overhead is associated with.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * The start timestamp for the overhead, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the overhead.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the overhead, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the overhead.
+   */
+  uint64_t end;
+} CUpti_ActivityOverhead;
+
+/**
+ * \brief The activity record for CUPTI environmental data.
+ *
+ * This activity record provides CUPTI environmental data, include
+ * power, clocks, and thermals.  This information is sampled at
+ * various rates and returned in this activity record.  The consumer
+ * of the record needs to check the environmentKind field to figure
+ * out what kind of environmental record this is.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_ENVIRONMENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the device
+   */
+  uint32_t deviceId;
+
+  /**
+   * The timestamp when this sample was retrieved, in ns. A value of 0
+   * indicates that timestamp information could not be collected for
+   * the marker.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The kind of data reported in this record.
+   */
+  CUpti_ActivityEnvironmentKind environmentKind;
+
+  union {
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_SPEED environment
+     * kind.
+     */
+    struct {
+      /**
+       * The SM frequency in MHz
+       */
+      uint32_t smClock;
+
+      /**
+       * The memory frequency in MHz
+       */
+      uint32_t memoryClock;
+
+      /**
+       * The PCIe link generation.
+       */
+      uint32_t pcieLinkGen;
+
+      /**
+       * The PCIe link width.
+       */
+      uint32_t pcieLinkWidth;
+
+      /**
+       * The clocks throttle reasons.
+       */
+      CUpti_EnvironmentClocksThrottleReason clocksThrottleReasons;
+    } speed;
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_TEMPERATURE
+     * environment kind.
+     */
+    struct {
+      /**
+       * The GPU temperature in degrees C.
+       */
+      uint32_t gpuTemperature;
+    } temperature;
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_POWER environment
+     * kind.
+     */
+    struct {
+      /**
+       * The power in milliwatts consumed by GPU and associated
+       * circuitry.
+       */
+      uint32_t power;
+
+      /**
+       * The power in milliwatts that will trigger power management
+       * algorithm.
+       */
+      uint32_t powerLimit;
+    } power;
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_COOLING
+     * environment kind.
+     */
+    struct {
+      /**
+       * The fan speed as percentage of maximum.
+       */
+      uint32_t fanSpeed;
+    } cooling;
+  } data;
+} CUpti_ActivityEnvironment;
+
+/**
+ * \brief The activity record for source-level instruction execution.
+ *
+ * This activity records result for source level instruction execution.
+ * (CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction execution.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction, regardless of predicate or condition code.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t notPredOffThreadsExecuted;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * regardless of predicate or condition code.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityInstructionExecution;
+
+/**
+ * \brief The activity record for PC sampling. (deprecated in CUDA 8.0)
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ * PC sampling activities are now reported using the
+ * CUpti_ActivityPCSampling2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+} CUpti_ActivityPCSampling;
+
+/**
+ * \brief The activity record for PC sampling. (deprecated in CUDA 9.0)
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ * PC sampling activities are now reported using the
+ * CUpti_ActivityPCSampling3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * These samples indicate that no instruction was issued in that cycle from
+   * the warp scheduler from where the warp was sampled.
+   * Field is valid for devices with compute capability 6.0 and higher
+   */
+  uint32_t latencySamples;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons. The count includes
+   * latencySamples.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+
+  uint32_t pad;
+} CUpti_ActivityPCSampling2;
+
+/**
+ * \brief The activity record for PC sampling.
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * These samples indicate that no instruction was issued in that cycle from
+   * the warp scheduler from where the warp was sampled.
+   * Field is valid for devices with compute capability 6.0 and higher
+   */
+  uint32_t latencySamples;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons. The count includes
+   * latencySamples.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+
+    /**
+   * The pc offset for the instruction.
+   */
+  uint64_t pcOffset;
+} CUpti_ActivityPCSampling3;
+
+/**
+ * \brief The activity record for record status for PC sampling.
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Number of times the PC was sampled for this kernel instance including all
+   * dropped samples.
+   */
+  uint64_t totalSamples;
+
+  /**
+   * Number of samples that were dropped by hardware due to backpressure/overflow.
+   */
+  uint64_t droppedSamples;
+  /**
+   * Sampling period in terms of number of cycles .
+   */
+  uint64_t samplingPeriodInCycles;
+} CUpti_ActivityPCSamplingRecordInfo;
+
+/**
+ * \brief The activity record for Unified Memory counters (deprecated in CUDA 7.0)
+ *
+ * This activity record represents a Unified Memory counter
+ * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The Unified Memory counter kind. See \ref CUpti_ActivityUnifiedMemoryCounterKind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind counterKind;
+
+  /**
+   * Scope of the Unified Memory counter. See \ref CUpti_ActivityUnifiedMemoryCounterScope
+   */
+  CUpti_ActivityUnifiedMemoryCounterScope scope;
+
+  /**
+   * The ID of the device involved in the memory transfer operation.
+   * It is not relevant if the scope of the counter is global (all devices).
+   */
+  uint32_t deviceId;
+
+  /**
+   * Value of the counter
+   *
+   */
+  uint64_t value;
+
+  /**
+   * The timestamp when this sample was retrieved, in ns. A value of 0
+   * indicates that timestamp information could not be collected
+   */
+  uint64_t timestamp;
+
+  /**
+   * The ID of the process to which this record belongs to. In case of
+   * global scope, processId is undefined.
+   */
+  uint32_t processId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityUnifiedMemoryCounter;
+
+/**
+ * \brief The activity record for Unified Memory counters (CUDA 7.0 and beyond)
+ *
+ * This activity record represents a Unified Memory counter
+ * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The Unified Memory counter kind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind counterKind;
+
+  /**
+   * Value of the counter
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD,
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH,
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THREASHING and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP, it is the size of the
+   * memory region in bytes.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, it
+   * is the number of page fault groups for the same page.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT,
+   * it is the program counter for the instruction that caused fault.
+   */
+  uint64_t value;
+
+  /**
+   * The start timestamp of the counter, in ns.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is
+   * captured when activity starts on GPU.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT, timestamp is
+   * captured when CUDA driver started processing the fault.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, timestamp
+   * is captured when CUDA driver detected thrashing of memory region.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING,
+   * timestamp is captured when throttling opeeration was started by CUDA driver.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP,
+   * timestamp is captured when CUDA driver has pushed all required operations
+   * to the processor specified by dstId.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp of the counter, in ns.
+   * Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is
+   * captured when activity finishes on GPU.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, timestamp is
+   * captured when CUDA driver queues the replay of faulting memory accesses on the GPU
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING, timestamp
+   * is captured when throttling operation was finished by CUDA driver
+   */
+  uint64_t end;
+
+  /**
+   * This is the virtual base address of the page/s being transferred. For cpu and
+   * gpu faults, the virtual address for the page that faulted.
+   */
+  uint64_t address;
+
+  /**
+   * The ID of the source CPU/device involved in the memory transfer, page fault, thrashing,
+   * throttling or remote map operation. For counterKind
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, it is a bitwise ORing of the
+   * device IDs fighting for the memory region. Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT
+   */
+  uint32_t srcId;
+
+  /**
+   * The ID of the destination CPU/device involved in the memory transfer or remote map
+   * operation. Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING
+   */
+  uint32_t dstId;
+
+  /**
+   * The ID of the stream causing the transfer.
+   * This value of this field is invalid.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The flags associated with this record. See enums \ref CUpti_ActivityUnifiedMemoryAccessType
+   * if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT
+   * and \ref CUpti_ActivityUnifiedMemoryMigrationCause if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD
+   * and \ref CUpti_ActivityUnifiedMemoryRemoteMapCause if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP and \ref CUpti_ActivityFlag
+   * if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING
+   */
+  uint32_t flags;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityUnifiedMemoryCounter2;
+
+/**
+ * \brief The activity record for global/device functions.
+ *
+ * This activity records function name and corresponding module
+ * information.
+ * (CUPTI_ACTIVITY_KIND_FUNCTION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_FUNCTION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+  * ID to uniquely identify the record
+  */
+  uint32_t id;
+
+  /**
+   * The ID of the context where the function is launched.
+   */
+  uint32_t contextId;
+
+  /**
+   * The module ID in which this global/device function is present.
+   */
+  uint32_t moduleId;
+
+  /**
+   * The function's unique symbol index in the module.
+   */
+  uint32_t functionIndex;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The name of the function. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityFunction;
+
+/**
+ * \brief The activity record for a CUDA module.
+ *
+ * This activity record represents a CUDA module
+ * (CUPTI_ACTIVITY_KIND_MODULE). This activity record kind is not
+ * produced by the activity API but is included for completeness and
+ * ease-of-use. Profile frameworks built on top of CUPTI that collect
+ * module data from the module callback may choose to use this type to
+ * store the collected module data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MODULE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the context where the module is loaded.
+   */
+  uint32_t contextId;
+
+  /**
+   * The module ID.
+   */
+  uint32_t id;
+
+  /**
+   * The cubin size.
+   */
+  uint32_t cubinSize;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The pointer to cubin.
+   */
+  const void *cubin;
+} CUpti_ActivityModule;
+
+/**
+ * \brief The activity record for source-level shared
+ * access.
+ *
+ * This activity records the locations of the shared
+ * accesses in the source
+ * (CUPTI_ACTIVITY_KIND_SHARED_ACCESS).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_SHARED_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this shared access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of shared memory transactions generated by this access
+   */
+  uint64_t sharedTransactions;
+
+  /**
+   * The minimum number of shared memory transactions possible based on the access pattern.
+   */
+  uint64_t theoreticalSharedTransactions;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivitySharedAccess;
+
+/**
+ * \brief The activity record for CUDA event.
+ *
+ * This activity is used to track recorded events.
+ * (CUPTI_ACTIVITY_KIND_CUDA_EVENT).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CUDA_EVENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the context where the event was recorded.
+   */
+  uint32_t contextId;
+
+  /**
+   * The compute stream where the event was recorded.
+   */
+  uint32_t streamId;
+
+  /**
+   * A unique event ID to identify the event record.
+   */
+  uint32_t eventId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityCudaEvent;
+
+/**
+ * \brief The activity record for CUDA stream.
+ *
+ * This activity is used to track created streams.
+ * (CUPTI_ACTIVITY_KIND_STREAM).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_STREAM.
+   */
+  CUpti_ActivityKind kind;
+  /**
+   * The ID of the context where the stream was created.
+   */
+  uint32_t contextId;
+
+  /**
+   * A unique stream ID to identify the stream.
+   */
+  uint32_t streamId;
+
+  /**
+   * The clamped priority for the stream.
+   */
+  uint32_t priority;
+
+  /**
+   * Flags associated with the stream.
+   */
+  CUpti_ActivityStreamFlag flag;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+} CUpti_ActivityStream;
+
+/**
+ * \brief The activity record for synchronization management.
+ *
+ * This activity is used to track various CUDA synchronization APIs.
+ * (CUPTI_ACTIVITY_KIND_SYNCHRONIZATION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_SYNCHRONIZATION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The type of record.
+   */
+  CUpti_ActivitySynchronizationType type;
+
+  /**
+   * The start timestamp for the function, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the function.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the function, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the function.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the context for which the synchronization API is called.
+   * In case of context synchronization API it is the context id for which the API is called.
+   * In case of stream/event synchronization it is the ID of the context where the stream/event was created.
+   */
+  uint32_t contextId;
+
+  /**
+   * The compute stream for which the synchronization API is called.
+   * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record.
+   * Not valid for cuCtxSynchronize, cuEventSynchronize.
+   */
+  uint32_t streamId;
+
+  /**
+   * The event ID for which the synchronization API is called.
+   * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record.
+   * Not valid for cuCtxSynchronize, cuStreamSynchronize.
+   */
+  uint32_t cudaEventId;
+} CUpti_ActivitySynchronization;
+
+
+/**
+ * \brief The activity record for source-level sass/source
+ * line-by-line correlation.
+ *
+ * This activity records source level sass/source correlation
+ * information.
+ * (CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityInstructionCorrelation;
+
+/**
+ * \brief The OpenAcc event kind for OpenAcc activity records.
+ *
+ * \see CUpti_ActivityKindOpenAcc
+ */
+typedef enum {
+    CUPTI_OPENACC_EVENT_KIND_INVALID              = 0,
+    CUPTI_OPENACC_EVENT_KIND_DEVICE_INIT          = 1,
+    CUPTI_OPENACC_EVENT_KIND_DEVICE_SHUTDOWN      = 2,
+    CUPTI_OPENACC_EVENT_KIND_RUNTIME_SHUTDOWN     = 3,
+    CUPTI_OPENACC_EVENT_KIND_ENQUEUE_LAUNCH       = 4,
+    CUPTI_OPENACC_EVENT_KIND_ENQUEUE_UPLOAD       = 5,
+    CUPTI_OPENACC_EVENT_KIND_ENQUEUE_DOWNLOAD     = 6,
+    CUPTI_OPENACC_EVENT_KIND_WAIT                 = 7,
+    CUPTI_OPENACC_EVENT_KIND_IMPLICIT_WAIT        = 8,
+    CUPTI_OPENACC_EVENT_KIND_COMPUTE_CONSTRUCT    = 9,
+    CUPTI_OPENACC_EVENT_KIND_UPDATE               = 10,
+    CUPTI_OPENACC_EVENT_KIND_ENTER_DATA           = 11,
+    CUPTI_OPENACC_EVENT_KIND_EXIT_DATA            = 12,
+    CUPTI_OPENACC_EVENT_KIND_CREATE               = 13,
+    CUPTI_OPENACC_EVENT_KIND_DELETE               = 14,
+    CUPTI_OPENACC_EVENT_KIND_ALLOC                = 15,
+    CUPTI_OPENACC_EVENT_KIND_FREE                 = 16,
+    CUPTI_OPENACC_EVENT_KIND_FORCE_INT            = 0x7fffffff
+} CUpti_OpenAccEventKind;
+
+/**
+ * \brief The OpenAcc parent construct kind for OpenAcc activity records.
+ */
+typedef enum {
+    CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN          = 0,
+    CUPTI_OPENACC_CONSTRUCT_KIND_PARALLEL         = 1,
+    CUPTI_OPENACC_CONSTRUCT_KIND_KERNELS          = 2,
+    CUPTI_OPENACC_CONSTRUCT_KIND_LOOP             = 3,
+    CUPTI_OPENACC_CONSTRUCT_KIND_DATA             = 4,
+    CUPTI_OPENACC_CONSTRUCT_KIND_ENTER_DATA       = 5,
+    CUPTI_OPENACC_CONSTRUCT_KIND_EXIT_DATA        = 6,
+    CUPTI_OPENACC_CONSTRUCT_KIND_HOST_DATA        = 7,
+    CUPTI_OPENACC_CONSTRUCT_KIND_ATOMIC           = 8,
+    CUPTI_OPENACC_CONSTRUCT_KIND_DECLARE          = 9,
+    CUPTI_OPENACC_CONSTRUCT_KIND_INIT             = 10,
+    CUPTI_OPENACC_CONSTRUCT_KIND_SHUTDOWN         = 11,
+    CUPTI_OPENACC_CONSTRUCT_KIND_SET              = 12,
+    CUPTI_OPENACC_CONSTRUCT_KIND_UPDATE           = 13,
+    CUPTI_OPENACC_CONSTRUCT_KIND_ROUTINE          = 14,
+    CUPTI_OPENACC_CONSTRUCT_KIND_WAIT             = 15,
+    CUPTI_OPENACC_CONSTRUCT_KIND_RUNTIME_API      = 16,
+    CUPTI_OPENACC_CONSTRUCT_KIND_FORCE_INT        = 0x7fffffff
+
+} CUpti_OpenAccConstructKind;
+
+typedef enum {
+    CUPTI_OPENMP_EVENT_KIND_INVALID               = 0,
+    CUPTI_OPENMP_EVENT_KIND_PARALLEL              = 1,
+    CUPTI_OPENMP_EVENT_KIND_TASK                  = 2,
+    CUPTI_OPENMP_EVENT_KIND_THREAD                = 3,
+    CUPTI_OPENMP_EVENT_KIND_IDLE                  = 4,
+    CUPTI_OPENMP_EVENT_KIND_WAIT_BARRIER          = 5,
+    CUPTI_OPENMP_EVENT_KIND_WAIT_TASKWAIT         = 6,
+    CUPTI_OPENMP_EVENT_KIND_FORCE_INT             = 0x7fffffff
+} CUpti_OpenMpEventKind;
+
+/**
+ * \brief The base activity record for OpenAcc records.
+ *
+ * The OpenACC activity API part uses a CUpti_ActivityOpenAcc as a generic
+ * representation for any OpenACC activity. The 'kind' field is used to determine the
+ * specific activity kind, and from that the CUpti_ActivityOpenAcc object can
+ * be cast to the specific OpenACC activity record type appropriate for that kind.
+ *
+ * Note that all OpenACC activity record types are padded and aligned to
+ * ensure that each member of the record is naturally aligned.
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /**
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /*
+   * Version number
+   */
+  uint32_t version;
+
+  /*
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /*
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /*
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /*
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /*
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /*
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /*
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /*
+   * The line number of the first line of the function named in funcName.
+   * A zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /*
+   * The last line number of the function named in funcName.
+   * A zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceeding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /*
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /*
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+} CUpti_ActivityOpenAcc;
+
+/**
+ * \brief The activity record for OpenACC data.
+ *
+ * (CUPTI_ACTIVITY_KIND_OPENACC_DATA).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_DATA.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /*
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /*
+   * Version number
+   */
+  uint32_t version;
+
+  /*
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /*
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /*
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /*
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /*
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /*
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /*
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A negative or zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /*
+   * The line number of the first line of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /*
+   * The last line number of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceeding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /*
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /*
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+
+  /* --- end of common CUpti_ActivityOpenAcc part --- */
+
+  /**
+   * Number of bytes
+   */
+  uint64_t bytes;
+
+  /**
+   * Host pointer if available
+   */
+  uint64_t hostPtr;
+
+  /**
+   * Device pointer if available
+   */
+  uint64_t devicePtr;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /*
+   * A pointer to null-terminated string containing the name of the variable
+   * for which this event is triggered, if known, or a null pointer if not.
+   */
+  const char *varName;
+
+} CUpti_ActivityOpenAccData;
+
+/**
+ * \brief The activity record for OpenACC launch.
+ *
+ * (CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /*
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /*
+   * Version number
+   */
+  uint32_t version;
+
+  /*
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /*
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /*
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /*
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /*
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /*
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /*
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A negative or zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /*
+   * The line number of the first line of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /*
+   * The last line number of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceeding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /*
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /*
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+
+  /* --- end of common CUpti_ActivityOpenAcc part --- */
+
+  /**
+   * The number of gangs created for this kernel launch
+   */
+  uint64_t numGangs;
+
+  /**
+   * The number of workers created for this kernel launch
+   */
+  uint64_t numWorkers;
+
+  /**
+   * The number of vector lanes created for this kernel launch
+   */
+  uint64_t vectorLength;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /*
+   * A pointer to null-terminated string containing the name of the
+   * kernel being launched, if known, or a null pointer if not.
+   */
+  const char *kernelName;
+
+} CUpti_ActivityOpenAccLaunch;
+
+/**
+ * \brief The activity record for OpenACC other.
+ *
+ * (CUPTI_ACTIVITY_KIND_OPENACC_OTHER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_OTHER.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /*
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /*
+   * Version number
+   */
+  uint32_t version;
+
+  /*
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /*
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /*
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /*
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /*
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /*
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /*
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A negative or zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /*
+   * The line number of the first line of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /*
+   * The last line number of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceeding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /*
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /*
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+
+  /* --- end of common CUpti_ActivityOpenAcc part --- */
+} CUpti_ActivityOpenAccOther;
+
+
+/**
+ * \brief The base activity record for OpenMp records.
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenMP event kind (\see CUpti_OpenMpEventKind)
+   */
+  CUpti_OpenMpEventKind eventKind;
+
+  /*
+   * Version number
+   */
+  uint32_t version;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the process where the OpenMP activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenMP activity is executing.
+   */
+  uint32_t cuThreadId;
+
+} CUpti_ActivityOpenMp;
+
+/**
+ * \brief The kind of external APIs supported for correlation.
+ *
+ * Custom correlation kinds are reserved for usage in external tools.
+ *
+ * \see CUpti_ActivityExternalCorrelation
+ */
+typedef enum {
+    CUPTI_EXTERNAL_CORRELATION_KIND_INVALID              = 0,
+
+    /**
+     * The external API is unknown to CUPTI
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN              = 1,
+
+    /**
+     * The external API is OpenACC
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC              = 2,
+
+    /**
+     * The external API is custom0
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0              = 3,
+
+    /**
+     * The external API is custom1
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1              = 4,
+
+    /**
+     * The external API is custom2
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM2              = 5,
+
+    /**
+     * Add new kinds before this line
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_SIZE,
+
+    CUPTI_EXTERNAL_CORRELATION_KIND_FORCE_INT            = 0x7fffffff
+} CUpti_ExternalCorrelationKind;
+
+/**
+ * \brief The activity record for correlation with external records
+ *
+ * This activity record correlates native CUDA records (e.g. CUDA Driver API,
+ * kernels, memcpys, ...) with records from external APIs such as OpenACC.
+ * (CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION).
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of external API this record correlated to.
+   */
+  CUpti_ExternalCorrelationKind externalKind;
+
+  /**
+   * The correlation ID of the associated non-CUDA API record.
+   * The exact field in the associated external record depends
+   * on that record's activity kind (\see externalKind).
+   */
+  uint64_t externalId;
+
+  /**
+   * The correlation ID of the associated CUDA driver or runtime API record.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t reserved;
+} CUpti_ActivityExternalCorrelation;
+
+/**
+* \brief The device type for device connected to NVLink.
+*/
+typedef enum {
+    CUPTI_DEV_TYPE_INVALID = 0,
+    /**
+    * The device type is GPU.
+    */
+    CUPTI_DEV_TYPE_GPU = 1,
+    /**
+    * The device type is NVLink processing unit in CPU.
+    */
+    CUPTI_DEV_TYPE_NPU = 2,
+    CUPTI_DEV_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_DevType;
+
+/**
+* \brief NVLink information. (deprecated in CUDA 9.0)
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NVLink information are now reported using the
+* CUpti_ActivityNvLink2 activity record.
+*/
+
+typedef struct PACKED_ALIGNMENT {
+    /**
+    * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+    */
+    CUpti_ActivityKind kind;
+    /**
+    * NVLink version.
+    */
+    uint32_t  nvlinkVersion;
+    /**
+    * Type of device 0 \ref CUpti_DevType
+    */
+    CUpti_DevType typeDev0;
+    /**
+    * Type of device 1 \ref CUpti_DevType
+    */
+    CUpti_DevType typeDev1;
+    /**
+    * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice4.
+    * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+    */
+    union {
+        CUuuid    uuidDev;
+        struct {
+            /**
+            * Index of the NPU. First index will always be zero.
+            */
+            uint32_t  index;
+            /**
+            * Domain ID of NPU. On Linux, this can be queried using lspci.
+            */
+            uint32_t  domainId;
+        } npu;
+    } idDev0;
+    /**
+    * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice4.
+    * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+    */
+    union {
+        CUuuid    uuidDev;
+        struct {
+            /**
+            * Index of the NPU. First index will always be zero.
+            */
+            uint32_t  index;
+            /**
+            * Domain ID of NPU. On Linux, this can be queried using lspci.
+            */
+            uint32_t  domainId;
+        } npu;
+    } idDev1;
+    /**
+    * Flag gives capabilities of the link \see CUpti_LinkFlag
+    */
+    uint32_t flag;
+    /**
+    * Number of physical NVLinks present between two devices.
+    */
+    uint32_t  physicalNvLinkCount;
+    /**
+    * Port numbers for maximum 4 NVLinks connected to device 0.
+    * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+    * In case of invalid/unknown port number, this field will be set
+    * to value CUPTI_NVLINK_INVALID_PORT.
+    * This will be used to correlate the metric values to individual
+    * physical link and attribute traffic to the logical NVLink in
+    * the topology.
+    */
+    int8_t  portDev0[4];
+    /**
+    * Port numbers for maximum 4 NVLinks connected to device 1.
+    * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+    * In case of invalid/unknown port number, this field will be set
+    * to value CUPTI_NVLINK_INVALID_PORT.
+    * This will be used to correlate the metric values to individual
+    * physical link and attribute traffic to the logical NVLink in
+    * the topology.
+    */
+    int8_t  portDev1[4];
+    /**
+    * Banwidth of NVLink in kbytes/sec
+    */
+    uint64_t  bandwidth;
+} CUpti_ActivityNvLink;
+
+/**
+* \brief NVLink information. (deprecated in CUDA 10.0)
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NvLink information are now reported using the
+* CUpti_ActivityNvLink4 activity record.
+*/
+
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+  /**
+   * NvLink version.
+   */
+  uint32_t  nvlinkVersion;
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+  /**
+   * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice4.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+   */
+  union {
+    CUuuid    uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t  index;
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t  domainId;
+    } npu;
+  } idDev0;
+  /**
+   * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice4.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+   */
+  union {
+    CUuuid    uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t  index;
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t  domainId;
+    } npu;
+  } idDev1;
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t  physicalNvLinkCount;
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t  portDev0[CUPTI_MAX_NVLINK_PORTS];
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t  portDev1[CUPTI_MAX_NVLINK_PORTS];
+  /**
+   * Banwidth of NVLink in kbytes/sec
+   */
+  uint64_t  bandwidth;
+} CUpti_ActivityNvLink2;
+
+/**
+* \brief NVLink information.
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NvLink information are now reported using the
+* CUpti_ActivityNvLink4 activity record.
+*/
+
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+  /**
+   * NvLink version.
+   */
+  uint32_t  nvlinkVersion;
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+  /**
+   * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice4.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+   */
+  union {
+    CUuuid    uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t  index;
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t  domainId;
+    } npu;
+  } idDev0;
+  /**
+   * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice4.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+   */
+  union {
+    CUuuid    uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t  index;
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t  domainId;
+    } npu;
+  } idDev1;
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t  physicalNvLinkCount;
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t  portDev0[CUPTI_MAX_NVLINK_PORTS];
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t  portDev1[CUPTI_MAX_NVLINK_PORTS];
+   /**
+   * Banwidth of NVLink in kbytes/sec
+   */
+  uint64_t  bandwidth;
+   /**
+   * NVSwitch is connected as an intermediate node.
+   */
+  uint8_t nvswitchConnected;
+   /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[7];
+} CUpti_ActivityNvLink3;
+
+/**
+* \brief NVLink information.
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+*/
+
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+  /**
+   * NvLink version.
+   */
+  uint32_t  nvlinkVersion;
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+  /**
+   * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice4.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+   */
+  union {
+    CUuuid    uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t  index;
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t  domainId;
+    } npu;
+  } idDev0;
+  /**
+   * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice4.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+   */
+  union {
+    CUuuid    uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t  index;
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t  domainId;
+    } npu;
+  } idDev1;
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t  physicalNvLinkCount;
+  /**
+   * Port numbers for maximum 32 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t  portDev0[CUPTI_MAX_NVLINK_PORTS];
+  /**
+   * Port numbers for maximum 32 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t  portDev1[CUPTI_MAX_NVLINK_PORTS];
+   /**
+   * Banwidth of NVLink in kbytes/sec
+   */
+  uint64_t  bandwidth;
+   /**
+   * NVSwitch is connected as an intermediate node.
+   */
+  uint8_t nvswitchConnected;
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[7];
+} CUpti_ActivityNvLink4;
+
+#define CUPTI_MAX_GPUS 32
+/**
+ * Field to differentiate whether PCIE Activity record
+ * is of a GPU or a PCI Bridge
+ */
+typedef enum {
+    /**
+     * PCIE GPU record
+     */
+    CUPTI_PCIE_DEVICE_TYPE_GPU       = 0,
+
+    /**
+     * PCIE Bridge record
+     */
+    CUPTI_PCIE_DEVICE_TYPE_BRIDGE    = 1,
+
+    CUPTI_PCIE_DEVICE_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_PcieDeviceType;
+
+/**
+ * \brief PCI devices information required to construct topology
+ *
+ * This structure gives capabilities of GPU and PCI bridge connected to the PCIE bus
+ * which can be used to understand the topology.
+ */
+typedef struct PACKED_ALIGNMENT {
+    /**
+     * The activity record kind, must be CUPTI_ACTIVITY_KIND_PCIE.
+     */
+    CUpti_ActivityKind kind;
+    /**
+     * Type of device in topology, \ref CUpti_PcieDeviceType. If type is
+     * CUPTI_PCIE_DEVICE_TYPE_GPU use devId for id and gpuAttr and if type is
+     * CUPTI_PCIE_DEVICE_TYPE_BRIDGE use bridgeId for id and bridgeAttr.
+     */
+    CUpti_PcieDeviceType type;
+    /**
+     * A unique identifier for GPU or Bridge in Topology
+     */
+    union {
+      /**
+       * GPU device ID
+       */
+      CUdevice devId;
+      /**
+       * A unique identifier for Bridge in the Topology
+       */
+      uint32_t bridgeId;
+    } id;
+
+    /**
+     * Domain for the GPU or Bridge, required to identify which PCIE bus it belongs to in
+     * multiple NUMA systems.
+     */
+    uint32_t domain;
+    /**
+     * PCIE Generation of GPU or Bridge.
+     */
+    uint16_t pcieGeneration;
+    /**
+     * Link rate of the GPU or bridge in gigatransfers per second (GT/s)
+     */
+    uint16_t linkRate;
+    /**
+     * Link width of the GPU or bridge
+     */
+    uint16_t linkWidth;
+
+    /**
+     * Upstream bus ID for the GPU or PCI bridge. Required to identify which bus it is
+     * connected to in the topology.
+     */
+    uint16_t upstreamBus;
+
+    /**
+     * Attributes for more information about GPU (gpuAttr) or PCI Bridge (bridgeAttr)
+     */
+    union {
+      struct {
+        /**
+         * UUID for the device. \ref CUpti_ActivityDevice4.
+         */
+        CUuuid    uuidDev;
+        /**
+         * CUdevice with which this device has P2P capability.
+         * This can also be obtained by querying cuDeviceCanAccessPeer or
+         * cudaDeviceCanAccessPeer APIs
+         */
+        CUdevice peerDev[CUPTI_MAX_GPUS];
+      } gpuAttr;
+
+      struct {
+        /**
+         * The downstream bus number, used to search downstream devices/bridges connected
+         * to this bridge.
+         */
+        uint16_t secondaryBus;
+        /**
+         * Device ID of the bridge
+         */
+        uint16_t deviceId;
+        /**
+         * Vendor ID of the bridge
+         */
+        uint16_t vendorId;
+        /**
+         * Padding for alignment
+         */
+        uint16_t pad0;
+      } bridgeAttr;
+    } attr;
+} CUpti_ActivityPcie;
+
+/**
+ * \brief PCIE Generation.
+ *
+ * Enumeration of PCIE Generation for
+ * pcie activity attribute pcieGeneration
+ */
+typedef enum {
+  /**
+  * PCIE Generation 1
+  */
+  CUPTI_PCIE_GEN_GEN1       = 1,
+  /**
+  * PCIE Generation 2
+  */
+  CUPTI_PCIE_GEN_GEN2       = 2,
+  /**
+  * PCIE Generation 3
+  */
+  CUPTI_PCIE_GEN_GEN3       = 3,
+  /**
+  * PCIE Generation 4
+  */
+  CUPTI_PCIE_GEN_GEN4       = 4,
+  /**
+  * PCIE Generation 5
+  */
+  CUPTI_PCIE_GEN_GEN5       = 5,
+
+  CUPTI_PCIE_GEN_FORCE_INT  = 0x7fffffff
+} CUpti_PcieGen;
+
+/**
+ * \brief The activity record for an instantaneous CUPTI event.
+ *
+ * This activity record represents a CUPTI event value
+ * (CUPTI_ACTIVITY_KIND_EVENT) sampled at a particular instant.
+ * This activity record kind is not produced by the activity API but is
+ * included for completeness and ease-of-use. Profiler frameworks built on
+ * top of CUPTI that collect event data at a particular time may choose to
+ * use this type to store the collected event data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The timestamp at which event is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint32_t reserved;
+} CUpti_ActivityInstantaneousEvent;
+
+/**
+ * \brief The activity record for an instantaneous CUPTI event
+ * with event domain instance information.
+ *
+ * This activity record represents the a CUPTI event value for a
+ * specific event domain instance
+ * (CUPTI_ACTIVITY_KIND_EVENT_INSTANCE) sampled at a particular instant.
+ * This activity record kind is not produced by the activity API but is
+ * included for completeness and ease-of-use. Profiler frameworks built on
+ * top of CUPTI that collect event data may choose to use this type to store the
+ * collected event data. This activity record should be used when
+ * event domain instance information needs to be associated with the
+ * event.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The timestamp at which event is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+  /**
+   * The event domain instance
+   */
+  uint8_t instance;
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[3];
+} CUpti_ActivityInstantaneousEventInstance;
+
+/**
+ * \brief The activity record for an instantaneous CUPTI metric.
+ *
+ * This activity record represents the collection of a CUPTI metric
+ * value (CUPTI_ACTIVITY_KIND_METRIC) at a particular instance.
+ * This activity record kind is not produced by the activity API but
+ * is included for completeness and ease-of-use. Profiler frameworks built
+ * on top of CUPTI that collect metric data may choose to use this type to
+ * store the collected metric data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The timestamp at which metric is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[3];
+} CUpti_ActivityInstantaneousMetric;
+
+/**
+ * \brief The instantaneous activity record for a CUPTI metric with instance
+ * information.
+
+ * This activity record represents a CUPTI metric value
+ * for a specific metric domain instance
+ * (CUPTI_ACTIVITY_KIND_METRIC_INSTANCE) sampled at a particular time. This
+ * activity record kind is not produced by the activity API but is included for
+ * completeness and ease-of-use. Profiler frameworks built on top of
+ * CUPTI that collect metric data may choose to use this type to store
+ * the collected metric data. This activity record should be used when
+ * metric domain instance information needs to be associated with the
+ * metric.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The timestamp at which metric is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The metric domain instance
+   */
+  uint8_t instance;
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[2];
+} CUpti_ActivityInstantaneousMetricInstance;
+
+/**
+ * \brief The types of JIT entry.
+ *
+ * To be used in CUpti_ActivityJit.
+ */
+typedef enum {
+  CUPTI_ACTIVITY_JIT_ENTRY_INVALID= 0,
+  /**
+  * PTX to CUBIN.
+  */
+  CUPTI_ACTIVITY_JIT_ENTRY_PTX_TO_CUBIN = 1,
+  /**
+  * NVVM-IR to PTX
+  */
+  CUPTI_ACTIVITY_JIT_ENTRY_NVVM_IR_TO_PTX = 2,
+
+  CUPTI_ACTIVITY_JIT_ENTRY_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityJitEntryType;
+
+/**
+ * \brief The types of JIT compilation operations.
+ *
+ * To be used in CUpti_ActivityJit.
+ */
+
+typedef enum {
+  CUPTI_ACTIVITY_JIT_OPERATION_INVALID = 0,
+  /**
+  * Loaded from the compute cache.
+  */
+  CUPTI_ACTIVITY_JIT_OPERATION_CACHE_LOAD = 1,
+  /**
+  * Stored in the compute cache.
+  */
+  CUPTI_ACTIVITY_JIT_OPERATION_CACHE_STORE = 2,
+  /**
+  * JIT compilation.
+  */
+  CUPTI_ACTIVITY_JIT_OPERATION_COMPILE = 3,
+
+  CUPTI_ACTIVITY_JIT_OPERATION_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityJitOperationType;
+
+/**
+ * \brief The activity record for JIT operations.
+ * This activity represents the JIT operations (compile, load, store) of a CUmodule
+ * from the Compute Cache.
+ * Gives the exact hashed path of where the cached module is loaded from,
+ * or where the module will be stored after Just-In-Time (JIT) compilation.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind must be CUPTI_ACTIVITY_KIND_JIT.
+   */
+  CUpti_ActivityKind kind;
+   /**
+    * The JIT entry type.
+    */
+  CUpti_ActivityJitEntryType jitEntryType;
+  /**
+   * The JIT operation type.
+   */
+  CUpti_ActivityJitOperationType jitOperationType;
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+  /**
+   * The start timestamp for the JIT operation, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the JIT operation.
+   */
+  uint64_t start;
+  /**
+   * The end timestamp for the JIT operation, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the JIT operation.
+   */
+  uint64_t end;
+  /**
+   * The correlation ID of the JIT operation to which
+   * records belong to. Each JIT operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the JIT operation.
+   */
+  uint32_t correlationId;
+  /**
+   * Internal use.
+   */
+  uint32_t padding;
+  /**
+   * The correlation ID to correlate JIT compilation, load and store operations.
+   * Each JIT compilation unit is assigned a unique correlation ID
+   * at the time of the JIT compilation. This correlation id can be used
+   * to find the matching JIT cache load/store records.
+   */
+  uint64_t jitOperationCorrelationId;
+  /**
+   * The size of compute cache.
+   */
+  uint64_t cacheSize;
+   /**
+   * The path where the fat binary is cached.
+   */
+  const char* cachePath;
+} CUpti_ActivityJit;
+
+
+/**
+ * \brief The activity record for trace of graph execution.
+ *
+ * This activity record represents execution for a graph without giving visibility
+ * about the execution of its nodes. This is intended to reduce overheads in tracing
+ * each node. The activity kind is CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+ */
+typedef struct {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the graph launch. Each graph launch is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the graph.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The start timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the graph execution is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The unique ID of the graph that is launched.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the context where the graph is being launched.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the graph is being launched.
+   */
+  uint32_t streamId;
+
+  /**
+   * This field is reserved for internal use
+   */
+  void *reserved;
+} CUpti_ActivityGraphTrace;
+
+END_PACKED_ALIGNMENT
+
+/**
+ * \brief Activity attributes.
+ *
+ * These attributes are used to control the behavior of the activity
+ * API.
+ */
+typedef enum {
+    /**
+     * The device memory size (in bytes) reserved for storing profiling data for concurrent
+     * kernels (activity kind \ref CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), memcopies and memsets
+     * for each buffer on a context. The value is a size_t.
+     *
+     * There is a limit on how many device buffers can be allocated per context. User
+     * can query and set this limit using the attribute
+     * \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT.
+     * CUPTI doesn't pre-allocate all the buffers, it pre-allocates only those many
+     * buffers as set by the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE.
+     * When all of the data in a buffer is consumed, it is added in the reuse pool, and
+     * CUPTI picks a buffer from this pool when a new buffer is needed. Thus memory
+     * footprint does not scale with the kernel count. Applications with the high density
+     * of kernels, memcopies and memsets might result in having CUPTI to allocate more device buffers.
+     * CUPTI allocates another buffer only when it runs out of the buffers in the
+     * reuse pool.
+     *
+     * Since buffer allocation happens in the main application thread, this might result
+     * in stalls in the critical path. CUPTI pre-allocates 3 buffers of the same size to
+     * mitigate this issue. User can query and set the pre-allocation limit using the
+     * attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE.
+     *
+     * Having larger buffer size leaves less device memory for the application.
+     * Having smaller buffer size increases the risk of dropping timestamps for
+     * records if too many kernels or memcopies or memsets are launched at one time.
+     *
+     * This value only applies to new buffer allocations. Set this value before initializing
+     * CUDA or before creating a context to ensure it is considered for the following allocations.
+     *
+     * The default value is 3200000 (~3MB) which can accommodate profiling data
+     * up to 100,000 kernels, memcopies and memsets combined.
+     *
+     * Note: Starting with the CUDA 11.2 release, CUPTI allocates profiling buffer in the
+     * pinned host memory by default as this might help in improving the performance of the
+     * tracing run. Refer to the description of the attribute
+     * \ref CUPTI_ACTIVITY_ATTR_MEM_ALLOCATION_TYPE_HOST_PINNED for more details.
+     * Size of the memory and maximum number of pools are still controlled by the attributes
+     * \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE and \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT.
+     *
+     * Note: The actual amount of device memory per buffer reserved by CUPTI might be larger.
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE                      = 0,
+    /**
+     * The device memory size (in bytes) reserved for storing profiling
+     * data for CDP operations for each buffer on a context. The
+     * value is a size_t.
+     *
+     * Having larger buffer size means less flush operations but
+     * consumes more device memory. This value only applies to new
+     * allocations.
+     *
+     * Set this value before initializing CUDA or before creating a
+     * context to ensure it is considered for the following allocations.
+     *
+     * The default value is 8388608 (8MB).
+     *
+     * Note: The actual amount of device memory per context reserved by
+     * CUPTI might be larger.
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP          = 1,
+    /**
+     * The maximum number of device memory buffers per context. The value is a size_t.
+     *
+     * For an application with high rate of kernel launches, memcopies and memsets having a bigger pool
+     * limit helps in timestamp collection for all these activties at the expense of a larger memory footprint.
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE
+     * for more details.
+     *
+     * Setting this value will not modify the number of memory buffers
+     * currently stored.
+     *
+     * Set this value before initializing CUDA to ensure the limit is
+     * not exceeded.
+     *
+     * The default value is 250.
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT                = 2,
+
+    /**
+     * The profiling semaphore pool size reserved for storing profiling data for
+     * serialized kernels tracing (activity kind \ref CUPTI_ACTIVITY_KIND_KERNEL)
+     * for each context. The value is a size_t.
+     *
+     * There is a limit on how many semaphore pools can be allocated per context. User
+     * can query and set this limit using the attribute
+     * \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT.
+     * CUPTI doesn't pre-allocate all the semaphore pools, it pre-allocates only those many
+     * semaphore pools as set by the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE.
+     * When all of the data in a semaphore pool is consumed, it is added in the reuse pool, and
+     * CUPTI picks a semaphore pool from the reuse pool when a new semaphore pool is needed. Thus memory
+     * footprint does not scale with the kernel count. Applications with the high density
+     * of kernels might result in having CUPTI to allocate more semaphore pools.
+     * CUPTI allocates another semaphore pool only when it runs out of the semaphore pools in the
+     * reuse pool.
+     *
+     * Since semaphore pool allocation happens in the main application thread, this might result
+     * in stalls in the critical path. CUPTI pre-allocates 3 semaphore pools of the same size to
+     * mitigate this issue. User can query and set the pre-allocation limit using the
+     * attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE.
+     *
+     * Having larger semaphore pool size leaves less device memory for the application.
+     * Having smaller semaphore pool size increases the risk of dropping timestamps for
+     * kernel records if too many kernels are issued/launched at one time.
+     *
+     * This value only applies to new semaphore pool allocations. Set this value before initializing
+     * CUDA or before creating a context to ensure it is considered for the following allocations.
+     *
+     * The default value is 25000 which can accommodate profiling data for upto 25,000 kernels.
+     *
+     */
+    CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE           = 3,
+    /**
+     * The maximum number of profiling semaphore pools per context. The value is a size_t.
+     *
+     * For an application with high rate of kernel launches, having a bigger
+     * pool limit helps in timestamp collection for all the kernels, at the
+     * expense of a larger device memory footprint.
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE
+     * for more details.
+     *
+     * Set this value before initializing CUDA to ensure the limit is not exceeded.
+     *
+     * The default value is 250.
+     */
+    CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT          = 4,
+
+    /**
+     * The flag to indicate whether user should provide activity buffer of zero value.
+     * The value is a uint8_t.
+     *
+     * If the value of this attribute is non-zero, user should provide
+     * a zero value buffer in the \ref CUpti_BuffersCallbackRequestFunc.
+     * If the user does not provide a zero value buffer after setting this to non-zero,
+     * the activity buffer may contain some uninitialized values when CUPTI returns it in
+     * \ref CUpti_BuffersCallbackCompleteFunc
+     *
+     * If the value of this attribute is zero, CUPTI will initialize the user buffer
+     * received in the \ref CUpti_BuffersCallbackRequestFunc to zero before filling it.
+     * If the user sets this to zero, a few stalls may appear in critical path because CUPTI
+     * will zero out the buffer in the main thread.
+     * Set this value before returning from \ref CUpti_BuffersCallbackRequestFunc to
+     * ensure it is considered for all the subsequent user buffers.
+     *
+     * The default value is 0.
+     */
+    CUPTI_ACTIVITY_ATTR_ZEROED_OUT_ACTIVITY_BUFFER              = 5,
+
+    /**
+     * Number of device buffers to pre-allocate for a context during the initialization phase.
+     * The value is a size_t.
+     *
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE
+     * for details.
+     *
+     * This value must be less than the maximum number of device buffers set using
+     * the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT
+     *
+     * Set this value before initializing CUDA or before creating a context to ensure it
+     * is considered by the CUPTI.
+     *
+     * The default value is set to 3 to ping pong between these buffers (if possible).
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE        = 6,
+
+    /**
+     * Number of profiling semaphore pools to pre-allocate for a context during the
+     * initialization phase. The value is a size_t.
+     *
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE
+     * for details.
+     *
+     * This value must be less than the maximum number of profiling semaphore pools set
+     * using the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT
+     *
+     * Set this value before initializing CUDA or before creating a context to ensure it
+     * is considered by the CUPTI.
+     *
+     * The default value is set to 3 to ping pong between these pools (if possible).
+     */
+    CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE  = 7,
+
+    /**
+     * Allocate page-locked (pinned) host memory for storing profiling data for concurrent
+     * kernels, memcopies and memsets for each buffer on a context. The value is a uint8_t.
+     *
+     * Starting with the CUDA 11.2 release, CUPTI allocates profiling buffer in the pinned host
+     * memory by default as this might help in improving the performance of the tracing run.
+     * Allocating excessive amounts of pinned memory may degrade system performance, since it
+     * reduces the amount of memory available to the system for paging. For this reason user
+     * might want to change the location from pinned host memory to device memory by setting
+     * value of this attribute to 0.
+     *
+     * The default value is 1.
+     */
+    CUPTI_ACTIVITY_ATTR_MEM_ALLOCATION_TYPE_HOST_PINNED         = 8,
+
+
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_FORCE_INT                 = 0x7fffffff
+} CUpti_ActivityAttribute;
+
+/**
+ * \brief Thread-Id types.
+ *
+ * CUPTI uses different methods to obtain the thread-id depending on the
+ * support and the underlying platform. This enum documents these methods
+ * for each type. APIs \ref cuptiSetThreadIdType and \ref cuptiGetThreadIdType
+ * can be used to set and get the thread-id type.
+ */
+typedef enum {
+    /**
+     * Default type
+     * Windows uses API GetCurrentThreadId()
+     * Linux/Mac/Android/QNX use POSIX pthread API pthread_self()
+     */
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_DEFAULT       = 0,
+
+    /**
+     * This type is based on the system API available on the underlying platform
+     * and thread-id obtained is supposed to be unique for the process lifetime.
+     * Windows uses API GetCurrentThreadId()
+     * Linux uses syscall SYS_gettid
+     * Mac uses syscall SYS_thread_selfid
+     * Android/QNX use gettid()
+     */
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_SYSTEM        = 1,
+
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityThreadIdType;
+
+/**
+ * \brief Get the CUPTI timestamp.
+ *
+ * Returns a timestamp normalized to correspond with the start and end
+ * timestamps reported in the CUPTI activity records. The timestamp is
+ * reported in nanoseconds.
+ *
+ * \param timestamp Returns the CUPTI timestamp
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p timestamp is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetTimestamp(uint64_t *timestamp);
+
+/**
+ * \brief Get the ID of a context.
+ *
+ * Get the ID of a context.
+ *
+ * \param context The context
+ * \param contextId Returns a process-unique ID for the context
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT The context is NULL or not valid.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p contextId is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetContextId(CUcontext context, uint32_t *contextId);
+
+/**
+ * \brief Get the ID of a stream.
+ *
+ * Get the ID of a stream. The stream ID is unique within a context
+ * (i.e. all streams within a context will have unique stream
+ * IDs).
+ *
+ * \param context If non-NULL then the stream is checked to ensure
+ * that it belongs to this context. Typically this parameter should be
+ * null.
+ * \param stream The stream
+ * \param streamId Returns a context-unique ID for the stream
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_STREAM if unable to get stream ID, or
+ * if \p context is non-NULL and \p stream does not belong to the
+ * context
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p streamId is NULL
+ *
+ * **DEPRECATED** This method is deprecated as of CUDA 8.0.
+ * Use method cuptiGetStreamIdEx instead.
+ */
+CUptiResult CUPTIAPI cuptiGetStreamId(CUcontext context, CUstream stream, uint32_t *streamId);
+
+/**
+* \brief Get the ID of a stream.
+*
+* Get the ID of a stream. The stream ID is unique within a context
+* (i.e. all streams within a context will have unique stream
+* IDs).
+*
+* \param context If non-NULL then the stream is checked to ensure
+* that it belongs to this context. Typically this parameter should be
+* null.
+* \param stream The stream
+* \param perThreadStream Flag to indicate if program is compiled for per-thread streams
+* \param streamId Returns a context-unique ID for the stream
+*
+* \retval CUPTI_SUCCESS
+* \retval CUPTI_ERROR_NOT_INITIALIZED
+* \retval CUPTI_ERROR_INVALID_STREAM if unable to get stream ID, or
+* if \p context is non-NULL and \p stream does not belong to the
+* context
+* \retval CUPTI_ERROR_INVALID_PARAMETER if \p streamId is NULL
+*/
+CUptiResult CUPTIAPI cuptiGetStreamIdEx(CUcontext context, CUstream stream, uint8_t perThreadStream, uint32_t *streamId);
+
+/**
+ * \brief Get the ID of a device
+ *
+ * If \p context is NULL, returns the ID of the device that contains
+ * the currently active context. If \p context is non-NULL, returns
+ * the ID of the device which contains that context. Operates in a
+ * similar manner to cudaGetDevice() or cuCtxGetDevice() but may be
+ * called from within callback functions.
+ *
+ * \param context The context, or NULL to indicate the current context.
+ * \param deviceId Returns the ID of the device that is current for
+ * the calling thread.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE if unable to get device ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p deviceId is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetDeviceId(CUcontext context, uint32_t *deviceId);
+
+/**
+ * \brief Get the unique ID of a graph node
+ *
+ * Returns the unique ID of the CUDA graph node.
+ *
+ * \param node The graph node.
+ * \param nodeId Returns the unique ID of the node
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p node is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetGraphNodeId(CUgraphNode node, uint64_t *nodeId);
+
+/**
+ * \brief Get the unique ID of graph
+ *
+ * Returns the unique ID of CUDA graph.
+ *
+ * \param graph The graph.
+ * \param pId Returns the unique ID of the graph
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p graph is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetGraphId(CUgraph graph, uint32_t *pId);
+
+/**
+ * \brief Enable collection of a specific kind of activity record.
+ *
+ * Enable collection of a specific kind of activity record. Multiple
+ * kinds can be enabled by calling this function multiple times. By
+ * default all activity kinds are disabled for collection.
+ *
+ * \param kind The kind of activity record to collect
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityEnable(CUpti_ActivityKind kind);
+
+/**
+ * \brief Enable collection of a specific kind of activity record. For certain activity kinds
+ * it dumps existing records.
+ *
+ * In general, the behavior of this API is similar to the API \ref cuptiActivityEnable i.e. it
+ * enables the collection of a specific kind of activity record.
+ * Additionally, this API can help in dumping the records for activities which happened in
+ * the past before enabling the corresponding activity kind.
+ * The API allows to get records for the current resource allocations done in CUDA
+ * For CUPTI_ACTIVITY_KIND_DEVICE, existing device records are dumped
+ * For CUPTI_ACTIVITY_KIND_CONTEXT, existing context records are dumped
+ * For CUPTI_ACTIVITY_KIND_STREAM, existing stream records are dumped
+ * For CUPTI_ACTIVITY_KIND_ NVLINK, existing NVLINK records are dumped
+ * For CUPTI_ACTIVITY_KIND_PCIE, existing PCIE records are dumped
+ * For other activities, the behavior is similar to the API \ref cuptiActivityEnable
+ *
+ * Device records are emitted in CUPTI on CUDA driver initialization. Those records
+ * can only be retrieved by the user if CUPTI is attached before CUDA initialization.
+ * Context and stream records are emitted on context and stream creation.
+ * The use case of the API is to provide the records for CUDA resources
+ * (contexs/streams/devices) that are currently active if user late attaches CUPTI.
+ *
+ * Before calling this function, the user must register buffer callbacks
+ * to get the activity records by calling \ref cuptiActivityRegisterCallbacks.
+ * If the user does not register the buffers and calls API \ref cuptiActivityEnableAndDump,
+ * then CUPTI will enable the activity kind but not provide any records for that
+ * activity kind.
+ *
+ * \param kind The kind of activity record to collect
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_UNKNOWN if buffer is not initialized.
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableAndDump(CUpti_ActivityKind kind);
+
+/**
+ * \brief Disable collection of a specific kind of activity record.
+ *
+ * Disable collection of a specific kind of activity record. Multiple
+ * kinds can be disabled by calling this function multiple times. By
+ * default all activity kinds are disabled for collection.
+ *
+ * \param kind The kind of activity record to stop collecting
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityDisable(CUpti_ActivityKind kind);
+
+/**
+ * \brief Enable collection of a specific kind of activity record for
+ * a context.
+ *
+ * Enable collection of a specific kind of activity record for a
+ * context.  This setting done by this API will supersede the global
+ * settings for activity records enabled by \ref cuptiActivityEnable.
+ * Multiple kinds can be enabled by calling this function multiple
+ * times.
+ *
+ * \param context The context for which activity is to be enabled
+ * \param kind The kind of activity record to collect
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableContext(CUcontext context, CUpti_ActivityKind kind);
+
+/**
+ * \brief Disable collection of a specific kind of activity record for
+ * a context.
+ *
+ * Disable collection of a specific kind of activity record for a context.
+ * This setting done by this API will supersede the global settings
+ * for activity records.
+ * Multiple kinds can be enabled by calling this function multiple times.
+ *
+ * \param context The context for which activity is to be disabled
+ * \param kind The kind of activity record to stop collecting
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityDisableContext(CUcontext context, CUpti_ActivityKind kind);
+
+/**
+ * \brief Get the number of activity records that were dropped of
+ * insufficient buffer space.
+ *
+ * Get the number of records that were dropped because of insufficient
+ * buffer space.  The dropped count includes records that could not be
+ * recorded because CUPTI did not have activity buffer space available
+ * for the record (because the CUpti_BuffersCallbackRequestFunc
+ * callback did not return an empty buffer of sufficient size) and
+ * also CDP records that could not be record because the device-size
+ * buffer was full (size is controlled by the
+ * CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP attribute). The dropped
+ * count maintained for the queue is reset to zero when this function
+ * is called.
+ *
+ * \param context The context, or NULL to get dropped count from global queue
+ * \param streamId The stream ID
+ * \param dropped The number of records that were dropped since the last call
+ * to this function.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p dropped is NULL
+ */
+CUptiResult CUPTIAPI cuptiActivityGetNumDroppedRecords(CUcontext context, uint32_t streamId,
+                                                       size_t *dropped);
+
+/**
+ * \brief Iterate over the activity records in a buffer.
+ *
+ * This is a helper function to iterate over the activity records in a
+ * buffer. A buffer of activity records is typically obtained by
+ * receiving a CUpti_BuffersCallbackCompleteFunc callback.
+ *
+ * An example of typical usage:
+ * \code
+ * CUpti_Activity *record = NULL;
+ * CUptiResult status = CUPTI_SUCCESS;
+ *   do {
+ *      status = cuptiActivityGetNextRecord(buffer, validSize, &record);
+ *      if(status == CUPTI_SUCCESS) {
+ *           // Use record here...
+ *      }
+ *      else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED)
+ *          break;
+ *      else {
+ *          goto Error;
+ *      }
+ *    } while (1);
+ * \endcode
+ *
+ * \param buffer The buffer containing activity records
+ * \param record Inputs the previous record returned by
+ * cuptiActivityGetNextRecord and returns the next activity record
+ * from the buffer. If input value is NULL, returns the first activity
+ * record in the buffer. Records of kind CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL
+ * may contain invalid (0) timestamps, indicating that no timing information could
+ * be collected for lack of device memory.
+ * \param validBufferSizeBytes The number of valid bytes in the buffer.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_MAX_LIMIT_REACHED if no more records in the buffer
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p buffer is NULL.
+ */
+CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t* buffer, size_t validBufferSizeBytes,
+                                                CUpti_Activity **record);
+
+/**
+ * \brief Function type for callback used by CUPTI to request an empty
+ * buffer for storing activity records.
+ *
+ * This callback function signals the CUPTI client that an activity
+ * buffer is needed by CUPTI. The activity buffer is used by CUPTI to
+ * store activity records. The callback function can decline the
+ * request by setting \p *buffer to NULL. In this case CUPTI may drop
+ * activity records.
+ *
+ * \param buffer Returns the new buffer. If set to NULL then no buffer
+ * is returned.
+ * \param size Returns the size of the returned buffer.
+ * \param maxNumRecords Returns the maximum number of records that
+ * should be placed in the buffer. If 0 then the buffer is filled with
+ * as many records as possible. If > 0 the buffer is filled with at
+ * most that many records before it is returned.
+ */
+typedef void (CUPTIAPI *CUpti_BuffersCallbackRequestFunc)(
+    uint8_t **buffer,
+    size_t *size,
+    size_t *maxNumRecords);
+
+/**
+ * \brief Function type for callback used by CUPTI to return a buffer
+ * of activity records.
+ *
+ * This callback function returns to the CUPTI client a buffer
+ * containing activity records.  The buffer contains \p validSize
+ * bytes of activity records which should be read using
+ * cuptiActivityGetNextRecord. The number of dropped records can be
+ * read using cuptiActivityGetNumDroppedRecords. After this call CUPTI
+ * relinquished ownership of the buffer and will not use it
+ * anymore. The client may return the buffer to CUPTI using the
+ * CUpti_BuffersCallbackRequestFunc callback.
+ * Note: CUDA 6.0 onwards, all buffers returned by this callback are
+ * global buffers i.e. there is no context/stream specific buffer.
+ * User needs to parse the global buffer to extract the context/stream
+ * specific activity records.
+ *
+ * \param context The context this buffer is associated with. If NULL, the
+ * buffer is associated with the global activities. This field is deprecated
+ * as of CUDA 6.0 and will always be NULL.
+ * \param streamId The stream id this buffer is associated with.
+ * This field is deprecated as of CUDA 6.0 and will always be NULL.
+ * \param buffer The activity record buffer.
+ * \param size The total size of the buffer in bytes as set in
+ * CUpti_BuffersCallbackRequestFunc.
+ * \param validSize The number of valid bytes in the buffer.
+ */
+typedef void (CUPTIAPI *CUpti_BuffersCallbackCompleteFunc)(
+    CUcontext context,
+    uint32_t streamId,
+    uint8_t *buffer,
+    size_t size,
+    size_t validSize);
+
+/**
+ * \brief Registers callback functions with CUPTI for activity buffer
+ * handling.
+ *
+ * This function registers two callback functions to be used in asynchronous
+ * buffer handling. If registered, activity record buffers are handled using
+ * asynchronous requested/completed callbacks from CUPTI.
+ *
+ * Registering these callbacks prevents the client from using CUPTI's
+ * blocking enqueue/dequeue functions.
+ *
+ * \param funcBufferRequested callback which is invoked when an empty
+ * buffer is requested by CUPTI
+ * \param funcBufferCompleted callback which is invoked when a buffer
+ * containing activity records is available from CUPTI
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if either \p
+ * funcBufferRequested or \p funcBufferCompleted is NULL
+ */
+CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(CUpti_BuffersCallbackRequestFunc funcBufferRequested,
+        CUpti_BuffersCallbackCompleteFunc funcBufferCompleted);
+
+/**
+ * \brief Wait for all activity records to be delivered via the
+ * completion callback.
+ *
+ * This function does not return until all activity records associated
+ * with the specified context/stream are returned to the CUPTI client
+ * using the callback registered in cuptiActivityRegisterCallbacks. To
+ * ensure that all activity records are complete, the requested
+ * stream(s), if any, are synchronized.
+ *
+ * If \p context is NULL, the global activity records (i.e. those not
+ * associated with a particular stream) are flushed (in this case no
+ * streams are synchonized).  If \p context is a valid CUcontext and
+ * \p streamId is 0, the buffers of all streams of this context are
+ * flushed.  Otherwise, the buffers of the specified stream in this
+ * context is flushed.
+ *
+ * Before calling this function, the buffer handling callback api
+ * must be activated by calling cuptiActivityRegisterCallbacks.
+ *
+ * \param context A valid CUcontext or NULL.
+ * \param streamId The stream ID.
+ * \param flag The flag can be set to indicate a forced flush. See CUpti_ActivityFlag
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_CUPTI_ERROR_INVALID_OPERATION if not preceeded
+ * by a successful call to cuptiActivityRegisterCallbacks
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred
+ *
+ * **DEPRECATED** This method is deprecated
+ * CONTEXT and STREAMID will be ignored. Use cuptiActivityFlushAll
+ * to flush all data.
+ */
+CUptiResult CUPTIAPI cuptiActivityFlush(CUcontext context, uint32_t streamId, uint32_t flag);
+
+/**
+ * \brief Request to deliver activity records via the buffer completion callback.
+ *
+ * This function returns the activity records associated with all contexts/streams
+ * (and the global buffers not associated with any stream) to the CUPTI client
+ * using the callback registered in cuptiActivityRegisterCallbacks.
+ *
+ * This is a blocking call but it doesn't issue any CUDA synchronization calls
+ * implicitly thus it's not guaranteed that all activities are completed on the
+ * underlying devices. Activity record is considered as completed if it has all
+ * the information filled up including the timestamps if any. It is the client's
+ * responsibility to issue necessary CUDA synchronization calls before calling
+ * this function if all activity records with complete information are expected
+ * to be delivered.
+ *
+ * Behavior of the function based on the input flag:
+ * - ::For default flush i.e. when flag is set as 0, it returns all the
+ * activity buffers which have all the activity records completed, buffers need not
+ * to be full though. It doesn't return buffers which have one or more incomplete
+ * records. Default flush can be done at a regular interval in a separate thread.
+ * - ::For forced flush i.e. when flag CUPTI_ACTIVITY_FLAG_FLUSH_FORCED is passed
+ * to the function, it returns all the activity buffers including the ones which have
+ * one or more incomplete activity records. It's suggested for clients to do the
+ * force flush before the termination of the profiling session to allow remaining
+ * buffers to be delivered. In general, it can be done in the at-exit handler.
+ *
+ * Before calling this function, the buffer handling callback api must be activated
+ * by calling cuptiActivityRegisterCallbacks.
+ *
+ * \param flag The flag can be set to indicate a forced flush. See CUpti_ActivityFlag
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if not preceeded by a
+ * successful call to cuptiActivityRegisterCallbacks
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred
+ *
+ * \see cuptiActivityFlushPeriod
+ */
+CUptiResult CUPTIAPI cuptiActivityFlushAll(uint32_t flag);
+
+/**
+ * \brief Read an activity API attribute.
+ *
+ * Read an activity API attribute and return it in \p *value.
+ *
+ * \param attr The attribute to read
+ * \param valueSize Size of buffer pointed by the value, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the value of the attribute
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value is NULL, or
+ * if \p attr is not an activity attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
+ * the \p value buffer is too small to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiActivityGetAttribute(CUpti_ActivityAttribute attr,
+        size_t *valueSize, void* value);
+
+/**
+ * \brief Write an activity API attribute.
+ *
+ * Write an activity API attribute.
+ *
+ * \param attr The attribute to write
+ * \param valueSize The size, in bytes, of the value
+ * \param value The attribute value to write
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value is NULL, or
+ * if \p attr is not an activity attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
+ * the \p value buffer is too small to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiActivitySetAttribute(CUpti_ActivityAttribute attr,
+        size_t *valueSize, void* value);
+
+
+/**
+ * \brief Set Unified Memory Counter configuration.
+ *
+ * \param config A pointer to \ref CUpti_ActivityUnifiedMemoryCounterConfig structures
+ * containing Unified Memory counter configuration.
+ * \param count Number of Unified Memory counter configuration structures
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p config is NULL or
+ * any parameter in the \p config structures is not a valid value
+ * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED One potential reason is that
+ * platform (OS/arch) does not support the unified memory counters
+ * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE Indicates that the device
+ * does not support the unified memory counters
+ * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES Indicates that
+ * multi-GPU configuration without P2P support between any pair of devices
+ * does not support the unified memory counters
+ */
+CUptiResult CUPTIAPI cuptiActivityConfigureUnifiedMemoryCounter(CUpti_ActivityUnifiedMemoryCounterConfig *config, uint32_t count);
+
+/**
+ * \brief Get auto boost state
+ *
+ * The profiling results can be inconsistent in case auto boost is enabled.
+ * CUPTI tries to disable auto boost while profiling. It can fail to disable in
+ * cases where user does not have the permissions or CUDA_AUTO_BOOST env
+ * variable is set. The function can be used to query whether auto boost is
+ * enabled.
+ *
+ * \param context A valid CUcontext.
+ * \param state A pointer to \ref CUpti_ActivityAutoBoostState structure which
+ * contains the current state and the id of the process that has requested the
+ * current state
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p CUcontext or \p state is NULL
+ * \retval CUPTI_ERROR_NOT_SUPPORTED Indicates that the device does not support auto boost
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred
+ */
+CUptiResult CUPTIAPI cuptiGetAutoBoostState(CUcontext context, CUpti_ActivityAutoBoostState *state);
+
+/**
+ * \brief Set PC sampling configuration.
+ *
+ * For Pascal and older GPU architectures this API must be called before enabling
+ * activity kind CUPTI_ACTIVITY_KIND_PC_SAMPLING. There is no such requirement
+ * for Volta and newer GPU architectures.
+ *
+ * For Volta and newer GPU architectures if this API is called in the middle of
+ * execution, PC sampling configuration will be updated for subsequent kernel launches.
+ *
+ * \param ctx The context
+ * \param config A pointer to \ref CUpti_ActivityPCSamplingConfig structure
+ * containing PC sampling configuration.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this api is called while
+ * some valid event collection method is set.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p config is NULL or
+ * any parameter in the \p config structures is not a valid value
+ * \retval CUPTI_ERROR_NOT_SUPPORTED Indicates that the system/device
+ * does not support the unified memory counters
+ */
+CUptiResult CUPTIAPI cuptiActivityConfigurePCSampling(CUcontext ctx, CUpti_ActivityPCSamplingConfig *config);
+
+/**
+ * \brief Returns the last error from a cupti call or callback
+ *
+ * Returns the last error that has been produced by any of the cupti api calls
+ * or the callback in the same host thread and resets it to CUPTI_SUCCESS.
+ */
+CUptiResult CUPTIAPI cuptiGetLastError(void);
+
+/**
+ * \brief Set the thread-id type
+ *
+ * CUPTI uses the method corresponding to set type to generate the thread-id.
+ * See enum \ref CUpti_ActivityThreadIdType for the list of methods.
+ * Activity records having thread-id field contain the same value.
+ * Thread id type must not be changed during the profiling session to
+ * avoid thread-id value mismatch across activity records.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_SUPPORTED if \p type is not supported on the platform
+ */
+CUptiResult CUPTIAPI cuptiSetThreadIdType(CUpti_ActivityThreadIdType type);
+
+/**
+ * \brief Get the thread-id type
+ *
+ * Returns the thread-id type used in CUPTI
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p type is NULL
+  */
+CUptiResult CUPTIAPI cuptiGetThreadIdType(CUpti_ActivityThreadIdType *type);
+
+/**
+* \brief Check support for a compute capability
+*
+* This function is used to check the support for a device based on
+* it's compute capability. It sets the \p support when the compute
+* capability is supported by the current version of CUPTI, and clears
+* it otherwise. This version of CUPTI might not support all GPUs sharing
+* the same compute capability. It is suggested to use API \ref
+* cuptiDeviceSupported which provides correct information.
+*
+* \param major The major revision number of the compute capability
+* \param minor The minor revision number of the compute capability
+* \param support Pointer to an integer to return the support status
+*
+* \retval CUPTI_SUCCESS
+* \retval CUPTI_ERROR_INVALID_PARAMETER if \p support is NULL
+*
+* \sa ::cuptiDeviceSupported
+*/
+CUptiResult CUPTIAPI cuptiComputeCapabilitySupported(int major, int minor, int *support);
+
+/**
+* \brief Check support for a compute device
+*
+* This function is used to check the support for a compute device.
+* It sets the \p support when the device is supported by the current
+* version of CUPTI, and clears it otherwise.
+*
+* \param dev The device handle returned by CUDA Driver API cuDeviceGet
+* \param support Pointer to an integer to return the support status
+*
+* \retval CUPTI_SUCCESS
+* \retval CUPTI_ERROR_INVALID_PARAMETER if \p support is NULL
+* \retval CUPTI_ERROR_INVALID_DEVICE if \p dev is not a valid device
+*
+* \sa ::cuptiComputeCapabilitySupported
+*/
+CUptiResult CUPTIAPI cuptiDeviceSupported(CUdevice dev, int *support);
+
+/**
+ * This indicates the virtualization mode in which CUDA device is running
+ */
+typedef enum {
+  /**
+   * No virtualization mode isassociated with the device
+   * i.e. it's a baremetal GPU
+   */
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_NONE = 0,
+  /**
+   * The device is associated with the pass-through GPU.
+   * In this mode, an entire physical GPU is directly assigned
+   * to one virtual machine (VM).
+   */
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_PASS_THROUGH = 1,
+  /**
+   * The device is associated with the virtual GPU (vGPU).
+   * In this mode multiple virtual machines (VMs) have simultaneous,
+   * direct access to a single physical GPU.
+   */
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_VIRTUAL_GPU = 2,
+
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_FORCE_INT = 0x7fffffff
+} CUpti_DeviceVirtualizationMode;
+
+/**
+ * \brief Query the virtualization mode of the device
+ *
+ * This function is used to query the virtualization mode of the CUDA device.
+ *
+ * \param dev The device handle returned by CUDA Driver API cuDeviceGet
+ * \param mode Pointer to an CUpti_DeviceVirtualizationMode to return the virtualization mode
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_DEVICE if \p dev is not a valid device
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p mode is NULL
+ *
+ */
+CUptiResult CUPTIAPI cuptiDeviceVirtualizationMode(CUdevice dev, CUpti_DeviceVirtualizationMode *mode);
+
+/**
+ * \brief Detach CUPTI from the running process
+ *
+ * This API detaches the CUPTI from the running process. It destroys and cleans up all the
+ * resources associated with CUPTI in the current process. After CUPTI detaches from the process,
+ * the process will keep on running with no CUPTI attached to it.
+ * For safe operation of the API, it is recommended this API is invoked from the exit callsite
+ * of any of the CUDA Driver or Runtime API. Otherwise CUPTI client needs to make sure that
+ * required CUDA synchronization and CUPTI activity buffer flush is done before calling the API.
+ * Sample code showing the usage of the API in the cupti callback handler code:
+ * \code
+    void CUPTIAPI
+    cuptiCallbackHandler(void *userdata, CUpti_CallbackDomain domain,
+        CUpti_CallbackId cbid, void *cbdata)
+    {
+        const CUpti_CallbackData *cbInfo = (CUpti_CallbackData *)cbdata;
+
+        // Take this code path when CUPTI detach is requested
+        if (detachCupti) {
+            switch(domain)
+            {
+            case CUPTI_CB_DOMAIN_RUNTIME_API:
+            case CUPTI_CB_DOMAIN_DRIVER_API:
+                if (cbInfo->callbackSite == CUPTI_API_EXIT) {
+                    // call the CUPTI detach API
+                    cuptiFinalize();
+                }
+                break;
+            default:
+                break;
+            }
+        }
+    }
+ \endcode
+ */
+CUptiResult CUPTIAPI cuptiFinalize(void);
+
+/**
+ * \brief Push an external correlation id for the calling thread
+ *
+ * This function notifies CUPTI that the calling thread is entering an external API region.
+ * When a CUPTI activity API record is created while within an external API region and
+ * CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION is enabled, the activity API record will
+ * be preceeded by a CUpti_ActivityExternalCorrelation record for each \ref CUpti_ExternalCorrelationKind.
+ *
+ * \param kind The kind of external API activities should be correlated with.
+ * \param id External correlation id.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER The external API kind is invalid
+ */
+CUptiResult CUPTIAPI cuptiActivityPushExternalCorrelationId(CUpti_ExternalCorrelationKind kind, uint64_t id);
+
+/**
+ * \brief Pop an external correlation id for the calling thread
+ *
+ * This function notifies CUPTI that the calling thread is leaving an external API region.
+ *
+ * \param kind The kind of external API activities should be correlated with.
+ * \param lastId If the function returns successful, contains the last external correlation id for this \p kind, can be NULL.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER The external API kind is invalid.
+ * \retval CUPTI_ERROR_QUEUE_EMPTY No external id is currently associated with \p kind.
+ */
+CUptiResult CUPTIAPI cuptiActivityPopExternalCorrelationId(CUpti_ExternalCorrelationKind kind, uint64_t *lastId);
+
+/**
+ * \brief Controls the collection of queued and submitted timestamps for kernels.
+ *
+ * This API is used to control the collection of queued and submitted timestamps
+ * for kernels whose records are provided through the struct \ref CUpti_ActivityKernel8.
+ * Default value is 0, i.e. these timestamps are not collected. This API needs
+ * to be called before initialization of CUDA and this setting should not be
+ * changed during the profiling session.
+ *
+ * \param enable is a boolean, denoting whether these timestamps should be
+ * collected
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableLatencyTimestamps(uint8_t enable);
+
+/**
+ * \brief Sets the flush period for the worker thread
+ *
+ * CUPTI creates a worker thread to minimize the perturbance for the application created
+ * threads. CUPTI offloads certain operations from the application threads to the worker
+ * thread, this includes synchronization of profiling resources between host and device,
+ * delivery of the activity buffers to the client using the callback registered in
+ * cuptiActivityRegisterCallbacks. For performance reasons, CUPTI wakes up the worker
+ * thread based on certain heuristics.
+ *
+ * This API is used to control the flush period of the worker thread. This setting will
+ * override the CUPTI heurtistics. Setting time to zero disables the periodic flush and
+ * restores the default behavior.
+ *
+ * Periodic flush can return only those activity buffers which are full and have all the
+ * activity records completed.
+ *
+ * It's allowed to use the API \ref cuptiActivityFlushAll to flush the data on-demand, even
+ * when client sets the periodic flush.
+ *
+ * \param time flush period in msec
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ *
+ * \see cuptiActivityFlushAll
+ */
+CUptiResult CUPTIAPI cuptiActivityFlushPeriod(uint32_t time);
+
+/**
+ * \brief Controls the collection of launch attributes for kernels.
+ *
+ * This API is used to control the collection of launch attributes for kernels whose
+ * records are provided through the struct \ref CUpti_ActivityKernel8.
+ * Default value is 0, i.e. these attributes are not collected.
+ *
+ * \param enable is a boolean denoting whether these launch attributes should be collected
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableLaunchAttributes(uint8_t enable);
+
+/**
+ * \brief Function type for callback used by CUPTI to request a timestamp
+ * to be used in activity records.
+ *
+ * This callback function signals the CUPTI client that a timestamp needs
+ * to be returned. This timestamp would be treated as normalized timestamp
+ * to be used for various purposes in CUPTI. For example to store start and
+ * end timestamps reported in the CUPTI activity records.
+ * The returned timestamp must be in nanoseconds.
+ *
+ * \sa ::cuptiActivityRegisterTimestampCallback
+ */
+typedef uint64_t (CUPTIAPI *CUpti_TimestampCallbackFunc)(void);
+
+/**
+ * \brief Registers callback function with CUPTI for providing timestamp.
+ *
+ * This function registers a callback function to obtain timestamp of user's
+ * choice instead of using CUPTI provided timestamp.
+ * By default CUPTI uses different methods, based on the underlying platform,
+ * to retrieve the timestamp
+ * Linux and Android use clock_gettime(CLOCK_REALTIME, ..)
+ * Windows uses QueryPerformanceCounter()
+ * Mac uses mach_absolute_time()
+ * QNX uses ClockCycles()
+ * Timestamps retrieved using these methods are converted to nanosecond if needed
+ * before usage.
+ *
+ * The registration of timestamp callback should be done before any of the CUPTI
+ * activity kinds are enabled to make sure that all the records report the timestamp using
+ * the callback function registered through cuptiActivityRegisterTimestampCallback API.
+ *
+ * Changing the timestamp callback function in CUPTI through
+ * cuptiActivityRegisterTimestampCallback API in the middle of the profiling
+ * session can cause records generated prior to the change to report
+ * timestamps through previous timestamp method.
+ *
+ * \param funcTimestamp callback which is invoked when a timestamp is
+ * needed by CUPTI
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p funcTimestamp is NULL
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityRegisterTimestampCallback(CUpti_TimestampCallbackFunc funcTimestamp);
+
+/** @} */ /* END CUPTI_ACTIVITY_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_ACTIVITY_H_*/
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_events.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_events.h
new file mode 100644
index 0000000000000000000000000000000000000000..d76394e8bc4c9dbbff8422eaa50651340639a546
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_events.h
@@ -0,0 +1,1371 @@
+/*
+ * Copyright 2010-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_EVENTS_H_)
+#define _CUPTI_EVENTS_H_
+
+#include <cuda.h>
+#include <string.h>
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_EVENT_API CUPTI Event API
+ * Functions, types, and enums that implement the CUPTI Event API.
+ *
+ * \note CUPTI event API from the header cupti_events.h are not supported on devices
+ * with compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
+ * These API will be deprecated in a future CUDA release. These are replaced by
+ * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API
+ * in the headers nvperf_host.h and nvperf_target.h which are supported on
+ * devices with compute capability 7.0 and higher (i.e. Volta and later GPU
+ * architectures).
+ *
+ * @{
+ */
+
+/**
+ * \brief ID for an event.
+ *
+ * An event represents a countable activity, action, or occurrence on
+ * the device.
+ */
+typedef uint32_t CUpti_EventID;
+
+/**
+ * \brief ID for an event domain.
+ *
+ * ID for an event domain. An event domain represents a group of
+ * related events. A device may have multiple instances of a domain,
+ * indicating that the device can simultaneously record multiple
+ * instances of each event within that domain.
+ */
+typedef uint32_t CUpti_EventDomainID;
+
+/**
+ * \brief A group of events.
+ *
+ * An event group is a collection of events that are managed
+ * together. All events in an event group must belong to the same
+ * domain.
+ */
+typedef void *CUpti_EventGroup;
+
+/**
+ * \brief Device class.
+ *
+ * Enumeration of device classes for device attribute
+ * CUPTI_DEVICE_ATTR_DEVICE_CLASS.
+ */
+typedef enum {
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_TESLA              = 0,
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_QUADRO             = 1,
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_GEFORCE            = 2,
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_TEGRA              = 3,
+} CUpti_DeviceAttributeDeviceClass;
+
+/**
+ * \brief Device attributes.
+ *
+ * CUPTI device attributes. These attributes can be read using \ref
+ * cuptiDeviceGetAttribute.
+ */
+typedef enum {
+  /**
+   * Number of event IDs for a device. Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_EVENT_ID                            = 1,
+  /**
+   * Number of event domain IDs for a device. Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_EVENT_DOMAIN_ID                     = 2,
+  /**
+   * Get global memory bandwidth in Kbytes/sec. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_GLOBAL_MEMORY_BANDWIDTH                 = 3,
+  /**
+   * Get theoretical maximum number of instructions per cycle. Value
+   * is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_INSTRUCTION_PER_CYCLE                   = 4,
+  /**
+   * Get theoretical maximum number of single precision instructions
+   * that can be executed per second. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_INSTRUCTION_THROUGHPUT_SINGLE_PRECISION = 5,
+  /**
+   * Get number of frame buffers for device.  Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_FRAME_BUFFERS                       = 6,
+  /**
+   * Get PCIE link rate in Mega bits/sec for device. Return 0 if bus-type
+   * is non-PCIE. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_PCIE_LINK_RATE                          = 7,
+  /**
+   * Get PCIE link width for device. Return 0 if bus-type
+   * is non-PCIE. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_PCIE_LINK_WIDTH                         = 8,
+  /**
+   * Get PCIE generation for device. Return 0 if bus-type
+   * is non-PCIE. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_PCIE_GEN                                = 9,
+  /**
+   * Get the class for the device. Value is a
+   * CUpti_DeviceAttributeDeviceClass.
+   */
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS                            = 10,
+  /**
+   * Get the peak single precision flop per cycle. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_FLOP_SP_PER_CYCLE                       = 11,
+  /**
+   * Get the peak double precision flop per cycle. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_FLOP_DP_PER_CYCLE                       = 12,
+  /**
+   * Get number of L2 units. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_L2_UNITS                           = 13,
+  /**
+   * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_SHARED
+   * preference. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_SHARED = 14,
+  /**
+   * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_L1
+   * preference. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_L1 = 15,
+  /**
+   * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_EQUAL
+   * preference. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_EQUAL = 16,
+  /**
+   * Get the peak half precision flop per cycle. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_FLOP_HP_PER_CYCLE                       = 17,
+  /**
+   * Check if Nvlink is connected to device. Returns 1, if at least one
+   * Nvlink is connected to the device, returns 0 otherwise.
+   * Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_NVLINK_PRESENT                          = 18,
+    /**
+   * Check if Nvlink is present between GPU and CPU. Returns Bandwidth,
+   * in Bytes/sec, if Nvlink is present, returns 0 otherwise.
+   * Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_GPU_CPU_NVLINK_BW                       = 19,
+  /**
+   * Check if NVSwitch is present in the underlying topology.
+   * Returns 1, if present, returns 0 otherwise.
+   * Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_NVSWITCH_PRESENT                        = 20,
+  CUPTI_DEVICE_ATTR_FORCE_INT                               = 0x7fffffff,
+} CUpti_DeviceAttribute;
+
+/**
+ * \brief Event domain attributes.
+ *
+ * Event domain attributes. Except where noted, all the attributes can
+ * be read using either \ref cuptiDeviceGetEventDomainAttribute or
+ * \ref cuptiEventDomainGetAttribute.
+ */
+typedef enum {
+  /**
+   * Event domain name. Value is a null terminated const c-string.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_NAME                 = 0,
+  /**
+   * Number of instances of the domain for which event counts will be
+   * collected.  The domain may have additional instances that cannot
+   * be profiled (see CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT).
+   * Can be read only with \ref
+   * cuptiDeviceGetEventDomainAttribute. Value is a uint32_t.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT       = 1,
+  /**
+   * Total number of instances of the domain, including instances that
+   * cannot be profiled.  Use CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT
+   * to get the number of instances that can be profiled. Can be read
+   * only with \ref cuptiDeviceGetEventDomainAttribute. Value is a
+   * uint32_t.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT = 3,
+  /**
+   * Collection method used for events contained in the event domain.
+   * Value is a \ref CUpti_EventCollectionMethod.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_COLLECTION_METHOD    = 4,
+
+  CUPTI_EVENT_DOMAIN_ATTR_FORCE_INT      = 0x7fffffff,
+} CUpti_EventDomainAttribute;
+
+/**
+ * \brief The collection method used for an event.
+ *
+ * The collection method indicates how an event is collected.
+ */
+typedef enum {
+  /**
+   * Event is collected using a hardware global performance monitor.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_PM                  = 0,
+  /**
+   * Event is collected using a hardware SM performance monitor.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_SM                  = 1,
+  /**
+   * Event is collected using software instrumentation.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_INSTRUMENTED        = 2,
+  /**
+   * Event is collected using NvLink throughput counter method.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_NVLINK_TC           = 3,
+  CUPTI_EVENT_COLLECTION_METHOD_FORCE_INT           = 0x7fffffff
+} CUpti_EventCollectionMethod;
+
+/**
+ * \brief Event group attributes.
+ *
+ * Event group attributes. These attributes can be read using \ref
+ * cuptiEventGroupGetAttribute. Attributes marked [rw] can also be
+ * written using \ref cuptiEventGroupSetAttribute.
+ */
+typedef enum {
+  /**
+   * The domain to which the event group is bound. This attribute is
+   * set when the first event is added to the group.  Value is a
+   * CUpti_EventDomainID.
+   */
+  CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID              = 0,
+  /**
+   * [rw] Profile all the instances of the domain for this
+   * eventgroup. This feature can be used to get load balancing
+   * across all instances of a domain. Value is an integer.
+   */
+  CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES = 1,
+  /**
+   * [rw] Reserved for user data.
+   */
+  CUPTI_EVENT_GROUP_ATTR_USER_DATA                    = 2,
+  /**
+   * Number of events in the group. Value is a uint32_t.
+   */
+  CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS                   = 3,
+  /**
+   * Enumerates events in the group. Value is a pointer to buffer of
+   * size sizeof(CUpti_EventID) * num_of_events in the eventgroup.
+   * num_of_events can be queried using
+   * CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS.
+   */
+  CUPTI_EVENT_GROUP_ATTR_EVENTS                       = 4,
+  /**
+   * Number of instances of the domain bound to this event group that
+   * will be counted.  Value is a uint32_t.
+   */
+  CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT               = 5,
+  /**
+   * Event group scope can be set to CUPTI_EVENT_PROFILING_SCOPE_DEVICE or
+   * CUPTI_EVENT_PROFILING_SCOPE_CONTEXT for an eventGroup, before
+   * adding any event.
+   * Sets the scope of eventgroup as CUPTI_EVENT_PROFILING_SCOPE_DEVICE or
+   * CUPTI_EVENT_PROFILING_SCOPE_CONTEXT when the scope of the events
+   * that will be added is CUPTI_EVENT_PROFILING_SCOPE_BOTH.
+   * If profiling scope of event is either
+   * CUPTI_EVENT_PROFILING_SCOPE_DEVICE or CUPTI_EVENT_PROFILING_SCOPE_CONTEXT
+   * then setting this attribute will not affect the default scope.
+   * It is not allowed to add events of different scope to same eventgroup.
+   * Value is a uint32_t.
+   */
+  CUPTI_EVENT_GROUP_ATTR_PROFILING_SCOPE               = 6,
+  CUPTI_EVENT_GROUP_ATTR_FORCE_INT                     = 0x7fffffff,
+} CUpti_EventGroupAttribute;
+
+/**
+* \brief Profiling scope for event.
+*
+* Profiling scope of event indicates if the event can be collected at context
+* scope or device scope or both i.e. it can be collected at any of context or
+* device scope.
+*/
+typedef enum {
+  /**
+   * Event is collected at context scope.
+   */
+  CUPTI_EVENT_PROFILING_SCOPE_CONTEXT                 = 0,
+  /**
+   * Event is collected at device scope.
+   */
+  CUPTI_EVENT_PROFILING_SCOPE_DEVICE                  = 1,
+  /**
+   * Event can be collected at device or context scope.
+   * The scope can be set using \ref cuptiEventGroupSetAttribute API.
+   */
+  CUPTI_EVENT_PROFILING_SCOPE_BOTH                    = 2,
+  CUPTI_EVENT_PROFILING_SCOPE_FORCE_INT               = 0x7fffffff
+} CUpti_EventProfilingScope;
+
+/**
+ * \brief Event attributes.
+ *
+ * Event attributes. These attributes can be read using \ref
+ * cuptiEventGetAttribute.
+ */
+typedef enum {
+  /**
+   * Event name. Value is a null terminated const c-string.
+   */
+  CUPTI_EVENT_ATTR_NAME              = 0,
+  /**
+   * Short description of event. Value is a null terminated const
+   * c-string.
+   */
+  CUPTI_EVENT_ATTR_SHORT_DESCRIPTION = 1,
+  /**
+   * Long description of event. Value is a null terminated const
+   * c-string.
+   */
+  CUPTI_EVENT_ATTR_LONG_DESCRIPTION  = 2,
+  /**
+   * Category of event. Value is CUpti_EventCategory.
+   */
+  CUPTI_EVENT_ATTR_CATEGORY          = 3,
+  /**
+   * Profiling scope of the events. It can be either device or context or both.
+   * Value is a \ref CUpti_EventProfilingScope.
+   */
+  CUPTI_EVENT_ATTR_PROFILING_SCOPE   = 5,
+
+  CUPTI_EVENT_ATTR_FORCE_INT         = 0x7fffffff,
+} CUpti_EventAttribute;
+
+/**
+ * \brief Event collection modes.
+ *
+ * The event collection mode determines the period over which the
+ * events within the enabled event groups will be collected.
+ */
+typedef enum {
+  /**
+   * Events are collected for the entire duration between the
+   * cuptiEventGroupEnable and cuptiEventGroupDisable calls.
+   * Event values are reset when the events are read.
+   * For CUDA toolkit v6.0 and older this was the default mode.
+   */
+  CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS          = 0,
+  /**
+   * Events are collected only for the durations of kernel executions
+   * that occur between the cuptiEventGroupEnable and
+   * cuptiEventGroupDisable calls. Event collection begins when a
+   * kernel execution begins, and stops when kernel execution
+   * completes. Event values are reset to zero when each kernel
+   * execution begins. If multiple kernel executions occur between the
+   * cuptiEventGroupEnable and cuptiEventGroupDisable calls then the
+   * event values must be read after each kernel launch if those
+   * events need to be associated with the specific kernel launch.
+   * Note that collection in this mode may significantly change the
+   * overall performance characteristics of the application because
+   * kernel executions that occur between the cuptiEventGroupEnable and
+   * cuptiEventGroupDisable calls are serialized on the GPU.
+   * This is the default mode from CUDA toolkit v6.5
+   */
+  CUPTI_EVENT_COLLECTION_MODE_KERNEL              = 1,
+  CUPTI_EVENT_COLLECTION_MODE_FORCE_INT           = 0x7fffffff
+} CUpti_EventCollectionMode;
+
+/**
+ * \brief An event category.
+ *
+ * Each event is assigned to a category that represents the general
+ * type of the event. A event's category is accessed using \ref
+ * cuptiEventGetAttribute and the CUPTI_EVENT_ATTR_CATEGORY attribute.
+ */
+typedef enum {
+  /**
+   * An instruction related event.
+   */
+  CUPTI_EVENT_CATEGORY_INSTRUCTION     = 0,
+  /**
+   * A memory related event.
+   */
+  CUPTI_EVENT_CATEGORY_MEMORY          = 1,
+  /**
+   * A cache related event.
+   */
+  CUPTI_EVENT_CATEGORY_CACHE           = 2,
+  /**
+   * A profile-trigger event.
+   */
+  CUPTI_EVENT_CATEGORY_PROFILE_TRIGGER = 3,
+  /**
+   * A system event.
+   */
+  CUPTI_EVENT_CATEGORY_SYSTEM  = 4,
+  CUPTI_EVENT_CATEGORY_FORCE_INT       = 0x7fffffff
+} CUpti_EventCategory;
+
+/**
+ * \brief The overflow value for a CUPTI event.
+ *
+ * The CUPTI event value that indicates an overflow.
+ */
+#define CUPTI_EVENT_OVERFLOW ((uint64_t)0xFFFFFFFFFFFFFFFFULL)
+
+/**
+ * \brief The value that indicates the event value is invalid
+ */
+#define CUPTI_EVENT_INVALID ((uint64_t)0xFFFFFFFFFFFFFFFEULL)
+
+/**
+ * \brief Flags for cuptiEventGroupReadEvent an
+ * cuptiEventGroupReadAllEvents.
+ *
+ * Flags for \ref cuptiEventGroupReadEvent an \ref
+ * cuptiEventGroupReadAllEvents.
+ */
+typedef enum {
+  /**
+   * No flags.
+   */
+  CUPTI_EVENT_READ_FLAG_NONE          = 0,
+  CUPTI_EVENT_READ_FLAG_FORCE_INT     = 0x7fffffff,
+} CUpti_ReadEventFlags;
+
+
+/**
+ * \brief A set of event groups.
+ *
+ * A set of event groups. When returned by \ref
+ * cuptiEventGroupSetsCreate and \ref cuptiMetricCreateEventGroupSets
+ * a set indicates that event groups that can be enabled at the same
+ * time (i.e. all the events in the set can be collected
+ * simultaneously).
+ */
+typedef struct {
+  /**
+   * The number of event groups in the set.
+   */
+  uint32_t numEventGroups;
+  /**
+   * An array of \p numEventGroups event groups.
+   */
+  CUpti_EventGroup *eventGroups;
+} CUpti_EventGroupSet;
+
+/**
+ * \brief A set of event group sets.
+ *
+ * A set of event group sets. When returned by \ref
+ * cuptiEventGroupSetsCreate and \ref cuptiMetricCreateEventGroupSets
+ * a CUpti_EventGroupSets indicates the number of passes required to
+ * collect all the events, and the event groups that should be
+ * collected during each pass.
+ */
+typedef struct {
+  /**
+   * Number of event group sets.
+   */
+  uint32_t numSets;
+  /**
+   * An array of \p numSets event group sets.
+   */
+  CUpti_EventGroupSet *sets;
+} CUpti_EventGroupSets;
+
+/**
+ * \brief Set the event collection mode.
+ *
+ * Set the event collection mode for a \p context.  The \p mode
+ * controls the event collection behavior of all events in event
+ * groups created in the \p context. This API is invalid in kernel
+ * replay mode.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context
+ * \param mode The event collection mode
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_OPERATION if called when replay mode is enabled
+ * \retval CUPTI_ERROR_NOT_SUPPORTED if mode is not supported on the device
+ */
+
+CUptiResult CUPTIAPI cuptiSetEventCollectionMode(CUcontext context,
+                                                 CUpti_EventCollectionMode mode);
+
+/**
+ * \brief Read a device attribute.
+ *
+ * Read a device attribute and return it in \p *value.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param attrib The attribute to read
+ * \param valueSize Size of buffer pointed by the value, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the value of the attribute
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not a device attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetAttribute(CUdevice device,
+                                             CUpti_DeviceAttribute attrib,
+                                             size_t *valueSize,
+                                             void *value);
+
+/**
+ * \brief Read a device timestamp.
+ *
+ * Returns the device timestamp in \p *timestamp. The timestamp is
+ * reported in nanoseconds and indicates the time since the device was
+ * last reset.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context A context on the device from which to get the timestamp
+ * \param timestamp Returns the device timestamp
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_PARAMETER is \p timestamp is NULL
+
+ * **DEPRECATED** This API is deprecated as of CUDA 11.3
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetTimestamp(CUcontext context,
+                                             uint64_t *timestamp);
+
+/**
+ * \brief Get the number of domains for a device.
+ *
+ * Returns the number of domains in \p numDomains for a device.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param numDomains Returns the number of domains
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numDomains is NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice device,
+                                                   uint32_t *numDomains);
+
+/**
+ * \brief Get the event domains for a device.
+ *
+ * Returns the event domains IDs in \p domainArray for a device.  The
+ * size of the \p domainArray buffer is given by \p
+ * *arraySizeBytes. The size of the \p domainArray buffer must be at
+ * least \p numdomains * sizeof(CUpti_EventDomainID) or else all
+ * domains will not be returned. The value returned in \p
+ * *arraySizeBytes contains the number of bytes returned in \p
+ * domainArray.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param arraySizeBytes The size of \p domainArray in bytes, and
+ * returns the number of bytes written to \p domainArray
+ * \param domainArray Returns the IDs of the event domains for the device
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p domainArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(CUdevice device,
+                                                 size_t *arraySizeBytes,
+                                                 CUpti_EventDomainID *domainArray);
+
+/**
+ * \brief Read an event domain attribute.
+ *
+ * Returns an event domain attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param eventDomain ID of the event domain
+ * \param attrib The event domain attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event domain attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(CUdevice device,
+                                                        CUpti_EventDomainID eventDomain,
+                                                        CUpti_EventDomainAttribute attrib,
+                                                        size_t *valueSize,
+                                                        void *value);
+
+/**
+ * \brief Get the number of event domains available on any device.
+ *
+ * Returns the total number of event domains available on any
+ * CUDA-capable device.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param numDomains Returns the number of domains
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numDomains is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetNumEventDomains(uint32_t *numDomains);
+
+/**
+ * \brief Get the event domains available on any device.
+ *
+ * Returns all the event domains available on any CUDA-capable device.
+ * Event domain IDs are returned in \p domainArray. The size of the \p
+ * domainArray buffer is given by \p *arraySizeBytes. The size of the
+ * \p domainArray buffer must be at least \p numDomains *
+ * sizeof(CUpti_EventDomainID) or all domains will not be
+ * returned. The value returned in \p *arraySizeBytes contains the
+ * number of bytes returned in \p domainArray.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param arraySizeBytes The size of \p domainArray in bytes, and
+ * returns the number of bytes written to \p domainArray
+ * \param domainArray Returns all the event domains
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p domainArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t *arraySizeBytes,
+                                           CUpti_EventDomainID *domainArray);
+
+/**
+ * \brief Read an event domain attribute.
+ *
+ * Returns an event domain attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventDomain ID of the event domain
+ * \param attrib The event domain attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event domain attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventDomainGetAttribute(CUpti_EventDomainID eventDomain,
+                                                  CUpti_EventDomainAttribute attrib,
+                                                  size_t *valueSize,
+                                                  void *value);
+
+/**
+ * \brief Get number of events in a domain.
+ *
+ * Returns the number of events in \p numEvents for a domain.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventDomain ID of the event domain
+ * \param numEvents Returns the number of events in the domain
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numEvents is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(CUpti_EventDomainID eventDomain,
+                                                  uint32_t *numEvents);
+
+/**
+ * \brief Get the events in a domain.
+ *
+ * Returns the event IDs in \p eventArray for a domain.  The size of
+ * the \p eventArray buffer is given by \p *arraySizeBytes. The size
+ * of the \p eventArray buffer must be at least \p numdomainevents *
+ * sizeof(CUpti_EventID) or else all events will not be returned. The
+ * value returned in \p *arraySizeBytes contains the number of bytes
+ * returned in \p eventArray.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventDomain ID of the event domain
+ * \param arraySizeBytes The size of \p eventArray in bytes, and
+ * returns the number of bytes written to \p eventArray
+ * \param eventArray Returns the IDs of the events in the domain
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or \p
+ * eventArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID eventDomain,
+                                                size_t *arraySizeBytes,
+                                                CUpti_EventID *eventArray);
+
+/**
+ * \brief Get an event attribute.
+ *
+ * Returns an event attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param event ID of the event
+ * \param attrib The event attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID event,
+                                            CUpti_EventAttribute attrib,
+                                            size_t *valueSize,
+                                            void *value);
+
+/**
+ * \brief Find an event by name.
+ *
+ * Find an event by name and return the event ID in \p *event.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param eventName The name of the event to find
+ * \param event Returns the ID of the found event or undefined if
+ * unable to find the event
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_EVENT_NAME if unable to find an event
+ * with name \p eventName. In this case \p *event is undefined
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventName or \p event are NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGetIdFromName(CUdevice device,
+                                             const char *eventName,
+                                             CUpti_EventID *event);
+
+/**
+ * \brief Create a new event group for a context.
+ *
+ * Creates a new event group for \p context and returns the new group
+ * in \p *eventGroup.
+ * \note \p flags are reserved for future use and should be set to zero.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context for the event group
+ * \param eventGroup Returns the new event group
+ * \param flags Reserved - must be zero
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext context,
+                                           CUpti_EventGroup *eventGroup,
+                                           uint32_t flags);
+
+/**
+ * \brief Destroy an event group.
+ *
+ * Destroy an \p eventGroup and free its resources. An event group
+ * cannot be destroyed if it is enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group to destroy
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if the event group is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Read an event group attribute.
+ *
+ * Read an event group attribute and return it in \p *value.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.).
+ *
+ * \param eventGroup The event group
+ * \param attrib The attribute to read
+ * \param valueSize Size of buffer pointed by the value, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the value of the attribute
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an eventgroup attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventGroupGetAttribute(CUpti_EventGroup eventGroup,
+                                                 CUpti_EventGroupAttribute attrib,
+                                                 size_t *valueSize,
+                                                 void *value);
+
+/**
+ * \brief Write an event group attribute.
+ *
+ * Write an event group attribute.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ * \param attrib The attribute to write
+ * \param valueSize The size, in bytes, of the value
+ * \param value The attribute value to write
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event group attribute, or if
+ * \p attrib is not a writable attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
+ * the \p value buffer is too small to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetAttribute(CUpti_EventGroup eventGroup,
+                                                 CUpti_EventGroupAttribute attrib,
+                                                 size_t valueSize,
+                                                 void *value);
+
+/**
+ * \brief Add an event to an event group.
+ *
+ * Add an event to an event group. The event add can fail for a number of reasons:
+ * \li The event group is enabled
+ * \li The event does not belong to the same event domain as the
+ * events that are already in the event group
+ * \li Device limitations on the events that can belong to the same group
+ * \li The event group is full
+ *
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ * \param event The event to add to the group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p event belongs to a
+ * different event domain than the events already in \p eventGroup, or
+ * if a device limitation prevents \p event from being collected at
+ * the same time as the events already in \p eventGroup
+ * \retval CUPTI_ERROR_MAX_LIMIT_REACHED if \p eventGroup is full
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup eventGroup,
+                                             CUpti_EventID event);
+
+/**
+ * \brief Remove an event from an event group.
+ *
+ * Remove \p event from the an event group. The event cannot be
+ * removed if the event group is enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ * \param event The event to remove from the group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupRemoveEvent(CUpti_EventGroup eventGroup,
+                                                CUpti_EventID event);
+
+/**
+ * \brief Remove all events from an event group.
+ *
+ * Remove all events from an event group. Events cannot be removed if
+ * the event group is enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupRemoveAllEvents(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Zero all the event counts in an event group.
+ *
+ * Zero all the event counts in an event group.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.).
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupResetAllEvents(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Enable an event group.
+ *
+ * Enable an event group. Enabling an event group zeros the value of
+ * all the events in the group and then starts collection of those
+ * events.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_NOT_READY if \p eventGroup does not contain any events
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p eventGroup cannot be
+ * enabled due to other already enabled event groups
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ * \retval CUPTI_ERROR_HARDWARE_BUSY if another client is profiling
+ * and hardware is busy
+ */
+CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Disable an event group.
+ *
+ * Disable an event group. Disabling an event group stops collection
+ * of events contained in the group.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Read the value for an event in an event group.
+ *
+ * Read the value for an event in an event group. The event value is
+ * returned in the \p eventValueBuffer buffer. \p
+ * eventValueBufferSizeBytes indicates the size of the \p
+ * eventValueBuffer buffer. The buffer must be at least sizeof(uint64)
+ * if ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is not set
+ * on the group containing the event.  The buffer must be at least
+ * (sizeof(uint64) * number of domain instances) if
+ * ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is set on the
+ * group.
+ *
+ * If any instance of an event counter overflows, the value returned
+ * for that event instance will be ::CUPTI_EVENT_OVERFLOW.
+ *
+ * The only allowed value for \p flags is ::CUPTI_EVENT_READ_FLAG_NONE.
+ *
+ * Reading an event from a disabled event group is not allowed. After
+ * being read, an event's value is reset to zero.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.). If \ref cuptiEventGroupResetAllEvents is
+ * called simultaneously with this function, then returned event
+ * values are undefined.
+ *
+ * \param eventGroup The event group
+ * \param flags Flags controlling the reading mode
+ * \param event The event to read
+ * \param eventValueBufferSizeBytes The size of \p eventValueBuffer
+ * in bytes, and returns the number of bytes written to \p
+ * eventValueBuffer
+ * \param eventValueBuffer Returns the event value(s)
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is disabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup, \p
+ * eventValueBufferSizeBytes or \p eventValueBuffer is NULL
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if size of \p eventValueBuffer
+ * is not sufficient
+ */
+CUptiResult CUPTIAPI cuptiEventGroupReadEvent(CUpti_EventGroup eventGroup,
+                                              CUpti_ReadEventFlags flags,
+                                              CUpti_EventID event,
+                                              size_t *eventValueBufferSizeBytes,
+                                              uint64_t *eventValueBuffer);
+
+/**
+ * \brief Read the values for all the events in an event group.
+ *
+ * Read the values for all the events in an event group. The event
+ * values are returned in the \p eventValueBuffer buffer. \p
+ * eventValueBufferSizeBytes indicates the size of \p
+ * eventValueBuffer.  The buffer must be at least (sizeof(uint64) *
+ * number of events in group) if
+ * ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is not set on
+ * the group containing the events.  The buffer must be at least
+ * (sizeof(uint64) * number of domain instances * number of events in
+ * group) if ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is
+ * set on the group.
+ *
+ * The data format returned in \p eventValueBuffer is:
+ *    - domain instance 0: event0 event1 ... eventN
+ *    - domain instance 1: event0 event1 ... eventN
+ *    - ...
+ *    - domain instance M: event0 event1 ... eventN
+ *
+ * The event order in \p eventValueBuffer is returned in \p
+ * eventIdArray. The size of \p eventIdArray is specified in \p
+ * eventIdArraySizeBytes. The size should be at least
+ * (sizeof(CUpti_EventID) * number of events in group).
+ *
+ * If any instance of any event counter overflows, the value returned
+ * for that event instance will be ::CUPTI_EVENT_OVERFLOW.
+ *
+ * The only allowed value for \p flags is ::CUPTI_EVENT_READ_FLAG_NONE.
+ *
+ * Reading events from a disabled event group is not allowed. After
+ * being read, an event's value is reset to zero.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.). If \ref cuptiEventGroupResetAllEvents is
+ * called simultaneously with this function, then returned event
+ * values are undefined.
+ *
+ * \param eventGroup The event group
+ * \param flags Flags controlling the reading mode
+ * \param eventValueBufferSizeBytes The size of \p eventValueBuffer in
+ * bytes, and returns the number of bytes written to \p
+ * eventValueBuffer
+ * \param eventValueBuffer Returns the event values
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes,
+ * and returns the number of bytes written to \p eventIdArray
+ * \param eventIdArray Returns the IDs of the events in the same order
+ * as the values return in eventValueBuffer.
+ * \param numEventIdsRead Returns the number of event IDs returned
+ * in \p eventIdArray
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is disabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup, \p
+ * eventValueBufferSizeBytes, \p eventValueBuffer, \p
+ * eventIdArraySizeBytes, \p eventIdArray or \p numEventIdsRead is
+ * NULL
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if size of \p eventValueBuffer
+ * or \p eventIdArray is not sufficient
+ */
+CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(CUpti_EventGroup       eventGroup,
+                                                  CUpti_ReadEventFlags   flags,
+                                                  size_t                 *eventValueBufferSizeBytes,
+                                                  uint64_t               *eventValueBuffer,
+                                                  size_t                 *eventIdArraySizeBytes,
+                                                  CUpti_EventID          *eventIdArray,
+                                                  size_t                 *numEventIdsRead);
+
+/**
+ * \brief For a set of events, get the grouping that indicates the
+ * number of passes and the event groups necessary to collect the
+ * events.
+ *
+ * The number of events that can be collected simultaneously varies by
+ * device and by the type of the events. When events can be collected
+ * simultaneously, they may need to be grouped into multiple event
+ * groups because they are from different event domains. This function
+ * takes a set of events and determines how many passes are required
+ * to collect all those events, and which events can be collected
+ * simultaneously in each pass.
+ *
+ * The CUpti_EventGroupSets returned in \p eventGroupPasses indicates
+ * how many passes are required to collect the events with the \p
+ * numSets field. Within each event group set, the \p sets array
+ * indicates the event groups that should be collected on each pass.
+ * \note \b Thread-safety: this function is thread safe, but client
+ * must guard against another thread simultaneously destroying \p
+ * context.
+ *
+ * \param context The context for event collection
+ * \param eventIdArraySizeBytes Size of \p eventIdArray in bytes
+ * \param eventIdArray Array of event IDs that need to be grouped
+ * \param eventGroupPasses Returns a CUpti_EventGroupSets object that
+ * indicates the number of passes required to collect the events and
+ * the events to collect on each pass
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventIdArray or
+ * \p eventGroupPasses is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetsCreate(CUcontext context,
+                                               size_t eventIdArraySizeBytes,
+                                               CUpti_EventID *eventIdArray,
+                                               CUpti_EventGroupSets **eventGroupPasses);
+
+/**
+ * \brief Destroy a event group sets object.
+ *
+ * Destroy a CUpti_EventGroupSets object.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroupSets The object to destroy
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if any of the event groups
+ * contained in the sets is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSets is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetsDestroy(CUpti_EventGroupSets *eventGroupSets);
+
+
+/**
+ * \brief Enable an event group set.
+ *
+ * Enable a set of event groups. Enabling a set of event groups zeros the value of
+ * all the events in all the groups and then starts collection of those events.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroupSet The pointer to the event group set
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_NOT_READY if \p eventGroup does not contain any events
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p eventGroup cannot be
+ * enabled due to other already enabled event groups
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSet is NULL
+ * \retval CUPTI_ERROR_HARDWARE_BUSY if other client is profiling and hardware is
+ * busy
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetEnable(CUpti_EventGroupSet *eventGroupSet);
+
+/**
+ * \brief Disable an event group set.
+ *
+ * Disable a set of event groups. Disabling a set of event groups
+ * stops collection of events contained in the groups.
+ * \note \b Thread-safety: this function is thread safe.
+ * \note \b If this call fails, some of the event groups in the set may be disabled
+ * and other event groups may remain enabled.
+ *
+ * \param eventGroupSet The pointer to the event group set
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSet is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetDisable(CUpti_EventGroupSet *eventGroupSet);
+
+/**
+ * \brief Enable kernel replay mode.
+ *
+ * Set profiling mode for the context to replay mode. In this mode,
+ * any number of events can be collected in one run of the kernel. The
+ * event collection mode will automatically switch to
+ * CUPTI_EVENT_COLLECTION_MODE_KERNEL.  In this mode, \ref
+ * cuptiSetEventCollectionMode will return
+ * CUPTI_ERROR_INVALID_OPERATION.
+ * \note \b Kernels might take longer to run if many events are enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context
+ * \retval CUPTI_SUCCESS
+ */
+CUptiResult CUPTIAPI cuptiEnableKernelReplayMode(CUcontext context);
+
+/**
+ * \brief Disable kernel replay mode.
+ *
+ * Set profiling mode for the context to non-replay (default)
+ * mode. Event collection mode will be set to
+ * CUPTI_EVENT_COLLECTION_MODE_KERNEL.  All previously enabled
+ * event groups and event group sets will be disabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context
+ * \retval CUPTI_SUCCESS
+ */
+CUptiResult CUPTIAPI cuptiDisableKernelReplayMode(CUcontext context);
+
+/**
+ * \brief Function type for getting updates on kernel replay.
+ *
+ * \param kernelName The mangled kernel name
+ * \param numReplaysDone Number of replays done so far
+ * \param customData Pointer of any custom data passed in when subscribing
+ */
+typedef void (CUPTIAPI *CUpti_KernelReplayUpdateFunc)(
+    const char *kernelName,
+    int numReplaysDone,
+    void *customData);
+
+/**
+ * \brief Subscribe to kernel replay updates.
+ *
+ * When subscribed, the function pointer passed in will be called each time a
+ * kernel run is finished during kernel replay. Previously subscribed function
+ * pointer will be replaced. Pass in NULL as the function pointer unsubscribes
+ * the update.
+ *
+ * \param updateFunc The update function pointer
+ * \param customData Pointer to any custom data
+ * \retval CUPTI_SUCCESS
+ */
+CUptiResult CUPTIAPI cuptiKernelReplaySubscribeUpdate(CUpti_KernelReplayUpdateFunc updateFunc, void *customData);
+
+/** @} */ /* END CUPTI_EVENT_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_EVENTS_H_*/
+
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling_util.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..9cb1ac2132b3d53bd67f39f1e4ebd85d3ea61465
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling_util.h
@@ -0,0 +1,419 @@
+#if !defined(_CUPTI_PCSAMPLING_UTIL_H_)
+#define _CUPTI_PCSAMPLING_UTIL_H_
+
+#include <cupti_pcsampling.h>
+#include <fstream>
+
+#ifndef CUPTIUTILAPI
+#ifdef _WIN32
+#define CUPTIUTILAPI __stdcall
+#else
+#define CUPTIUTILAPI
+#endif
+#endif
+
+#define ACTIVITY_RECORD_ALIGNMENT 8
+#if defined(_WIN32) // Windows 32- and 64-bit
+#define START_PACKED_ALIGNMENT __pragma(pack(push,1)) // exact fit - no padding
+#define PACKED_ALIGNMENT __declspec(align(ACTIVITY_RECORD_ALIGNMENT))
+#define END_PACKED_ALIGNMENT __pragma(pack(pop))
+#elif defined(__GNUC__) // GCC
+#define START_PACKED_ALIGNMENT
+#define PACKED_ALIGNMENT __attribute__ ((__packed__)) __attribute__ ((aligned (ACTIVITY_RECORD_ALIGNMENT)))
+#define END_PACKED_ALIGNMENT
+#else // all other compilers
+#define START_PACKED_ALIGNMENT
+#define PACKED_ALIGNMENT
+#define END_PACKED_ALIGNMENT
+#endif
+
+#ifndef CUPTI_UTIL_STRUCT_SIZE
+#define CUPTI_UTIL_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+#ifndef CHECK_PC_SAMPLING_STRUCT_FIELD_EXISTS
+#define CHECK_PC_SAMPLING_STRUCT_FIELD_EXISTS(type, member, structSize)    \
+    (offsetof(type, member) < structSize)
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__)
+    #pragma GCC visibility push(default)
+#endif
+
+namespace CUPTI { namespace PcSamplingUtil {
+
+/**
+ * \defgroup CUPTI_PCSAMPLING_UTILITY CUPTI PC Sampling Utility API
+ * Functions, types, and enums that implement the CUPTI PC Sampling Utility API.
+ * @{
+ */
+
+/**
+ * \brief Header info will be stored in file.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Version of file format.
+   */
+  uint32_t version;
+  /**
+   * Total number of buffers present in the file.
+   */
+  uint32_t totalBuffers;
+} Header;
+
+/**
+ * \brief BufferInfo will be stored in the file for every buffer
+ *  i.e for every call of UtilDumpPcSamplingBufferInFile() API.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Total number of PC records.
+   */
+  uint64_t recordCount;
+  /**
+   * Count of all stall reasons supported on the GPU
+   */
+  size_t numStallReasons;
+  /**
+   * Total number of stall reasons in single record.
+   */
+  uint64_t numSelectedStallReasons;
+  /**
+   * Buffer size in Bytes.
+   */
+  uint64_t bufferByteSize;
+} BufferInfo;
+
+/**
+ * \brief All available stall reasons name and respective indexes
+ * will be stored in it.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Number of all available stall reasons
+   */
+  size_t numStallReasons;
+  /**
+   * Stall reasons names of all available stall reasons
+   */
+  char **stallReasons;
+  /**
+   * Stall reason index of all available stall reasons
+   */
+  uint32_t *stallReasonIndex;
+} PcSamplingStallReasons;
+
+typedef enum {
+  /**
+   * Invalid buffer type.
+   */
+  PC_SAMPLING_BUFFER_INVALID             = 0,
+  /**
+   * Refers to CUpti_PCSamplingData buffer.
+   */
+  PC_SAMPLING_BUFFER_PC_TO_COUNTER_DATA  = 1
+} PcSamplingBufferType;
+
+/**
+ * \brief CUPTI PC sampling utility API result codes.
+ *
+ * Error and result codes returned by CUPTI PC sampling utility API.
+ */
+typedef enum {
+  /**
+   * No error
+   */
+  CUPTI_UTIL_SUCCESS                                       = 0,
+  /**
+   * One or more of the parameters are invalid.
+   */
+  CUPTI_UTIL_ERROR_INVALID_PARAMETER                       = 1,
+  /**
+   * Unable to create a new file
+   */
+  CUPTI_UTIL_ERROR_UNABLE_TO_CREATE_FILE                   = 2,
+  /**
+   * Unable to open a file
+   */
+  CUPTI_UTIL_ERROR_UNABLE_TO_OPEN_FILE                     = 3,
+  /**
+   * Read or write operation failed
+   */
+  CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED             = 4,
+  /**
+   * Provided file handle is corrupted.
+   */
+  CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED                   = 5,
+  /**
+   * seek operation failed.
+   */
+  CUPTI_UTIL_ERROR_SEEK_OPERATION_FAILED                   = 6,
+  /**
+   * Unable to allocate enough memory to perform the requested
+   * operation.
+   */
+  CUPTI_UTIL_ERROR_OUT_OF_MEMORY                           = 7,
+  /**
+   * An unknown internal error has occurred.
+   */
+  CUPTI_UTIL_ERROR_UNKNOWN                                 = 999,
+  CUPTI_UTIL_ERROR_FORCE_INT                               = 0x7fffffff
+} CUptiUtilResult;
+
+/**
+ * \brief Params for \ref CuptiUtilPutPcSampData
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * Type of buffer to store in file
+   */
+  PcSamplingBufferType bufferType;
+  /**
+   * PC sampling buffer.
+   */
+  void *pSamplingData;
+  /**
+   * Number of configured attributes
+   */
+  size_t numAttributes;
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationInfo
+   * It is expected to provide configuration details of at least
+   * CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON attribute.
+   */
+  CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo;
+  /**
+   * Refer \ref PcSamplingStallReasons.
+   */
+  PcSamplingStallReasons *pPcSamplingStallReasons;
+  /**
+   * File name to store buffer into it.
+   */
+  const char* fileName;
+} CUptiUtil_PutPcSampDataParams;
+#define CUptiUtil_PutPcSampDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_PutPcSampDataParams, fileName)
+
+/**
+ * \brief Dump PC sampling data into the file.
+ *
+ * This API can be called multiple times.
+ * It will append buffer in the file.
+ * For every buffer it will store BufferInfo
+ * so that before retrieving data it will help to allocate buffer
+ * to store retrieved data.
+ * This API creates file if file does not present.
+ * If stallReasonIndex or stallReasons pointer of \ref CUptiUtil_PutPcSampDataParams is NULL
+ * then stall reasons data  will not be stored in file.
+ * It is expected to store all available stall reason data at least once to refer it during
+ * offline correlation.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if buffer type is invalid
+ * or if either of pSamplingData, pParams pointer is NULL or stall reason configuration details not provided
+ * or filename is empty.
+ * \retval CUPTI_UTIL_ERROR_UNABLE_TO_CREATE_FILE
+ * \retval CUPTI_UTIL_ERROR_UNABLE_TO_OPEN_FILE
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilPutPcSampData(CUptiUtil_PutPcSampDataParams *pParams);
+
+/**
+ * \brief Params for \ref CuptiUtilGetHeaderData
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * File handle.
+   */
+  std::ifstream *fileHandler;
+  /**
+   * Header Info.
+   */
+  Header headerInfo;
+
+} CUptiUtil_GetHeaderDataParams;
+#define CUptiUtil_GetHeaderDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetHeaderDataParams, headerInfo)
+
+/**
+ * \brief Get header data of file.
+ *
+ * This API must be called once initially while retrieving data from file.
+ * \ref Header structure, it gives info about total number
+ * of buffers present in the file.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if either of pParam or fileHandle is NULL or param struct size is incorrect.
+ * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED  failed to read data from file.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilGetHeaderData(CUptiUtil_GetHeaderDataParams *pParams);
+
+/**
+ * \brief Params for \ref CuptiUtilGetBufferInfo
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * File handle.
+   */
+  std::ifstream *fileHandler;
+  /**
+   * Buffer Info.
+   */
+  BufferInfo bufferInfoData;
+} CUptiUtil_GetBufferInfoParams;
+#define CUptiUtil_GetBufferInfoParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetBufferInfoParams, bufferInfoData)
+
+/**
+ * \brief Get buffer info data of file.
+ *
+ * This API must be called every time before calling CuptiUtilGetPcSampData API.
+ * \ref BufferInfo structure, it gives info about recordCount and stallReasonCount
+ * of every record in the buffer. This will help to allocate exact buffer to retrieve data into it.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if either of pParam or fileHandle is NULL or param struct size is incorrect.
+ * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file.
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED failed to read data from file.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilGetBufferInfo(CUptiUtil_GetBufferInfoParams *pParams);
+
+/**
+ * \brief Params for \ref CuptiUtilGetPcSampData
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * File handle.
+   */
+  std::ifstream *fileHandler;
+  /**
+   * Type of buffer to store in file
+   */
+  PcSamplingBufferType bufferType;
+  /**
+   * Pointer to collected buffer info using \ref CuptiUtilGetBufferInfo
+   */
+  BufferInfo *pBufferInfoData;
+  /**
+   * Pointer to allocated memory to store retrieved data from file.
+   */
+  void *pSamplingData;
+  /**
+   * Number of configuration attributes
+   */
+  size_t numAttributes;
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationInfo
+   */
+  CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo;
+  /**
+   * Refer \ref PcSamplingStallReasons.
+   * For stallReasons field of \ref PcSamplingStallReasons it is expected to
+   * allocate memory for each string element of array.
+   */
+  PcSamplingStallReasons *pPcSamplingStallReasons;
+} CUptiUtil_GetPcSampDataParams;
+#define CUptiUtil_GetPcSampDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetPcSampDataParams, pPcSamplingStallReasons)
+
+/**
+ * \brief Retrieve PC sampling data from file into allocated buffer.
+ *
+ * This API must be called after CuptiUtilGetBufferInfo API.
+ * It will retrieve data from file into allocated buffer.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if buffer type is invalid
+ * or if either of pSampData, pParams is NULL. If pPcSamplingStallReasons is not NULL then
+ * error out if either of stallReasonIndex, stallReasons or stallReasons array element pointer is NULL.
+ * or filename is empty.
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED
+ * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilGetPcSampData(CUptiUtil_GetPcSampDataParams *pParams);
+
+/**
+ * \brief Params for \ref CuptiUtilMergePcSampData
+ */
+typedef struct
+{
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * Number of buffers to merge.
+   */
+  size_t numberOfBuffers;
+  /**
+   * Pointer to array of buffers to merge
+   */
+  CUpti_PCSamplingData *PcSampDataBuffer;
+  /**
+   * Pointer to array of merged buffers as per the range id.
+   */
+  CUpti_PCSamplingData **MergedPcSampDataBuffers;
+  /**
+   * Number of merged buffers.
+   */
+  size_t *numMergedBuffer;
+} CUptiUtil_MergePcSampDataParams;
+#define CUptiUtil_MergePcSampDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_MergePcSampDataParams, numMergedBuffer)
+
+/**
+ * \brief Merge PC sampling data range id wise.
+ *
+ * This API merge PC sampling data range id wise.
+ * It allocates memory for merged data and fill data in it
+ * and provide buffer pointer in MergedPcSampDataBuffers field.
+ * It is expected from user to free merge data buffers after use.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if param struct size is invalid
+ * or count of buffers to merge is invalid i.e less than 1
+ * or either of PcSampDataBuffer, MergedPcSampDataBuffers, numMergedBuffer is NULL
+ * \retval CUPTI_UTIL_ERROR_OUT_OF_MEMORY Unable to allocate memory for merged buffer.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilMergePcSampData(CUptiUtil_MergePcSampDataParams *pParams);
+
+/** @} */ /* END CUPTI_PCSAMPLING_UTILITY */
+
+} }
+
+#if defined(__GNUC__)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_result.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_result.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2896451245f9ad325175330c6715b80bf639832
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_result.h
@@ -0,0 +1,328 @@
+/*
+ * Copyright 2010-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_RESULT_H_)
+#define _CUPTI_RESULT_H_
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_RESULT_API CUPTI Result Codes
+ * Error and result codes returned by CUPTI functions.
+ * @{
+ */
+
+/**
+ * \brief CUPTI result codes.
+ *
+ * Error and result codes returned by CUPTI functions.
+ */
+typedef enum {
+    /**
+     * No error.
+     */
+    CUPTI_SUCCESS                                       = 0,
+    /**
+     * One or more of the parameters is invalid.
+     */
+    CUPTI_ERROR_INVALID_PARAMETER                       = 1,
+    /**
+     * The device does not correspond to a valid CUDA device.
+     */
+    CUPTI_ERROR_INVALID_DEVICE                          = 2,
+    /**
+     * The context is NULL or not valid.
+     */
+    CUPTI_ERROR_INVALID_CONTEXT                         = 3,
+    /**
+     * The event domain id is invalid.
+     */
+    CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID                 = 4,
+    /**
+     * The event id is invalid.
+     */
+    CUPTI_ERROR_INVALID_EVENT_ID                        = 5,
+    /**
+     * The event name is invalid.
+     */
+    CUPTI_ERROR_INVALID_EVENT_NAME                      = 6,
+    /**
+     * The current operation cannot be performed due to dependency on
+     * other factors.
+     */
+    CUPTI_ERROR_INVALID_OPERATION                       = 7,
+    /**
+     * Unable to allocate enough memory to perform the requested
+     * operation.
+     */
+    CUPTI_ERROR_OUT_OF_MEMORY                           = 8,
+    /**
+     * An error occurred on the performance monitoring hardware.
+     */
+    CUPTI_ERROR_HARDWARE                                = 9,
+    /**
+     * The output buffer size is not sufficient to return all
+     * requested data.
+     */
+    CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT           = 10,
+    /**
+     * API is not implemented.
+     */
+    CUPTI_ERROR_API_NOT_IMPLEMENTED                     = 11,
+    /**
+     * The maximum limit is reached.
+     */
+    CUPTI_ERROR_MAX_LIMIT_REACHED                       = 12,
+    /**
+     * The object is not yet ready to perform the requested operation.
+     */
+    CUPTI_ERROR_NOT_READY                               = 13,
+    /**
+     * The current operation is not compatible with the current state
+     * of the object
+     */
+    CUPTI_ERROR_NOT_COMPATIBLE                          = 14,
+    /**
+     * CUPTI is unable to initialize its connection to the CUDA
+     * driver.
+     */
+    CUPTI_ERROR_NOT_INITIALIZED                         = 15,
+    /**
+     * The metric id is invalid.
+     */
+    CUPTI_ERROR_INVALID_METRIC_ID                        = 16,
+    /**
+     * The metric name is invalid.
+     */
+    CUPTI_ERROR_INVALID_METRIC_NAME                      = 17,
+    /**
+     * The queue is empty.
+     */
+    CUPTI_ERROR_QUEUE_EMPTY                              = 18,
+    /**
+     * Invalid handle (internal?).
+     */
+    CUPTI_ERROR_INVALID_HANDLE                           = 19,
+    /**
+     * Invalid stream.
+     */
+    CUPTI_ERROR_INVALID_STREAM                           = 20,
+    /**
+     * Invalid kind.
+     */
+    CUPTI_ERROR_INVALID_KIND                             = 21,
+    /**
+     * Invalid event value.
+     */
+    CUPTI_ERROR_INVALID_EVENT_VALUE                      = 22,
+    /**
+     * CUPTI is disabled due to conflicts with other enabled profilers
+     */
+    CUPTI_ERROR_DISABLED                                 = 23,
+    /**
+     * Invalid module.
+     */
+    CUPTI_ERROR_INVALID_MODULE                           = 24,
+    /**
+     * Invalid metric value.
+     */
+    CUPTI_ERROR_INVALID_METRIC_VALUE                     = 25,
+    /**
+     * The performance monitoring hardware is in use by other client.
+     */
+    CUPTI_ERROR_HARDWARE_BUSY                            = 26,
+    /**
+     * The attempted operation is not supported on the current
+     * system or device.
+     */
+    CUPTI_ERROR_NOT_SUPPORTED                            = 27,
+    /**
+     * Unified memory profiling is not supported on the system.
+     * Potential reason could be unsupported OS or architecture.
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED               = 28,
+    /**
+     * Unified memory profiling is not supported on the device
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE     = 29,
+    /**
+     * Unified memory profiling is not supported on a multi-GPU
+     * configuration without P2P support between any pair of devices
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES = 30,
+    /**
+     * Unified memory profiling is not supported under the
+     * Multi-Process Service (MPS) environment. CUDA 7.5 removes this
+     * restriction.
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_WITH_MPS      = 31,
+    /**
+     * In CUDA 9.0, devices with compute capability 7.0 don't
+     * support CDP tracing
+     */
+    CUPTI_ERROR_CDP_TRACING_NOT_SUPPORTED                = 32,
+    /**
+     * Profiling on virtualized GPU is not supported.
+     */
+    CUPTI_ERROR_VIRTUALIZED_DEVICE_NOT_SUPPORTED         = 33,
+    /**
+     * Profiling results might be incorrect for CUDA applications
+     * compiled with nvcc version older than 9.0 for devices with
+     * compute capability 6.0 and 6.1.
+     * Profiling session will continue and CUPTI will notify it using this error code.
+     * User is advised to recompile the application code with nvcc version 9.0 or later.
+     * Ignore this warning if code is already compiled with the recommended nvcc version.
+     */
+    CUPTI_ERROR_CUDA_COMPILER_NOT_COMPATIBLE             = 34,
+    /**
+     * User doesn't have sufficient privileges which are required to
+     * start the profiling session.
+     * One possible reason for this may be that the NVIDIA driver or your system
+     * administrator may have restricted access to the NVIDIA GPU performance counters.
+     * To learn how to resolve this issue and find more information, please visit
+     * https://developer.nvidia.com/CUPTI_ERROR_INSUFFICIENT_PRIVILEGES
+     */
+    CUPTI_ERROR_INSUFFICIENT_PRIVILEGES                  = 35,
+    /**
+     * Legacy CUPTI Profiling API i.e. event API from the header cupti_events.h and
+     * metric API from the header cupti_metrics.h are not compatible with the
+     * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API
+     * in the headers nvperf_host.h and nvperf_target.h.
+     */
+    CUPTI_ERROR_OLD_PROFILER_API_INITIALIZED             = 36,
+    /**
+     * Missing definition of the OpenACC API routine in the linked OpenACC library.
+     *
+     * One possible reason is that OpenACC library is linked statically in the
+     * user application, which might not have the definition of all the OpenACC
+     * API routines needed for the OpenACC profiling, as compiler might ignore
+     * definitions for the functions not used in the application. This issue
+     * can be mitigated by linking the OpenACC library dynamically.
+     */
+    CUPTI_ERROR_OPENACC_UNDEFINED_ROUTINE                = 37,
+    /**
+     * Legacy CUPTI Profiling API i.e. event API from the header cupti_events.h and
+     * metric API from the header cupti_metrics.h are not supported on devices with
+     * compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
+     * These API will be deprecated in a future CUDA release. These are replaced by
+     * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API
+     * in the headers nvperf_host.h and nvperf_target.h.
+     */
+    CUPTI_ERROR_LEGACY_PROFILER_NOT_SUPPORTED            = 38,
+    /**
+     * CUPTI doesn't allow multiple callback subscribers. Only a single subscriber
+     * can be registered at a time.
+     * Same error code is used when application is launched using NVIDIA tools
+     * like nvprof, Visual Profiler, Nsight Systems, Nsight Compute, cuda-gdb and
+     * cuda-memcheck.
+     */
+    CUPTI_ERROR_MULTIPLE_SUBSCRIBERS_NOT_SUPPORTED       = 39,
+    /**
+     * Profiling on virtualized GPU is not allowed by hypervisor.
+     */
+    CUPTI_ERROR_VIRTUALIZED_DEVICE_INSUFFICIENT_PRIVILEGES = 40,
+    /**
+     * Profiling and tracing are not allowed when confidential computing mode
+     * is enabled.
+     */
+    CUPTI_ERROR_CONFIDENTIAL_COMPUTING_NOT_SUPPORTED = 41,
+    /**
+     * CUPTI does not support NVIDIA Crypto Mining Processors (CMP).
+     * For more information, please visit https://developer.nvidia.com/ERR_NVCMPGPU
+    */
+    CUPTI_ERROR_CMP_DEVICE_NOT_SUPPORTED = 42,
+    /**
+     * An unknown internal error has occurred.
+     */
+    CUPTI_ERROR_UNKNOWN                                  = 999,
+    CUPTI_ERROR_FORCE_INT                                = 0x7fffffff
+} CUptiResult;
+
+/**
+ * \brief Get the descriptive string for a CUptiResult.
+ *
+ * Return the descriptive string for a CUptiResult in \p *str.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param result The result to get the string for
+ * \param str Returns the string
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p str is NULL or \p
+ * result is not a valid CUptiResult
+ */
+CUptiResult CUPTIAPI cuptiGetResultString(CUptiResult result, const char **str);
+
+/** @} */ /* END CUPTI_RESULT_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_RESULT_H_*/
+
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_runtime_cbid.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_runtime_cbid.h
new file mode 100644
index 0000000000000000000000000000000000000000..dac73f5d586e4704bfaaccd7d75f6c857e91fff3
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_runtime_cbid.h
@@ -0,0 +1,447 @@
+
+// *************************************************************************
+//      Definitions of indices for API functions, unique across entire API
+// *************************************************************************
+
+// This file is generated.  Any changes you make will be lost during the next clean build.
+// CUDA public interface, for type definitions and cu* function prototypes
+
+typedef enum CUpti_runtime_api_trace_cbid_enum {
+    CUPTI_RUNTIME_TRACE_CBID_INVALID                                                       = 0,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDriverGetVersion_v3020                                    = 1,
+    CUPTI_RUNTIME_TRACE_CBID_cudaRuntimeGetVersion_v3020                                   = 2,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceCount_v3020                                      = 3,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v3020                                 = 4,
+    CUPTI_RUNTIME_TRACE_CBID_cudaChooseDevice_v3020                                        = 5,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetChannelDesc_v3020                                      = 6,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateChannelDesc_v3020                                   = 7,
+    CUPTI_RUNTIME_TRACE_CBID_cudaConfigureCall_v3020                                       = 8,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020                                       = 9,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020                                        = 10,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeekAtLastError_v3020                                     = 11,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetErrorString_v3020                                      = 12,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020                                              = 13,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetCacheConfig_v3020                                  = 14,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetAttributes_v3020                                   = 15,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020                                           = 16,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020                                           = 17,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetValidDevices_v3020                                     = 18,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDeviceFlags_v3020                                      = 19,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc_v3020                                              = 20,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocPitch_v3020                                         = 21,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFree_v3020                                                = 22,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocArray_v3020                                         = 23,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeArray_v3020                                           = 24,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocHost_v3020                                          = 25,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeHost_v3020                                            = 26,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostAlloc_v3020                                           = 27,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostGetDevicePointer_v3020                                = 28,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostGetFlags_v3020                                        = 29,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemGetInfo_v3020                                          = 30,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020                                              = 31,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2D_v3020                                            = 32,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArray_v3020                                       = 33,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArray_v3020                                     = 34,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArray_v3020                                     = 35,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArray_v3020                                   = 36,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyArrayToArray_v3020                                  = 37,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DArrayToArray_v3020                                = 38,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbol_v3020                                      = 39,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbol_v3020                                    = 40,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020                                         = 41,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArrayAsync_v3020                                  = 42,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArrayAsync_v3020                                = 43,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DAsync_v3020                                       = 44,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArrayAsync_v3020                                = 45,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArrayAsync_v3020                              = 46,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbolAsync_v3020                                 = 47,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbolAsync_v3020                               = 48,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020                                              = 49,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2D_v3020                                            = 50,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020                                         = 51,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_v3020                                       = 52,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSymbolAddress_v3020                                    = 53,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSymbolSize_v3020                                       = 54,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTexture_v3020                                         = 55,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTexture2D_v3020                                       = 56,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTextureToArray_v3020                                  = 57,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUnbindTexture_v3020                                       = 58,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureAlignmentOffset_v3020                           = 59,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureReference_v3020                                 = 60,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindSurfaceToArray_v3020                                  = 61,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSurfaceReference_v3020                                 = 62,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLSetGLDevice_v3020                                       = 63,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLRegisterBufferObject_v3020                              = 64,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLMapBufferObject_v3020                                   = 65,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnmapBufferObject_v3020                                 = 66,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnregisterBufferObject_v3020                            = 67,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLSetBufferObjectMapFlags_v3020                           = 68,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLMapBufferObjectAsync_v3020                              = 69,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnmapBufferObjectAsync_v3020                            = 70,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWGLGetDevice_v3020                                        = 71,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsGLRegisterImage_v3020                             = 72,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsGLRegisterBuffer_v3020                            = 73,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsUnregisterResource_v3020                          = 74,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceSetMapFlags_v3020                         = 75,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsMapResources_v3020                                = 76,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsUnmapResources_v3020                              = 77,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedPointer_v3020                    = 78,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsSubResourceGetMappedArray_v3020                   = 79,
+    CUPTI_RUNTIME_TRACE_CBID_cudaVDPAUGetDevice_v3020                                      = 80,
+    CUPTI_RUNTIME_TRACE_CBID_cudaVDPAUSetVDPAUDevice_v3020                                 = 81,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsVDPAURegisterVideoSurface_v3020                   = 82,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsVDPAURegisterOutputSurface_v3020                  = 83,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDevice_v3020                                      = 84,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDevices_v3020                                     = 85,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11SetDirect3DDevice_v3020                              = 86,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D11RegisterResource_v3020                       = 87,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDevice_v3020                                      = 88,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDevices_v3020                                     = 89,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10SetDirect3DDevice_v3020                              = 90,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D10RegisterResource_v3020                       = 91,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10RegisterResource_v3020                               = 92,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10UnregisterResource_v3020                             = 93,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10MapResources_v3020                                   = 94,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10UnmapResources_v3020                                 = 95,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceSetMapFlags_v3020                            = 96,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetSurfaceDimensions_v3020                   = 97,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedArray_v3020                         = 98,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedPointer_v3020                       = 99,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedSize_v3020                          = 100,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedPitch_v3020                         = 101,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDevice_v3020                                       = 102,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDevices_v3020                                      = 103,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9SetDirect3DDevice_v3020                               = 104,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDirect3DDevice_v3020                               = 105,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D9RegisterResource_v3020                        = 106,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9RegisterResource_v3020                                = 107,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnregisterResource_v3020                              = 108,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9MapResources_v3020                                    = 109,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnmapResources_v3020                                  = 110,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceSetMapFlags_v3020                             = 111,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetSurfaceDimensions_v3020                    = 112,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedArray_v3020                          = 113,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedPointer_v3020                        = 114,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedSize_v3020                           = 115,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedPitch_v3020                          = 116,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9Begin_v3020                                           = 117,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9End_v3020                                             = 118,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9RegisterVertexBuffer_v3020                            = 119,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnregisterVertexBuffer_v3020                          = 120,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9MapVertexBuffer_v3020                                 = 121,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnmapVertexBuffer_v3020                               = 122,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadExit_v3020                                          = 123,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDoubleForDevice_v3020                                  = 124,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDoubleForHost_v3020                                    = 125,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSynchronize_v3020                                   = 126,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadGetLimit_v3020                                      = 127,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSetLimit_v3020                                      = 128,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreate_v3020                                        = 129,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamDestroy_v3020                                       = 130,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_v3020                                   = 131,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamQuery_v3020                                         = 132,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020                                         = 133,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020                                = 134,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_v3020                                         = 135,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020                                        = 136,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020                                    = 137,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventQuery_v3020                                          = 138,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventElapsedTime_v3020                                    = 139,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc3D_v3020                                            = 140,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc3DArray_v3020                                       = 141,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3D_v3020                                            = 142,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3DAsync_v3020                                       = 143,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3D_v3020                                            = 144,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DAsync_v3020                                       = 145,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSetCacheConfig_v3020                                = 146,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_v3020                                     = 147,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDirect3DDevice_v3020                              = 148,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDirect3DDevice_v3020                              = 149,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadGetCacheConfig_v3020                                = 150,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPointerGetAttributes_v4000                                = 151,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostRegister_v4000                                        = 152,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostUnregister_v4000                                      = 153,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceCanAccessPeer_v4000                                 = 154,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceEnablePeerAccess_v4000                              = 155,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceDisablePeerAccess_v4000                             = 156,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerRegister_v4000                                        = 157,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerUnregister_v4000                                      = 158,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerGetDevicePointer_v4000                                = 159,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyPeer_v4000                                          = 160,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyPeerAsync_v4000                                     = 161,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeer_v4000                                        = 162,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeerAsync_v4000                                   = 163,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020                                         = 164,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020                                   = 165,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetLimit_v3020                                      = 166,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetLimit_v3020                                      = 167,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetCacheConfig_v3020                                = 168,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetCacheConfig_v3020                                = 169,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerInitialize_v4000                                  = 170,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerStart_v4000                                       = 171,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerStop_v4000                                        = 172,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetByPCIBusId_v4010                                 = 173,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetPCIBusId_v4010                                   = 174,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLGetDevices_v4010                                        = 175,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcGetEventHandle_v4010                                   = 176,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcOpenEventHandle_v4010                                  = 177,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcGetMemHandle_v4010                                     = 178,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcOpenMemHandle_v4010                                    = 179,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcCloseMemHandle_v4010                                   = 180,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetInfo_v4010                                        = 181,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetSharedMemConfig_v4020                              = 182,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetSharedMemConfig_v4020                            = 183,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetSharedMemConfig_v4020                            = 184,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateTextureObject_v5000                                 = 185,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyTextureObject_v5000                                = 186,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectResourceDesc_v5000                        = 187,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectTextureDesc_v5000                         = 188,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateSurfaceObject_v5000                                 = 189,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroySurfaceObject_v5000                                = 190,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSurfaceObjectResourceDesc_v5000                        = 191,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocMipmappedArray_v5000                                = 192,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetMipmappedArrayLevel_v5000                              = 193,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeMipmappedArray_v5000                                  = 194,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTextureToMipmappedArray_v5000                         = 195,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedMipmappedArray_v5000             = 196,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAddCallback_v5000                                   = 197,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreateWithFlags_v5000                               = 198,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectResourceViewDesc_v5000                    = 199,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetAttribute_v5000                                  = 200,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamDestroy_v5050                                       = 201,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreateWithPriority_v5050                            = 202,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetPriority_v5050                                   = 203,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetFlags_v5050                                      = 204,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetStreamPriorityRange_v5050                        = 205,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocManaged_v6000                                       = 206,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000           = 207,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAttachMemAsync_v6000                                = 208,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetErrorName_v6050                                        = 209,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050           = 210,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000                                        = 211,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceFlags_v7000                                      = 212,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_ptsz_v7000                                         = 213,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000                                   = 214,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_ptds_v7000                                         = 215,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2D_ptds_v7000                                       = 216,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArray_ptds_v7000                                  = 217,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArray_ptds_v7000                                = 218,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArray_ptds_v7000                                = 219,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArray_ptds_v7000                              = 220,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyArrayToArray_ptds_v7000                             = 221,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DArrayToArray_ptds_v7000                           = 222,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbol_ptds_v7000                                 = 223,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbol_ptds_v7000                               = 224,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_ptsz_v7000                                    = 225,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArrayAsync_ptsz_v7000                             = 226,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArrayAsync_ptsz_v7000                           = 227,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DAsync_ptsz_v7000                                  = 228,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArrayAsync_ptsz_v7000                           = 229,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArrayAsync_ptsz_v7000                         = 230,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbolAsync_ptsz_v7000                            = 231,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbolAsync_ptsz_v7000                          = 232,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset_ptds_v7000                                         = 233,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2D_ptds_v7000                                       = 234,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_ptsz_v7000                                    = 235,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_ptsz_v7000                                  = 236,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetPriority_ptsz_v7000                              = 237,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetFlags_ptsz_v7000                                 = 238,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_ptsz_v7000                              = 239,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamQuery_ptsz_v7000                                    = 240,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAttachMemAsync_ptsz_v7000                           = 241,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_ptsz_v7000                                    = 242,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3D_ptds_v7000                                       = 243,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3DAsync_ptsz_v7000                                  = 244,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3D_ptds_v7000                                       = 245,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DAsync_ptsz_v7000                                  = 246,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_ptsz_v7000                                = 247,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAddCallback_ptsz_v7000                              = 248,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeer_ptds_v7000                                   = 249,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeerAsync_ptsz_v7000                              = 250,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000  = 251,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v8000                                    = 252,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_ptsz_v8000                               = 253,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemAdvise_v8000                                           = 254,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetP2PAttribute_v8000                               = 255,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsEGLRegisterImage_v7000                            = 256,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerConnect_v7000                            = 257,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerDisconnect_v7000                         = 258,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerAcquireFrame_v7000                       = 259,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerReleaseFrame_v7000                       = 260,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerConnect_v7000                            = 261,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerDisconnect_v7000                         = 262,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerPresentFrame_v7000                       = 263,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerReturnFrame_v7000                        = 264,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedEglFrame_v7000                   = 265,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemRangeGetAttribute_v8000                                = 266,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemRangeGetAttributes_v8000                               = 267,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerConnectWithFlags_v7000                   = 268,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000                             = 269,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_ptsz_v9000                        = 270,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateFromEGLSync_v9000                              = 271,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000                  = 272,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetAttribute_v9000                                    = 273,
+    CUPTI_RUNTIME_TRACE_CBID_cudaImportExternalMemory_v10000                               = 274,
+    CUPTI_RUNTIME_TRACE_CBID_cudaExternalMemoryGetMappedBuffer_v10000                      = 275,
+    CUPTI_RUNTIME_TRACE_CBID_cudaExternalMemoryGetMappedMipmappedArray_v10000              = 276,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyExternalMemory_v10000                              = 277,
+    CUPTI_RUNTIME_TRACE_CBID_cudaImportExternalSemaphore_v10000                            = 278,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v10000                      = 279,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_ptsz_v10000                 = 280,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v10000                        = 281,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_ptsz_v10000                   = 282,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyExternalSemaphore_v10000                           = 283,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchHostFunc_v10000                                     = 284,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchHostFunc_ptsz_v10000                                = 285,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphCreate_v10000                                        = 286,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeGetParams_v10000                           = 287,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeSetParams_v10000                           = 288,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddKernelNode_v10000                                 = 289,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNode_v10000                                 = 290,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeGetParams_v10000                           = 291,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParams_v10000                           = 292,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemsetNode_v10000                                 = 293,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemsetNodeGetParams_v10000                           = 294,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemsetNodeSetParams_v10000                           = 295,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddHostNode_v10000                                   = 296,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphHostNodeGetParams_v10000                             = 297,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddChildGraphNode_v10000                             = 298,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphChildGraphNodeGetGraph_v10000                        = 299,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEmptyNode_v10000                                  = 300,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphClone_v10000                                         = 301,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeFindInClone_v10000                               = 302,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetType_v10000                                   = 303,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetRootNodes_v10000                                  = 304,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependencies_v10000                           = 305,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependentNodes_v10000                         = 306,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddDependencies_v10000                               = 307,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphRemoveDependencies_v10000                            = 308,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDestroyNode_v10000                                   = 309,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiate_v10000                                   = 310,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000                                        = 311,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000                                   = 312,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecDestroy_v10000                                   = 313,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDestroy_v10000                                       = 314,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCapture_v10000                                 = 315,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCapture_ptsz_v10000                            = 316,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamIsCapturing_v10000                                  = 317,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamIsCapturing_ptsz_v10000                             = 318,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamEndCapture_v10000                                   = 319,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamEndCapture_ptsz_v10000                              = 320,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphHostNodeSetParams_v10000                             = 321,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetNodes_v10000                                      = 322,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetEdges_v10000                                      = 323,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v10010                               = 324,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_ptsz_v10010                          = 325,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecKernelNodeSetParams_v10010                       = 326,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadExchangeStreamCaptureMode_v10010                    = 327,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetNvSciSyncAttributes_v10020                       = 328,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyAvailableDynamicSMemPerBlock_v10200              = 329,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetFlags_v10200                                     = 330,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetFlags_ptsz_v10200                                = 331,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParams_v10020                       = 332,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemsetNodeSetParams_v10020                       = 333,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecHostNodeSetParams_v10020                         = 334,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecUpdate_v10020                                    = 335,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetFuncBySymbol_v11000                                    = 336,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCtxResetPersistingL2Cache_v11000                          = 337,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeCopyAttributes_v11000                      = 338,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeGetAttribute_v11000                        = 339,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeSetAttribute_v11000                        = 340,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCopyAttributes_v11000                               = 341,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCopyAttributes_ptsz_v11000                          = 342,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetAttribute_v11000                                 = 343,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetAttribute_ptsz_v11000                            = 344,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_v11000                                 = 345,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_ptsz_v11000                            = 346,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetTexture1DLinearMaxWidth_v11010                   = 347,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphUpload_v10000                                        = 348,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphUpload_ptsz_v10000                                   = 349,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNodeToSymbol_v11010                         = 350,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNodeFromSymbol_v11010                       = 351,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNode1D_v11010                               = 352,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParamsToSymbol_v11010                   = 353,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParamsFromSymbol_v11010                 = 354,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParams1D_v11010                         = 355,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010               = 356,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010             = 357,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParams1D_v11010                     = 358,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetSparseProperties_v11010                           = 359,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMipmappedArrayGetSparseProperties_v11010                  = 360,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecChildGraphNodeSetParams_v11010                   = 361,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEventRecordNode_v11010                            = 362,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventRecordNodeGetEvent_v11010                       = 363,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventRecordNodeSetEvent_v11010                       = 364,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEventWaitNode_v11010                              = 365,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventWaitNodeGetEvent_v11010                         = 366,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventWaitNodeSetEvent_v11010                         = 367,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecEventRecordNodeSetEvent_v11010                   = 368,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecEventWaitNodeSetEvent_v11010                     = 369,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecordWithFlags_v11010                               = 370,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecordWithFlags_ptsz_v11010                          = 371,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetDefaultMemPool_v11020                            = 372,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocAsync_v11020                                        = 373,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocAsync_ptsz_v11020                                   = 374,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeAsync_v11020                                          = 375,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeAsync_ptsz_v11020                                     = 376,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolTrimTo_v11020                                      = 377,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolSetAttribute_v11020                                = 378,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolGetAttribute_v11020                                = 379,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolSetAccess_v11020                                   = 380,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetPlane_v11020                                      = 381,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolGetAccess_v11020                                   = 382,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolCreate_v11020                                      = 383,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolDestroy_v11020                                     = 384,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetMemPool_v11020                                   = 385,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetMemPool_v11020                                   = 386,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolExportToShareableHandle_v11020                     = 387,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolImportFromShareableHandle_v11020                   = 388,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolExportPointer_v11020                               = 389,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolImportPointer_v11020                               = 390,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocFromPoolAsync_v11020                                = 391,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocFromPoolAsync_ptsz_v11020                           = 392,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v2_v11020                   = 393,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020              = 394,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v2_v11020                     = 395,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020                = 396,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddExternalSemaphoresSignalNode_v11020               = 397,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresSignalNodeGetParams_v11020         = 398,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresSignalNodeSetParams_v11020         = 399,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddExternalSemaphoresWaitNode_v11020                 = 400,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresWaitNodeGetParams_v11020           = 401,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresWaitNodeSetParams_v11020           = 402,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020     = 403,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020       = 404,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceFlushGPUDirectRDMAWrites_v11030                     = 405,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPoint_v11030                                = 406,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPoint_ptsz_v11030                           = 407,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDebugDotPrint_v11030                                 = 408,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v2_v11030                            = 409,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v2_ptsz_v11030                       = 410,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v11030                    = 411,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_ptsz_v11030               = 412,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectCreate_v11030                                   = 413,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectRetain_v11030                                   = 414,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectRelease_v11030                                  = 415,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphRetainUserObject_v11030                              = 416,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphReleaseUserObject_v11030                             = 417,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithFlags_v11040                          = 418,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemAllocNode_v11040                               = 419,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemAllocNodeGetParams_v11040                         = 420,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemFreeNode_v11040                                = 421,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemFreeNodeGetParams_v11040                          = 422,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGraphMemTrim_v11040                                 = 423,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetGraphMemAttribute_v11040                         = 424,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetGraphMemAttribute_v11040                         = 425,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeSetEnabled_v11060                                = 426,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetEnabled_v11060                                = 427,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetMemoryRequirements_v11060                         = 428,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMipmappedArrayGetMemoryRequirements_v11060                = 429,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060                                    = 430,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060                               = 431,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxPotentialClusterSize_v11070                   = 432,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveClusters_v11070                         = 433,
+    CUPTI_RUNTIME_TRACE_CBID_SIZE                                                          = 434,
+    CUPTI_RUNTIME_TRACE_CBID_FORCE_INT                                                     = 0x7fffffff
+} CUpti_runtime_api_trace_cbid;
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_target.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_target.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4b625d45c65288fa2ea7dc05819ee4dfc4cbdd3
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_target.h
@@ -0,0 +1,43 @@
+#if !defined(_CUPTI_TARGET_H_)
+#define _CUPTI_TARGET_H_
+
+/*
+CUPTI profiler target API's
+This file contains the CUPTI profiling API's.
+*/
+#include <cupti_result.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+#ifndef CUPTI_PROFILER_STRUCT_SIZE
+#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+typedef struct CUpti_Device_GetChipName_Params
+{
+    size_t structSize;                                      //!< [in]
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    size_t deviceIndex;                                     //!< [in]
+    const char* pChipName;                                  //!< [out]
+} CUpti_Device_GetChipName_Params;
+
+#define CUpti_Device_GetChipName_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Device_GetChipName_Params, pChipName)
+CUptiResult CUPTIAPI cuptiDeviceGetChipName(CUpti_Device_GetChipName_Params *pParams);
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+#endif
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_version.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..0833be1c1894f53e4e9ded4585f485c4e96a3560
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_version.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2010-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_VERSION_H_)
+#define _CUPTI_VERSION_H_
+
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_VERSION_API CUPTI Version
+ * Function and macro to determine the CUPTI version.
+ * @{
+ */
+
+/**
+ * \brief The API version for this implementation of CUPTI.
+ *
+ * The API version for this implementation of CUPTI. This define along
+ * with \ref cuptiGetVersion can be used to dynamically detect if the
+ * version of CUPTI compiled against matches the version of the loaded
+ * CUPTI library.
+ *
+ * v1 : CUDAToolsSDK 4.0
+ * v2 : CUDAToolsSDK 4.1
+ * v3 : CUDA Toolkit 5.0
+ * v4 : CUDA Toolkit 5.5
+ * v5 : CUDA Toolkit 6.0
+ * v6 : CUDA Toolkit 6.5
+ * v7 : CUDA Toolkit 6.5(with sm_52 support)
+ * v8 : CUDA Toolkit 7.0
+ * v9 : CUDA Toolkit 8.0
+ * v10 : CUDA Toolkit 9.0
+ * v11 : CUDA Toolkit 9.1
+ * v12 : CUDA Toolkit 10.0, 10.1 and 10.2
+ * v13 : CUDA Toolkit 11.0
+ * v14 : CUDA Toolkit 11.1
+ * v15 : CUDA Toolkit 11.2, 11.3 and 11.4
+ * v16 : CUDA Toolkit 11.5
+ * v17 : CUDA Toolkit 11.6
+ * v18 : CUDA Toolkit 11.8
+ */
+#define CUPTI_API_VERSION 18
+
+/**
+ * \brief Get the CUPTI API version.
+ *
+ * Return the API version in \p *version.
+ *
+ * \param version Returns the version
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p version is NULL
+ * \sa CUPTI_API_VERSION
+ */
+CUptiResult CUPTIAPI cuptiGetVersion(uint32_t *version);
+
+/** @} */ /* END CUPTI_VERSION_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_VERSION_H_*/
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa6cf0390e339d33e543896395746685efa010c4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_meta.h
@@ -0,0 +1,2941 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// No dependent includes
+
+// CUDA public interface, for type definitions and cu* function prototypes
+#include "cuda.h"
+
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+typedef struct cuGetErrorString_params_st {
+    CUresult error;
+    const char **pStr;
+} cuGetErrorString_params;
+
+typedef struct cuGetErrorName_params_st {
+    CUresult error;
+    const char **pStr;
+} cuGetErrorName_params;
+
+typedef struct cuInit_params_st {
+    unsigned int Flags;
+} cuInit_params;
+
+typedef struct cuDriverGetVersion_params_st {
+    int *driverVersion;
+} cuDriverGetVersion_params;
+
+typedef struct cuDeviceGet_params_st {
+    CUdevice *device;
+    int ordinal;
+} cuDeviceGet_params;
+
+typedef struct cuDeviceGetCount_params_st {
+    int *count;
+} cuDeviceGetCount_params;
+
+typedef struct cuDeviceGetName_params_st {
+    char *name;
+    int len;
+    CUdevice dev;
+} cuDeviceGetName_params;
+
+typedef struct cuDeviceGetUuid_params_st {
+    CUuuid *uuid;
+    CUdevice dev;
+} cuDeviceGetUuid_params;
+
+typedef struct cuDeviceGetUuid_v2_params_st {
+    CUuuid *uuid;
+    CUdevice dev;
+} cuDeviceGetUuid_v2_params;
+
+typedef struct cuDeviceGetLuid_params_st {
+    char *luid;
+    unsigned int *deviceNodeMask;
+    CUdevice dev;
+} cuDeviceGetLuid_params;
+
+typedef struct cuDeviceTotalMem_v2_params_st {
+    size_t *bytes;
+    CUdevice dev;
+} cuDeviceTotalMem_v2_params;
+
+typedef struct cuDeviceGetTexture1DLinearMaxWidth_params_st {
+    size_t *maxWidthInElements;
+    CUarray_format format;
+    unsigned numChannels;
+    CUdevice dev;
+} cuDeviceGetTexture1DLinearMaxWidth_params;
+
+typedef struct cuDeviceGetAttribute_params_st {
+    int *pi;
+    CUdevice_attribute attrib;
+    CUdevice dev;
+} cuDeviceGetAttribute_params;
+
+typedef struct cuDeviceGetNvSciSyncAttributes_params_st {
+    void *nvSciSyncAttrList;
+    CUdevice dev;
+    int flags;
+} cuDeviceGetNvSciSyncAttributes_params;
+
+typedef struct cuDeviceSetMemPool_params_st {
+    CUdevice dev;
+    CUmemoryPool pool;
+} cuDeviceSetMemPool_params;
+
+typedef struct cuDeviceGetMemPool_params_st {
+    CUmemoryPool *pool;
+    CUdevice dev;
+} cuDeviceGetMemPool_params;
+
+typedef struct cuDeviceGetDefaultMemPool_params_st {
+    CUmemoryPool *pool_out;
+    CUdevice dev;
+} cuDeviceGetDefaultMemPool_params;
+
+typedef struct cuFlushGPUDirectRDMAWrites_params_st {
+    CUflushGPUDirectRDMAWritesTarget target;
+    CUflushGPUDirectRDMAWritesScope scope;
+} cuFlushGPUDirectRDMAWrites_params;
+
+typedef struct cuDeviceGetProperties_params_st {
+    CUdevprop *prop;
+    CUdevice dev;
+} cuDeviceGetProperties_params;
+
+typedef struct cuDeviceComputeCapability_params_st {
+    int *major;
+    int *minor;
+    CUdevice dev;
+} cuDeviceComputeCapability_params;
+
+typedef struct cuDevicePrimaryCtxRetain_params_st {
+    CUcontext *pctx;
+    CUdevice dev;
+} cuDevicePrimaryCtxRetain_params;
+
+typedef struct cuDevicePrimaryCtxRelease_v2_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxRelease_v2_params;
+
+typedef struct cuDevicePrimaryCtxSetFlags_v2_params_st {
+    CUdevice dev;
+    unsigned int flags;
+} cuDevicePrimaryCtxSetFlags_v2_params;
+
+typedef struct cuDevicePrimaryCtxGetState_params_st {
+    CUdevice dev;
+    unsigned int *flags;
+    int *active;
+} cuDevicePrimaryCtxGetState_params;
+
+typedef struct cuDevicePrimaryCtxReset_v2_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxReset_v2_params;
+
+typedef struct cuDeviceGetExecAffinitySupport_params_st {
+    int *pi;
+    CUexecAffinityType type;
+    CUdevice dev;
+} cuDeviceGetExecAffinitySupport_params;
+
+typedef struct cuCtxCreate_v2_params_st {
+    CUcontext *pctx;
+    unsigned int flags;
+    CUdevice dev;
+} cuCtxCreate_v2_params;
+
+typedef struct cuCtxCreate_v3_params_st {
+    CUcontext *pctx;
+    CUexecAffinityParam *paramsArray;
+    int numParams;
+    unsigned int flags;
+    CUdevice dev;
+} cuCtxCreate_v3_params;
+
+typedef struct cuCtxDestroy_v2_params_st {
+    CUcontext ctx;
+} cuCtxDestroy_v2_params;
+
+typedef struct cuCtxPushCurrent_v2_params_st {
+    CUcontext ctx;
+} cuCtxPushCurrent_v2_params;
+
+typedef struct cuCtxPopCurrent_v2_params_st {
+    CUcontext *pctx;
+} cuCtxPopCurrent_v2_params;
+
+typedef struct cuCtxSetCurrent_params_st {
+    CUcontext ctx;
+} cuCtxSetCurrent_params;
+
+typedef struct cuCtxGetCurrent_params_st {
+    CUcontext *pctx;
+} cuCtxGetCurrent_params;
+
+typedef struct cuCtxGetDevice_params_st {
+    CUdevice *device;
+} cuCtxGetDevice_params;
+
+typedef struct cuCtxGetFlags_params_st {
+    unsigned int *flags;
+} cuCtxGetFlags_params;
+
+typedef struct cuCtxSetLimit_params_st {
+    CUlimit limit;
+    size_t value;
+} cuCtxSetLimit_params;
+
+typedef struct cuCtxGetLimit_params_st {
+    size_t *pvalue;
+    CUlimit limit;
+} cuCtxGetLimit_params;
+
+typedef struct cuCtxGetCacheConfig_params_st {
+    CUfunc_cache *pconfig;
+} cuCtxGetCacheConfig_params;
+
+typedef struct cuCtxSetCacheConfig_params_st {
+    CUfunc_cache config;
+} cuCtxSetCacheConfig_params;
+
+typedef struct cuCtxGetSharedMemConfig_params_st {
+    CUsharedconfig *pConfig;
+} cuCtxGetSharedMemConfig_params;
+
+typedef struct cuCtxSetSharedMemConfig_params_st {
+    CUsharedconfig config;
+} cuCtxSetSharedMemConfig_params;
+
+typedef struct cuCtxGetApiVersion_params_st {
+    CUcontext ctx;
+    unsigned int *version;
+} cuCtxGetApiVersion_params;
+
+typedef struct cuCtxGetStreamPriorityRange_params_st {
+    int *leastPriority;
+    int *greatestPriority;
+} cuCtxGetStreamPriorityRange_params;
+
+typedef struct cuCtxGetExecAffinity_params_st {
+    CUexecAffinityParam *pExecAffinity;
+    CUexecAffinityType type;
+} cuCtxGetExecAffinity_params;
+
+typedef struct cuCtxAttach_params_st {
+    CUcontext *pctx;
+    unsigned int flags;
+} cuCtxAttach_params;
+
+typedef struct cuCtxDetach_params_st {
+    CUcontext ctx;
+} cuCtxDetach_params;
+
+typedef struct cuModuleLoad_params_st {
+    CUmodule *module;
+    const char *fname;
+} cuModuleLoad_params;
+
+typedef struct cuModuleLoadData_params_st {
+    CUmodule *module;
+    const void *image;
+} cuModuleLoadData_params;
+
+typedef struct cuModuleLoadDataEx_params_st {
+    CUmodule *module;
+    const void *image;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuModuleLoadDataEx_params;
+
+typedef struct cuModuleLoadFatBinary_params_st {
+    CUmodule *module;
+    const void *fatCubin;
+} cuModuleLoadFatBinary_params;
+
+typedef struct cuModuleUnload_params_st {
+    CUmodule hmod;
+} cuModuleUnload_params;
+
+typedef struct cuModuleGetFunction_params_st {
+    CUfunction *hfunc;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetFunction_params;
+
+typedef struct cuModuleGetGlobal_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *bytes;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetGlobal_v2_params;
+
+typedef struct cuModuleGetTexRef_params_st {
+    CUtexref *pTexRef;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetTexRef_params;
+
+typedef struct cuModuleGetSurfRef_params_st {
+    CUsurfref *pSurfRef;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetSurfRef_params;
+
+typedef struct cuLinkCreate_v2_params_st {
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+    CUlinkState *stateOut;
+} cuLinkCreate_v2_params;
+
+typedef struct cuLinkAddData_v2_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    void *data;
+    size_t size;
+    const char *name;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddData_v2_params;
+
+typedef struct cuLinkAddFile_v2_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    const char *path;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddFile_v2_params;
+
+typedef struct cuLinkComplete_params_st {
+    CUlinkState state;
+    void **cubinOut;
+    size_t *sizeOut;
+} cuLinkComplete_params;
+
+typedef struct cuLinkDestroy_params_st {
+    CUlinkState state;
+} cuLinkDestroy_params;
+
+typedef struct cuMemGetInfo_v2_params_st {
+    size_t *free;
+    size_t *total;
+} cuMemGetInfo_v2_params;
+
+typedef struct cuMemAlloc_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+} cuMemAlloc_v2_params;
+
+typedef struct cuMemAllocPitch_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *pPitch;
+    size_t WidthInBytes;
+    size_t Height;
+    unsigned int ElementSizeBytes;
+} cuMemAllocPitch_v2_params;
+
+typedef struct cuMemFree_v2_params_st {
+    CUdeviceptr dptr;
+} cuMemFree_v2_params;
+
+typedef struct cuMemGetAddressRange_v2_params_st {
+    CUdeviceptr *pbase;
+    size_t *psize;
+    CUdeviceptr dptr;
+} cuMemGetAddressRange_v2_params;
+
+typedef struct cuMemAllocHost_v2_params_st {
+    void **pp;
+    size_t bytesize;
+} cuMemAllocHost_v2_params;
+
+typedef struct cuMemFreeHost_params_st {
+    void *p;
+} cuMemFreeHost_params;
+
+typedef struct cuMemHostAlloc_params_st {
+    void **pp;
+    size_t bytesize;
+    unsigned int Flags;
+} cuMemHostAlloc_params;
+
+typedef struct cuMemHostGetDevicePointer_v2_params_st {
+    CUdeviceptr *pdptr;
+    void *p;
+    unsigned int Flags;
+} cuMemHostGetDevicePointer_v2_params;
+
+typedef struct cuMemHostGetFlags_params_st {
+    unsigned int *pFlags;
+    void *p;
+} cuMemHostGetFlags_params;
+
+typedef struct cuMemAllocManaged_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    unsigned int flags;
+} cuMemAllocManaged_params;
+
+typedef struct cuDeviceGetByPCIBusId_params_st {
+    CUdevice *dev;
+    const char *pciBusId;
+} cuDeviceGetByPCIBusId_params;
+
+typedef struct cuDeviceGetPCIBusId_params_st {
+    char *pciBusId;
+    int len;
+    CUdevice dev;
+} cuDeviceGetPCIBusId_params;
+
+typedef struct cuIpcGetEventHandle_params_st {
+    CUipcEventHandle *pHandle;
+    CUevent event;
+} cuIpcGetEventHandle_params;
+
+typedef struct cuIpcOpenEventHandle_params_st {
+    CUevent *phEvent;
+    CUipcEventHandle handle;
+} cuIpcOpenEventHandle_params;
+
+typedef struct cuIpcGetMemHandle_params_st {
+    CUipcMemHandle *pHandle;
+    CUdeviceptr dptr;
+} cuIpcGetMemHandle_params;
+
+typedef struct cuIpcOpenMemHandle_v2_params_st {
+    CUdeviceptr *pdptr;
+    CUipcMemHandle handle;
+    unsigned int Flags;
+} cuIpcOpenMemHandle_v2_params;
+
+typedef struct cuIpcCloseMemHandle_params_st {
+    CUdeviceptr dptr;
+} cuIpcCloseMemHandle_params;
+
+typedef struct cuMemHostRegister_v2_params_st {
+    void *p;
+    size_t bytesize;
+    unsigned int Flags;
+} cuMemHostRegister_v2_params;
+
+typedef struct cuMemHostUnregister_params_st {
+    void *p;
+} cuMemHostUnregister_params;
+
+typedef struct cuMemcpy_ptds_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+} cuMemcpy_ptds_params;
+
+typedef struct cuMemcpyPeer_ptds_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+} cuMemcpyPeer_ptds_params;
+
+typedef struct cuMemcpyHtoD_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoD_v2_ptds_params;
+
+typedef struct cuMemcpyDtoH_v2_ptds_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoH_v2_ptds_params;
+
+typedef struct cuMemcpyDtoD_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoD_v2_ptds_params;
+
+typedef struct cuMemcpyDtoA_v2_ptds_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoA_v2_ptds_params;
+
+typedef struct cuMemcpyAtoD_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoD_v2_ptds_params;
+
+typedef struct cuMemcpyHtoA_v2_ptds_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoA_v2_ptds_params;
+
+typedef struct cuMemcpyAtoH_v2_ptds_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoH_v2_ptds_params;
+
+typedef struct cuMemcpyAtoA_v2_ptds_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoA_v2_ptds_params;
+
+typedef struct cuMemcpy2D_v2_ptds_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2D_v2_ptds_params;
+
+typedef struct cuMemcpy2DUnaligned_v2_ptds_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2DUnaligned_v2_ptds_params;
+
+typedef struct cuMemcpy3D_v2_ptds_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+} cuMemcpy3D_v2_ptds_params;
+
+typedef struct cuMemcpy3DPeer_ptds_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+} cuMemcpy3DPeer_ptds_params;
+
+typedef struct cuMemcpyAsync_ptsz_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAsync_ptsz_params;
+
+typedef struct cuMemcpyPeerAsync_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyPeerAsync_ptsz_params;
+
+typedef struct cuMemcpyHtoDAsync_v2_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoDAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyDtoHAsync_v2_ptsz_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoHAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyDtoDAsync_v2_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoDAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyHtoAAsync_v2_ptsz_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoAAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyAtoHAsync_v2_ptsz_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAtoHAsync_v2_ptsz_params;
+
+typedef struct cuMemcpy2DAsync_v2_ptsz_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+    CUstream hStream;
+} cuMemcpy2DAsync_v2_ptsz_params;
+
+typedef struct cuMemcpy3DAsync_v2_ptsz_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+    CUstream hStream;
+} cuMemcpy3DAsync_v2_ptsz_params;
+
+typedef struct cuMemcpy3DPeerAsync_ptsz_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+    CUstream hStream;
+} cuMemcpy3DPeerAsync_ptsz_params;
+
+typedef struct cuMemsetD8_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+} cuMemsetD8_v2_ptds_params;
+
+typedef struct cuMemsetD16_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+} cuMemsetD16_v2_ptds_params;
+
+typedef struct cuMemsetD32_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+} cuMemsetD32_v2_ptds_params;
+
+typedef struct cuMemsetD2D8_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D8_v2_ptds_params;
+
+typedef struct cuMemsetD2D16_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D16_v2_ptds_params;
+
+typedef struct cuMemsetD2D32_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D32_v2_ptds_params;
+
+typedef struct cuMemsetD8Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD8Async_ptsz_params;
+
+typedef struct cuMemsetD16Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD16Async_ptsz_params;
+
+typedef struct cuMemsetD32Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD32Async_ptsz_params;
+
+typedef struct cuMemsetD2D8Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D8Async_ptsz_params;
+
+typedef struct cuMemsetD2D16Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D16Async_ptsz_params;
+
+typedef struct cuMemsetD2D32Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D32Async_ptsz_params;
+
+typedef struct cuArrayCreate_v2_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY_DESCRIPTOR *pAllocateArray;
+} cuArrayCreate_v2_params;
+
+typedef struct cuArrayGetDescriptor_v2_params_st {
+    CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor;
+    CUarray hArray;
+} cuArrayGetDescriptor_v2_params;
+
+typedef struct cuArrayGetSparseProperties_params_st {
+    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties;
+    CUarray array;
+} cuArrayGetSparseProperties_params;
+
+typedef struct cuMipmappedArrayGetSparseProperties_params_st {
+    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties;
+    CUmipmappedArray mipmap;
+} cuMipmappedArrayGetSparseProperties_params;
+
+typedef struct cuArrayGetMemoryRequirements_params_st {
+    CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements;
+    CUarray array;
+    CUdevice device;
+} cuArrayGetMemoryRequirements_params;
+
+typedef struct cuMipmappedArrayGetMemoryRequirements_params_st {
+    CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements;
+    CUmipmappedArray mipmap;
+    CUdevice device;
+} cuMipmappedArrayGetMemoryRequirements_params;
+
+typedef struct cuArrayGetPlane_params_st {
+    CUarray *pPlaneArray;
+    CUarray hArray;
+    unsigned int planeIdx;
+} cuArrayGetPlane_params;
+
+typedef struct cuArrayDestroy_params_st {
+    CUarray hArray;
+} cuArrayDestroy_params;
+
+typedef struct cuArray3DCreate_v2_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray;
+} cuArray3DCreate_v2_params;
+
+typedef struct cuArray3DGetDescriptor_v2_params_st {
+    CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor;
+    CUarray hArray;
+} cuArray3DGetDescriptor_v2_params;
+
+typedef struct cuMipmappedArrayCreate_params_st {
+    CUmipmappedArray *pHandle;
+    const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc;
+    unsigned int numMipmapLevels;
+} cuMipmappedArrayCreate_params;
+
+typedef struct cuMipmappedArrayGetLevel_params_st {
+    CUarray *pLevelArray;
+    CUmipmappedArray hMipmappedArray;
+    unsigned int level;
+} cuMipmappedArrayGetLevel_params;
+
+typedef struct cuMipmappedArrayDestroy_params_st {
+    CUmipmappedArray hMipmappedArray;
+} cuMipmappedArrayDestroy_params;
+
+typedef struct cuMemAddressReserve_params_st {
+    CUdeviceptr *ptr;
+    size_t size;
+    size_t alignment;
+    CUdeviceptr addr;
+    unsigned long long flags;
+} cuMemAddressReserve_params;
+
+typedef struct cuMemAddressFree_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+} cuMemAddressFree_params;
+
+typedef struct cuMemCreate_params_st {
+    CUmemGenericAllocationHandle *handle;
+    size_t size;
+    const CUmemAllocationProp *prop;
+    unsigned long long flags;
+} cuMemCreate_params;
+
+typedef struct cuMemRelease_params_st {
+    CUmemGenericAllocationHandle handle;
+} cuMemRelease_params;
+
+typedef struct cuMemMap_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+    size_t offset;
+    CUmemGenericAllocationHandle handle;
+    unsigned long long flags;
+} cuMemMap_params;
+
+typedef struct cuMemMapArrayAsync_ptsz_params_st {
+    CUarrayMapInfo *mapInfoList;
+    unsigned int count;
+    CUstream hStream;
+} cuMemMapArrayAsync_ptsz_params;
+
+typedef struct cuMemUnmap_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+} cuMemUnmap_params;
+
+typedef struct cuMemSetAccess_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+    const CUmemAccessDesc *desc;
+    size_t count;
+} cuMemSetAccess_params;
+
+typedef struct cuMemGetAccess_params_st {
+    unsigned long long *flags;
+    const CUmemLocation *location;
+    CUdeviceptr ptr;
+} cuMemGetAccess_params;
+
+typedef struct cuMemExportToShareableHandle_params_st {
+    void *shareableHandle;
+    CUmemGenericAllocationHandle handle;
+    CUmemAllocationHandleType handleType;
+    unsigned long long flags;
+} cuMemExportToShareableHandle_params;
+
+typedef struct cuMemImportFromShareableHandle_params_st {
+    CUmemGenericAllocationHandle *handle;
+    void *osHandle;
+    CUmemAllocationHandleType shHandleType;
+} cuMemImportFromShareableHandle_params;
+
+typedef struct cuMemGetAllocationGranularity_params_st {
+    size_t *granularity;
+    const CUmemAllocationProp *prop;
+    CUmemAllocationGranularity_flags option;
+} cuMemGetAllocationGranularity_params;
+
+typedef struct cuMemGetAllocationPropertiesFromHandle_params_st {
+    CUmemAllocationProp *prop;
+    CUmemGenericAllocationHandle handle;
+} cuMemGetAllocationPropertiesFromHandle_params;
+
+typedef struct cuMemRetainAllocationHandle_params_st {
+    CUmemGenericAllocationHandle *handle;
+    void *addr;
+} cuMemRetainAllocationHandle_params;
+
+typedef struct cuMemFreeAsync_ptsz_params_st {
+    CUdeviceptr dptr;
+    CUstream hStream;
+} cuMemFreeAsync_ptsz_params;
+
+typedef struct cuMemAllocAsync_ptsz_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUstream hStream;
+} cuMemAllocAsync_ptsz_params;
+
+typedef struct cuMemPoolTrimTo_params_st {
+    CUmemoryPool pool;
+    size_t minBytesToKeep;
+} cuMemPoolTrimTo_params;
+
+typedef struct cuMemPoolSetAttribute_params_st {
+    CUmemoryPool pool;
+    CUmemPool_attribute attr;
+    void *value;
+} cuMemPoolSetAttribute_params;
+
+typedef struct cuMemPoolGetAttribute_params_st {
+    CUmemoryPool pool;
+    CUmemPool_attribute attr;
+    void *value;
+} cuMemPoolGetAttribute_params;
+
+typedef struct cuMemPoolSetAccess_params_st {
+    CUmemoryPool pool;
+    const CUmemAccessDesc *map;
+    size_t count;
+} cuMemPoolSetAccess_params;
+
+typedef struct cuMemPoolGetAccess_params_st {
+    CUmemAccess_flags *flags;
+    CUmemoryPool memPool;
+    CUmemLocation *location;
+} cuMemPoolGetAccess_params;
+
+typedef struct cuMemPoolCreate_params_st {
+    CUmemoryPool *pool;
+    const CUmemPoolProps *poolProps;
+} cuMemPoolCreate_params;
+
+typedef struct cuMemPoolDestroy_params_st {
+    CUmemoryPool pool;
+} cuMemPoolDestroy_params;
+
+typedef struct cuMemAllocFromPoolAsync_ptsz_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUmemoryPool pool;
+    CUstream hStream;
+} cuMemAllocFromPoolAsync_ptsz_params;
+
+typedef struct cuMemPoolExportToShareableHandle_params_st {
+    void *handle_out;
+    CUmemoryPool pool;
+    CUmemAllocationHandleType handleType;
+    unsigned long long flags;
+} cuMemPoolExportToShareableHandle_params;
+
+typedef struct cuMemPoolImportFromShareableHandle_params_st {
+    CUmemoryPool *pool_out;
+    void *handle;
+    CUmemAllocationHandleType handleType;
+    unsigned long long flags;
+} cuMemPoolImportFromShareableHandle_params;
+
+typedef struct cuMemPoolExportPointer_params_st {
+    CUmemPoolPtrExportData *shareData_out;
+    CUdeviceptr ptr;
+} cuMemPoolExportPointer_params;
+
+typedef struct cuMemPoolImportPointer_params_st {
+    CUdeviceptr *ptr_out;
+    CUmemoryPool pool;
+    CUmemPoolPtrExportData *shareData;
+} cuMemPoolImportPointer_params;
+
+typedef struct cuPointerGetAttribute_params_st {
+    void *data;
+    CUpointer_attribute attribute;
+    CUdeviceptr ptr;
+} cuPointerGetAttribute_params;
+
+typedef struct cuMemPrefetchAsync_ptsz_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUdevice dstDevice;
+    CUstream hStream;
+} cuMemPrefetchAsync_ptsz_params;
+
+typedef struct cuMemAdvise_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUmem_advise advice;
+    CUdevice device;
+} cuMemAdvise_params;
+
+typedef struct cuMemRangeGetAttribute_params_st {
+    void *data;
+    size_t dataSize;
+    CUmem_range_attribute attribute;
+    CUdeviceptr devPtr;
+    size_t count;
+} cuMemRangeGetAttribute_params;
+
+typedef struct cuMemRangeGetAttributes_params_st {
+    void **data;
+    size_t *dataSizes;
+    CUmem_range_attribute *attributes;
+    size_t numAttributes;
+    CUdeviceptr devPtr;
+    size_t count;
+} cuMemRangeGetAttributes_params;
+
+typedef struct cuPointerSetAttribute_params_st {
+    const void *value;
+    CUpointer_attribute attribute;
+    CUdeviceptr ptr;
+} cuPointerSetAttribute_params;
+
+typedef struct cuPointerGetAttributes_params_st {
+    unsigned int numAttributes;
+    CUpointer_attribute *attributes;
+    void **data;
+    CUdeviceptr ptr;
+} cuPointerGetAttributes_params;
+
+typedef struct cuStreamCreate_params_st {
+    CUstream *phStream;
+    unsigned int Flags;
+} cuStreamCreate_params;
+
+typedef struct cuStreamCreateWithPriority_params_st {
+    CUstream *phStream;
+    unsigned int flags;
+    int priority;
+} cuStreamCreateWithPriority_params;
+
+typedef struct cuStreamGetPriority_ptsz_params_st {
+    CUstream hStream;
+    int *priority;
+} cuStreamGetPriority_ptsz_params;
+
+typedef struct cuStreamGetFlags_ptsz_params_st {
+    CUstream hStream;
+    unsigned int *flags;
+} cuStreamGetFlags_ptsz_params;
+
+typedef struct cuStreamGetCtx_ptsz_params_st {
+    CUstream hStream;
+    CUcontext *pctx;
+} cuStreamGetCtx_ptsz_params;
+
+typedef struct cuStreamWaitEvent_ptsz_params_st {
+    CUstream hStream;
+    CUevent hEvent;
+    unsigned int Flags;
+} cuStreamWaitEvent_ptsz_params;
+
+typedef struct cuStreamAddCallback_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCallback callback;
+    void *userData;
+    unsigned int flags;
+} cuStreamAddCallback_ptsz_params;
+
+typedef struct cuStreamBeginCapture_v2_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureMode mode;
+} cuStreamBeginCapture_v2_ptsz_params;
+
+typedef struct cuThreadExchangeStreamCaptureMode_params_st {
+    CUstreamCaptureMode *mode;
+} cuThreadExchangeStreamCaptureMode_params;
+
+typedef struct cuStreamEndCapture_ptsz_params_st {
+    CUstream hStream;
+    CUgraph *phGraph;
+} cuStreamEndCapture_ptsz_params;
+
+typedef struct cuStreamIsCapturing_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus;
+} cuStreamIsCapturing_ptsz_params;
+
+typedef struct cuStreamGetCaptureInfo_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+} cuStreamGetCaptureInfo_ptsz_params;
+
+typedef struct cuStreamGetCaptureInfo_v2_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+    CUgraph *graph_out;
+    const CUgraphNode **dependencies_out;
+    size_t *numDependencies_out;
+} cuStreamGetCaptureInfo_v2_ptsz_params;
+
+typedef struct cuStreamUpdateCaptureDependencies_ptsz_params_st {
+    CUstream hStream;
+    CUgraphNode *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cuStreamUpdateCaptureDependencies_ptsz_params;
+
+typedef struct cuStreamAttachMemAsync_ptsz_params_st {
+    CUstream hStream;
+    CUdeviceptr dptr;
+    size_t length;
+    unsigned int flags;
+} cuStreamAttachMemAsync_ptsz_params;
+
+typedef struct cuStreamQuery_ptsz_params_st {
+    CUstream hStream;
+} cuStreamQuery_ptsz_params;
+
+typedef struct cuStreamSynchronize_ptsz_params_st {
+    CUstream hStream;
+} cuStreamSynchronize_ptsz_params;
+
+typedef struct cuStreamDestroy_v2_params_st {
+    CUstream hStream;
+} cuStreamDestroy_v2_params;
+
+typedef struct cuStreamCopyAttributes_ptsz_params_st {
+    CUstream dst;
+    CUstream src;
+} cuStreamCopyAttributes_ptsz_params;
+
+typedef struct cuStreamGetAttribute_ptsz_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    CUstreamAttrValue *value_out;
+} cuStreamGetAttribute_ptsz_params;
+
+typedef struct cuStreamSetAttribute_ptsz_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    const CUstreamAttrValue *value;
+} cuStreamSetAttribute_ptsz_params;
+
+typedef struct cuEventCreate_params_st {
+    CUevent *phEvent;
+    unsigned int Flags;
+} cuEventCreate_params;
+
+typedef struct cuEventRecord_ptsz_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+} cuEventRecord_ptsz_params;
+
+typedef struct cuEventRecordWithFlags_ptsz_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+    unsigned int flags;
+} cuEventRecordWithFlags_ptsz_params;
+
+typedef struct cuEventQuery_params_st {
+    CUevent hEvent;
+} cuEventQuery_params;
+
+typedef struct cuEventSynchronize_params_st {
+    CUevent hEvent;
+} cuEventSynchronize_params;
+
+typedef struct cuEventDestroy_v2_params_st {
+    CUevent hEvent;
+} cuEventDestroy_v2_params;
+
+typedef struct cuEventElapsedTime_params_st {
+    float *pMilliseconds;
+    CUevent hStart;
+    CUevent hEnd;
+} cuEventElapsedTime_params;
+
+typedef struct cuImportExternalMemory_params_st {
+    CUexternalMemory *extMem_out;
+    const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc;
+} cuImportExternalMemory_params;
+
+typedef struct cuExternalMemoryGetMappedBuffer_params_st {
+    CUdeviceptr *devPtr;
+    CUexternalMemory extMem;
+    const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc;
+} cuExternalMemoryGetMappedBuffer_params;
+
+typedef struct cuExternalMemoryGetMappedMipmappedArray_params_st {
+    CUmipmappedArray *mipmap;
+    CUexternalMemory extMem;
+    const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc;
+} cuExternalMemoryGetMappedMipmappedArray_params;
+
+typedef struct cuDestroyExternalMemory_params_st {
+    CUexternalMemory extMem;
+} cuDestroyExternalMemory_params;
+
+typedef struct cuImportExternalSemaphore_params_st {
+    CUexternalSemaphore *extSem_out;
+    const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc;
+} cuImportExternalSemaphore_params;
+
+typedef struct cuSignalExternalSemaphoresAsync_ptsz_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuSignalExternalSemaphoresAsync_ptsz_params;
+
+typedef struct cuWaitExternalSemaphoresAsync_ptsz_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuWaitExternalSemaphoresAsync_ptsz_params;
+
+typedef struct cuDestroyExternalSemaphore_params_st {
+    CUexternalSemaphore extSem;
+} cuDestroyExternalSemaphore_params;
+
+typedef struct cuStreamWaitValue32_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWaitValue32_ptsz_params;
+
+typedef struct cuStreamWaitValue64_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWaitValue64_ptsz_params;
+
+typedef struct cuStreamWriteValue32_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWriteValue32_ptsz_params;
+
+typedef struct cuStreamWriteValue64_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWriteValue64_ptsz_params;
+
+typedef struct cuStreamBatchMemOp_ptsz_params_st {
+    CUstream stream;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} cuStreamBatchMemOp_ptsz_params;
+
+typedef struct cuFuncGetAttribute_params_st {
+    int *pi;
+    CUfunction_attribute attrib;
+    CUfunction hfunc;
+} cuFuncGetAttribute_params;
+
+typedef struct cuFuncSetAttribute_params_st {
+    CUfunction hfunc;
+    CUfunction_attribute attrib;
+    int value;
+} cuFuncSetAttribute_params;
+
+typedef struct cuFuncSetCacheConfig_params_st {
+    CUfunction hfunc;
+    CUfunc_cache config;
+} cuFuncSetCacheConfig_params;
+
+typedef struct cuFuncSetSharedMemConfig_params_st {
+    CUfunction hfunc;
+    CUsharedconfig config;
+} cuFuncSetSharedMemConfig_params;
+
+typedef struct cuFuncGetModule_params_st {
+    CUmodule *hmod;
+    CUfunction hfunc;
+} cuFuncGetModule_params;
+
+typedef struct cuLaunchKernel_ptsz_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernel_ptsz_params;
+
+typedef struct cuLaunchKernelEx_ptsz_params_st {
+    const CUlaunchConfig *config;
+    CUfunction f;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernelEx_ptsz_params;
+
+typedef struct cuLaunchCooperativeKernel_ptsz_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+} cuLaunchCooperativeKernel_ptsz_params;
+
+typedef struct cuLaunchCooperativeKernelMultiDevice_params_st {
+    CUDA_LAUNCH_PARAMS *launchParamsList;
+    unsigned int numDevices;
+    unsigned int flags;
+} cuLaunchCooperativeKernelMultiDevice_params;
+
+typedef struct cuLaunchHostFunc_ptsz_params_st {
+    CUstream hStream;
+    CUhostFn fn;
+    void *userData;
+} cuLaunchHostFunc_ptsz_params;
+
+typedef struct cuFuncSetBlockShape_params_st {
+    CUfunction hfunc;
+    int x;
+    int y;
+    int z;
+} cuFuncSetBlockShape_params;
+
+typedef struct cuFuncSetSharedSize_params_st {
+    CUfunction hfunc;
+    unsigned int bytes;
+} cuFuncSetSharedSize_params;
+
+typedef struct cuParamSetSize_params_st {
+    CUfunction hfunc;
+    unsigned int numbytes;
+} cuParamSetSize_params;
+
+typedef struct cuParamSeti_params_st {
+    CUfunction hfunc;
+    int offset;
+    unsigned int value;
+} cuParamSeti_params;
+
+typedef struct cuParamSetf_params_st {
+    CUfunction hfunc;
+    int offset;
+    float value;
+} cuParamSetf_params;
+
+typedef struct cuParamSetv_params_st {
+    CUfunction hfunc;
+    int offset;
+    void *ptr;
+    unsigned int numbytes;
+} cuParamSetv_params;
+
+typedef struct cuLaunch_params_st {
+    CUfunction f;
+} cuLaunch_params;
+
+typedef struct cuLaunchGrid_params_st {
+    CUfunction f;
+    int grid_width;
+    int grid_height;
+} cuLaunchGrid_params;
+
+typedef struct cuLaunchGridAsync_params_st {
+    CUfunction f;
+    int grid_width;
+    int grid_height;
+    CUstream hStream;
+} cuLaunchGridAsync_params;
+
+typedef struct cuParamSetTexRef_params_st {
+    CUfunction hfunc;
+    int texunit;
+    CUtexref hTexRef;
+} cuParamSetTexRef_params;
+
+typedef struct cuGraphCreate_params_st {
+    CUgraph *phGraph;
+    unsigned int flags;
+} cuGraphCreate_params;
+
+typedef struct cuGraphAddKernelNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphAddKernelNode_params;
+
+typedef struct cuGraphKernelNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphKernelNodeGetParams_params;
+
+typedef struct cuGraphKernelNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphKernelNodeSetParams_params;
+
+typedef struct cuGraphAddMemcpyNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_MEMCPY3D *copyParams;
+    CUcontext ctx;
+} cuGraphAddMemcpyNode_params;
+
+typedef struct cuGraphMemcpyNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_MEMCPY3D *nodeParams;
+} cuGraphMemcpyNodeGetParams_params;
+
+typedef struct cuGraphMemcpyNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_MEMCPY3D *nodeParams;
+} cuGraphMemcpyNodeSetParams_params;
+
+typedef struct cuGraphAddMemsetNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_MEMSET_NODE_PARAMS *memsetParams;
+    CUcontext ctx;
+} cuGraphAddMemsetNode_params;
+
+typedef struct cuGraphMemsetNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_MEMSET_NODE_PARAMS *nodeParams;
+} cuGraphMemsetNodeGetParams_params;
+
+typedef struct cuGraphMemsetNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_MEMSET_NODE_PARAMS *nodeParams;
+} cuGraphMemsetNodeSetParams_params;
+
+typedef struct cuGraphAddHostNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphAddHostNode_params;
+
+typedef struct cuGraphHostNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphHostNodeGetParams_params;
+
+typedef struct cuGraphHostNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphHostNodeSetParams_params;
+
+typedef struct cuGraphAddChildGraphNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUgraph childGraph;
+} cuGraphAddChildGraphNode_params;
+
+typedef struct cuGraphChildGraphNodeGetGraph_params_st {
+    CUgraphNode hNode;
+    CUgraph *phGraph;
+} cuGraphChildGraphNodeGetGraph_params;
+
+typedef struct cuGraphAddEmptyNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+} cuGraphAddEmptyNode_params;
+
+typedef struct cuGraphAddEventRecordNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUevent event;
+} cuGraphAddEventRecordNode_params;
+
+typedef struct cuGraphEventRecordNodeGetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent *event_out;
+} cuGraphEventRecordNodeGetEvent_params;
+
+typedef struct cuGraphEventRecordNodeSetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphEventRecordNodeSetEvent_params;
+
+typedef struct cuGraphAddEventWaitNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUevent event;
+} cuGraphAddEventWaitNode_params;
+
+typedef struct cuGraphEventWaitNodeGetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent *event_out;
+} cuGraphEventWaitNodeGetEvent_params;
+
+typedef struct cuGraphEventWaitNodeSetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphEventWaitNodeSetEvent_params;
+
+typedef struct cuGraphAddExternalSemaphoresSignalNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams;
+} cuGraphAddExternalSemaphoresSignalNode_params;
+
+typedef struct cuGraphExternalSemaphoresSignalNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out;
+} cuGraphExternalSemaphoresSignalNodeGetParams_params;
+
+typedef struct cuGraphExternalSemaphoresSignalNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams;
+} cuGraphExternalSemaphoresSignalNodeSetParams_params;
+
+typedef struct cuGraphAddExternalSemaphoresWaitNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams;
+} cuGraphAddExternalSemaphoresWaitNode_params;
+
+typedef struct cuGraphExternalSemaphoresWaitNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out;
+} cuGraphExternalSemaphoresWaitNodeGetParams_params;
+
+typedef struct cuGraphExternalSemaphoresWaitNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams;
+} cuGraphExternalSemaphoresWaitNodeSetParams_params;
+
+typedef struct cuGraphAddMemAllocNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams;
+} cuGraphAddMemAllocNode_params;
+
+typedef struct cuGraphMemAllocNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_MEM_ALLOC_NODE_PARAMS *params_out;
+} cuGraphMemAllocNodeGetParams_params;
+
+typedef struct cuGraphAddMemFreeNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUdeviceptr dptr;
+} cuGraphAddMemFreeNode_params;
+
+typedef struct cuGraphMemFreeNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUdeviceptr *dptr_out;
+} cuGraphMemFreeNodeGetParams_params;
+
+typedef struct cuDeviceGraphMemTrim_params_st {
+    CUdevice device;
+} cuDeviceGraphMemTrim_params;
+
+typedef struct cuDeviceGetGraphMemAttribute_params_st {
+    CUdevice device;
+    CUgraphMem_attribute attr;
+    void *value;
+} cuDeviceGetGraphMemAttribute_params;
+
+typedef struct cuDeviceSetGraphMemAttribute_params_st {
+    CUdevice device;
+    CUgraphMem_attribute attr;
+    void *value;
+} cuDeviceSetGraphMemAttribute_params;
+
+typedef struct cuGraphClone_params_st {
+    CUgraph *phGraphClone;
+    CUgraph originalGraph;
+} cuGraphClone_params;
+
+typedef struct cuGraphNodeFindInClone_params_st {
+    CUgraphNode *phNode;
+    CUgraphNode hOriginalNode;
+    CUgraph hClonedGraph;
+} cuGraphNodeFindInClone_params;
+
+typedef struct cuGraphNodeGetType_params_st {
+    CUgraphNode hNode;
+    CUgraphNodeType *type;
+} cuGraphNodeGetType_params;
+
+typedef struct cuGraphGetNodes_params_st {
+    CUgraph hGraph;
+    CUgraphNode *nodes;
+    size_t *numNodes;
+} cuGraphGetNodes_params;
+
+typedef struct cuGraphGetRootNodes_params_st {
+    CUgraph hGraph;
+    CUgraphNode *rootNodes;
+    size_t *numRootNodes;
+} cuGraphGetRootNodes_params;
+
+typedef struct cuGraphGetEdges_params_st {
+    CUgraph hGraph;
+    CUgraphNode *from;
+    CUgraphNode *to;
+    size_t *numEdges;
+} cuGraphGetEdges_params;
+
+typedef struct cuGraphNodeGetDependencies_params_st {
+    CUgraphNode hNode;
+    CUgraphNode *dependencies;
+    size_t *numDependencies;
+} cuGraphNodeGetDependencies_params;
+
+typedef struct cuGraphNodeGetDependentNodes_params_st {
+    CUgraphNode hNode;
+    CUgraphNode *dependentNodes;
+    size_t *numDependentNodes;
+} cuGraphNodeGetDependentNodes_params;
+
+typedef struct cuGraphAddDependencies_params_st {
+    CUgraph hGraph;
+    const CUgraphNode *from;
+    const CUgraphNode *to;
+    size_t numDependencies;
+} cuGraphAddDependencies_params;
+
+typedef struct cuGraphRemoveDependencies_params_st {
+    CUgraph hGraph;
+    const CUgraphNode *from;
+    const CUgraphNode *to;
+    size_t numDependencies;
+} cuGraphRemoveDependencies_params;
+
+typedef struct cuGraphDestroyNode_params_st {
+    CUgraphNode hNode;
+} cuGraphDestroyNode_params;
+
+typedef struct cuGraphInstantiate_v2_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    CUgraphNode *phErrorNode;
+    char *logBuffer;
+    size_t bufferSize;
+} cuGraphInstantiate_v2_params;
+
+typedef struct cuGraphInstantiateWithFlags_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    unsigned long long flags;
+} cuGraphInstantiateWithFlags_params;
+
+typedef struct cuGraphExecKernelNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphExecKernelNodeSetParams_params;
+
+typedef struct cuGraphExecMemcpyNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_MEMCPY3D *copyParams;
+    CUcontext ctx;
+} cuGraphExecMemcpyNodeSetParams_params;
+
+typedef struct cuGraphExecMemsetNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_MEMSET_NODE_PARAMS *memsetParams;
+    CUcontext ctx;
+} cuGraphExecMemsetNodeSetParams_params;
+
+typedef struct cuGraphExecHostNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphExecHostNodeSetParams_params;
+
+typedef struct cuGraphExecChildGraphNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUgraph childGraph;
+} cuGraphExecChildGraphNodeSetParams_params;
+
+typedef struct cuGraphExecEventRecordNodeSetEvent_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphExecEventRecordNodeSetEvent_params;
+
+typedef struct cuGraphExecEventWaitNodeSetEvent_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphExecEventWaitNodeSetEvent_params;
+
+typedef struct cuGraphExecExternalSemaphoresSignalNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams;
+} cuGraphExecExternalSemaphoresSignalNodeSetParams_params;
+
+typedef struct cuGraphExecExternalSemaphoresWaitNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams;
+} cuGraphExecExternalSemaphoresWaitNodeSetParams_params;
+
+typedef struct cuGraphNodeSetEnabled_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    unsigned int isEnabled;
+} cuGraphNodeSetEnabled_params;
+
+typedef struct cuGraphNodeGetEnabled_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    unsigned int *isEnabled;
+} cuGraphNodeGetEnabled_params;
+
+typedef struct cuGraphUpload_ptsz_params_st {
+    CUgraphExec hGraphExec;
+    CUstream hStream;
+} cuGraphUpload_ptsz_params;
+
+typedef struct cuGraphLaunch_ptsz_params_st {
+    CUgraphExec hGraphExec;
+    CUstream hStream;
+} cuGraphLaunch_ptsz_params;
+
+typedef struct cuGraphExecDestroy_params_st {
+    CUgraphExec hGraphExec;
+} cuGraphExecDestroy_params;
+
+typedef struct cuGraphDestroy_params_st {
+    CUgraph hGraph;
+} cuGraphDestroy_params;
+
+typedef struct cuGraphExecUpdate_params_st {
+    CUgraphExec hGraphExec;
+    CUgraph hGraph;
+    CUgraphNode *hErrorNode_out;
+    CUgraphExecUpdateResult *updateResult_out;
+} cuGraphExecUpdate_params;
+
+typedef struct cuGraphKernelNodeCopyAttributes_params_st {
+    CUgraphNode dst;
+    CUgraphNode src;
+} cuGraphKernelNodeCopyAttributes_params;
+
+typedef struct cuGraphKernelNodeGetAttribute_params_st {
+    CUgraphNode hNode;
+    CUkernelNodeAttrID attr;
+    CUkernelNodeAttrValue *value_out;
+} cuGraphKernelNodeGetAttribute_params;
+
+typedef struct cuGraphKernelNodeSetAttribute_params_st {
+    CUgraphNode hNode;
+    CUkernelNodeAttrID attr;
+    const CUkernelNodeAttrValue *value;
+} cuGraphKernelNodeSetAttribute_params;
+
+typedef struct cuGraphDebugDotPrint_params_st {
+    CUgraph hGraph;
+    const char *path;
+    unsigned int flags;
+} cuGraphDebugDotPrint_params;
+
+typedef struct cuUserObjectCreate_params_st {
+    CUuserObject *object_out;
+    void *ptr;
+    CUhostFn destroy;
+    unsigned int initialRefcount;
+    unsigned int flags;
+} cuUserObjectCreate_params;
+
+typedef struct cuUserObjectRetain_params_st {
+    CUuserObject object;
+    unsigned int count;
+} cuUserObjectRetain_params;
+
+typedef struct cuUserObjectRelease_params_st {
+    CUuserObject object;
+    unsigned int count;
+} cuUserObjectRelease_params;
+
+typedef struct cuGraphRetainUserObject_params_st {
+    CUgraph graph;
+    CUuserObject object;
+    unsigned int count;
+    unsigned int flags;
+} cuGraphRetainUserObject_params;
+
+typedef struct cuGraphReleaseUserObject_params_st {
+    CUgraph graph;
+    CUuserObject object;
+    unsigned int count;
+} cuGraphReleaseUserObject_params;
+
+typedef struct cuOccupancyMaxActiveBlocksPerMultiprocessor_params_st {
+    int *numBlocks;
+    CUfunction func;
+    int blockSize;
+    size_t dynamicSMemSize;
+} cuOccupancyMaxActiveBlocksPerMultiprocessor_params;
+
+typedef struct cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_params_st {
+    int *numBlocks;
+    CUfunction func;
+    int blockSize;
+    size_t dynamicSMemSize;
+    unsigned int flags;
+} cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_params;
+
+typedef struct cuOccupancyMaxPotentialBlockSize_params_st {
+    int *minGridSize;
+    int *blockSize;
+    CUfunction func;
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize;
+    size_t dynamicSMemSize;
+    int blockSizeLimit;
+} cuOccupancyMaxPotentialBlockSize_params;
+
+typedef struct cuOccupancyMaxPotentialBlockSizeWithFlags_params_st {
+    int *minGridSize;
+    int *blockSize;
+    CUfunction func;
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize;
+    size_t dynamicSMemSize;
+    int blockSizeLimit;
+    unsigned int flags;
+} cuOccupancyMaxPotentialBlockSizeWithFlags_params;
+
+typedef struct cuOccupancyAvailableDynamicSMemPerBlock_params_st {
+    size_t *dynamicSmemSize;
+    CUfunction func;
+    int numBlocks;
+    int blockSize;
+} cuOccupancyAvailableDynamicSMemPerBlock_params;
+
+typedef struct cuOccupancyMaxPotentialClusterSize_params_st {
+    int *clusterSize;
+    CUfunction func;
+    const CUlaunchConfig *config;
+} cuOccupancyMaxPotentialClusterSize_params;
+
+typedef struct cuOccupancyMaxActiveClusters_params_st {
+    int *numClusters;
+    CUfunction func;
+    const CUlaunchConfig *config;
+} cuOccupancyMaxActiveClusters_params;
+
+typedef struct cuTexRefSetArray_params_st {
+    CUtexref hTexRef;
+    CUarray hArray;
+    unsigned int Flags;
+} cuTexRefSetArray_params;
+
+typedef struct cuTexRefSetMipmappedArray_params_st {
+    CUtexref hTexRef;
+    CUmipmappedArray hMipmappedArray;
+    unsigned int Flags;
+} cuTexRefSetMipmappedArray_params;
+
+typedef struct cuTexRefSetAddress_v2_params_st {
+    size_t *ByteOffset;
+    CUtexref hTexRef;
+    CUdeviceptr dptr;
+    size_t bytes;
+} cuTexRefSetAddress_v2_params;
+
+typedef struct cuTexRefSetAddress2D_v3_params_st {
+    CUtexref hTexRef;
+    const CUDA_ARRAY_DESCRIPTOR *desc;
+    CUdeviceptr dptr;
+    size_t Pitch;
+} cuTexRefSetAddress2D_v3_params;
+
+typedef struct cuTexRefSetFormat_params_st {
+    CUtexref hTexRef;
+    CUarray_format fmt;
+    int NumPackedComponents;
+} cuTexRefSetFormat_params;
+
+typedef struct cuTexRefSetAddressMode_params_st {
+    CUtexref hTexRef;
+    int dim;
+    CUaddress_mode am;
+} cuTexRefSetAddressMode_params;
+
+typedef struct cuTexRefSetFilterMode_params_st {
+    CUtexref hTexRef;
+    CUfilter_mode fm;
+} cuTexRefSetFilterMode_params;
+
+typedef struct cuTexRefSetMipmapFilterMode_params_st {
+    CUtexref hTexRef;
+    CUfilter_mode fm;
+} cuTexRefSetMipmapFilterMode_params;
+
+typedef struct cuTexRefSetMipmapLevelBias_params_st {
+    CUtexref hTexRef;
+    float bias;
+} cuTexRefSetMipmapLevelBias_params;
+
+typedef struct cuTexRefSetMipmapLevelClamp_params_st {
+    CUtexref hTexRef;
+    float minMipmapLevelClamp;
+    float maxMipmapLevelClamp;
+} cuTexRefSetMipmapLevelClamp_params;
+
+typedef struct cuTexRefSetMaxAnisotropy_params_st {
+    CUtexref hTexRef;
+    unsigned int maxAniso;
+} cuTexRefSetMaxAnisotropy_params;
+
+typedef struct cuTexRefSetBorderColor_params_st {
+    CUtexref hTexRef;
+    float *pBorderColor;
+} cuTexRefSetBorderColor_params;
+
+typedef struct cuTexRefSetFlags_params_st {
+    CUtexref hTexRef;
+    unsigned int Flags;
+} cuTexRefSetFlags_params;
+
+typedef struct cuTexRefGetAddress_v2_params_st {
+    CUdeviceptr *pdptr;
+    CUtexref hTexRef;
+} cuTexRefGetAddress_v2_params;
+
+typedef struct cuTexRefGetArray_params_st {
+    CUarray *phArray;
+    CUtexref hTexRef;
+} cuTexRefGetArray_params;
+
+typedef struct cuTexRefGetMipmappedArray_params_st {
+    CUmipmappedArray *phMipmappedArray;
+    CUtexref hTexRef;
+} cuTexRefGetMipmappedArray_params;
+
+typedef struct cuTexRefGetAddressMode_params_st {
+    CUaddress_mode *pam;
+    CUtexref hTexRef;
+    int dim;
+} cuTexRefGetAddressMode_params;
+
+typedef struct cuTexRefGetFilterMode_params_st {
+    CUfilter_mode *pfm;
+    CUtexref hTexRef;
+} cuTexRefGetFilterMode_params;
+
+typedef struct cuTexRefGetFormat_params_st {
+    CUarray_format *pFormat;
+    int *pNumChannels;
+    CUtexref hTexRef;
+} cuTexRefGetFormat_params;
+
+typedef struct cuTexRefGetMipmapFilterMode_params_st {
+    CUfilter_mode *pfm;
+    CUtexref hTexRef;
+} cuTexRefGetMipmapFilterMode_params;
+
+typedef struct cuTexRefGetMipmapLevelBias_params_st {
+    float *pbias;
+    CUtexref hTexRef;
+} cuTexRefGetMipmapLevelBias_params;
+
+typedef struct cuTexRefGetMipmapLevelClamp_params_st {
+    float *pminMipmapLevelClamp;
+    float *pmaxMipmapLevelClamp;
+    CUtexref hTexRef;
+} cuTexRefGetMipmapLevelClamp_params;
+
+typedef struct cuTexRefGetMaxAnisotropy_params_st {
+    int *pmaxAniso;
+    CUtexref hTexRef;
+} cuTexRefGetMaxAnisotropy_params;
+
+typedef struct cuTexRefGetBorderColor_params_st {
+    float *pBorderColor;
+    CUtexref hTexRef;
+} cuTexRefGetBorderColor_params;
+
+typedef struct cuTexRefGetFlags_params_st {
+    unsigned int *pFlags;
+    CUtexref hTexRef;
+} cuTexRefGetFlags_params;
+
+typedef struct cuTexRefCreate_params_st {
+    CUtexref *pTexRef;
+} cuTexRefCreate_params;
+
+typedef struct cuTexRefDestroy_params_st {
+    CUtexref hTexRef;
+} cuTexRefDestroy_params;
+
+typedef struct cuSurfRefSetArray_params_st {
+    CUsurfref hSurfRef;
+    CUarray hArray;
+    unsigned int Flags;
+} cuSurfRefSetArray_params;
+
+typedef struct cuSurfRefGetArray_params_st {
+    CUarray *phArray;
+    CUsurfref hSurfRef;
+} cuSurfRefGetArray_params;
+
+typedef struct cuTexObjectCreate_params_st {
+    CUtexObject *pTexObject;
+    const CUDA_RESOURCE_DESC *pResDesc;
+    const CUDA_TEXTURE_DESC *pTexDesc;
+    const CUDA_RESOURCE_VIEW_DESC *pResViewDesc;
+} cuTexObjectCreate_params;
+
+typedef struct cuTexObjectDestroy_params_st {
+    CUtexObject texObject;
+} cuTexObjectDestroy_params;
+
+typedef struct cuTexObjectGetResourceDesc_params_st {
+    CUDA_RESOURCE_DESC *pResDesc;
+    CUtexObject texObject;
+} cuTexObjectGetResourceDesc_params;
+
+typedef struct cuTexObjectGetTextureDesc_params_st {
+    CUDA_TEXTURE_DESC *pTexDesc;
+    CUtexObject texObject;
+} cuTexObjectGetTextureDesc_params;
+
+typedef struct cuTexObjectGetResourceViewDesc_params_st {
+    CUDA_RESOURCE_VIEW_DESC *pResViewDesc;
+    CUtexObject texObject;
+} cuTexObjectGetResourceViewDesc_params;
+
+typedef struct cuSurfObjectCreate_params_st {
+    CUsurfObject *pSurfObject;
+    const CUDA_RESOURCE_DESC *pResDesc;
+} cuSurfObjectCreate_params;
+
+typedef struct cuSurfObjectDestroy_params_st {
+    CUsurfObject surfObject;
+} cuSurfObjectDestroy_params;
+
+typedef struct cuSurfObjectGetResourceDesc_params_st {
+    CUDA_RESOURCE_DESC *pResDesc;
+    CUsurfObject surfObject;
+} cuSurfObjectGetResourceDesc_params;
+
+typedef struct cuDeviceCanAccessPeer_params_st {
+    int *canAccessPeer;
+    CUdevice dev;
+    CUdevice peerDev;
+} cuDeviceCanAccessPeer_params;
+
+typedef struct cuCtxEnablePeerAccess_params_st {
+    CUcontext peerContext;
+    unsigned int Flags;
+} cuCtxEnablePeerAccess_params;
+
+typedef struct cuCtxDisablePeerAccess_params_st {
+    CUcontext peerContext;
+} cuCtxDisablePeerAccess_params;
+
+typedef struct cuDeviceGetP2PAttribute_params_st {
+    int *value;
+    CUdevice_P2PAttribute attrib;
+    CUdevice srcDevice;
+    CUdevice dstDevice;
+} cuDeviceGetP2PAttribute_params;
+
+typedef struct cuGraphicsUnregisterResource_params_st {
+    CUgraphicsResource resource;
+} cuGraphicsUnregisterResource_params;
+
+typedef struct cuGraphicsSubResourceGetMappedArray_params_st {
+    CUarray *pArray;
+    CUgraphicsResource resource;
+    unsigned int arrayIndex;
+    unsigned int mipLevel;
+} cuGraphicsSubResourceGetMappedArray_params;
+
+typedef struct cuGraphicsResourceGetMappedMipmappedArray_params_st {
+    CUmipmappedArray *pMipmappedArray;
+    CUgraphicsResource resource;
+} cuGraphicsResourceGetMappedMipmappedArray_params;
+
+typedef struct cuGraphicsResourceGetMappedPointer_v2_params_st {
+    CUdeviceptr *pDevPtr;
+    size_t *pSize;
+    CUgraphicsResource resource;
+} cuGraphicsResourceGetMappedPointer_v2_params;
+
+typedef struct cuGraphicsResourceSetMapFlags_v2_params_st {
+    CUgraphicsResource resource;
+    unsigned int flags;
+} cuGraphicsResourceSetMapFlags_v2_params;
+
+typedef struct cuGraphicsMapResources_ptsz_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsMapResources_ptsz_params;
+
+typedef struct cuGraphicsUnmapResources_ptsz_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsUnmapResources_ptsz_params;
+
+typedef struct cuGetProcAddress_params_st {
+    const char *symbol;
+    void **pfn;
+    int cudaVersion;
+    cuuint64_t flags;
+} cuGetProcAddress_params;
+
+typedef struct cuModuleGetLoadingMode_params_st {
+    CUmoduleLoadingMode *mode;
+} cuModuleGetLoadingMode_params;
+
+typedef struct cuMemGetHandleForAddressRange_params_st {
+    void *handle;
+    CUdeviceptr dptr;
+    size_t size;
+    CUmemRangeHandleType handleType;
+    unsigned long long flags;
+} cuMemGetHandleForAddressRange_params;
+
+typedef struct cuGetExportTable_params_st {
+    const void **ppExportTable;
+    const CUuuid *pExportTableId;
+} cuGetExportTable_params;
+
+typedef struct cuMemHostRegister_params_st {
+    void *p;
+    size_t bytesize;
+    unsigned int Flags;
+} cuMemHostRegister_params;
+
+typedef struct cuGraphicsResourceSetMapFlags_params_st {
+    CUgraphicsResource resource;
+    unsigned int flags;
+} cuGraphicsResourceSetMapFlags_params;
+
+typedef struct cuLinkCreate_params_st {
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+    CUlinkState *stateOut;
+} cuLinkCreate_params;
+
+typedef struct cuLinkAddData_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    void *data;
+    size_t size;
+    const char *name;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddData_params;
+
+typedef struct cuLinkAddFile_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    const char *path;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddFile_params;
+
+typedef struct cuTexRefSetAddress2D_v2_params_st {
+    CUtexref hTexRef;
+    const CUDA_ARRAY_DESCRIPTOR *desc;
+    CUdeviceptr dptr;
+    size_t Pitch;
+} cuTexRefSetAddress2D_v2_params;
+
+typedef struct cuDeviceTotalMem_params_st {
+    unsigned int *bytes;
+    CUdevice dev;
+} cuDeviceTotalMem_params;
+
+typedef struct cuCtxCreate_params_st {
+    CUcontext *pctx;
+    unsigned int flags;
+    CUdevice dev;
+} cuCtxCreate_params;
+
+typedef struct cuModuleGetGlobal_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *bytes;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetGlobal_params;
+
+typedef struct cuMemGetInfo_params_st {
+    unsigned int *free;
+    unsigned int *total;
+} cuMemGetInfo_params;
+
+typedef struct cuMemAlloc_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int bytesize;
+} cuMemAlloc_params;
+
+typedef struct cuMemAllocPitch_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *pPitch;
+    unsigned int WidthInBytes;
+    unsigned int Height;
+    unsigned int ElementSizeBytes;
+} cuMemAllocPitch_params;
+
+typedef struct cuMemFree_params_st {
+    CUdeviceptr_v1 dptr;
+} cuMemFree_params;
+
+typedef struct cuMemGetAddressRange_params_st {
+    CUdeviceptr_v1 *pbase;
+    unsigned int *psize;
+    CUdeviceptr_v1 dptr;
+} cuMemGetAddressRange_params;
+
+typedef struct cuMemAllocHost_params_st {
+    void **pp;
+    unsigned int bytesize;
+} cuMemAllocHost_params;
+
+typedef struct cuMemHostGetDevicePointer_params_st {
+    CUdeviceptr_v1 *pdptr;
+    void *p;
+    unsigned int Flags;
+} cuMemHostGetDevicePointer_params;
+
+typedef struct cuMemcpyHtoD_params_st {
+    CUdeviceptr_v1 dstDevice;
+    const void *srcHost;
+    unsigned int ByteCount;
+} cuMemcpyHtoD_params;
+
+typedef struct cuMemcpyDtoH_params_st {
+    void *dstHost;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+} cuMemcpyDtoH_params;
+
+typedef struct cuMemcpyDtoD_params_st {
+    CUdeviceptr_v1 dstDevice;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+} cuMemcpyDtoD_params;
+
+typedef struct cuMemcpyDtoA_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+} cuMemcpyDtoA_params;
+
+typedef struct cuMemcpyAtoD_params_st {
+    CUdeviceptr_v1 dstDevice;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+} cuMemcpyAtoD_params;
+
+typedef struct cuMemcpyHtoA_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    const void *srcHost;
+    unsigned int ByteCount;
+} cuMemcpyHtoA_params;
+
+typedef struct cuMemcpyAtoH_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+} cuMemcpyAtoH_params;
+
+typedef struct cuMemcpyAtoA_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+} cuMemcpyAtoA_params;
+
+typedef struct cuMemcpyHtoAAsync_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    const void *srcHost;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoAAsync_params;
+
+typedef struct cuMemcpyAtoHAsync_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyAtoHAsync_params;
+
+typedef struct cuMemcpy2D_params_st {
+    const CUDA_MEMCPY2D_v1 *pCopy;
+} cuMemcpy2D_params;
+
+typedef struct cuMemcpy2DUnaligned_params_st {
+    const CUDA_MEMCPY2D_v1 *pCopy;
+} cuMemcpy2DUnaligned_params;
+
+typedef struct cuMemcpy3D_params_st {
+    const CUDA_MEMCPY3D_v1 *pCopy;
+} cuMemcpy3D_params;
+
+typedef struct cuMemcpyHtoDAsync_params_st {
+    CUdeviceptr_v1 dstDevice;
+    const void *srcHost;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoDAsync_params;
+
+typedef struct cuMemcpyDtoHAsync_params_st {
+    void *dstHost;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoHAsync_params;
+
+typedef struct cuMemcpyDtoDAsync_params_st {
+    CUdeviceptr_v1 dstDevice;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoDAsync_params;
+
+typedef struct cuMemcpy2DAsync_params_st {
+    const CUDA_MEMCPY2D_v1 *pCopy;
+    CUstream hStream;
+} cuMemcpy2DAsync_params;
+
+typedef struct cuMemcpy3DAsync_params_st {
+    const CUDA_MEMCPY3D_v1 *pCopy;
+    CUstream hStream;
+} cuMemcpy3DAsync_params;
+
+typedef struct cuMemsetD8_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned char uc;
+    unsigned int N;
+} cuMemsetD8_params;
+
+typedef struct cuMemsetD16_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned short us;
+    unsigned int N;
+} cuMemsetD16_params;
+
+typedef struct cuMemsetD32_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int ui;
+    unsigned int N;
+} cuMemsetD32_params;
+
+typedef struct cuMemsetD2D8_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int dstPitch;
+    unsigned char uc;
+    unsigned int Width;
+    unsigned int Height;
+} cuMemsetD2D8_params;
+
+typedef struct cuMemsetD2D16_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int dstPitch;
+    unsigned short us;
+    unsigned int Width;
+    unsigned int Height;
+} cuMemsetD2D16_params;
+
+typedef struct cuMemsetD2D32_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int dstPitch;
+    unsigned int ui;
+    unsigned int Width;
+    unsigned int Height;
+} cuMemsetD2D32_params;
+
+typedef struct cuArrayCreate_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray;
+} cuArrayCreate_params;
+
+typedef struct cuArrayGetDescriptor_params_st {
+    CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor;
+    CUarray hArray;
+} cuArrayGetDescriptor_params;
+
+typedef struct cuArray3DCreate_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray;
+} cuArray3DCreate_params;
+
+typedef struct cuArray3DGetDescriptor_params_st {
+    CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor;
+    CUarray hArray;
+} cuArray3DGetDescriptor_params;
+
+typedef struct cuTexRefSetAddress_params_st {
+    unsigned int *ByteOffset;
+    CUtexref hTexRef;
+    CUdeviceptr_v1 dptr;
+    unsigned int bytes;
+} cuTexRefSetAddress_params;
+
+typedef struct cuTexRefSetAddress2D_params_st {
+    CUtexref hTexRef;
+    const CUDA_ARRAY_DESCRIPTOR_v1 *desc;
+    CUdeviceptr_v1 dptr;
+    unsigned int Pitch;
+} cuTexRefSetAddress2D_params;
+
+typedef struct cuTexRefGetAddress_params_st {
+    CUdeviceptr_v1 *pdptr;
+    CUtexref hTexRef;
+} cuTexRefGetAddress_params;
+
+typedef struct cuGraphicsResourceGetMappedPointer_params_st {
+    CUdeviceptr_v1 *pDevPtr;
+    unsigned int *pSize;
+    CUgraphicsResource resource;
+} cuGraphicsResourceGetMappedPointer_params;
+
+typedef struct cuCtxDestroy_params_st {
+    CUcontext ctx;
+} cuCtxDestroy_params;
+
+typedef struct cuCtxPopCurrent_params_st {
+    CUcontext *pctx;
+} cuCtxPopCurrent_params;
+
+typedef struct cuCtxPushCurrent_params_st {
+    CUcontext ctx;
+} cuCtxPushCurrent_params;
+
+typedef struct cuStreamDestroy_params_st {
+    CUstream hStream;
+} cuStreamDestroy_params;
+
+typedef struct cuEventDestroy_params_st {
+    CUevent hEvent;
+} cuEventDestroy_params;
+
+typedef struct cuDevicePrimaryCtxRelease_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxRelease_params;
+
+typedef struct cuDevicePrimaryCtxReset_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxReset_params;
+
+typedef struct cuDevicePrimaryCtxSetFlags_params_st {
+    CUdevice dev;
+    unsigned int flags;
+} cuDevicePrimaryCtxSetFlags_params;
+
+typedef struct cuMemcpyHtoD_v2_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoD_v2_params;
+
+typedef struct cuMemcpyDtoH_v2_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoH_v2_params;
+
+typedef struct cuMemcpyDtoD_v2_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoD_v2_params;
+
+typedef struct cuMemcpyDtoA_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoA_v2_params;
+
+typedef struct cuMemcpyAtoD_v2_params_st {
+    CUdeviceptr dstDevice;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoD_v2_params;
+
+typedef struct cuMemcpyHtoA_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoA_v2_params;
+
+typedef struct cuMemcpyAtoH_v2_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoH_v2_params;
+
+typedef struct cuMemcpyAtoA_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoA_v2_params;
+
+typedef struct cuMemcpyHtoAAsync_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoAAsync_v2_params;
+
+typedef struct cuMemcpyAtoHAsync_v2_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAtoHAsync_v2_params;
+
+typedef struct cuMemcpy2D_v2_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2D_v2_params;
+
+typedef struct cuMemcpy2DUnaligned_v2_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2DUnaligned_v2_params;
+
+typedef struct cuMemcpy3D_v2_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+} cuMemcpy3D_v2_params;
+
+typedef struct cuMemcpyHtoDAsync_v2_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoDAsync_v2_params;
+
+typedef struct cuMemcpyDtoHAsync_v2_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoHAsync_v2_params;
+
+typedef struct cuMemcpyDtoDAsync_v2_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoDAsync_v2_params;
+
+typedef struct cuMemcpy2DAsync_v2_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+    CUstream hStream;
+} cuMemcpy2DAsync_v2_params;
+
+typedef struct cuMemcpy3DAsync_v2_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+    CUstream hStream;
+} cuMemcpy3DAsync_v2_params;
+
+typedef struct cuMemsetD8_v2_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+} cuMemsetD8_v2_params;
+
+typedef struct cuMemsetD16_v2_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+} cuMemsetD16_v2_params;
+
+typedef struct cuMemsetD32_v2_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+} cuMemsetD32_v2_params;
+
+typedef struct cuMemsetD2D8_v2_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D8_v2_params;
+
+typedef struct cuMemsetD2D16_v2_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D16_v2_params;
+
+typedef struct cuMemsetD2D32_v2_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D32_v2_params;
+
+typedef struct cuMemcpy_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+} cuMemcpy_params;
+
+typedef struct cuMemcpyAsync_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAsync_params;
+
+typedef struct cuMemcpyPeer_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+} cuMemcpyPeer_params;
+
+typedef struct cuMemcpyPeerAsync_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyPeerAsync_params;
+
+typedef struct cuMemcpy3DPeer_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+} cuMemcpy3DPeer_params;
+
+typedef struct cuMemcpy3DPeerAsync_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+    CUstream hStream;
+} cuMemcpy3DPeerAsync_params;
+
+typedef struct cuMemsetD8Async_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD8Async_params;
+
+typedef struct cuMemsetD16Async_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD16Async_params;
+
+typedef struct cuMemsetD32Async_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD32Async_params;
+
+typedef struct cuMemsetD2D8Async_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D8Async_params;
+
+typedef struct cuMemsetD2D16Async_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D16Async_params;
+
+typedef struct cuMemsetD2D32Async_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D32Async_params;
+
+typedef struct cuStreamGetPriority_params_st {
+    CUstream hStream;
+    int *priority;
+} cuStreamGetPriority_params;
+
+typedef struct cuStreamGetFlags_params_st {
+    CUstream hStream;
+    unsigned int *flags;
+} cuStreamGetFlags_params;
+
+typedef struct cuStreamGetCtx_params_st {
+    CUstream hStream;
+    CUcontext *pctx;
+} cuStreamGetCtx_params;
+
+typedef struct cuStreamWaitEvent_params_st {
+    CUstream hStream;
+    CUevent hEvent;
+    unsigned int Flags;
+} cuStreamWaitEvent_params;
+
+typedef struct cuStreamAddCallback_params_st {
+    CUstream hStream;
+    CUstreamCallback callback;
+    void *userData;
+    unsigned int flags;
+} cuStreamAddCallback_params;
+
+typedef struct cuStreamAttachMemAsync_params_st {
+    CUstream hStream;
+    CUdeviceptr dptr;
+    size_t length;
+    unsigned int flags;
+} cuStreamAttachMemAsync_params;
+
+typedef struct cuStreamQuery_params_st {
+    CUstream hStream;
+} cuStreamQuery_params;
+
+typedef struct cuStreamSynchronize_params_st {
+    CUstream hStream;
+} cuStreamSynchronize_params;
+
+typedef struct cuEventRecord_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+} cuEventRecord_params;
+
+typedef struct cuEventRecordWithFlags_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+    unsigned int flags;
+} cuEventRecordWithFlags_params;
+
+typedef struct cuLaunchKernel_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernel_params;
+
+typedef struct cuLaunchKernelEx_params_st {
+    const CUlaunchConfig *config;
+    CUfunction f;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernelEx_params;
+
+typedef struct cuLaunchHostFunc_params_st {
+    CUstream hStream;
+    CUhostFn fn;
+    void *userData;
+} cuLaunchHostFunc_params;
+
+typedef struct cuGraphicsMapResources_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsMapResources_params;
+
+typedef struct cuGraphicsUnmapResources_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsUnmapResources_params;
+
+typedef struct cuStreamWriteValue32_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWriteValue32_params;
+
+typedef struct cuStreamWaitValue32_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWaitValue32_params;
+
+typedef struct cuStreamWriteValue64_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWriteValue64_params;
+
+typedef struct cuStreamWaitValue64_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWaitValue64_params;
+
+typedef struct cuStreamBatchMemOp_params_st {
+    CUstream stream;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} cuStreamBatchMemOp_params;
+
+typedef struct cuMemPrefetchAsync_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUdevice dstDevice;
+    CUstream hStream;
+} cuMemPrefetchAsync_params;
+
+typedef struct cuLaunchCooperativeKernel_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+} cuLaunchCooperativeKernel_params;
+
+typedef struct cuSignalExternalSemaphoresAsync_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuSignalExternalSemaphoresAsync_params;
+
+typedef struct cuWaitExternalSemaphoresAsync_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuWaitExternalSemaphoresAsync_params;
+
+typedef struct cuStreamBeginCapture_params_st {
+    CUstream hStream;
+} cuStreamBeginCapture_params;
+
+typedef struct cuStreamBeginCapture_ptsz_params_st {
+    CUstream hStream;
+} cuStreamBeginCapture_ptsz_params;
+
+typedef struct cuStreamBeginCapture_v2_params_st {
+    CUstream hStream;
+    CUstreamCaptureMode mode;
+} cuStreamBeginCapture_v2_params;
+
+typedef struct cuStreamEndCapture_params_st {
+    CUstream hStream;
+    CUgraph *phGraph;
+} cuStreamEndCapture_params;
+
+typedef struct cuStreamIsCapturing_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus;
+} cuStreamIsCapturing_params;
+
+typedef struct cuStreamGetCaptureInfo_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+} cuStreamGetCaptureInfo_params;
+
+typedef struct cuStreamGetCaptureInfo_v2_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+    CUgraph *graph_out;
+    const CUgraphNode **dependencies_out;
+    size_t *numDependencies_out;
+} cuStreamGetCaptureInfo_v2_params;
+
+typedef struct cuGraphUpload_params_st {
+    CUgraphExec hGraph;
+    CUstream hStream;
+} cuGraphUpload_params;
+
+typedef struct cuGraphLaunch_params_st {
+    CUgraphExec hGraph;
+    CUstream hStream;
+} cuGraphLaunch_params;
+
+typedef struct cuStreamCopyAttributes_params_st {
+    CUstream dstStream;
+    CUstream srcStream;
+} cuStreamCopyAttributes_params;
+
+typedef struct cuStreamGetAttribute_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    CUstreamAttrValue *value;
+} cuStreamGetAttribute_params;
+
+typedef struct cuStreamSetAttribute_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    const CUstreamAttrValue *param;
+} cuStreamSetAttribute_params;
+
+typedef struct cuIpcOpenMemHandle_params_st {
+    CUdeviceptr *pdptr;
+    CUipcMemHandle handle;
+    unsigned int Flags;
+} cuIpcOpenMemHandle_params;
+
+typedef struct cuGraphInstantiate_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    CUgraphNode *phErrorNode;
+    char *logBuffer;
+    size_t bufferSize;
+} cuGraphInstantiate_params;
+
+typedef struct cuMemMapArrayAsync_params_st {
+    CUarrayMapInfo *mapInfoList;
+    unsigned int count;
+    CUstream hStream;
+} cuMemMapArrayAsync_params;
+
+typedef struct cuMemFreeAsync_params_st {
+    CUdeviceptr dptr;
+    CUstream hStream;
+} cuMemFreeAsync_params;
+
+typedef struct cuMemAllocAsync_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUstream hStream;
+} cuMemAllocAsync_params;
+
+typedef struct cuMemAllocFromPoolAsync_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUmemoryPool pool;
+    CUstream hStream;
+} cuMemAllocFromPoolAsync_params;
+
+typedef struct cuStreamUpdateCaptureDependencies_params_st {
+    CUstream hStream;
+    CUgraphNode *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cuStreamUpdateCaptureDependencies_params;
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_runtime_api_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_runtime_api_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..69cea3735f87455a668bae914f86b06deb157106
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_runtime_api_meta.h
@@ -0,0 +1,2139 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// CUDA public interface, for type definitions and api function prototypes
+#include "cuda_runtime_api.h"
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+// Currently used parameter trace structures
+typedef struct cudaDeviceSetLimit_v3020_params_st {
+    enum cudaLimit limit;
+    size_t value;
+} cudaDeviceSetLimit_v3020_params;
+
+typedef struct cudaDeviceGetLimit_v3020_params_st {
+    size_t *pValue;
+    enum cudaLimit limit;
+} cudaDeviceGetLimit_v3020_params;
+
+typedef struct cudaDeviceGetTexture1DLinearMaxWidth_v11010_params_st {
+    size_t *maxWidthInElements;
+    const struct cudaChannelFormatDesc *fmtDesc;
+    int device;
+} cudaDeviceGetTexture1DLinearMaxWidth_v11010_params;
+
+typedef struct cudaDeviceGetCacheConfig_v3020_params_st {
+    enum cudaFuncCache *pCacheConfig;
+} cudaDeviceGetCacheConfig_v3020_params;
+
+typedef struct cudaDeviceGetStreamPriorityRange_v5050_params_st {
+    int *leastPriority;
+    int *greatestPriority;
+} cudaDeviceGetStreamPriorityRange_v5050_params;
+
+typedef struct cudaDeviceSetCacheConfig_v3020_params_st {
+    enum cudaFuncCache cacheConfig;
+} cudaDeviceSetCacheConfig_v3020_params;
+
+typedef struct cudaDeviceGetSharedMemConfig_v4020_params_st {
+    enum cudaSharedMemConfig *pConfig;
+} cudaDeviceGetSharedMemConfig_v4020_params;
+
+typedef struct cudaDeviceSetSharedMemConfig_v4020_params_st {
+    enum cudaSharedMemConfig config;
+} cudaDeviceSetSharedMemConfig_v4020_params;
+
+typedef struct cudaDeviceGetByPCIBusId_v4010_params_st {
+    int *device;
+    const char *pciBusId;
+} cudaDeviceGetByPCIBusId_v4010_params;
+
+typedef struct cudaDeviceGetPCIBusId_v4010_params_st {
+    char *pciBusId;
+    int len;
+    int device;
+} cudaDeviceGetPCIBusId_v4010_params;
+
+typedef struct cudaIpcGetEventHandle_v4010_params_st {
+    cudaIpcEventHandle_t *handle;
+    cudaEvent_t event;
+} cudaIpcGetEventHandle_v4010_params;
+
+typedef struct cudaIpcOpenEventHandle_v4010_params_st {
+    cudaEvent_t *event;
+    cudaIpcEventHandle_t handle;
+} cudaIpcOpenEventHandle_v4010_params;
+
+typedef struct cudaIpcGetMemHandle_v4010_params_st {
+    cudaIpcMemHandle_t *handle;
+    void *devPtr;
+} cudaIpcGetMemHandle_v4010_params;
+
+typedef struct cudaIpcOpenMemHandle_v4010_params_st {
+    void **devPtr;
+    cudaIpcMemHandle_t handle;
+    unsigned int flags;
+} cudaIpcOpenMemHandle_v4010_params;
+
+typedef struct cudaIpcCloseMemHandle_v4010_params_st {
+    void *devPtr;
+} cudaIpcCloseMemHandle_v4010_params;
+
+typedef struct cudaDeviceFlushGPUDirectRDMAWrites_v11030_params_st {
+    enum cudaFlushGPUDirectRDMAWritesTarget target;
+    enum cudaFlushGPUDirectRDMAWritesScope scope;
+} cudaDeviceFlushGPUDirectRDMAWrites_v11030_params;
+
+typedef struct cudaGetErrorName_v6050_params_st {
+    cudaError_t error;
+} cudaGetErrorName_v6050_params;
+
+typedef struct cudaGetErrorString_v3020_params_st {
+    cudaError_t error;
+} cudaGetErrorString_v3020_params;
+
+typedef struct cudaGetDeviceCount_v3020_params_st {
+    int *count;
+} cudaGetDeviceCount_v3020_params;
+
+typedef struct cudaGetDeviceProperties_v3020_params_st {
+    struct cudaDeviceProp *prop;
+    int device;
+} cudaGetDeviceProperties_v3020_params;
+
+typedef struct cudaDeviceGetAttribute_v5000_params_st {
+    int *value;
+    enum cudaDeviceAttr attr;
+    int device;
+} cudaDeviceGetAttribute_v5000_params;
+
+typedef struct cudaDeviceGetDefaultMemPool_v11020_params_st {
+    cudaMemPool_t *memPool;
+    int device;
+} cudaDeviceGetDefaultMemPool_v11020_params;
+
+typedef struct cudaDeviceSetMemPool_v11020_params_st {
+    int device;
+    cudaMemPool_t memPool;
+} cudaDeviceSetMemPool_v11020_params;
+
+typedef struct cudaDeviceGetMemPool_v11020_params_st {
+    cudaMemPool_t *memPool;
+    int device;
+} cudaDeviceGetMemPool_v11020_params;
+
+typedef struct cudaDeviceGetNvSciSyncAttributes_v10020_params_st {
+    void *nvSciSyncAttrList;
+    int device;
+    int flags;
+} cudaDeviceGetNvSciSyncAttributes_v10020_params;
+
+typedef struct cudaDeviceGetP2PAttribute_v8000_params_st {
+    int *value;
+    enum cudaDeviceP2PAttr attr;
+    int srcDevice;
+    int dstDevice;
+} cudaDeviceGetP2PAttribute_v8000_params;
+
+typedef struct cudaChooseDevice_v3020_params_st {
+    int *device;
+    const struct cudaDeviceProp *prop;
+} cudaChooseDevice_v3020_params;
+
+typedef struct cudaSetDevice_v3020_params_st {
+    int device;
+} cudaSetDevice_v3020_params;
+
+typedef struct cudaGetDevice_v3020_params_st {
+    int *device;
+} cudaGetDevice_v3020_params;
+
+typedef struct cudaSetValidDevices_v3020_params_st {
+    int *device_arr;
+    int len;
+} cudaSetValidDevices_v3020_params;
+
+typedef struct cudaSetDeviceFlags_v3020_params_st {
+    unsigned int flags;
+} cudaSetDeviceFlags_v3020_params;
+
+typedef struct cudaGetDeviceFlags_v7000_params_st {
+    unsigned int *flags;
+} cudaGetDeviceFlags_v7000_params;
+
+typedef struct cudaStreamCreate_v3020_params_st {
+    cudaStream_t *pStream;
+} cudaStreamCreate_v3020_params;
+
+typedef struct cudaStreamCreateWithFlags_v5000_params_st {
+    cudaStream_t *pStream;
+    unsigned int flags;
+} cudaStreamCreateWithFlags_v5000_params;
+
+typedef struct cudaStreamCreateWithPriority_v5050_params_st {
+    cudaStream_t *pStream;
+    unsigned int flags;
+    int priority;
+} cudaStreamCreateWithPriority_v5050_params;
+
+typedef struct cudaStreamGetPriority_ptsz_v7000_params_st {
+    cudaStream_t hStream;
+    int *priority;
+} cudaStreamGetPriority_ptsz_v7000_params;
+
+typedef struct cudaStreamGetFlags_ptsz_v7000_params_st {
+    cudaStream_t hStream;
+    unsigned int *flags;
+} cudaStreamGetFlags_ptsz_v7000_params;
+
+typedef struct cudaStreamCopyAttributes_ptsz_v11000_params_st {
+    cudaStream_t dst;
+    cudaStream_t src;
+} cudaStreamCopyAttributes_ptsz_v11000_params;
+
+typedef struct cudaStreamGetAttribute_ptsz_v11000_params_st {
+    cudaStream_t hStream;
+    cudaStreamAttrID attr;
+    cudaStreamAttrValue *value_out;
+} cudaStreamGetAttribute_ptsz_v11000_params;
+
+typedef struct cudaStreamSetAttribute_ptsz_v11000_params_st {
+    cudaStream_t hStream;
+    cudaStreamAttrID attr;
+    const cudaStreamAttrValue *value;
+} cudaStreamSetAttribute_ptsz_v11000_params;
+
+typedef struct cudaStreamDestroy_v5050_params_st {
+    cudaStream_t stream;
+} cudaStreamDestroy_v5050_params;
+
+typedef struct cudaStreamWaitEvent_ptsz_v7000_params_st {
+    cudaStream_t stream;
+    cudaEvent_t event;
+    unsigned int flags;
+} cudaStreamWaitEvent_ptsz_v7000_params;
+
+typedef struct cudaStreamAddCallback_ptsz_v7000_params_st {
+    cudaStream_t stream;
+    cudaStreamCallback_t callback;
+    void *userData;
+    unsigned int flags;
+} cudaStreamAddCallback_ptsz_v7000_params;
+
+typedef struct cudaStreamSynchronize_ptsz_v7000_params_st {
+    cudaStream_t stream;
+} cudaStreamSynchronize_ptsz_v7000_params;
+
+typedef struct cudaStreamQuery_ptsz_v7000_params_st {
+    cudaStream_t stream;
+} cudaStreamQuery_ptsz_v7000_params;
+
+typedef struct cudaStreamAttachMemAsync_ptsz_v7000_params_st {
+    cudaStream_t stream;
+    void *devPtr;
+    size_t length;
+    unsigned int flags;
+} cudaStreamAttachMemAsync_ptsz_v7000_params;
+
+typedef struct cudaStreamBeginCapture_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureMode mode;
+} cudaStreamBeginCapture_ptsz_v10000_params;
+
+typedef struct cudaThreadExchangeStreamCaptureMode_v10010_params_st {
+    enum cudaStreamCaptureMode *mode;
+} cudaThreadExchangeStreamCaptureMode_v10010_params;
+
+typedef struct cudaStreamEndCapture_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    cudaGraph_t *pGraph;
+} cudaStreamEndCapture_ptsz_v10000_params;
+
+typedef struct cudaStreamIsCapturing_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *pCaptureStatus;
+} cudaStreamIsCapturing_ptsz_v10000_params;
+
+typedef struct cudaStreamGetCaptureInfo_ptsz_v10010_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *pCaptureStatus;
+    unsigned long long *pId;
+} cudaStreamGetCaptureInfo_ptsz_v10010_params;
+
+typedef struct cudaStreamGetCaptureInfo_v2_ptsz_v11030_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+    cudaGraph_t *graph_out;
+    const cudaGraphNode_t **dependencies_out;
+    size_t *numDependencies_out;
+} cudaStreamGetCaptureInfo_v2_ptsz_v11030_params;
+
+typedef struct cudaStreamUpdateCaptureDependencies_v11030_params_st {
+    cudaStream_t stream;
+    cudaGraphNode_t *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cudaStreamUpdateCaptureDependencies_v11030_params;
+
+typedef struct cudaEventCreate_v3020_params_st {
+    cudaEvent_t *event;
+} cudaEventCreate_v3020_params;
+
+typedef struct cudaEventCreateWithFlags_v3020_params_st {
+    cudaEvent_t *event;
+    unsigned int flags;
+} cudaEventCreateWithFlags_v3020_params;
+
+typedef struct cudaEventRecord_ptsz_v7000_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+} cudaEventRecord_ptsz_v7000_params;
+
+typedef struct cudaEventRecordWithFlags_ptsz_v11010_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+    unsigned int flags;
+} cudaEventRecordWithFlags_ptsz_v11010_params;
+
+typedef struct cudaEventQuery_v3020_params_st {
+    cudaEvent_t event;
+} cudaEventQuery_v3020_params;
+
+typedef struct cudaEventSynchronize_v3020_params_st {
+    cudaEvent_t event;
+} cudaEventSynchronize_v3020_params;
+
+typedef struct cudaEventDestroy_v3020_params_st {
+    cudaEvent_t event;
+} cudaEventDestroy_v3020_params;
+
+typedef struct cudaEventElapsedTime_v3020_params_st {
+    float *ms;
+    cudaEvent_t start;
+    cudaEvent_t end;
+} cudaEventElapsedTime_v3020_params;
+
+typedef struct cudaImportExternalMemory_v10000_params_st {
+    cudaExternalMemory_t *extMem_out;
+    const struct cudaExternalMemoryHandleDesc *memHandleDesc;
+} cudaImportExternalMemory_v10000_params;
+
+typedef struct cudaExternalMemoryGetMappedBuffer_v10000_params_st {
+    void **devPtr;
+    cudaExternalMemory_t extMem;
+    const struct cudaExternalMemoryBufferDesc *bufferDesc;
+} cudaExternalMemoryGetMappedBuffer_v10000_params;
+
+typedef struct cudaExternalMemoryGetMappedMipmappedArray_v10000_params_st {
+    cudaMipmappedArray_t *mipmap;
+    cudaExternalMemory_t extMem;
+    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc;
+} cudaExternalMemoryGetMappedMipmappedArray_v10000_params;
+
+typedef struct cudaDestroyExternalMemory_v10000_params_st {
+    cudaExternalMemory_t extMem;
+} cudaDestroyExternalMemory_v10000_params;
+
+typedef struct cudaImportExternalSemaphore_v10000_params_st {
+    cudaExternalSemaphore_t *extSem_out;
+    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc;
+} cudaImportExternalSemaphore_v10000_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020_params;
+
+typedef struct cudaDestroyExternalSemaphore_v10000_params_st {
+    cudaExternalSemaphore_t extSem;
+} cudaDestroyExternalSemaphore_v10000_params;
+
+typedef struct cudaLaunchKernel_ptsz_v7000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchKernel_ptsz_v7000_params;
+
+typedef struct cudaLaunchKernelExC_ptsz_v11060_params_st {
+    const cudaLaunchConfig_t *config;
+    const void *func;
+    void **args;
+} cudaLaunchKernelExC_ptsz_v11060_params;
+
+typedef struct cudaLaunchCooperativeKernel_ptsz_v9000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchCooperativeKernel_ptsz_v9000_params;
+
+typedef struct cudaLaunchCooperativeKernelMultiDevice_v9000_params_st {
+    struct cudaLaunchParams *launchParamsList;
+    unsigned int numDevices;
+    unsigned int flags;
+} cudaLaunchCooperativeKernelMultiDevice_v9000_params;
+
+typedef struct cudaFuncSetCacheConfig_v3020_params_st {
+    const void *func;
+    enum cudaFuncCache cacheConfig;
+} cudaFuncSetCacheConfig_v3020_params;
+
+typedef struct cudaFuncSetSharedMemConfig_v4020_params_st {
+    const void *func;
+    enum cudaSharedMemConfig config;
+} cudaFuncSetSharedMemConfig_v4020_params;
+
+typedef struct cudaFuncGetAttributes_v3020_params_st {
+    struct cudaFuncAttributes *attr;
+    const void *func;
+} cudaFuncGetAttributes_v3020_params;
+
+typedef struct cudaFuncSetAttribute_v9000_params_st {
+    const void *func;
+    enum cudaFuncAttribute attr;
+    int value;
+} cudaFuncSetAttribute_v9000_params;
+
+typedef struct cudaLaunchHostFunc_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    cudaHostFn_t fn;
+    void *userData;
+} cudaLaunchHostFunc_ptsz_v10000_params;
+
+typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050_params_st {
+    int *numBlocks;
+    const void *func;
+    int blockSize;
+    size_t dynamicSMemSize;
+} cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050_params;
+
+typedef struct cudaOccupancyAvailableDynamicSMemPerBlock_v10200_params_st {
+    size_t *dynamicSmemSize;
+    const void *func;
+    int numBlocks;
+    int blockSize;
+} cudaOccupancyAvailableDynamicSMemPerBlock_v10200_params;
+
+typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000_params_st {
+    int *numBlocks;
+    const void *func;
+    int blockSize;
+    size_t dynamicSMemSize;
+    unsigned int flags;
+} cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000_params;
+
+typedef struct cudaOccupancyMaxPotentialClusterSize_v11070_params_st {
+    int *clusterSize;
+    const void *func;
+    const cudaLaunchConfig_t *launchConfig;
+} cudaOccupancyMaxPotentialClusterSize_v11070_params;
+
+typedef struct cudaOccupancyMaxActiveClusters_v11070_params_st {
+    int *numClusters;
+    const void *func;
+    const cudaLaunchConfig_t *launchConfig;
+} cudaOccupancyMaxActiveClusters_v11070_params;
+
+typedef struct cudaMallocManaged_v6000_params_st {
+    void **devPtr;
+    size_t size;
+    unsigned int flags;
+} cudaMallocManaged_v6000_params;
+
+typedef struct cudaMalloc_v3020_params_st {
+    void **devPtr;
+    size_t size;
+} cudaMalloc_v3020_params;
+
+typedef struct cudaMallocHost_v3020_params_st {
+    void **ptr;
+    size_t size;
+} cudaMallocHost_v3020_params;
+
+typedef struct cudaMallocPitch_v3020_params_st {
+    void **devPtr;
+    size_t *pitch;
+    size_t width;
+    size_t height;
+} cudaMallocPitch_v3020_params;
+
+typedef struct cudaMallocArray_v3020_params_st {
+    cudaArray_t *array;
+    const struct cudaChannelFormatDesc *desc;
+    size_t width;
+    size_t height;
+    unsigned int flags;
+} cudaMallocArray_v3020_params;
+
+typedef struct cudaFree_v3020_params_st {
+    void *devPtr;
+} cudaFree_v3020_params;
+
+typedef struct cudaFreeHost_v3020_params_st {
+    void *ptr;
+} cudaFreeHost_v3020_params;
+
+typedef struct cudaFreeArray_v3020_params_st {
+    cudaArray_t array;
+} cudaFreeArray_v3020_params;
+
+typedef struct cudaFreeMipmappedArray_v5000_params_st {
+    cudaMipmappedArray_t mipmappedArray;
+} cudaFreeMipmappedArray_v5000_params;
+
+typedef struct cudaHostAlloc_v3020_params_st {
+    void **pHost;
+    size_t size;
+    unsigned int flags;
+} cudaHostAlloc_v3020_params;
+
+typedef struct cudaHostRegister_v4000_params_st {
+    void *ptr;
+    size_t size;
+    unsigned int flags;
+} cudaHostRegister_v4000_params;
+
+typedef struct cudaHostUnregister_v4000_params_st {
+    void *ptr;
+} cudaHostUnregister_v4000_params;
+
+typedef struct cudaHostGetDevicePointer_v3020_params_st {
+    void **pDevice;
+    void *pHost;
+    unsigned int flags;
+} cudaHostGetDevicePointer_v3020_params;
+
+typedef struct cudaHostGetFlags_v3020_params_st {
+    unsigned int *pFlags;
+    void *pHost;
+} cudaHostGetFlags_v3020_params;
+
+typedef struct cudaMalloc3D_v3020_params_st {
+    struct cudaPitchedPtr *pitchedDevPtr;
+    struct cudaExtent extent;
+} cudaMalloc3D_v3020_params;
+
+typedef struct cudaMalloc3DArray_v3020_params_st {
+    cudaArray_t *array;
+    const struct cudaChannelFormatDesc *desc;
+    struct cudaExtent extent;
+    unsigned int flags;
+} cudaMalloc3DArray_v3020_params;
+
+typedef struct cudaMallocMipmappedArray_v5000_params_st {
+    cudaMipmappedArray_t *mipmappedArray;
+    const struct cudaChannelFormatDesc *desc;
+    struct cudaExtent extent;
+    unsigned int numLevels;
+    unsigned int flags;
+} cudaMallocMipmappedArray_v5000_params;
+
+typedef struct cudaGetMipmappedArrayLevel_v5000_params_st {
+    cudaArray_t *levelArray;
+    cudaMipmappedArray_const_t mipmappedArray;
+    unsigned int level;
+} cudaGetMipmappedArrayLevel_v5000_params;
+
+typedef struct cudaMemcpy3D_ptds_v7000_params_st {
+    const struct cudaMemcpy3DParms *p;
+} cudaMemcpy3D_ptds_v7000_params;
+
+typedef struct cudaMemcpy3DPeer_ptds_v7000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+} cudaMemcpy3DPeer_ptds_v7000_params;
+
+typedef struct cudaMemcpy3DAsync_ptsz_v7000_params_st {
+    const struct cudaMemcpy3DParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpy3DPeerAsync_ptsz_v7000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DPeerAsync_ptsz_v7000_params;
+
+typedef struct cudaMemGetInfo_v3020_params_st {
+    size_t *free;
+    size_t *total;
+} cudaMemGetInfo_v3020_params;
+
+typedef struct cudaArrayGetInfo_v4010_params_st {
+    struct cudaChannelFormatDesc *desc;
+    struct cudaExtent *extent;
+    unsigned int *flags;
+    cudaArray_t array;
+} cudaArrayGetInfo_v4010_params;
+
+typedef struct cudaArrayGetPlane_v11020_params_st {
+    cudaArray_t *pPlaneArray;
+    cudaArray_t hArray;
+    unsigned int planeIdx;
+} cudaArrayGetPlane_v11020_params;
+
+typedef struct cudaArrayGetMemoryRequirements_v11060_params_st {
+    struct cudaArrayMemoryRequirements *memoryRequirements;
+    cudaArray_t array;
+    int device;
+} cudaArrayGetMemoryRequirements_v11060_params;
+
+typedef struct cudaMipmappedArrayGetMemoryRequirements_v11060_params_st {
+    struct cudaArrayMemoryRequirements *memoryRequirements;
+    cudaMipmappedArray_t mipmap;
+    int device;
+} cudaMipmappedArrayGetMemoryRequirements_v11060_params;
+
+typedef struct cudaArrayGetSparseProperties_v11010_params_st {
+    struct cudaArraySparseProperties *sparseProperties;
+    cudaArray_t array;
+} cudaArrayGetSparseProperties_v11010_params;
+
+typedef struct cudaMipmappedArrayGetSparseProperties_v11010_params_st {
+    struct cudaArraySparseProperties *sparseProperties;
+    cudaMipmappedArray_t mipmap;
+} cudaMipmappedArrayGetSparseProperties_v11010_params;
+
+typedef struct cudaMemcpy_ptds_v7000_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy_ptds_v7000_params;
+
+typedef struct cudaMemcpyPeer_v4000_params_st {
+    void *dst;
+    int dstDevice;
+    const void *src;
+    int srcDevice;
+    size_t count;
+} cudaMemcpyPeer_v4000_params;
+
+typedef struct cudaMemcpy2D_ptds_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2D_ptds_v7000_params;
+
+typedef struct cudaMemcpy2DToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpy2DFromArray_ptds_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DFromArray_ptds_v7000_params;
+
+typedef struct cudaMemcpy2DArrayToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DArrayToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyToSymbol_ptds_v7000_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToSymbol_ptds_v7000_params;
+
+typedef struct cudaMemcpyFromSymbol_ptds_v7000_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromSymbol_ptds_v7000_params;
+
+typedef struct cudaMemcpyAsync_ptsz_v7000_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyPeerAsync_v4000_params_st {
+    void *dst;
+    int dstDevice;
+    const void *src;
+    int srcDevice;
+    size_t count;
+    cudaStream_t stream;
+} cudaMemcpyPeerAsync_v4000_params;
+
+typedef struct cudaMemcpy2DAsync_ptsz_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpy2DToArrayAsync_ptsz_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DToArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpy2DFromArrayAsync_ptsz_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DFromArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyToSymbolAsync_ptsz_v7000_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToSymbolAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyFromSymbolAsync_ptsz_v7000_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromSymbolAsync_ptsz_v7000_params;
+
+typedef struct cudaMemset_ptds_v7000_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+} cudaMemset_ptds_v7000_params;
+
+typedef struct cudaMemset2D_ptds_v7000_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+} cudaMemset2D_ptds_v7000_params;
+
+typedef struct cudaMemset3D_ptds_v7000_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+} cudaMemset3D_ptds_v7000_params;
+
+typedef struct cudaMemsetAsync_ptsz_v7000_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+    cudaStream_t stream;
+} cudaMemsetAsync_ptsz_v7000_params;
+
+typedef struct cudaMemset2DAsync_ptsz_v7000_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+    cudaStream_t stream;
+} cudaMemset2DAsync_ptsz_v7000_params;
+
+typedef struct cudaMemset3DAsync_ptsz_v7000_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+    cudaStream_t stream;
+} cudaMemset3DAsync_ptsz_v7000_params;
+
+typedef struct cudaGetSymbolAddress_v3020_params_st {
+    void **devPtr;
+    const void *symbol;
+} cudaGetSymbolAddress_v3020_params;
+
+typedef struct cudaGetSymbolSize_v3020_params_st {
+    size_t *size;
+    const void *symbol;
+} cudaGetSymbolSize_v3020_params;
+
+typedef struct cudaMemPrefetchAsync_ptsz_v8000_params_st {
+    const void *devPtr;
+    size_t count;
+    int dstDevice;
+    cudaStream_t stream;
+} cudaMemPrefetchAsync_ptsz_v8000_params;
+
+typedef struct cudaMemAdvise_v8000_params_st {
+    const void *devPtr;
+    size_t count;
+    enum cudaMemoryAdvise advice;
+    int device;
+} cudaMemAdvise_v8000_params;
+
+typedef struct cudaMemRangeGetAttribute_v8000_params_st {
+    void *data;
+    size_t dataSize;
+    enum cudaMemRangeAttribute attribute;
+    const void *devPtr;
+    size_t count;
+} cudaMemRangeGetAttribute_v8000_params;
+
+typedef struct cudaMemRangeGetAttributes_v8000_params_st {
+    void **data;
+    size_t *dataSizes;
+    enum cudaMemRangeAttribute *attributes;
+    size_t numAttributes;
+    const void *devPtr;
+    size_t count;
+} cudaMemRangeGetAttributes_v8000_params;
+
+typedef struct cudaMemcpyToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyFromArray_ptds_v7000_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyArrayToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyArrayToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyToArrayAsync_ptsz_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyFromArrayAsync_ptsz_v7000_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMallocAsync_ptsz_v11020_params_st {
+    void **devPtr;
+    size_t size;
+    cudaStream_t hStream;
+} cudaMallocAsync_ptsz_v11020_params;
+
+typedef struct cudaFreeAsync_ptsz_v11020_params_st {
+    void *devPtr;
+    cudaStream_t hStream;
+} cudaFreeAsync_ptsz_v11020_params;
+
+typedef struct cudaMemPoolTrimTo_v11020_params_st {
+    cudaMemPool_t memPool;
+    size_t minBytesToKeep;
+} cudaMemPoolTrimTo_v11020_params;
+
+typedef struct cudaMemPoolSetAttribute_v11020_params_st {
+    cudaMemPool_t memPool;
+    enum cudaMemPoolAttr attr;
+    void *value;
+} cudaMemPoolSetAttribute_v11020_params;
+
+typedef struct cudaMemPoolGetAttribute_v11020_params_st {
+    cudaMemPool_t memPool;
+    enum cudaMemPoolAttr attr;
+    void *value;
+} cudaMemPoolGetAttribute_v11020_params;
+
+typedef struct cudaMemPoolSetAccess_v11020_params_st {
+    cudaMemPool_t memPool;
+    const struct cudaMemAccessDesc *descList;
+    size_t count;
+} cudaMemPoolSetAccess_v11020_params;
+
+typedef struct cudaMemPoolGetAccess_v11020_params_st {
+    enum cudaMemAccessFlags *flags;
+    cudaMemPool_t memPool;
+    struct cudaMemLocation *location;
+} cudaMemPoolGetAccess_v11020_params;
+
+typedef struct cudaMemPoolCreate_v11020_params_st {
+    cudaMemPool_t *memPool;
+    const struct cudaMemPoolProps *poolProps;
+} cudaMemPoolCreate_v11020_params;
+
+typedef struct cudaMemPoolDestroy_v11020_params_st {
+    cudaMemPool_t memPool;
+} cudaMemPoolDestroy_v11020_params;
+
+typedef struct cudaMallocFromPoolAsync_ptsz_v11020_params_st {
+    void **ptr;
+    size_t size;
+    cudaMemPool_t memPool;
+    cudaStream_t stream;
+} cudaMallocFromPoolAsync_ptsz_v11020_params;
+
+typedef struct cudaMemPoolExportToShareableHandle_v11020_params_st {
+    void *shareableHandle;
+    cudaMemPool_t memPool;
+    enum cudaMemAllocationHandleType handleType;
+    unsigned int flags;
+} cudaMemPoolExportToShareableHandle_v11020_params;
+
+typedef struct cudaMemPoolImportFromShareableHandle_v11020_params_st {
+    cudaMemPool_t *memPool;
+    void *shareableHandle;
+    enum cudaMemAllocationHandleType handleType;
+    unsigned int flags;
+} cudaMemPoolImportFromShareableHandle_v11020_params;
+
+typedef struct cudaMemPoolExportPointer_v11020_params_st {
+    struct cudaMemPoolPtrExportData *exportData;
+    void *ptr;
+} cudaMemPoolExportPointer_v11020_params;
+
+typedef struct cudaMemPoolImportPointer_v11020_params_st {
+    void **ptr;
+    cudaMemPool_t memPool;
+    struct cudaMemPoolPtrExportData *exportData;
+} cudaMemPoolImportPointer_v11020_params;
+
+typedef struct cudaPointerGetAttributes_v4000_params_st {
+    struct cudaPointerAttributes *attributes;
+    const void *ptr;
+} cudaPointerGetAttributes_v4000_params;
+
+typedef struct cudaDeviceCanAccessPeer_v4000_params_st {
+    int *canAccessPeer;
+    int device;
+    int peerDevice;
+} cudaDeviceCanAccessPeer_v4000_params;
+
+typedef struct cudaDeviceEnablePeerAccess_v4000_params_st {
+    int peerDevice;
+    unsigned int flags;
+} cudaDeviceEnablePeerAccess_v4000_params;
+
+typedef struct cudaDeviceDisablePeerAccess_v4000_params_st {
+    int peerDevice;
+} cudaDeviceDisablePeerAccess_v4000_params;
+
+typedef struct cudaGraphicsUnregisterResource_v3020_params_st {
+    cudaGraphicsResource_t resource;
+} cudaGraphicsUnregisterResource_v3020_params;
+
+typedef struct cudaGraphicsResourceSetMapFlags_v3020_params_st {
+    cudaGraphicsResource_t resource;
+    unsigned int flags;
+} cudaGraphicsResourceSetMapFlags_v3020_params;
+
+typedef struct cudaGraphicsMapResources_v3020_params_st {
+    int count;
+    cudaGraphicsResource_t *resources;
+    cudaStream_t stream;
+} cudaGraphicsMapResources_v3020_params;
+
+typedef struct cudaGraphicsUnmapResources_v3020_params_st {
+    int count;
+    cudaGraphicsResource_t *resources;
+    cudaStream_t stream;
+} cudaGraphicsUnmapResources_v3020_params;
+
+typedef struct cudaGraphicsResourceGetMappedPointer_v3020_params_st {
+    void **devPtr;
+    size_t *size;
+    cudaGraphicsResource_t resource;
+} cudaGraphicsResourceGetMappedPointer_v3020_params;
+
+typedef struct cudaGraphicsSubResourceGetMappedArray_v3020_params_st {
+    cudaArray_t *array;
+    cudaGraphicsResource_t resource;
+    unsigned int arrayIndex;
+    unsigned int mipLevel;
+} cudaGraphicsSubResourceGetMappedArray_v3020_params;
+
+typedef struct cudaGraphicsResourceGetMappedMipmappedArray_v5000_params_st {
+    cudaMipmappedArray_t *mipmappedArray;
+    cudaGraphicsResource_t resource;
+} cudaGraphicsResourceGetMappedMipmappedArray_v5000_params;
+
+typedef struct cudaBindTexture_v3020_params_st {
+    size_t *offset;
+    const struct textureReference *texref;
+    const void *devPtr;
+    const struct cudaChannelFormatDesc *desc;
+    size_t size;
+} cudaBindTexture_v3020_params;
+
+typedef struct cudaBindTexture2D_v3020_params_st {
+    size_t *offset;
+    const struct textureReference *texref;
+    const void *devPtr;
+    const struct cudaChannelFormatDesc *desc;
+    size_t width;
+    size_t height;
+    size_t pitch;
+} cudaBindTexture2D_v3020_params;
+
+typedef struct cudaBindTextureToArray_v3020_params_st {
+    const struct textureReference *texref;
+    cudaArray_const_t array;
+    const struct cudaChannelFormatDesc *desc;
+} cudaBindTextureToArray_v3020_params;
+
+typedef struct cudaBindTextureToMipmappedArray_v5000_params_st {
+    const struct textureReference *texref;
+    cudaMipmappedArray_const_t mipmappedArray;
+    const struct cudaChannelFormatDesc *desc;
+} cudaBindTextureToMipmappedArray_v5000_params;
+
+typedef struct cudaUnbindTexture_v3020_params_st {
+    const struct textureReference *texref;
+} cudaUnbindTexture_v3020_params;
+
+typedef struct cudaGetTextureAlignmentOffset_v3020_params_st {
+    size_t *offset;
+    const struct textureReference *texref;
+} cudaGetTextureAlignmentOffset_v3020_params;
+
+typedef struct cudaGetTextureReference_v3020_params_st {
+    const struct textureReference **texref;
+    const void *symbol;
+} cudaGetTextureReference_v3020_params;
+
+typedef struct cudaBindSurfaceToArray_v3020_params_st {
+    const struct surfaceReference *surfref;
+    cudaArray_const_t array;
+    const struct cudaChannelFormatDesc *desc;
+} cudaBindSurfaceToArray_v3020_params;
+
+typedef struct cudaGetSurfaceReference_v3020_params_st {
+    const struct surfaceReference **surfref;
+    const void *symbol;
+} cudaGetSurfaceReference_v3020_params;
+
+typedef struct cudaGetChannelDesc_v3020_params_st {
+    struct cudaChannelFormatDesc *desc;
+    cudaArray_const_t array;
+} cudaGetChannelDesc_v3020_params;
+
+typedef struct cudaCreateChannelDesc_v3020_params_st {
+    int x;
+    int y;
+    int z;
+    int w;
+    enum cudaChannelFormatKind f;
+} cudaCreateChannelDesc_v3020_params;
+
+typedef struct cudaCreateTextureObject_v5000_params_st {
+    cudaTextureObject_t *pTexObject;
+    const struct cudaResourceDesc *pResDesc;
+    const struct cudaTextureDesc *pTexDesc;
+    const struct cudaResourceViewDesc *pResViewDesc;
+} cudaCreateTextureObject_v5000_params;
+
+typedef struct cudaDestroyTextureObject_v5000_params_st {
+    cudaTextureObject_t texObject;
+} cudaDestroyTextureObject_v5000_params;
+
+typedef struct cudaGetTextureObjectResourceDesc_v5000_params_st {
+    struct cudaResourceDesc *pResDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectResourceDesc_v5000_params;
+
+typedef struct cudaGetTextureObjectTextureDesc_v5000_params_st {
+    struct cudaTextureDesc *pTexDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectTextureDesc_v5000_params;
+
+typedef struct cudaGetTextureObjectResourceViewDesc_v5000_params_st {
+    struct cudaResourceViewDesc *pResViewDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectResourceViewDesc_v5000_params;
+
+typedef struct cudaCreateSurfaceObject_v5000_params_st {
+    cudaSurfaceObject_t *pSurfObject;
+    const struct cudaResourceDesc *pResDesc;
+} cudaCreateSurfaceObject_v5000_params;
+
+typedef struct cudaDestroySurfaceObject_v5000_params_st {
+    cudaSurfaceObject_t surfObject;
+} cudaDestroySurfaceObject_v5000_params;
+
+typedef struct cudaGetSurfaceObjectResourceDesc_v5000_params_st {
+    struct cudaResourceDesc *pResDesc;
+    cudaSurfaceObject_t surfObject;
+} cudaGetSurfaceObjectResourceDesc_v5000_params;
+
+typedef struct cudaDriverGetVersion_v3020_params_st {
+    int *driverVersion;
+} cudaDriverGetVersion_v3020_params;
+
+typedef struct cudaRuntimeGetVersion_v3020_params_st {
+    int *runtimeVersion;
+} cudaRuntimeGetVersion_v3020_params;
+
+typedef struct cudaGraphCreate_v10000_params_st {
+    cudaGraph_t *pGraph;
+    unsigned int flags;
+} cudaGraphCreate_v10000_params;
+
+typedef struct cudaGraphAddKernelNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphAddKernelNode_v10000_params;
+
+typedef struct cudaGraphKernelNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphKernelNodeGetParams_v10000_params;
+
+typedef struct cudaGraphKernelNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphKernelNodeSetParams_v10000_params;
+
+typedef struct cudaGraphKernelNodeCopyAttributes_v11000_params_st {
+    cudaGraphNode_t hSrc;
+    cudaGraphNode_t hDst;
+} cudaGraphKernelNodeCopyAttributes_v11000_params;
+
+typedef struct cudaGraphKernelNodeGetAttribute_v11000_params_st {
+    cudaGraphNode_t hNode;
+    cudaKernelNodeAttrID attr;
+    cudaKernelNodeAttrValue *value_out;
+} cudaGraphKernelNodeGetAttribute_v11000_params;
+
+typedef struct cudaGraphKernelNodeSetAttribute_v11000_params_st {
+    cudaGraphNode_t hNode;
+    cudaKernelNodeAttrID attr;
+    const cudaKernelNodeAttrValue *value;
+} cudaGraphKernelNodeSetAttribute_v11000_params;
+
+typedef struct cudaGraphAddMemcpyNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaMemcpy3DParms *pCopyParams;
+} cudaGraphAddMemcpyNode_v10000_params;
+
+typedef struct cudaGraphAddMemcpyNodeToSymbol_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphAddMemcpyNodeToSymbol_v11010_params;
+
+typedef struct cudaGraphAddMemcpyNodeFromSymbol_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphAddMemcpyNodeFromSymbol_v11010_params;
+
+typedef struct cudaGraphAddMemcpyNode1D_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaGraphAddMemcpyNode1D_v11010_params;
+
+typedef struct cudaGraphMemcpyNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaMemcpy3DParms *pNodeParams;
+} cudaGraphMemcpyNodeGetParams_v10000_params;
+
+typedef struct cudaGraphMemcpyNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaMemcpy3DParms *pNodeParams;
+} cudaGraphMemcpyNodeSetParams_v10000_params;
+
+typedef struct cudaGraphMemcpyNodeSetParamsToSymbol_v11010_params_st {
+    cudaGraphNode_t node;
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphMemcpyNodeSetParamsToSymbol_v11010_params;
+
+typedef struct cudaGraphMemcpyNodeSetParamsFromSymbol_v11010_params_st {
+    cudaGraphNode_t node;
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphMemcpyNodeSetParamsFromSymbol_v11010_params;
+
+typedef struct cudaGraphMemcpyNodeSetParams1D_v11010_params_st {
+    cudaGraphNode_t node;
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaGraphMemcpyNodeSetParams1D_v11010_params;
+
+typedef struct cudaGraphAddMemsetNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaMemsetParams *pMemsetParams;
+} cudaGraphAddMemsetNode_v10000_params;
+
+typedef struct cudaGraphMemsetNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaMemsetParams *pNodeParams;
+} cudaGraphMemsetNodeGetParams_v10000_params;
+
+typedef struct cudaGraphMemsetNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaMemsetParams *pNodeParams;
+} cudaGraphMemsetNodeSetParams_v10000_params;
+
+typedef struct cudaGraphAddHostNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaHostNodeParams *pNodeParams;
+} cudaGraphAddHostNode_v10000_params;
+
+typedef struct cudaGraphHostNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaHostNodeParams *pNodeParams;
+} cudaGraphHostNodeGetParams_v10000_params;
+
+typedef struct cudaGraphHostNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaHostNodeParams *pNodeParams;
+} cudaGraphHostNodeSetParams_v10000_params;
+
+typedef struct cudaGraphAddChildGraphNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    cudaGraph_t childGraph;
+} cudaGraphAddChildGraphNode_v10000_params;
+
+typedef struct cudaGraphChildGraphNodeGetGraph_v10000_params_st {
+    cudaGraphNode_t node;
+    cudaGraph_t *pGraph;
+} cudaGraphChildGraphNodeGetGraph_v10000_params;
+
+typedef struct cudaGraphAddEmptyNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+} cudaGraphAddEmptyNode_v10000_params;
+
+typedef struct cudaGraphAddEventRecordNode_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    cudaEvent_t event;
+} cudaGraphAddEventRecordNode_v11010_params;
+
+typedef struct cudaGraphEventRecordNodeGetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t *event_out;
+} cudaGraphEventRecordNodeGetEvent_v11010_params;
+
+typedef struct cudaGraphEventRecordNodeSetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t event;
+} cudaGraphEventRecordNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphAddEventWaitNode_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    cudaEvent_t event;
+} cudaGraphAddEventWaitNode_v11010_params;
+
+typedef struct cudaGraphEventWaitNodeGetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t *event_out;
+} cudaGraphEventWaitNodeGetEvent_v11010_params;
+
+typedef struct cudaGraphEventWaitNodeSetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t event;
+} cudaGraphEventWaitNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphAddExternalSemaphoresSignalNode_v11020_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams;
+} cudaGraphAddExternalSemaphoresSignalNode_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresSignalNodeGetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    struct cudaExternalSemaphoreSignalNodeParams *params_out;
+} cudaGraphExternalSemaphoresSignalNodeGetParams_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresSignalNodeSetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams;
+} cudaGraphExternalSemaphoresSignalNodeSetParams_v11020_params;
+
+typedef struct cudaGraphAddExternalSemaphoresWaitNode_v11020_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams;
+} cudaGraphAddExternalSemaphoresWaitNode_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresWaitNodeGetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    struct cudaExternalSemaphoreWaitNodeParams *params_out;
+} cudaGraphExternalSemaphoresWaitNodeGetParams_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresWaitNodeSetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams;
+} cudaGraphExternalSemaphoresWaitNodeSetParams_v11020_params;
+
+typedef struct cudaGraphAddMemAllocNode_v11040_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    struct cudaMemAllocNodeParams *nodeParams;
+} cudaGraphAddMemAllocNode_v11040_params;
+
+typedef struct cudaGraphMemAllocNodeGetParams_v11040_params_st {
+    cudaGraphNode_t node;
+    struct cudaMemAllocNodeParams *params_out;
+} cudaGraphMemAllocNodeGetParams_v11040_params;
+
+typedef struct cudaGraphAddMemFreeNode_v11040_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    void *dptr;
+} cudaGraphAddMemFreeNode_v11040_params;
+
+typedef struct cudaGraphMemFreeNodeGetParams_v11040_params_st {
+    cudaGraphNode_t node;
+    void *dptr_out;
+} cudaGraphMemFreeNodeGetParams_v11040_params;
+
+typedef struct cudaDeviceGraphMemTrim_v11040_params_st {
+    int device;
+} cudaDeviceGraphMemTrim_v11040_params;
+
+typedef struct cudaDeviceGetGraphMemAttribute_v11040_params_st {
+    int device;
+    enum cudaGraphMemAttributeType attr;
+    void *value;
+} cudaDeviceGetGraphMemAttribute_v11040_params;
+
+typedef struct cudaDeviceSetGraphMemAttribute_v11040_params_st {
+    int device;
+    enum cudaGraphMemAttributeType attr;
+    void *value;
+} cudaDeviceSetGraphMemAttribute_v11040_params;
+
+typedef struct cudaGraphClone_v10000_params_st {
+    cudaGraph_t *pGraphClone;
+    cudaGraph_t originalGraph;
+} cudaGraphClone_v10000_params;
+
+typedef struct cudaGraphNodeFindInClone_v10000_params_st {
+    cudaGraphNode_t *pNode;
+    cudaGraphNode_t originalNode;
+    cudaGraph_t clonedGraph;
+} cudaGraphNodeFindInClone_v10000_params;
+
+typedef struct cudaGraphNodeGetType_v10000_params_st {
+    cudaGraphNode_t node;
+    enum cudaGraphNodeType *pType;
+} cudaGraphNodeGetType_v10000_params;
+
+typedef struct cudaGraphGetNodes_v10000_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *nodes;
+    size_t *numNodes;
+} cudaGraphGetNodes_v10000_params;
+
+typedef struct cudaGraphGetRootNodes_v10000_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *pRootNodes;
+    size_t *pNumRootNodes;
+} cudaGraphGetRootNodes_v10000_params;
+
+typedef struct cudaGraphGetEdges_v10000_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *from;
+    cudaGraphNode_t *to;
+    size_t *numEdges;
+} cudaGraphGetEdges_v10000_params;
+
+typedef struct cudaGraphNodeGetDependencies_v10000_params_st {
+    cudaGraphNode_t node;
+    cudaGraphNode_t *pDependencies;
+    size_t *pNumDependencies;
+} cudaGraphNodeGetDependencies_v10000_params;
+
+typedef struct cudaGraphNodeGetDependentNodes_v10000_params_st {
+    cudaGraphNode_t node;
+    cudaGraphNode_t *pDependentNodes;
+    size_t *pNumDependentNodes;
+} cudaGraphNodeGetDependentNodes_v10000_params;
+
+typedef struct cudaGraphAddDependencies_v10000_params_st {
+    cudaGraph_t graph;
+    const cudaGraphNode_t *from;
+    const cudaGraphNode_t *to;
+    size_t numDependencies;
+} cudaGraphAddDependencies_v10000_params;
+
+typedef struct cudaGraphRemoveDependencies_v10000_params_st {
+    cudaGraph_t graph;
+    const cudaGraphNode_t *from;
+    const cudaGraphNode_t *to;
+    size_t numDependencies;
+} cudaGraphRemoveDependencies_v10000_params;
+
+typedef struct cudaGraphDestroyNode_v10000_params_st {
+    cudaGraphNode_t node;
+} cudaGraphDestroyNode_v10000_params;
+
+typedef struct cudaGraphInstantiate_v10000_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    cudaGraphNode_t *pErrorNode;
+    char *pLogBuffer;
+    size_t bufferSize;
+} cudaGraphInstantiate_v10000_params;
+
+typedef struct cudaGraphInstantiateWithFlags_v11040_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    unsigned long long flags;
+} cudaGraphInstantiateWithFlags_v11040_params;
+
+typedef struct cudaGraphExecKernelNodeSetParams_v10010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphExecKernelNodeSetParams_v10010_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParams_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaMemcpy3DParms *pNodeParams;
+} cudaGraphExecMemcpyNodeSetParams_v10020_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParams1D_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaGraphExecMemcpyNodeSetParams1D_v11010_params;
+
+typedef struct cudaGraphExecMemsetNodeSetParams_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaMemsetParams *pNodeParams;
+} cudaGraphExecMemsetNodeSetParams_v10020_params;
+
+typedef struct cudaGraphExecHostNodeSetParams_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaHostNodeParams *pNodeParams;
+} cudaGraphExecHostNodeSetParams_v10020_params;
+
+typedef struct cudaGraphExecChildGraphNodeSetParams_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    cudaGraph_t childGraph;
+} cudaGraphExecChildGraphNodeSetParams_v11010_params;
+
+typedef struct cudaGraphExecEventRecordNodeSetEvent_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    cudaEvent_t event;
+} cudaGraphExecEventRecordNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphExecEventWaitNodeSetEvent_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    cudaEvent_t event;
+} cudaGraphExecEventWaitNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams;
+} cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020_params;
+
+typedef struct cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams;
+} cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020_params;
+
+typedef struct cudaGraphNodeSetEnabled_v11060_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    unsigned int isEnabled;
+} cudaGraphNodeSetEnabled_v11060_params;
+
+typedef struct cudaGraphNodeGetEnabled_v11060_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    unsigned int *isEnabled;
+} cudaGraphNodeGetEnabled_v11060_params;
+
+typedef struct cudaGraphExecUpdate_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraph_t hGraph;
+    cudaGraphNode_t *hErrorNode_out;
+    enum cudaGraphExecUpdateResult *updateResult_out;
+} cudaGraphExecUpdate_v10020_params;
+
+typedef struct cudaGraphUpload_ptsz_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphUpload_ptsz_v10000_params;
+
+typedef struct cudaGraphLaunch_ptsz_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphLaunch_ptsz_v10000_params;
+
+typedef struct cudaGraphExecDestroy_v10000_params_st {
+    cudaGraphExec_t graphExec;
+} cudaGraphExecDestroy_v10000_params;
+
+typedef struct cudaGraphDestroy_v10000_params_st {
+    cudaGraph_t graph;
+} cudaGraphDestroy_v10000_params;
+
+typedef struct cudaGraphDebugDotPrint_v11030_params_st {
+    cudaGraph_t graph;
+    const char *path;
+    unsigned int flags;
+} cudaGraphDebugDotPrint_v11030_params;
+
+typedef struct cudaUserObjectCreate_v11030_params_st {
+    cudaUserObject_t *object_out;
+    void *ptr;
+    cudaHostFn_t destroy;
+    unsigned int initialRefcount;
+    unsigned int flags;
+} cudaUserObjectCreate_v11030_params;
+
+typedef struct cudaUserObjectRetain_v11030_params_st {
+    cudaUserObject_t object;
+    unsigned int count;
+} cudaUserObjectRetain_v11030_params;
+
+typedef struct cudaUserObjectRelease_v11030_params_st {
+    cudaUserObject_t object;
+    unsigned int count;
+} cudaUserObjectRelease_v11030_params;
+
+typedef struct cudaGraphRetainUserObject_v11030_params_st {
+    cudaGraph_t graph;
+    cudaUserObject_t object;
+    unsigned int count;
+    unsigned int flags;
+} cudaGraphRetainUserObject_v11030_params;
+
+typedef struct cudaGraphReleaseUserObject_v11030_params_st {
+    cudaGraph_t graph;
+    cudaUserObject_t object;
+    unsigned int count;
+} cudaGraphReleaseUserObject_v11030_params;
+
+typedef struct cudaGetDriverEntryPoint_ptsz_v11030_params_st {
+    const char *symbol;
+    void **funcPtr;
+    unsigned long long flags;
+} cudaGetDriverEntryPoint_ptsz_v11030_params;
+
+typedef struct cudaGetFuncBySymbol_v11000_params_st {
+    cudaFunction_t *functionPtr;
+    const void *symbolPtr;
+} cudaGetFuncBySymbol_v11000_params;
+
+typedef struct cudaMemcpy_v3020_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy_v3020_params;
+
+typedef struct cudaMemcpyToSymbol_v3020_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToSymbol_v3020_params;
+
+typedef struct cudaMemcpyFromSymbol_v3020_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromSymbol_v3020_params;
+
+typedef struct cudaMemcpy2D_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2D_v3020_params;
+
+typedef struct cudaMemcpyToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToArray_v3020_params;
+
+typedef struct cudaMemcpy2DToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DToArray_v3020_params;
+
+typedef struct cudaMemcpyFromArray_v3020_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromArray_v3020_params;
+
+typedef struct cudaMemcpy2DFromArray_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DFromArray_v3020_params;
+
+typedef struct cudaMemcpyArrayToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyArrayToArray_v3020_params;
+
+typedef struct cudaMemcpy2DArrayToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DArrayToArray_v3020_params;
+
+typedef struct cudaMemcpy3D_v3020_params_st {
+    const struct cudaMemcpy3DParms *p;
+} cudaMemcpy3D_v3020_params;
+
+typedef struct cudaMemcpy3DPeer_v4000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+} cudaMemcpy3DPeer_v4000_params;
+
+typedef struct cudaMemset_v3020_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+} cudaMemset_v3020_params;
+
+typedef struct cudaMemset2D_v3020_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+} cudaMemset2D_v3020_params;
+
+typedef struct cudaMemset3D_v3020_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+} cudaMemset3D_v3020_params;
+
+typedef struct cudaMemcpyAsync_v3020_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyAsync_v3020_params;
+
+typedef struct cudaMemcpyToSymbolAsync_v3020_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToSymbolAsync_v3020_params;
+
+typedef struct cudaMemcpyFromSymbolAsync_v3020_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromSymbolAsync_v3020_params;
+
+typedef struct cudaMemcpy2DAsync_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DAsync_v3020_params;
+
+typedef struct cudaMemcpyToArrayAsync_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToArrayAsync_v3020_params;
+
+typedef struct cudaMemcpy2DToArrayAsync_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DToArrayAsync_v3020_params;
+
+typedef struct cudaMemcpyFromArrayAsync_v3020_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromArrayAsync_v3020_params;
+
+typedef struct cudaMemcpy2DFromArrayAsync_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DFromArrayAsync_v3020_params;
+
+typedef struct cudaMemcpy3DAsync_v3020_params_st {
+    const struct cudaMemcpy3DParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DAsync_v3020_params;
+
+typedef struct cudaMemcpy3DPeerAsync_v4000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DPeerAsync_v4000_params;
+
+typedef struct cudaMemsetAsync_v3020_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+    cudaStream_t stream;
+} cudaMemsetAsync_v3020_params;
+
+typedef struct cudaMemset2DAsync_v3020_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+    cudaStream_t stream;
+} cudaMemset2DAsync_v3020_params;
+
+typedef struct cudaMemset3DAsync_v3020_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+    cudaStream_t stream;
+} cudaMemset3DAsync_v3020_params;
+
+typedef struct cudaStreamQuery_v3020_params_st {
+    cudaStream_t stream;
+} cudaStreamQuery_v3020_params;
+
+typedef struct cudaStreamGetFlags_v5050_params_st {
+    cudaStream_t hStream;
+    unsigned int *flags;
+} cudaStreamGetFlags_v5050_params;
+
+typedef struct cudaStreamGetPriority_v5050_params_st {
+    cudaStream_t hStream;
+    int *priority;
+} cudaStreamGetPriority_v5050_params;
+
+typedef struct cudaEventRecord_v3020_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+} cudaEventRecord_v3020_params;
+
+typedef struct cudaEventRecordWithFlags_v11010_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+    unsigned int flags;
+} cudaEventRecordWithFlags_v11010_params;
+
+typedef struct cudaStreamWaitEvent_v3020_params_st {
+    cudaStream_t stream;
+    cudaEvent_t event;
+    unsigned int flags;
+} cudaStreamWaitEvent_v3020_params;
+
+typedef struct cudaStreamAddCallback_v5000_params_st {
+    cudaStream_t stream;
+    cudaStreamCallback_t callback;
+    void *userData;
+    unsigned int flags;
+} cudaStreamAddCallback_v5000_params;
+
+typedef struct cudaStreamAttachMemAsync_v6000_params_st {
+    cudaStream_t stream;
+    void *devPtr;
+    size_t length;
+    unsigned int flags;
+} cudaStreamAttachMemAsync_v6000_params;
+
+typedef struct cudaStreamSynchronize_v3020_params_st {
+    cudaStream_t stream;
+} cudaStreamSynchronize_v3020_params;
+
+typedef struct cudaLaunchKernel_v7000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchKernel_v7000_params;
+
+typedef struct cudaLaunchKernelExC_v11060_params_st {
+    const cudaLaunchConfig_t *config;
+    const void *func;
+    void **args;
+} cudaLaunchKernelExC_v11060_params;
+
+typedef struct cudaLaunchCooperativeKernel_v9000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchCooperativeKernel_v9000_params;
+
+typedef struct cudaLaunchHostFunc_v10000_params_st {
+    cudaStream_t stream;
+    cudaHostFn_t fn;
+    void *userData;
+} cudaLaunchHostFunc_v10000_params;
+
+typedef struct cudaMemPrefetchAsync_v8000_params_st {
+    const void *devPtr;
+    size_t count;
+    int dstDevice;
+    cudaStream_t stream;
+} cudaMemPrefetchAsync_v8000_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_v10000_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_ptsz_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_ptsz_v10000_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_v2_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_v2_v11020_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_v10000_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_ptsz_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_ptsz_v10000_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_v2_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_v2_v11020_params;
+
+typedef struct cudaGraphUpload_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphUpload_v10000_params;
+
+typedef struct cudaGraphLaunch_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphLaunch_v10000_params;
+
+typedef struct cudaStreamBeginCapture_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureMode mode;
+} cudaStreamBeginCapture_v10000_params;
+
+typedef struct cudaStreamEndCapture_v10000_params_st {
+    cudaStream_t stream;
+    cudaGraph_t *pGraph;
+} cudaStreamEndCapture_v10000_params;
+
+typedef struct cudaStreamIsCapturing_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *pCaptureStatus;
+} cudaStreamIsCapturing_v10000_params;
+
+typedef struct cudaStreamGetCaptureInfo_v10010_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+} cudaStreamGetCaptureInfo_v10010_params;
+
+typedef struct cudaStreamGetCaptureInfo_v2_v11030_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+    cudaGraph_t *graph_out;
+    const cudaGraphNode_t **dependencies_out;
+    size_t *numDependencies_out;
+} cudaStreamGetCaptureInfo_v2_v11030_params;
+
+typedef struct cudaStreamUpdateCaptureDependencies_ptsz_v11030_params_st {
+    cudaStream_t stream;
+    cudaGraphNode_t *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cudaStreamUpdateCaptureDependencies_ptsz_v11030_params;
+
+typedef struct cudaStreamCopyAttributes_v11000_params_st {
+    cudaStream_t dstStream;
+    cudaStream_t srcStream;
+} cudaStreamCopyAttributes_v11000_params;
+
+typedef struct cudaStreamGetAttribute_v11000_params_st {
+    cudaStream_t stream;
+    cudaStreamAttrID attr;
+    cudaStreamAttrValue *value;
+} cudaStreamGetAttribute_v11000_params;
+
+typedef struct cudaStreamSetAttribute_v11000_params_st {
+    cudaStream_t stream;
+    cudaStreamAttrID attr;
+    const cudaStreamAttrValue *param;
+} cudaStreamSetAttribute_v11000_params;
+
+typedef struct cudaMallocAsync_v11020_params_st {
+    void **devPtr;
+    size_t size;
+    cudaStream_t hStream;
+} cudaMallocAsync_v11020_params;
+
+typedef struct cudaFreeAsync_v11020_params_st {
+    void *devPtr;
+    cudaStream_t hStream;
+} cudaFreeAsync_v11020_params;
+
+typedef struct cudaMallocFromPoolAsync_v11020_params_st {
+    void **ptr;
+    size_t size;
+    cudaMemPool_t memPool;
+    cudaStream_t stream;
+} cudaMallocFromPoolAsync_v11020_params;
+
+typedef struct cudaGetDriverEntryPoint_v11030_params_st {
+    const char *symbol;
+    void **funcPtr;
+    unsigned long long flags;
+} cudaGetDriverEntryPoint_v11030_params;
+
+// Parameter trace structures for removed functions
+
+
+// End of parameter trace structures
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_vdpau_interop_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_vdpau_interop_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..88e79d1957925c4bbacd381e9461d5072de88f24
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_vdpau_interop_meta.h
@@ -0,0 +1,38 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// CUDA public interface, for type definitions and api function prototypes
+#include "cuda_vdpau_interop.h"
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+// Currently used parameter trace structures
+typedef struct cudaVDPAUGetDevice_v3020_params_st {
+    int *device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cudaVDPAUGetDevice_v3020_params;
+
+typedef struct cudaVDPAUSetVDPAUDevice_v3020_params_st {
+    int device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cudaVDPAUSetVDPAUDevice_v3020_params;
+
+typedef struct cudaGraphicsVDPAURegisterVideoSurface_v3020_params_st {
+    struct cudaGraphicsResource **resource;
+    VdpVideoSurface vdpSurface;
+    unsigned int flags;
+} cudaGraphicsVDPAURegisterVideoSurface_v3020_params;
+
+typedef struct cudaGraphicsVDPAURegisterOutputSurface_v3020_params_st {
+    struct cudaGraphicsResource **resource;
+    VdpOutputSurface vdpSurface;
+    unsigned int flags;
+} cudaGraphicsVDPAURegisterOutputSurface_v3020_params;
+
+// Parameter trace structures for removed functions
+
+
+// End of parameter trace structures
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_infer.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_infer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e24cfcbba4d93b57f15a4bd60fbe60a99b493f66
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_infer.h
@@ -0,0 +1,571 @@
+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ *  cudnn_cnn_infer : cuDNN's basic definitions and inference CNN functions.
+ */
+
+#if !defined(CUDNN_CNN_INFER_H_)
+#define CUDNN_CNN_INFER_H_
+
+#pragma once
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+#include "cudnn_version.h"
+#include "cudnn_ops_infer.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_CNN_INFER_MAJOR 8
+#define CUDNN_CNN_INFER_MINOR 7
+#define CUDNN_CNN_INFER_PATCH 0
+
+#if (CUDNN_CNN_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_CNN_INFER_MINOR != CUDNN_MINOR) || \
+    (CUDNN_CNN_INFER_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN CNN INFER!!!
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct cudnnConvolutionStruct *cudnnConvolutionDescriptor_t;
+
+/*
+ *  convolution mode
+ */
+typedef enum { CUDNN_CONVOLUTION = 0, CUDNN_CROSS_CORRELATION = 1 } cudnnConvolutionMode_t;
+
+/*
+ * CUDNN Reorder
+ */
+typedef enum {
+    CUDNN_DEFAULT_REORDER = 0,
+    CUDNN_NO_REORDER      = 1,
+} cudnnReorderType_t;
+
+typedef struct cudnnConvolutionFwdAlgoPerfStruct {
+    cudnnConvolutionFwdAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionFwdAlgoPerf_t;
+
+/* Create an instance of convolution descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc);
+
+/* Destroy an instance of convolution descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int pad_h,      /* zero-padding height */
+                                int pad_w,      /* zero-padding width */
+                                int u,          /* vertical filter stride */
+                                int v,          /* horizontal filter stride */
+                                int dilation_h, /* filter dilation in the vertical dimension */
+                                int dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int *pad_h,      /* zero-padding height */
+                                int *pad_w,      /* zero-padding width */
+                                int *u,          /* vertical filter stride */
+                                int *v,          /* horizontal filter stride */
+                                int *dilation_h, /* filter dilation in the vertical dimension */
+                                int *dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLength, /* nbDims-2 size */
+                                const int padA[],
+                                const int filterStrideA[],
+                                const int dilationA[],
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType); /* convolution data type */
+
+/* Helper function to return the dimensions of the output tensor given a convolution descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLengthRequested,
+                                int *arrayLength,
+                                int padA[],
+                                int strideA[],
+                                int dilationA[],
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType); /* convolution data type */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int *n,
+                                      int *c,
+                                      int *h,
+                                      int *w);
+
+/* Helper function to return the dimensions of the output tensor given a convolution descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int nbDims,
+                                      int tensorOuputDimA[]);
+
+/* helper function to provide the convolution forward algo that fit best the requirement */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t srcDesc,
+                                       const cudnnFilterDescriptor_t filterDesc,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t destDesc,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
+                                     const cudnnTensorDescriptor_t xDesc,
+                                     const cudnnFilterDescriptor_t wDesc,
+                                     const cudnnConvolutionDescriptor_t convDesc,
+                                     const cudnnTensorDescriptor_t yDesc,
+                                     const int requestedAlgoCount,
+                                     int *returnedAlgoCount,
+                                     cudnnConvolutionFwdAlgoPerf_t *perfResults);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t xDesc,
+                                       const void *x,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t yDesc,
+                                       void *y,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
+                                       void *workSpace,
+                                       size_t workSpaceSizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle,
+            const cudnnTensorDescriptor_t xDesc,
+            const void *x,
+            const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc,
+            void *colBuffer);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnReorderFilterAndBias(cudnnHandle_t handle,
+                          const cudnnFilterDescriptor_t filterDesc,
+                          cudnnReorderType_t reorderType,
+                          const void *filterData,
+                          void *reorderedFilterData,
+                          int reorderBias,
+                          const void *biasData,
+                          void *reorderedBiasData);
+
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const cudnnConvolutionDescriptor_t convDesc,
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        cudnnConvolutionFwdAlgo_t algo,
+                                        size_t *sizeInBytes);
+
+/* Convolution functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+
+/* Function to perform the forward pass for batch convolution */
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionForward(cudnnHandle_t handle,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnConvolutionDescriptor_t convDesc,
+                        cudnnConvolutionFwdAlgo_t algo,
+                        void *workSpace,
+                        size_t workSpaceSizeInBytes,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t yDesc,
+                        void *y);
+
+/* Fused conv/bias/activation operation : y = Act( alpha1 * conv(x) + alpha2 * z + bias ) */
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
+                                      const void *alpha1,
+                                      const cudnnTensorDescriptor_t xDesc,
+                                      const void *x,
+                                      const cudnnFilterDescriptor_t wDesc,
+                                      const void *w,
+                                      const cudnnConvolutionDescriptor_t convDesc,
+                                      cudnnConvolutionFwdAlgo_t algo,
+                                      void *workSpace,
+                                      size_t workSpaceSizeInBytes,
+                                      const void *alpha2,
+                                      const cudnnTensorDescriptor_t zDesc,
+                                      const void *z,
+                                      const cudnnTensorDescriptor_t biasDesc,
+                                      const void *bias,
+                                      const cudnnActivationDescriptor_t activationDesc,
+                                      const cudnnTensorDescriptor_t yDesc,
+                                      void *y);
+
+/* helper function to provide the convolution backward data algo that fit best the requirement */
+
+typedef struct cudnnConvolutionBwdDataAlgoPerfStruct {
+    cudnnConvolutionBwdDataAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionBwdDataAlgoPerf_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t wDesc,
+                                          const cudnnTensorDescriptor_t dyDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t dxDesc,
+                                          const int requestedAlgoCount,
+                                          int *returnedAlgoCount,
+                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t wDesc,
+                                            const void *w,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const void *dy,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t dxDesc,
+                                            void *dx,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
+                                            void *workSpace,
+                                            size_t workSpaceSizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t filterDesc,
+                                            const cudnnTensorDescriptor_t diffDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t gradDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
+
+/*
+ *  convolution algorithm (which requires potentially some workspace)
+ */
+
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
+                                             const cudnnFilterDescriptor_t wDesc,
+                                             const cudnnTensorDescriptor_t dyDesc,
+                                             const cudnnConvolutionDescriptor_t convDesc,
+                                             const cudnnTensorDescriptor_t dxDesc,
+                                             cudnnConvolutionBwdDataAlgo_t algo,
+                                             size_t *sizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardData(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnFilterDescriptor_t wDesc,
+                             const void *w,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnConvolutionDescriptor_t convDesc,
+                             cudnnConvolutionBwdDataAlgo_t algo,
+                             void *workSpace,
+                             size_t workSpaceSizeInBytes,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx);
+
+/* Helper function to calculate folding descriptors for dgrad */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t filterDesc,
+                                          const cudnnTensorDescriptor_t diffDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t gradDesc,
+                                          const cudnnTensorFormat_t transformFormat,
+                                          cudnnFilterDescriptor_t foldedFilterDesc,
+                                          cudnnTensorDescriptor_t paddedDiffDesc,
+                                          cudnnConvolutionDescriptor_t foldedConvDesc,
+                                          cudnnTensorDescriptor_t foldedGradDesc,
+                                          cudnnTensorTransformDescriptor_t filterFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t diffPadTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradUnfoldTransDesc);
+
+/* cudnnFusedOps... */
+struct cudnnFusedOpsConstParamStruct;
+typedef struct cudnnFusedOpsConstParamStruct *cudnnFusedOpsConstParamPack_t;
+
+struct cudnnFusedOpsVariantParamStruct;
+typedef struct cudnnFusedOpsVariantParamStruct *cudnnFusedOpsVariantParamPack_t;
+
+struct cudnnFusedOpsPlanStruct;
+typedef struct cudnnFusedOpsPlanStruct *cudnnFusedOpsPlan_t;
+
+typedef enum {
+    /* each op in [ ] can be disabled by passing NULL ptr */
+    /* [per channel scale], [per channel bias], [activation], convolution, [generate BN stats] */
+    CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0,
+    /* [per channel scale], [per channel bias], [activation], convolutionBackwardWeights */
+    CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1,
+    /* utility for BN training in BN-conv fusion */
+    /* computes the equivalent scale and bias from ySum ySqSum and learned scale, bias */
+    /* optionally update running stats and generate saved stats */
+    CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2,
+    /* utility for BN inference in BN-conv fusion */
+    /* computes the equivalent scale and bias from learned running stats and learned scale, bias */
+    CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3,
+    /* reserved for future use: convolution, [per channel scale], [per channel bias], [residual add], [activation] */
+    CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4,
+    /* reserved for future use: [per channel scale], [per channel bias], [residual add],  activation, bitmask */
+    CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5,
+    /* reserved for future use */
+    CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6,
+} cudnnFusedOps_t;
+
+typedef enum {
+    /* set XDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get XDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_XDESC = 0,
+    /* set/get XDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_XDATA_PLACEHOLDER = 1,
+    /* set/get BN_MODE: pass cudnnBatchNormMode_t* */
+    CUDNN_PARAM_BN_MODE = 2,
+    /* set CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3,
+    /* set/get BN_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4,
+    /* set/get BN_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5,
+    /* set ACTIVATION_DESC: pass previously initialized cudnnActivationDescriptor_t */
+    /* get ACTIVATION_DESC: pass previously created cudnnActivationDescriptor_t */
+    CUDNN_PARAM_ACTIVATION_DESC = 6,
+    /* set CONV_DESC: pass previously initialized cudnnConvolutionDescriptor_t */
+    /* get CONV_DESC: pass previously created cudnnConvolutionDescriptor_t */
+    CUDNN_PARAM_CONV_DESC = 7,
+    /* set WDESC: pass previously initialized cudnnFilterDescriptor_t */
+    /* get WDESC: pass previously created cudnnFilterDescriptor_t */
+    CUDNN_PARAM_WDESC = 8,
+    /* set/get WDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_WDATA_PLACEHOLDER = 9,
+    /* set DWDESC: pass previously initialized cudnnFilterDescriptor_t */
+    /* get DWDESC: pass previously created cudnnFilterDescriptor_t */
+    CUDNN_PARAM_DWDESC = 10,
+    /* set/get DWDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DWDATA_PLACEHOLDER = 11,
+    /* set YDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get YDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_YDESC = 12,
+    /* set/get YDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YDATA_PLACEHOLDER = 13,
+    /* set DYDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DYDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DYDESC = 14,
+    /* set/get DYDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DYDATA_PLACEHOLDER = 15,
+    /* set YSTATS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get YSTATS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_YSTATS_DESC = 16,
+    /* set/get YSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YSUM_PLACEHOLDER = 17,
+    /* set/get YSQSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18,
+    /* set CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19,
+    /* set/get CUDNN_PARAM_BN_SCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20,
+    /* set/get CUDNN_PARAM_BN_BIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21,
+    /* set/get CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22,
+    /* set/get CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23,
+    /* set/get CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24,
+    /* set/get CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25,
+
+    /* set ZDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get ZDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_ZDESC = 26,
+    /* set/get ZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_ZDATA_PLACEHOLDER = 27,
+    /* set BN_Z_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get BN_Z_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28,
+    /* set/get BN_Z_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29,
+    /* set/get BN_Z_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30,
+
+    /* set ACTIVATION_BITMASK_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get ACTIVATION_BITMASK_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31,
+    /* set/get ACTIVATION_BITMASK_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32,
+
+    /* set DXDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DXDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DXDESC = 33,
+    /* set/get DXDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DXDATA_PLACEHOLDER = 34,
+    /* set DZDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DZDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DZDESC = 35,
+    /* set/get DZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DZDATA_PLACEHOLDER = 36,
+    /* set/get CUDNN_PARAM_BN_DSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37,
+    /* set/get CUDNN_PARAM_BN_DBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38,
+} cudnnFusedOpsConstParamLabel_t;
+
+typedef enum {
+    CUDNN_PTR_NULL         = 0,
+    CUDNN_PTR_ELEM_ALIGNED = 1,
+    CUDNN_PTR_16B_ALIGNED  = 2,
+} cudnnFusedOpsPointerPlaceHolder_t;
+
+typedef enum {
+    /* set: pass void* pointing to dev memory */
+    /* get: pass void** pointing to host memory */
+    CUDNN_PTR_XDATA              = 0,
+    CUDNN_PTR_BN_EQSCALE         = 1,
+    CUDNN_PTR_BN_EQBIAS          = 2,
+    CUDNN_PTR_WDATA              = 3,
+    CUDNN_PTR_DWDATA             = 4,
+    CUDNN_PTR_YDATA              = 5,
+    CUDNN_PTR_DYDATA             = 6,
+    CUDNN_PTR_YSUM               = 7,
+    CUDNN_PTR_YSQSUM             = 8,
+    CUDNN_PTR_WORKSPACE          = 9,
+    CUDNN_PTR_BN_SCALE           = 10,
+    CUDNN_PTR_BN_BIAS            = 11,
+    CUDNN_PTR_BN_SAVED_MEAN      = 12,
+    CUDNN_PTR_BN_SAVED_INVSTD    = 13,
+    CUDNN_PTR_BN_RUNNING_MEAN    = 14,
+    CUDNN_PTR_BN_RUNNING_VAR     = 15,
+    CUDNN_PTR_ZDATA              = 16,
+    CUDNN_PTR_BN_Z_EQSCALE       = 17,
+    CUDNN_PTR_BN_Z_EQBIAS        = 18,
+    CUDNN_PTR_ACTIVATION_BITMASK = 19,
+    CUDNN_PTR_DXDATA             = 20,
+    CUDNN_PTR_DZDATA             = 21,
+    CUDNN_PTR_BN_DSCALE          = 22,
+    CUDNN_PTR_BN_DBIAS           = 23,
+
+    /* set/get: pass size_t* pointing to host memory */
+    CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100,
+    /* set/get: pass int64_t* pointing to host memory */
+    CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101,
+    /* set/get: pass double* pointing to host memory */
+    CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102,
+    /* set/get: pass double* pointing to host memory */
+    CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103,
+} cudnnFusedOpsVariantParamLabel_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCnnInferVersionCheck(void);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_CNN_INFER_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..274be325726f20b1c7a306c6ef7184475a28a07b
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftw.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftw.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f12b4e1ea68c5a186d73b5d943d2cba0218312f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftw.h
@@ -0,0 +1,454 @@
+
+ /* Copyright 2005-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+/*!
+* \file cufftw.h
+* \brief Public header file for the NVIDIA CUDA FFTW library (CUFFTW)
+*/
+
+#ifndef _CUFFTW_H_
+#define _CUFFTW_H_
+
+
+#include <stdio.h>
+#include "cufft.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// transform direction
+#define FFTW_FORWARD -1
+#define FFTW_INVERSE  1
+#define FFTW_BACKWARD 1
+
+// Planner flags
+
+#define FFTW_ESTIMATE           0x01
+#define FFTW_MEASURE            0x02
+#define FFTW_PATIENT            0x03
+#define FFTW_EXHAUSTIVE         0x04
+#define FFTW_WISDOM_ONLY        0x05
+
+//Algorithm restriction flags
+
+#define FFTW_DESTROY_INPUT      0x08
+#define FFTW_PRESERVE_INPUT     0x0C
+#define FFTW_UNALIGNED          0x10
+    
+// CUFFTW defines and supports the following data types
+
+// note if complex.h has been included we use the C99 complex types
+#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined (complex)
+  typedef double _Complex fftw_complex;
+  typedef float _Complex fftwf_complex;
+#else
+  typedef double fftw_complex[2];
+  typedef float fftwf_complex[2];
+#endif
+
+typedef void *fftw_plan;
+
+typedef void *fftwf_plan;
+
+typedef struct {
+    int n;
+    int is;
+    int os;
+} fftw_iodim;
+
+typedef fftw_iodim fftwf_iodim;
+    
+typedef struct {
+    ptrdiff_t n;
+    ptrdiff_t is;
+    ptrdiff_t os;
+} fftw_iodim64;
+
+typedef fftw_iodim64 fftwf_iodim64;
+    
+
+// CUFFTW defines and supports the following double precision APIs
+
+
+fftw_plan CUFFTAPI fftw_plan_dft_1d(int n, 
+                                    fftw_complex *in,
+                                    fftw_complex *out, 
+                                    int sign, 
+                                    unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_2d(int n0,
+                                    int n1, 
+                                    fftw_complex *in,
+                                    fftw_complex *out, 
+                                    int sign, 
+                                    unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_3d(int n0,
+                                    int n1,
+                                    int n2, 
+                                    fftw_complex *in,
+                                    fftw_complex *out, 
+                                    int sign, 
+                                    unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft(int rank,
+                                 const int *n,
+                                 fftw_complex *in,
+                                 fftw_complex *out, 
+                                 int sign, 
+                                 unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_r2c_1d(int n, 
+                                        double *in,
+                                        fftw_complex *out, 
+                                        unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_r2c_2d(int n0,
+                                        int n1, 
+                                        double *in,
+                                        fftw_complex *out, 
+                                        unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_r2c_3d(int n0,
+                                        int n1,
+                                        int n2, 
+                                        double *in,
+                                        fftw_complex *out, 
+                                        unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_r2c(int rank,
+                                     const int *n,
+                                     double *in,
+                                     fftw_complex *out, 
+                                     unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_c2r_1d(int n, 
+                                        fftw_complex *in,
+                                        double *out, 
+                                        unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_c2r_2d(int n0,
+                                        int n1, 
+                                        fftw_complex *in,
+                                        double *out, 
+                                        unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_c2r_3d(int n0,
+                                        int n1,
+                                        int n2, 
+                                        fftw_complex *in,
+                                        double *out, 
+                                        unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_c2r(int rank,
+                                     const int *n,
+                                     fftw_complex *in,
+                                     double *out, 
+                                     unsigned flags);
+
+
+fftw_plan CUFFTAPI fftw_plan_many_dft(int rank,
+                                      const int *n,
+                                      int batch,
+                                      fftw_complex *in,
+                                      const int *inembed, int istride, int idist,
+                                      fftw_complex *out,
+                                      const int *onembed, int ostride, int odist,
+                                      int sign, unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_many_dft_r2c(int rank,
+                                          const int *n,
+                                          int batch,
+                                          double *in,
+                                          const int *inembed, int istride, int idist,
+                                          fftw_complex *out,
+                                          const int *onembed, int ostride, int odist,
+                                          unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_many_dft_c2r(int rank,
+                                          const int *n,
+                                          int batch,
+                                          fftw_complex *in,
+                                          const int *inembed, int istride, int idist,
+                                          double *out,
+                                          const int *onembed, int ostride, int odist,
+                                          unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_guru_dft(int rank, const fftw_iodim *dims,
+                                      int batch_rank, const fftw_iodim *batch_dims,
+                                      fftw_complex *in, fftw_complex *out,
+                                      int sign, unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_guru_dft_r2c(int rank, const fftw_iodim *dims,
+                                          int batch_rank, const fftw_iodim *batch_dims,
+                                          double *in, fftw_complex *out, 
+                                          unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_guru_dft_c2r(int rank, const fftw_iodim *dims,
+                                          int batch_rank, const fftw_iodim *batch_dims,
+                                          fftw_complex *in, double *out, 
+                                          unsigned flags);
+
+void CUFFTAPI fftw_execute(const fftw_plan plan);
+
+void CUFFTAPI fftw_execute_dft(const fftw_plan plan, 
+                               fftw_complex *idata,
+                               fftw_complex *odata);
+
+void CUFFTAPI fftw_execute_dft_r2c(const fftw_plan plan, 
+                                   double *idata,
+                                   fftw_complex *odata);
+
+void CUFFTAPI fftw_execute_dft_c2r(const fftw_plan plan, 
+                                   fftw_complex *idata,
+                                   double *odata);
+                                   
+                                   
+// CUFFTW defines and supports the following single precision APIs
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_1d(int n, 
+                                      fftwf_complex *in,
+                                      fftwf_complex *out, 
+                                      int sign, 
+                                      unsigned flags);
+                                   
+fftwf_plan CUFFTAPI fftwf_plan_dft_2d(int n0,
+                                      int n1, 
+                                      fftwf_complex *in,
+                                      fftwf_complex *out, 
+                                      int sign, 
+                                      unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_3d(int n0,
+                                      int n1,
+                                      int n2, 
+                                      fftwf_complex *in,
+                                      fftwf_complex *out, 
+                                      int sign, 
+                                      unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft(int rank,
+                                   const int *n,
+                                   fftwf_complex *in,
+                                   fftwf_complex *out, 
+                                   int sign, 
+                                   unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_1d(int n, 
+                                          float *in,
+                                          fftwf_complex *out, 
+                                          unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_2d(int n0,
+                                          int n1, 
+                                          float *in,
+                                          fftwf_complex *out, 
+                                          unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_3d(int n0,
+                                          int n1,
+                                          int n2, 
+                                          float *in,
+                                          fftwf_complex *out, 
+                                          unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_r2c(int rank,
+                                       const int *n,
+                                       float *in,
+                                       fftwf_complex *out, 
+                                       unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_1d(int n, 
+                                          fftwf_complex *in,
+                                          float *out, 
+                                          unsigned flags);
+                                      
+fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_2d(int n0,
+                                          int n1, 
+                                          fftwf_complex *in,
+                                          float *out, 
+                                          unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_3d(int n0,
+                                        int n1,
+                                        int n2, 
+                                        fftwf_complex *in,
+                                        float *out, 
+                                        unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_c2r(int rank,
+                                       const int *n,
+                                       fftwf_complex *in,
+                                       float *out, 
+                                       unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_many_dft(int rank,
+                                        const int *n,
+                                        int batch,
+                                        fftwf_complex *in,
+                                        const int *inembed, int istride, int idist,
+                                        fftwf_complex *out,
+                                        const int *onembed, int ostride, int odist,
+                                        int sign, unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_many_dft_r2c(int rank,
+                                            const int *n,
+                                            int batch,
+                                            float *in,
+                                            const int *inembed, int istride, int idist,
+                                            fftwf_complex *out,
+                                            const int *onembed, int ostride, int odist,
+                                            unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_many_dft_c2r(int rank,
+                                            const int *n,
+                                            int batch,
+                                            fftwf_complex *in,
+                                            const int *inembed, int istride, int idist,
+                                            float *out,
+                                            const int *onembed, int ostride, int odist,
+                                            unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_guru_dft(int rank, const fftwf_iodim *dims,
+                                        int batch_rank, const fftwf_iodim *batch_dims,
+                                        fftwf_complex *in, fftwf_complex *out,
+                                        int sign, unsigned flags);
+                                        
+fftwf_plan CUFFTAPI fftwf_plan_guru_dft_r2c(int rank, const fftwf_iodim *dims,
+                                            int batch_rank, const fftwf_iodim *batch_dims,
+                                            float *in, fftwf_complex *out, 
+                                            unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_guru_dft_c2r(int rank, const fftwf_iodim *dims,
+                                            int batch_rank, const fftwf_iodim *batch_dims,
+                                            fftwf_complex *in, float *out, 
+                                            unsigned flags);
+
+void CUFFTAPI fftwf_execute(const fftw_plan plan);
+
+void CUFFTAPI fftwf_execute_dft(const fftwf_plan plan, 
+                                fftwf_complex *idata,
+                                fftwf_complex *odata);
+
+void CUFFTAPI fftwf_execute_dft_r2c(const fftwf_plan plan, 
+                                    float *idata,
+                                    fftwf_complex *odata);
+
+void CUFFTAPI fftwf_execute_dft_c2r(const fftwf_plan plan, 
+                                    fftwf_complex *idata,
+                                    float *odata);
+
+/// CUFFTW 64-bit Guru Interface
+/// dp
+fftw_plan CUFFTAPI fftw_plan_guru64_dft(int rank, const fftw_iodim64* dims, int batch_rank, const fftw_iodim64* batch_dims, fftw_complex* in, fftw_complex* out, int sign, unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_guru64_dft_r2c(int rank, const fftw_iodim64* dims, int batch_rank, const fftw_iodim64* batch_dims, double* in, fftw_complex* out, unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_guru64_dft_c2r(int rank, const fftw_iodim64* dims, int batch_rank, const fftw_iodim64* batch_dims, fftw_complex* in, double* out, unsigned flags);
+
+/// sp
+fftwf_plan CUFFTAPI fftwf_plan_guru64_dft(int rank, const fftwf_iodim64* dims, int batch_rank, const fftwf_iodim64* batch_dims, fftwf_complex* in, fftwf_complex* out, int sign, unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_guru64_dft_r2c(int rank, const fftwf_iodim64* dims, int batch_rank, const fftwf_iodim64* batch_dims, float* in, fftwf_complex* out, unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_guru64_dft_c2r(int rank, const fftwf_iodim64* dims, int batch_rank, const fftwf_iodim64* batch_dims, fftwf_complex* in, float* out, unsigned flags);
+
+#ifdef _WIN32
+#define _CUFFTAPI(T) T CUFFTAPI
+#else
+#define _CUFFTAPI(T) CUFFTAPI T
+#endif
+
+// CUFFTW defines and supports the following support APIs
+_CUFFTAPI(void *) fftw_malloc(size_t n);
+
+_CUFFTAPI(void *) fftwf_malloc(size_t n);
+
+void CUFFTAPI fftw_free(void *pointer);
+
+void CUFFTAPI fftwf_free(void *pointer);
+
+void CUFFTAPI fftw_export_wisdom_to_file(FILE * output_file); 
+
+void CUFFTAPI fftwf_export_wisdom_to_file(FILE * output_file); 
+
+void CUFFTAPI fftw_import_wisdom_from_file(FILE * input_file); 
+
+void CUFFTAPI fftwf_import_wisdom_from_file(FILE * input_file); 
+
+void CUFFTAPI fftw_print_plan(const fftw_plan plan);                                 
+
+void CUFFTAPI fftwf_print_plan(const fftwf_plan plan);
+
+void CUFFTAPI fftw_set_timelimit(double seconds);
+
+void CUFFTAPI fftwf_set_timelimit(double seconds);
+
+double CUFFTAPI fftw_cost(const fftw_plan plan);
+                               
+double CUFFTAPI fftwf_cost(const fftw_plan plan);
+
+void CUFFTAPI fftw_flops(const fftw_plan plan, double *add, double *mul, double *fma);
+
+void CUFFTAPI fftwf_flops(const fftw_plan plan, double *add, double *mul, double *fma);
+
+void CUFFTAPI fftw_destroy_plan(fftw_plan plan);
+
+void CUFFTAPI fftwf_destroy_plan(fftwf_plan plan);
+
+void CUFFTAPI fftw_cleanup(void);
+
+void CUFFTAPI fftwf_cleanup(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CUFFTW_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..299cf7e68206fd59da47e38d1f11dca84e3ed274
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/idna/__pycache__/idnadata.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/idna/__pycache__/idnadata.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d68a69db6781ab705c49a7a45ced26275869457
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/idna/__pycache__/idnadata.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e0c01fe1a0e5738b15b6952c63eebb273a28c12beefd13f01594da265a1b156
+size 101565
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/LICENSE b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..e466b0dfda14f3a7c8ece512937eb99c8b7b6d68
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/LICENSE
@@ -0,0 +1,29 @@
+Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Please also refer to the file .github/CONTRIBUTING.md, which clarifies licensing of
+external contributions to this project including patches, pull requests, etc.
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/RECORD b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/RECORD
new file mode 100644
index 0000000000000000000000000000000000000000..d8caf213c456fbebc99e75e8743e41cd50a6832f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/RECORD
@@ -0,0 +1,65 @@
+../../../bin/pybind11-config,sha256=KwKhJwrv86OeAvCUq7sBopc-kDZzCJdnh_4RZIF8T-c,265
+pybind11-2.13.6.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+pybind11-2.13.6.dist-info/LICENSE,sha256=g5ZbhDuY9nDTqFvQQe1LNyyOxQ17SlmVqDrGl7pnXcs,1684
+pybind11-2.13.6.dist-info/METADATA,sha256=Gg_aZ0f3aFFDF3bQvgzR9kwVT_jogjVEc74kDVldlq0,9513
+pybind11-2.13.6.dist-info/RECORD,,
+pybind11-2.13.6.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pybind11-2.13.6.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
+pybind11-2.13.6.dist-info/entry_points.txt,sha256=Q_kAwEJBDz8wHD0V50hY3AvchDk3Pfyeox2YHrAcWZ0,105
+pybind11-2.13.6.dist-info/top_level.txt,sha256=d1mqwSpUlmlZhXDQ9Y57eNlXc3dVDM1toKmfC1kJbvU,9
+pybind11/__init__.py,sha256=9vt06pvuwvdKW0YwYQKOTxBEgmQ0kb5ZUOJrgtGhdKs,459
+pybind11/__main__.py,sha256=p8vZ4btnkb_TaF03R1ac7qHmp-Eut86gCSUcVP8F3i4,2526
+pybind11/__pycache__/__init__.cpython-311.pyc,,
+pybind11/__pycache__/__main__.cpython-311.pyc,,
+pybind11/__pycache__/_version.cpython-311.pyc,,
+pybind11/__pycache__/commands.cpython-311.pyc,,
+pybind11/__pycache__/setup_helpers.cpython-311.pyc,,
+pybind11/_version.py,sha256=XUUceDIbc3kdRixyEVMy5v0LcGF36QUxMG9rJHlT6P4,232
+pybind11/commands.py,sha256=V43hKb7VE_abYZvaO-TpJLOU65n6W3ZrdYHGF3G3qUs,1243
+pybind11/include/pybind11/attr.h,sha256=QPjH7BfhL8QFwHHkrDak8gNOLMlb1itAO5fobjdoLp8,24334
+pybind11/include/pybind11/buffer_info.h,sha256=_FcQisqdpphfWXKeCGNv3Gq5ivy1z-qF3d1Noeteaok,7778
+pybind11/include/pybind11/cast.h,sha256=8gJ4Y4nc83dyq12CuU7ircAvAV1HoEZEVr0UyfeLQNA,71696
+pybind11/include/pybind11/chrono.h,sha256=A23naeloqn-1NKVAABOsJtHU9Vz8lfvrAICuLk-7qBM,8458
+pybind11/include/pybind11/common.h,sha256=ATg9Bt1pwF8qnNuI086fprM4CUTdrZdk_g2HXE1Sf6A,120
+pybind11/include/pybind11/complex.h,sha256=AaDZ-rEmK4tFaue-K9P5y3TxxnaQF6JwZ_6LAzkdLQI,2096
+pybind11/include/pybind11/detail/class.h,sha256=Bjk3K6xAMgwxPNTKfik7SC5Y24wgKs8Oz5VjvFdy0kA,29026
+pybind11/include/pybind11/detail/common.h,sha256=uxFMVYKW87YPbUz8Mo70xoVrpK2D1NzhKSwlDpwrJxo,54708
+pybind11/include/pybind11/detail/cpp_conduit.h,sha256=Bbx5728XzvyCL2gfW7kG6vgDltS5-V5gtkNQFPFevXg,2589
+pybind11/include/pybind11/detail/descr.h,sha256=D63pIHsF3luO_g51CjbJU8Wl9VOihciEXQhXvfRg-Rk,6035
+pybind11/include/pybind11/detail/exception_translation.h,sha256=fM1J19z00AuDlozHt0srpCJr-1uWW4kj_fLdSJDbdY8,2600
+pybind11/include/pybind11/detail/init.h,sha256=Sb1UkPecC5l9xj5naYLdUM7qIRLVpe614H9Frvyg8xg,17983
+pybind11/include/pybind11/detail/internals.h,sha256=xs-I7JdJACxx7gJf12HBLjL007jRXcAffPDsd0oTrq4,31985
+pybind11/include/pybind11/detail/type_caster_base.h,sha256=mdgZ-FIkxdSShMPPe69EXxjvd1eQDDBVX835B7XqCNo,48938
+pybind11/include/pybind11/detail/typeid.h,sha256=jw5pr9m72vkDsloT8vxl9wj17VJGcEdXDyziBlt89Js,1625
+pybind11/include/pybind11/detail/value_and_holder.h,sha256=hwNYlqxjUhlUqihwMjr6s3LhhKlZiTLaWREtQrgOAkQ,2814
+pybind11/include/pybind11/eigen.h,sha256=-HmSA1kgwCQ-GHUt7PHtTEc-vxqw9xARpF8PHWJip28,316
+pybind11/include/pybind11/eigen/common.h,sha256=dIeqmK7IzW5K4k2larPnA1A863rDp38U9YbNIwiIyYk,378
+pybind11/include/pybind11/eigen/matrix.h,sha256=VjCfx8M2AcD3m8THUbIEYidJyIClaNw9jMbd_Fzfo1s,32142
+pybind11/include/pybind11/eigen/tensor.h,sha256=csE3_N9yy-9k0SWQPJuAxmv8Jp_-lFrrPdVOyMV8-gc,18384
+pybind11/include/pybind11/embed.h,sha256=F3JQiOWnLGSuZ0NuEyBWFhHyVdczD8D_67kriU4QfsY,13362
+pybind11/include/pybind11/eval.h,sha256=7re-O2Eor1yD0Q_KgFkHIjKD17ejzII687Yszl9_KfE,4731
+pybind11/include/pybind11/functional.h,sha256=iOyYuNmbI-K3zgc1IMDwe4iHEOO3F8vwZbVSvbgxFQ4,5267
+pybind11/include/pybind11/gil.h,sha256=hsJj6z1iXqlo5c7fPCgEvK_-eeDoKZm7PKPwPNCdVVo,7702
+pybind11/include/pybind11/gil_safe_call_once.h,sha256=KKcy9Wgc_MJY-U5WpCZeNyzW7oVmC-d6yXkgephZ7zs,3993
+pybind11/include/pybind11/iostream.h,sha256=K5rPXoCYN325r1PptcJCIhPhgtRtTJQjMr7bvUIOwxk,8862
+pybind11/include/pybind11/numpy.h,sha256=xREhfycUTCOPF8CF-UWRdoLX0B23V6YWRiBqeRRElZg,84442
+pybind11/include/pybind11/operators.h,sha256=224RoAXcv1la4NNY9rQ3aD_AeC8S9ZKx3HVK1O8B4MU,9103
+pybind11/include/pybind11/options.h,sha256=qXvmnj--9fZSp56NYefnB3W5V17ppHlY1Srgo3DNBpw,2734
+pybind11/include/pybind11/pybind11.h,sha256=hbzXHRCBIW7dwtwaKjXKPC0Nl1MGHZ5-BjGsMlE3LuU,129898
+pybind11/include/pybind11/pytypes.h,sha256=BF8x4S5fsAzWf-d9pu83UsqjwRRo0ragHPy9sDOpUvk,99894
+pybind11/include/pybind11/stl.h,sha256=aMi1OCCw2Zb-IRLSlAtQEJJHtWsRJiLT9dKDMHST1Ic,15532
+pybind11/include/pybind11/stl/filesystem.h,sha256=lcYRCwNA8Xf4e4FRbeYh36SAwQjxKgyTXXdrguR4gM4,4559
+pybind11/include/pybind11/stl_bind.h,sha256=B5t8E0A4Zdgm2sF0J8Q_UI2U5uqEBQ9TsJCelsJ4q0E,28495
+pybind11/include/pybind11/type_caster_pyobject_ptr.h,sha256=H7pKBYTvUlibiJQEcKmeAkygSQwoCkuIyukNSDmVq-U,1929
+pybind11/include/pybind11/typing.h,sha256=PIjZFNNzY_KsrkHQPlg0Vt24jlTi6kThdOldEJjchtY,7000
+pybind11/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pybind11/setup_helpers.py,sha256=AwD_CjfVzX653nW4_i0U4bkFMCG4ZILoMZixyL8CZ4o,17490
+pybind11/share/cmake/pybind11/FindPythonLibsNew.cmake,sha256=_ZVzgVp6GQSEEv-b2iuauqTgoi1k2jHiNJlpl25MN-4,12187
+pybind11/share/cmake/pybind11/pybind11Common.cmake,sha256=lvJJ518cN7SjKDgjpXw0XU0eKW358wEloIcKCyCNPB0,16164
+pybind11/share/cmake/pybind11/pybind11Config.cmake,sha256=I96KX_zIZvLHbedHknVBj2YKhMt_QjM5LhCbzVNTvD8,7959
+pybind11/share/cmake/pybind11/pybind11ConfigVersion.cmake,sha256=vDsLSBg7-Nop8Ar9wRe0xKgGUV4LRzWE4XE0kE5B6fE,1403
+pybind11/share/cmake/pybind11/pybind11GuessPythonExtSuffix.cmake,sha256=WvhK2E-vWi9ArY0WJZXEK4kEFHpDQjl-au963hqH0r0,3321
+pybind11/share/cmake/pybind11/pybind11NewTools.cmake,sha256=zGLNjL28gzi8tvwiabudLsye7id_sZI5ooYfiBBllvM,12169
+pybind11/share/cmake/pybind11/pybind11Targets.cmake,sha256=tIjPtIpfb5m9POtu484cjGgNyWc5E4bbKzESLrcOLA0,4271
+pybind11/share/cmake/pybind11/pybind11Tools.cmake,sha256=5K6EahoS7wIaQIhjrDS4p4jTpYr0b_MronXKee8zCAc,8565
+pybind11/share/pkgconfig/pybind11.pc,sha256=M17R2NbpW6o7ujxioMP5M6WgVGrmJ_1vu_-E-H_rbes,171
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/REQUESTED b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/REQUESTED
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/entry_points.txt b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/entry_points.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8de5a647622cd4cbe433550be7a6b91be72e304a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/entry_points.txt
@@ -0,0 +1,5 @@
+[console_scripts]
+pybind11-config = pybind11.__main__:main
+
+[pipx.run]
+pybind11 = pybind11.__main__:main
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/cpp_conduit.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/cpp_conduit.h
new file mode 100644
index 0000000000000000000000000000000000000000..b66c2d39c0019dcfc8a5fd137bbf37d2515a97b2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/cpp_conduit.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2024 The pybind Community.
+
+#pragma once
+
+#include <pybind11/pytypes.h>
+
+#include "common.h"
+#include "internals.h"
+
+#include <typeinfo>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Forward declaration needed here: Refactoring opportunity.
+extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *);
+
+inline bool type_is_managed_by_our_internals(PyTypeObject *type_obj) {
+#if defined(PYPY_VERSION)
+    auto &internals = get_internals();
+    return bool(internals.registered_types_py.find(type_obj)
+                != internals.registered_types_py.end());
+#else
+    return bool(type_obj->tp_new == pybind11_object_new);
+#endif
+}
+
+inline bool is_instance_method_of_type(PyTypeObject *type_obj, PyObject *attr_name) {
+    PyObject *descr = _PyType_Lookup(type_obj, attr_name);
+    return bool((descr != nullptr) && PyInstanceMethod_Check(descr));
+}
+
+inline object try_get_cpp_conduit_method(PyObject *obj) {
+    if (PyType_Check(obj)) {
+        return object();
+    }
+    PyTypeObject *type_obj = Py_TYPE(obj);
+    str attr_name("_pybind11_conduit_v1_");
+    bool assumed_to_be_callable = false;
+    if (type_is_managed_by_our_internals(type_obj)) {
+        if (!is_instance_method_of_type(type_obj, attr_name.ptr())) {
+            return object();
+        }
+        assumed_to_be_callable = true;
+    }
+    PyObject *method = PyObject_GetAttr(obj, attr_name.ptr());
+    if (method == nullptr) {
+        PyErr_Clear();
+        return object();
+    }
+    if (!assumed_to_be_callable && PyCallable_Check(method) == 0) {
+        Py_DECREF(method);
+        return object();
+    }
+    return reinterpret_steal<object>(method);
+}
+
+inline void *try_raw_pointer_ephemeral_from_cpp_conduit(handle src,
+                                                        const std::type_info *cpp_type_info) {
+    object method = try_get_cpp_conduit_method(src.ptr());
+    if (method) {
+        capsule cpp_type_info_capsule(const_cast<void *>(static_cast<const void *>(cpp_type_info)),
+                                      typeid(std::type_info).name());
+        object cpp_conduit = method(bytes(PYBIND11_PLATFORM_ABI_ID),
+                                    cpp_type_info_capsule,
+                                    bytes("raw_pointer_ephemeral"));
+        if (isinstance<capsule>(cpp_conduit)) {
+            return reinterpret_borrow<capsule>(cpp_conduit).get_pointer();
+        }
+    }
+    return nullptr;
+}
+
+#define PYBIND11_HAS_CPP_CONDUIT 1
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/gil.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/gil.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b0edaee4e5521569fd985f273e3892fdcefe17f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/gil.h
@@ -0,0 +1,219 @@
+/*
+    pybind11/gil.h: RAII helpers for managing the GIL
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+#include <cassert>
+
+#if !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+#    include "detail/internals.h"
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// forward declarations
+PyThreadState *get_thread_state_unchecked();
+
+PYBIND11_NAMESPACE_END(detail)
+
+#if !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+
+/* The functions below essentially reproduce the PyGILState_* API using a RAII
+ * pattern, but there are a few important differences:
+ *
+ * 1. When acquiring the GIL from an non-main thread during the finalization
+ *    phase, the GILState API blindly terminates the calling thread, which
+ *    is often not what is wanted. This API does not do this.
+ *
+ * 2. The gil_scoped_release function can optionally cut the relationship
+ *    of a PyThreadState and its associated thread, which allows moving it to
+ *    another thread (this is a fairly rare/advanced use case).
+ *
+ * 3. The reference count of an acquired thread state can be controlled. This
+ *    can be handy to prevent cases where callbacks issued from an external
+ *    thread would otherwise constantly construct and destroy thread state data
+ *    structures.
+ *
+ * See the Python bindings of NanoGUI (http://github.com/wjakob/nanogui) for an
+ * example which uses features 2 and 3 to migrate the Python thread of
+ * execution to another thread (to run the event loop on the original thread,
+ * in this case).
+ */
+
+class gil_scoped_acquire {
+public:
+    PYBIND11_NOINLINE gil_scoped_acquire() {
+        auto &internals = detail::get_internals();
+        tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate);
+
+        if (!tstate) {
+            /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if
+               calling from a Python thread). Since we use a different key, this ensures
+               we don't create a new thread state and deadlock in PyEval_AcquireThread
+               below. Note we don't save this state with internals.tstate, since we don't
+               create it we would fail to clear it (its reference count should be > 0). */
+            tstate = PyGILState_GetThisThreadState();
+        }
+
+        if (!tstate) {
+            tstate = PyThreadState_New(internals.istate);
+#    if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            if (!tstate) {
+                pybind11_fail("scoped_acquire: could not create thread state!");
+            }
+#    endif
+            tstate->gilstate_counter = 0;
+            PYBIND11_TLS_REPLACE_VALUE(internals.tstate, tstate);
+        } else {
+            release = detail::get_thread_state_unchecked() != tstate;
+        }
+
+        if (release) {
+            PyEval_AcquireThread(tstate);
+        }
+
+        inc_ref();
+    }
+
+    gil_scoped_acquire(const gil_scoped_acquire &) = delete;
+    gil_scoped_acquire &operator=(const gil_scoped_acquire &) = delete;
+
+    void inc_ref() { ++tstate->gilstate_counter; }
+
+    PYBIND11_NOINLINE void dec_ref() {
+        --tstate->gilstate_counter;
+#    if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+        if (detail::get_thread_state_unchecked() != tstate) {
+            pybind11_fail("scoped_acquire::dec_ref(): thread state must be current!");
+        }
+        if (tstate->gilstate_counter < 0) {
+            pybind11_fail("scoped_acquire::dec_ref(): reference count underflow!");
+        }
+#    endif
+        if (tstate->gilstate_counter == 0) {
+#    if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            if (!release) {
+                pybind11_fail("scoped_acquire::dec_ref(): internal error!");
+            }
+#    endif
+            PyThreadState_Clear(tstate);
+            if (active) {
+                PyThreadState_DeleteCurrent();
+            }
+            PYBIND11_TLS_DELETE_VALUE(detail::get_internals().tstate);
+            release = false;
+        }
+    }
+
+    /// This method will disable the PyThreadState_DeleteCurrent call and the
+    /// GIL won't be acquired. This method should be used if the interpreter
+    /// could be shutting down when this is called, as thread deletion is not
+    /// allowed during shutdown. Check _Py_IsFinalizing() on Python 3.7+, and
+    /// protect subsequent code.
+    PYBIND11_NOINLINE void disarm() { active = false; }
+
+    PYBIND11_NOINLINE ~gil_scoped_acquire() {
+        dec_ref();
+        if (release) {
+            PyEval_SaveThread();
+        }
+    }
+
+private:
+    PyThreadState *tstate = nullptr;
+    bool release = true;
+    bool active = true;
+};
+
+class gil_scoped_release {
+public:
+    // PRECONDITION: The GIL must be held when this constructor is called.
+    explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) {
+        assert(PyGILState_Check());
+        // `get_internals()` must be called here unconditionally in order to initialize
+        // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an
+        // initialization race could occur as multiple threads try `gil_scoped_acquire`.
+        auto &internals = detail::get_internals();
+        // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+        tstate = PyEval_SaveThread();
+        if (disassoc) {
+            // Python >= 3.7 can remove this, it's an int before 3.7
+            // NOLINTNEXTLINE(readability-qualified-auto)
+            auto key = internals.tstate;
+            PYBIND11_TLS_DELETE_VALUE(key);
+        }
+    }
+
+    gil_scoped_release(const gil_scoped_release &) = delete;
+    gil_scoped_release &operator=(const gil_scoped_release &) = delete;
+
+    /// This method will disable the PyThreadState_DeleteCurrent call and the
+    /// GIL won't be acquired. This method should be used if the interpreter
+    /// could be shutting down when this is called, as thread deletion is not
+    /// allowed during shutdown. Check _Py_IsFinalizing() on Python 3.7+, and
+    /// protect subsequent code.
+    PYBIND11_NOINLINE void disarm() { active = false; }
+
+    ~gil_scoped_release() {
+        if (!tstate) {
+            return;
+        }
+        // `PyEval_RestoreThread()` should not be called if runtime is finalizing
+        if (active) {
+            PyEval_RestoreThread(tstate);
+        }
+        if (disassoc) {
+            // Python >= 3.7 can remove this, it's an int before 3.7
+            // NOLINTNEXTLINE(readability-qualified-auto)
+            auto key = detail::get_internals().tstate;
+            PYBIND11_TLS_REPLACE_VALUE(key, tstate);
+        }
+    }
+
+private:
+    PyThreadState *tstate;
+    bool disassoc;
+    bool active = true;
+};
+
+#else // PYBIND11_SIMPLE_GIL_MANAGEMENT
+
+class gil_scoped_acquire {
+    PyGILState_STATE state;
+
+public:
+    gil_scoped_acquire() : state{PyGILState_Ensure()} {}
+    gil_scoped_acquire(const gil_scoped_acquire &) = delete;
+    gil_scoped_acquire &operator=(const gil_scoped_acquire &) = delete;
+    ~gil_scoped_acquire() { PyGILState_Release(state); }
+    void disarm() {}
+};
+
+class gil_scoped_release {
+    PyThreadState *state;
+
+public:
+    // PRECONDITION: The GIL must be held when this constructor is called.
+    gil_scoped_release() {
+        assert(PyGILState_Check());
+        state = PyEval_SaveThread();
+    }
+    gil_scoped_release(const gil_scoped_release &) = delete;
+    gil_scoped_release &operator=(const gil_scoped_release &) = delete;
+    ~gil_scoped_release() { PyEval_RestoreThread(state); }
+    void disarm() {}
+};
+
+#endif // PYBIND11_SIMPLE_GIL_MANAGEMENT
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/iostream.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/iostream.h
new file mode 100644
index 0000000000000000000000000000000000000000..1878089e3171bcfbd575ff4d7f925f981489316c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/iostream.h
@@ -0,0 +1,265 @@
+/*
+    pybind11/iostream.h -- Tools to assist with redirecting cout and cerr to Python
+
+    Copyright (c) 2017 Henry F. Schreiner
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+
+    WARNING: The implementation in this file is NOT thread safe. Multiple
+    threads writing to a redirected ostream concurrently cause data races
+    and potentially buffer overflows. Therefore it is currently a requirement
+    that all (possibly) concurrent redirected ostream writes are protected by
+    a mutex.
+    #HelpAppreciated: Work on iostream.h thread safety.
+    For more background see the discussions under
+    https://github.com/pybind/pybind11/pull/2982 and
+    https://github.com/pybind/pybind11/pull/2995.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <ostream>
+#include <streambuf>
+#include <string>
+#include <utility>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Buffer that writes to Python instead of C++
+class pythonbuf : public std::streambuf {
+private:
+    using traits_type = std::streambuf::traits_type;
+
+    const size_t buf_size;
+    std::unique_ptr<char[]> d_buffer;
+    object pywrite;
+    object pyflush;
+
+    int overflow(int c) override {
+        if (!traits_type::eq_int_type(c, traits_type::eof())) {
+            *pptr() = traits_type::to_char_type(c);
+            pbump(1);
+        }
+        return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof();
+    }
+
+    // Computes how many bytes at the end of the buffer are part of an
+    // incomplete sequence of UTF-8 bytes.
+    // Precondition: pbase() < pptr()
+    size_t utf8_remainder() const {
+        const auto rbase = std::reverse_iterator<char *>(pbase());
+        const auto rpptr = std::reverse_iterator<char *>(pptr());
+        auto is_ascii = [](char c) { return (static_cast<unsigned char>(c) & 0x80) == 0x00; };
+        auto is_leading = [](char c) { return (static_cast<unsigned char>(c) & 0xC0) == 0xC0; };
+        auto is_leading_2b = [](char c) { return static_cast<unsigned char>(c) <= 0xDF; };
+        auto is_leading_3b = [](char c) { return static_cast<unsigned char>(c) <= 0xEF; };
+        // If the last character is ASCII, there are no incomplete code points
+        if (is_ascii(*rpptr)) {
+            return 0;
+        }
+        // Otherwise, work back from the end of the buffer and find the first
+        // UTF-8 leading byte
+        const auto rpend = rbase - rpptr >= 3 ? rpptr + 3 : rbase;
+        const auto leading = std::find_if(rpptr, rpend, is_leading);
+        if (leading == rbase) {
+            return 0;
+        }
+        const auto dist = static_cast<size_t>(leading - rpptr);
+        size_t remainder = 0;
+
+        if (dist == 0) {
+            remainder = 1; // 1-byte code point is impossible
+        } else if (dist == 1) {
+            remainder = is_leading_2b(*leading) ? 0 : dist + 1;
+        } else if (dist == 2) {
+            remainder = is_leading_3b(*leading) ? 0 : dist + 1;
+        }
+        // else if (dist >= 3), at least 4 bytes before encountering an UTF-8
+        // leading byte, either no remainder or invalid UTF-8.
+        // Invalid UTF-8 will cause an exception later when converting
+        // to a Python string, so that's not handled here.
+        return remainder;
+    }
+
+    // This function must be non-virtual to be called in a destructor.
+    int _sync() {
+        if (pbase() != pptr()) { // If buffer is not empty
+            gil_scoped_acquire tmp;
+            // This subtraction cannot be negative, so dropping the sign.
+            auto size = static_cast<size_t>(pptr() - pbase());
+            size_t remainder = utf8_remainder();
+
+            if (size > remainder) {
+                str line(pbase(), size - remainder);
+                pywrite(std::move(line));
+                pyflush();
+            }
+
+            // Copy the remainder at the end of the buffer to the beginning:
+            if (remainder > 0) {
+                std::memmove(pbase(), pptr() - remainder, remainder);
+            }
+            setp(pbase(), epptr());
+            pbump(static_cast<int>(remainder));
+        }
+        return 0;
+    }
+
+    int sync() override { return _sync(); }
+
+public:
+    explicit pythonbuf(const object &pyostream, size_t buffer_size = 1024)
+        : buf_size(buffer_size), d_buffer(new char[buf_size]), pywrite(pyostream.attr("write")),
+          pyflush(pyostream.attr("flush")) {
+        setp(d_buffer.get(), d_buffer.get() + buf_size - 1);
+    }
+
+    pythonbuf(pythonbuf &&) = default;
+
+    /// Sync before destroy
+    ~pythonbuf() override { _sync(); }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+/** \rst
+    This a move-only guard that redirects output.
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        {
+            py::scoped_ostream_redirect output;
+            std::cout << "Hello, World!"; // Python stdout
+        } // <-- return std::cout to normal
+
+    You can explicitly pass the c++ stream and the python object,
+    for example to guard stderr instead.
+
+    .. code-block:: cpp
+
+        {
+            py::scoped_ostream_redirect output{
+                std::cerr, py::module::import("sys").attr("stderr")};
+            std::cout << "Hello, World!";
+        }
+ \endrst */
+class scoped_ostream_redirect {
+protected:
+    std::streambuf *old;
+    std::ostream &costream;
+    detail::pythonbuf buffer;
+
+public:
+    explicit scoped_ostream_redirect(std::ostream &costream = std::cout,
+                                     const object &pyostream
+                                     = module_::import("sys").attr("stdout"))
+        : costream(costream), buffer(pyostream) {
+        old = costream.rdbuf(&buffer);
+    }
+
+    ~scoped_ostream_redirect() { costream.rdbuf(old); }
+
+    scoped_ostream_redirect(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect(scoped_ostream_redirect &&other) = default;
+    scoped_ostream_redirect &operator=(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect &operator=(scoped_ostream_redirect &&) = delete;
+};
+
+/** \rst
+    Like `scoped_ostream_redirect`, but redirects cerr by default. This class
+    is provided primary to make ``py::call_guard`` easier to make.
+
+    .. code-block:: cpp
+
+     m.def("noisy_func", &noisy_func,
+           py::call_guard<scoped_ostream_redirect,
+                          scoped_estream_redirect>());
+
+\endrst */
+class scoped_estream_redirect : public scoped_ostream_redirect {
+public:
+    explicit scoped_estream_redirect(std::ostream &costream = std::cerr,
+                                     const object &pyostream
+                                     = module_::import("sys").attr("stderr"))
+        : scoped_ostream_redirect(costream, pyostream) {}
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Class to redirect output as a context manager. C++ backend.
+class OstreamRedirect {
+    bool do_stdout_;
+    bool do_stderr_;
+    std::unique_ptr<scoped_ostream_redirect> redirect_stdout;
+    std::unique_ptr<scoped_estream_redirect> redirect_stderr;
+
+public:
+    explicit OstreamRedirect(bool do_stdout = true, bool do_stderr = true)
+        : do_stdout_(do_stdout), do_stderr_(do_stderr) {}
+
+    void enter() {
+        if (do_stdout_) {
+            redirect_stdout.reset(new scoped_ostream_redirect());
+        }
+        if (do_stderr_) {
+            redirect_stderr.reset(new scoped_estream_redirect());
+        }
+    }
+
+    void exit() {
+        redirect_stdout.reset();
+        redirect_stderr.reset();
+    }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+/** \rst
+    This is a helper function to add a C++ redirect context manager to Python
+    instead of using a C++ guard. To use it, add the following to your binding code:
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        py::add_ostream_redirect(m, "ostream_redirect");
+
+    You now have a Python context manager that redirects your output:
+
+    .. code-block:: python
+
+        with m.ostream_redirect():
+            m.print_to_cout_function()
+
+    This manager can optionally be told which streams to operate on:
+
+    .. code-block:: python
+
+        with m.ostream_redirect(stdout=true, stderr=true):
+            m.noisy_function_with_error_printing()
+
+ \endrst */
+inline class_<detail::OstreamRedirect>
+add_ostream_redirect(module_ m, const std::string &name = "ostream_redirect") {
+    return class_<detail::OstreamRedirect>(std::move(m), name.c_str(), module_local())
+        .def(init<bool, bool>(), arg("stdout") = true, arg("stderr") = true)
+        .def("__enter__", &detail::OstreamRedirect::enter)
+        .def("__exit__", [](detail::OstreamRedirect &self_, const args &) { self_.exit(); });
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_C.cpython-311-x86_64-linux-gnu.so b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_C.cpython-311-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..acb9f6e3e8dab7a9f988196266a61b1478c977e0
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_C.cpython-311-x86_64-linux-gnu.so differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_VF.pyi b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_VF.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..8f6e61bf678ae19145799f44bb6beceac9bf24c9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_VF.pyi
@@ -0,0 +1,25648 @@
+# @generated from torch/_C/_VariableFunctions.pyi.in
+# mypy: disable-error-code="type-arg"
+
+import builtins
+from typing import (
+    Any,
+    Callable,
+    ContextManager,
+    Iterator,
+    List,
+    Literal,
+    NamedTuple,
+    Optional,
+    overload,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+import torch
+from torch import contiguous_format, Generator, inf, memory_format, strided, SymInt, Tensor
+from torch.types import (
+    _bool,
+    _complex,
+    _device,
+    _dtype,
+    _float,
+    _int,
+    _layout,
+    _qscheme,
+    _size,
+    Device,
+    Number,
+)
+
+from torch._prims_common import DeviceLikeType
+
+@overload
+def __and__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __and__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+@overload
+def __lshift__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __lshift__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+@overload
+def __or__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __or__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+@overload
+def __rshift__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __rshift__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+@overload
+def __xor__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __xor__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+def _adaptive_avg_pool2d(input: Tensor, output_size: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]]) -> Tensor: ...
+def _adaptive_avg_pool3d(input: Tensor, output_size: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]]) -> Tensor: ...
+def _add_batch_dim(input: Tensor, batch_dim: _int, level: _int) -> Tensor: ...
+@overload
+def _add_relu(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def _add_relu(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: ...
+@overload
+def _add_relu_(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: ...
+@overload
+def _add_relu_(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: ...
+def _addmm_activation(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, use_gelu: _bool = False, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def _aminmax(input: Tensor) -> Tuple[Tensor, Tensor]: ...
+@overload
+def _aminmax(input: Tensor, dim: _int, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: ...
+def _amp_foreach_non_finite_check_and_unscale_(self: Union[Tuple[Tensor, ...], List[Tensor]], found_inf: Tensor, inv_scale: Tensor) -> None: ...
+def _amp_update_scale_(input: Tensor, growth_tracker: Tensor, found_inf: Tensor, scale_growth_factor: _float, scale_backoff_factor: _float, growth_interval: _int) -> Tensor: ...
+@overload
+def _assert_async(input: Tensor) -> None: 
+    r"""
+    _assert_async(tensor) -> void
+    
+    Asynchronously assert that the contents of tensor are nonzero.  For CPU tensors,
+    this is equivalent to ``assert tensor`` or ``assert tensor.is_nonzero()``; for
+    CUDA tensors, we DO NOT synchronize and you may only find out the assertion
+    failed at a later CUDA kernel launch.  Asynchronous assertion can be helpful for
+    testing invariants in CUDA tensors without giving up performance.  This function
+    is NOT intended to be used for regular error checking, as it will trash your CUDA
+    context if the assert fails (forcing you to restart your PyTorch process.)
+    
+    Args:
+        tensor (Tensor): a one element tensor to test to see if it is nonzero.  Zero
+            elements (including False for boolean tensors) cause an assertion failure
+            to be raised.
+    """
+    ...
+@overload
+def _assert_async(input: Tensor, assert_msg: str) -> None: 
+    r"""
+    _assert_async(tensor) -> void
+    
+    Asynchronously assert that the contents of tensor are nonzero.  For CPU tensors,
+    this is equivalent to ``assert tensor`` or ``assert tensor.is_nonzero()``; for
+    CUDA tensors, we DO NOT synchronize and you may only find out the assertion
+    failed at a later CUDA kernel launch.  Asynchronous assertion can be helpful for
+    testing invariants in CUDA tensors without giving up performance.  This function
+    is NOT intended to be used for regular error checking, as it will trash your CUDA
+    context if the assert fails (forcing you to restart your PyTorch process.)
+    
+    Args:
+        tensor (Tensor): a one element tensor to test to see if it is nonzero.  Zero
+            elements (including False for boolean tensors) cause an assertion failure
+            to be raised.
+    """
+    ...
+def _assert_scalar(self: Union[Number, _complex], assert_msg: str) -> None: ...
+def _assert_tensor_metadata(a: Tensor, size: Optional[Sequence[Union[_int, SymInt]]] = None, stride: Optional[Sequence[Union[_int, SymInt]]] = None, dtype: Optional[_dtype] = None) -> None: ...
+def _batch_norm_impl_index(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, momentum: _float, eps: _float, cudnn_enabled: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor, _int]: ...
+def _cast_Byte(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Char(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Double(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Float(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Half(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Int(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Long(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Short(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _choose_qparams_per_tensor(input: Tensor, reduce_range: _bool = False) -> Tuple[_float, _int]: ...
+def _chunk_cat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int, num_chunks: _int, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _coalesce(input: Tensor) -> Tensor: ...
+def _compute_linear_combination(input: Tensor, coefficients: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _conj(input: Tensor) -> Tensor: ...
+def _conj_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _conj_physical(input: Tensor) -> Tensor: ...
+def _convert_indices_from_coo_to_csr(input: Tensor, size: _int, *, out_int32: _bool = False, out: Optional[Tensor] = None) -> Tensor: ...
+def _convert_indices_from_csr_to_coo(crow_indices: Tensor, col_indices: Tensor, *, out_int32: _bool = False, transpose: _bool = False, out: Optional[Tensor] = None) -> Tensor: ...
+def _convert_weight_to_int4pack(input: Tensor, innerKTiles: _int) -> Tensor: ...
+@overload
+def _convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], transposed: _bool, output_padding: _size, groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, cudnn_enabled: _bool) -> Tensor: ...
+@overload
+def _convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], transposed: _bool, output_padding: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, cudnn_enabled: _bool, allow_tf32: _bool) -> Tensor: ...
+def _convolution_mode(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: str, dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def _copy_from(input: Tensor, dst: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _copy_from_and_resize(input: Tensor, dst: Tensor) -> Tensor: ...
+def _cslt_compress(input: Tensor) -> Tensor: ...
+def _cslt_sparse_mm(compressed_A: Tensor, dense_B: Tensor, bias: Optional[Tensor] = None, alpha: Optional[Tensor] = None, out_dtype: Optional[_dtype] = None, transpose_result: _bool = False, alg_id: _int = 0) -> Tensor: ...
+def _cslt_sparse_mm_search(compressed_A: Tensor, dense_B: Tensor, bias: Optional[Tensor] = None, alpha: Optional[Tensor] = None, out_dtype: Optional[_dtype] = None, transpose_result: _bool = False) -> _int: ...
+@overload
+def _ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int = 0, zero_infinity: _bool = False) -> Tuple[Tensor, Tensor]: ...
+@overload
+def _ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int = 0, zero_infinity: _bool = False) -> Tuple[Tensor, Tensor]: ...
+@overload
+def _cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int, deterministic: _bool, zero_infinity: _bool) -> Tuple[Tensor, Tensor]: ...
+@overload
+def _cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int, deterministic: _bool, zero_infinity: _bool) -> Tuple[Tensor, Tensor]: ...
+def _cudnn_init_dropout_state(dropout: _float, train: _bool, dropout_seed: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def _cudnn_rnn(input: Tensor, weight: Union[Tuple[Tensor, ...], List[Tensor]], weight_stride0: _int, weight_buf: Optional[Tensor], hx: Tensor, cx: Optional[Tensor], mode: _int, hidden_size: Union[_int, SymInt], proj_size: Union[_int, SymInt], num_layers: _int, batch_first: _bool, dropout: _float, train: _bool, bidirectional: _bool, batch_sizes: Sequence[Union[_int, SymInt]], dropout_state: Optional[Tensor]) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: ...
+def _cudnn_rnn_flatten_weight(weight_arr: Union[Tuple[Tensor, ...], List[Tensor]], weight_stride0: _int, input_size: Union[_int, SymInt], mode: _int, hidden_size: Union[_int, SymInt], proj_size: Union[_int, SymInt], num_layers: _int, batch_first: _bool, bidirectional: _bool) -> Tensor: ...
+def _cufft_clear_plan_cache(device_index: _int) -> None: ...
+def _cufft_get_plan_cache_max_size(device_index: _int) -> _int: ...
+def _cufft_get_plan_cache_size(device_index: _int) -> _int: ...
+def _cufft_set_plan_cache_max_size(device_index: _int, max_size: _int) -> None: ...
+def _cummax_helper(input: Tensor, values: Tensor, indices: Tensor, dim: _int) -> None: ...
+def _cummin_helper(input: Tensor, values: Tensor, indices: Tensor, dim: _int) -> None: ...
+def _debug_has_internal_overlap(input: Tensor) -> _int: ...
+def _dim_arange(like: Tensor, dim: _int) -> Tensor: ...
+def _dirichlet_grad(x: Tensor, alpha: Tensor, total: Tensor) -> Tensor: ...
+def _disable_functionalization(): ...
+@overload
+def _efficientzerotensor(size: Sequence[Union[_int, SymInt]], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def _efficientzerotensor(*size: _int, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def _embedding_bag(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool = False, mode: _int = 0, sparse: _bool = False, per_sample_weights: Optional[Tensor] = None, include_last_offset: _bool = False, padding_idx: _int = -1) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def _embedding_bag_forward_only(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool = False, mode: _int = 0, sparse: _bool = False, per_sample_weights: Optional[Tensor] = None, include_last_offset: _bool = False, padding_idx: _int = -1) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+@overload
+def _empty_affine_quantized(size: Sequence[Union[_int, SymInt]], *, scale: _float = 1, zero_point: _int = 0, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def _empty_affine_quantized(*size: _int, scale: _float = 1, zero_point: _int = 0, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def _empty_per_channel_affine_quantized(size: Sequence[Union[_int, SymInt]], *, scales: Tensor, zero_points: Tensor, axis: _int, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def _empty_per_channel_affine_quantized(*size: _int, scales: Tensor, zero_points: Tensor, axis: _int, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def _enable_functionalization(*, reapply_views: _bool = False): ...
+def _euclidean_dist(x1: Tensor, x2: Tensor) -> Tensor: ...
+def _fake_quantize_learnable_per_channel_affine(input: Tensor, scale: Tensor, zero_point: Tensor, axis: _int, quant_min: _int, quant_max: _int, grad_factor: _float = 1.0) -> Tensor: ...
+def _fake_quantize_learnable_per_tensor_affine(input: Tensor, scale: Tensor, zero_point: Tensor, quant_min: _int, quant_max: _int, grad_factor: _float = 1.0) -> Tensor: ...
+def _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(input: Tensor, scale: Tensor, zero_point: Tensor, fake_quant_enabled: Tensor, quant_min: _int, quant_max: _int) -> torch.return_types._fake_quantize_per_tensor_affine_cachemask_tensor_qparams: ...
+def _fft_c2c(input: Tensor, dim: Sequence[Union[_int, SymInt]], normalization: _int, forward: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _fft_c2r(input: Tensor, dim: _size, normalization: _int, last_dim_size: Union[_int, SymInt], *, out: Optional[Tensor] = None) -> Tensor: ...
+def _fft_r2c(input: Tensor, dim: _size, normalization: _int, onesided: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _fill_mem_eff_dropout_mask_(input: Tensor, dropout_p: _float, seed: _int, offset: _int) -> Tensor: ...
+def _foobar(input: Tensor, arg1: _bool = True, arg2: _bool = True, *, arg3: _bool = True) -> Tensor: ...
+def _foreach_abs(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_abs(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.abs` to each Tensor of the input list.
+    """
+    ...
+def _foreach_abs_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_abs_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.abs` to each Tensor of the input list.
+    """
+    ...
+def _foreach_acos(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_acos(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.acos` to each Tensor of the input list.
+    """
+    ...
+def _foreach_acos_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_acos_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.acos` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> None: ...
+@overload
+def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor, *, alpha: Union[Number, _complex] = 1) -> None: ...
+@overload
+def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_addcdiv(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcdiv(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcdiv(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcdiv_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_addcdiv_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> None: ...
+@overload
+def _foreach_addcdiv_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> None: ...
+@overload
+def _foreach_addcmul(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcmul(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcmul(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcmul_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_addcmul_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> None: ...
+@overload
+def _foreach_addcmul_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> None: ...
+def _foreach_asin(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_asin(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.asin` to each Tensor of the input list.
+    """
+    ...
+def _foreach_asin_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_asin_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.asin` to each Tensor of the input list.
+    """
+    ...
+def _foreach_atan(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_atan(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.atan` to each Tensor of the input list.
+    """
+    ...
+def _foreach_atan_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_atan_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.atan` to each Tensor of the input list.
+    """
+    ...
+def _foreach_ceil(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_ceil(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.ceil` to each Tensor of the input list.
+    """
+    ...
+def _foreach_ceil_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_ceil_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.ceil` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_clamp_max(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_max(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_max(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_max_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_clamp_max_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_clamp_max_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+@overload
+def _foreach_clamp_min(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_min(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_min(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_min_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_clamp_min_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_clamp_min_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_copy_(self: Union[Tuple[Tensor, ...], List[Tensor]], src: Union[Tuple[Tensor, ...], List[Tensor]], non_blocking: _bool = False) -> None: ...
+def _foreach_cos(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_cos(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.cos` to each Tensor of the input list.
+    """
+    ...
+def _foreach_cos_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_cos_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.cos` to each Tensor of the input list.
+    """
+    ...
+def _foreach_cosh(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_cosh(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.cosh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_cosh_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_cosh_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.cosh` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> None: ...
+@overload
+def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_erf(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_erf(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.erf` to each Tensor of the input list.
+    """
+    ...
+def _foreach_erf_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_erf_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.erf` to each Tensor of the input list.
+    """
+    ...
+def _foreach_erfc(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_erfc(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.erfc` to each Tensor of the input list.
+    """
+    ...
+def _foreach_erfc_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_erfc_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.erfc` to each Tensor of the input list.
+    """
+    ...
+def _foreach_exp(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_exp(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.exp` to each Tensor of the input list.
+    """
+    ...
+def _foreach_exp_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_exp_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.exp` to each Tensor of the input list.
+    """
+    ...
+def _foreach_expm1(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_expm1(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.expm1` to each Tensor of the input list.
+    """
+    ...
+def _foreach_expm1_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_expm1_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.expm1` to each Tensor of the input list.
+    """
+    ...
+def _foreach_floor(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_floor(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.floor` to each Tensor of the input list.
+    """
+    ...
+def _foreach_floor_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_floor_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.floor` to each Tensor of the input list.
+    """
+    ...
+def _foreach_frac(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_frac(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.frac` to each Tensor of the input list.
+    """
+    ...
+def _foreach_frac_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_frac_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.frac` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_lerp(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weight: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_lerp(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weights: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_lerp_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weight: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_lerp_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weights: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_lgamma(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_lgamma(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.lgamma` to each Tensor of the input list.
+    """
+    ...
+def _foreach_lgamma_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_lgamma_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.lgamma` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_log(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.log` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log10(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_log10(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.log10` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log10_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_log10_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.log10` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log1p(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_log1p(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.log1p` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log1p_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_log1p_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.log1p` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log2(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_log2(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.log2` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log2_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_log2_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.log2` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_log_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.log` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_maximum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_maximum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_maximum(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_maximum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_maximum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_maximum_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+@overload
+def _foreach_minimum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_minimum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_minimum(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_minimum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_minimum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_minimum_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+@overload
+def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> None: ...
+@overload
+def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_neg(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_neg(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.neg` to each Tensor of the input list.
+    """
+    ...
+def _foreach_neg_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_neg_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.neg` to each Tensor of the input list.
+    """
+    ...
+def _foreach_norm(self: Union[Tuple[Tensor, ...], List[Tensor]], ord: Union[Number, _complex] = 2) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow(self: Union[Number, _complex], exponent: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow_(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_pow_(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_pow_(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_reciprocal(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_reciprocal(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.reciprocal` to each Tensor of the input list.
+    """
+    ...
+def _foreach_reciprocal_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_reciprocal_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.reciprocal` to each Tensor of the input list.
+    """
+    ...
+def _foreach_round(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_round(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.round` to each Tensor of the input list.
+    """
+    ...
+def _foreach_round_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_round_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.round` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sigmoid(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_sigmoid(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.sigmoid` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sigmoid_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_sigmoid_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.sigmoid` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sign(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+def _foreach_sign_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_sin(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_sin(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.sin` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sin_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_sin_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.sin` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sinh(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_sinh(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.sinh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sinh_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_sinh_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.sinh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sqrt(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_sqrt(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.sqrt` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sqrt_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_sqrt_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.sqrt` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_sub(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_sub(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_sub(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_sub_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_sub_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> None: ...
+@overload
+def _foreach_sub_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+def _foreach_tan(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_tan(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.tan` to each Tensor of the input list.
+    """
+    ...
+def _foreach_tan_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_tan_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.tan` to each Tensor of the input list.
+    """
+    ...
+def _foreach_tanh(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_tanh(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.tanh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_tanh_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_tanh_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.tanh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_trunc(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_trunc(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.trunc` to each Tensor of the input list.
+    """
+    ...
+def _foreach_trunc_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_trunc_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.trunc` to each Tensor of the input list.
+    """
+    ...
+def _foreach_zero_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_zero_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.zero` to each Tensor of the input list.
+    """
+    ...
+def _from_functional_tensor(t: Tensor) -> Tensor: ...
+def _functional_assert_async(input: Tensor, assert_msg: str, dep_token: Tensor) -> Tensor: ...
+def _functional_assert_scalar(self: Union[Number, _complex], assert_msg: str, dep_token: Tensor) -> Tensor: ...
+def _functional_sym_constrain_range(size: Union[Number, _complex], min: Optional[_int], max: Optional[_int], dep_token: Tensor) -> Tensor: ...
+def _functional_sym_constrain_range_for_size(size: Union[Number, _complex], min: Optional[_int], max: Optional[_int], dep_token: Tensor) -> Tensor: ...
+def _functionalize_are_all_mutations_hidden_from_autograd(t: Tensor) -> _bool: ...
+def _functionalize_are_all_mutations_under_no_grad_or_inference_mode(t: Tensor) -> _bool: ...
+def _functionalize_commit_update(t: Tensor) -> None: ...
+def _functionalize_mark_mutation_hidden_from_autograd(t: Tensor) -> None: ...
+def _functionalize_replace(self_: Tensor, other: Tensor) -> None: ...
+def _functionalize_sync(t: Tensor) -> None: ...
+@overload
+def _fused_adam_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: Tensor, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+@overload
+def _fused_adam_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: _float, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+@overload
+def _fused_adamw_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: Tensor, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+@overload
+def _fused_adamw_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: _float, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+def _fused_dropout(input: Tensor, p: _float, generator: Optional[Generator] = None) -> Tuple[Tensor, Tensor]: ...
+def _fused_moving_avg_obs_fq_helper(input: Tensor, observer_on: Tensor, fake_quant_on: Tensor, running_min: Tensor, running_max: Tensor, scale: Tensor, zero_point: Tensor, averaging_const: _float, quant_min: _int, quant_max: _int, ch_axis: _int, per_row_fake_quant: _bool = False, symmetric_quant: _bool = False) -> torch.return_types._fused_moving_avg_obs_fq_helper: ...
+def _fused_sdp_choice(query: Tensor, key: Tensor, value: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: _float = 0.0, is_causal: _bool = False, *, scale: Optional[_float] = None) -> _int: ...
+@overload
+def _fused_sgd_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], momentum_buffer_list: Union[Tuple[Tensor, ...], List[Tensor]], *, weight_decay: _float, momentum: _float, lr: Tensor, dampening: _float, nesterov: _bool, maximize: _bool, is_first_step: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+@overload
+def _fused_sgd_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], momentum_buffer_list: Union[Tuple[Tensor, ...], List[Tensor]], *, weight_decay: _float, momentum: _float, lr: _float, dampening: _float, nesterov: _bool, maximize: _bool, is_first_step: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+def _fw_primal_copy(input: Tensor, level: _int, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _grid_sampler_2d_cpu_fallback(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ...
+def _has_compatible_shallow_copy_type(input: Tensor, from_: Tensor) -> _bool: ...
+def _histogramdd_bin_edges(input: Tensor, bins: _size, *, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> Tuple[Tensor, ...]: ...
+def _histogramdd_from_bin_cts(input: Tensor, bins: _size, *, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> Tensor: ...
+def _histogramdd_from_bin_tensors(input: Tensor, bins: Union[Tuple[Tensor, ...], List[Tensor]], *, weight: Optional[Tensor] = None, density: _bool = False) -> Tensor: ...
+def _index_put_impl_(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False, unsafe: _bool = False) -> Tensor: ...
+def _indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _int_mm(input: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _is_all_true(input: Tensor) -> Tensor: ...
+def _is_any_true(input: Tensor) -> Tensor: ...
+def _is_functional_tensor(t: Tensor) -> _bool: ...
+def _is_zerotensor(input: Tensor) -> _bool: ...
+def _lazy_clone(input: Tensor) -> Tensor: ...
+def _linalg_check_errors(info: Tensor, api_name: str, *, is_matrix: _bool) -> None: ...
+def _linalg_det(A: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_det: ...
+def _linalg_eigh(A: Tensor, UPLO: str = "L", compute_v: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_eigh: ...
+def _linalg_slogdet(A: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_slogdet: ...
+def _linalg_solve_ex(A: Tensor, B: Tensor, *, left: _bool = True, check_errors: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_solve_ex: ...
+def _linalg_svd(A: Tensor, full_matrices: _bool = False, compute_uv: _bool = True, *, driver: Optional[str] = None, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_svd: ...
+def _log_softmax(input: Tensor, dim: _int, half_to_float: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _log_softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input_dtype: _dtype, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _logcumsumexp(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _lstm_mps(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: ...
+def _lu_with_info(input: Tensor, pivot: _bool = True, check_errors: _bool = True) -> torch.return_types._lu_with_info: ...
+def _make_dep_token(*, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def _make_dual(primal: Tensor, tangent: Tensor, level: _int) -> Tensor: ...
+def _make_dual_copy(primal: Tensor, tangent: Tensor, level: _int, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _make_per_channel_quantized_tensor(input: Tensor, scale: Tensor, zero_point: Tensor, axis: _int) -> Tensor: ...
+def _make_per_tensor_quantized_tensor(input: Tensor, scale: _float, zero_point: _int) -> Tensor: ...
+def _masked_scale(input: Tensor, mask: Tensor, scale: _float) -> Tensor: ...
+def _masked_softmax(input: Tensor, mask: Tensor, dim: Optional[_int] = None, mask_type: Optional[_int] = None) -> Tensor: ...
+def _mixed_dtypes_linear(input: Tensor, weight: Tensor, scale: Tensor, *, bias: Optional[Tensor] = None, activation: Optional[str] = None) -> Tensor: ...
+def _mkldnn_reshape(input: Tensor, shape: _size) -> Tensor: ...
+def _mkldnn_transpose(input: Tensor, dim0: _int, dim1: _int) -> Tensor: ...
+def _mkldnn_transpose_(input: Tensor, dim0: _int, dim1: _int) -> Tensor: ...
+def _mps_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def _mps_convolution_transpose(input: Tensor, weight: Tensor, padding: Sequence[Union[_int, SymInt]], output_padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+@overload
+def _native_batch_norm_legit(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Tensor, running_var: Tensor, training: _bool, momentum: _float, eps: _float, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor, Tensor]: ...
+@overload
+def _native_batch_norm_legit(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], training: _bool, momentum: _float, eps: _float, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor, Tensor]: ...
+def _native_batch_norm_legit_no_training(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Tensor, running_var: Tensor, momentum: _float, eps: _float) -> Tuple[Tensor, Tensor, Tensor]: ...
+def _native_multi_head_attention(query: Tensor, key: Tensor, value: Tensor, embed_dim: _int, num_head: _int, qkv_weight: Tensor, qkv_bias: Tensor, proj_weight: Tensor, proj_bias: Tensor, mask: Optional[Tensor] = None, need_weights: _bool = True, average_attn_weights: _bool = True, mask_type: Optional[_int] = None) -> Tuple[Tensor, Tensor]: ...
+def _neg_view(input: Tensor) -> Tensor: ...
+def _neg_view_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _nested_from_padded(padded: Tensor, cpu_nested_shape_example: Tensor, fuse_transform_0213: _bool = False) -> Tensor: ...
+def _nested_from_padded_and_nested_example(padded: Tensor, nt_example: Tensor) -> Tensor: ...
+def _nested_get_jagged_dummy(any: Tensor) -> Tensor: ...
+def _nested_get_lengths(input: Tensor) -> Tensor: ...
+def _nested_get_offsets(input: Tensor) -> Tensor: ...
+def _nested_get_ragged_idx(input: Tensor) -> _int: ...
+def _nested_get_values(input: Tensor) -> Tensor: ...
+def _nested_get_values_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _nested_tensor_from_mask(t: Tensor, mask: Tensor, mask_check: _bool = True) -> Tensor: ...
+def _nested_tensor_from_mask_left_aligned(t: Tensor, mask: Tensor) -> _bool: ...
+def _nested_tensor_from_tensor_list(list: Union[Tuple[Tensor, ...], List[Tensor]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = None) -> Tensor: ...
+def _nested_tensor_softmax_with_shape(input: Tensor, query: Tensor) -> Tensor: ...
+def _nested_view_from_buffer(input: Tensor, nested_size: Tensor, nested_strides: Tensor, offsets: Tensor) -> Tensor: ...
+def _nested_view_from_buffer_copy(input: Tensor, nested_size: Tensor, nested_strides: Tensor, offsets: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _nested_view_from_jagged(input: Tensor, offsets: Tensor, dummy: Tensor, lengths: Optional[Tensor] = None, ragged_idx: _int = 1) -> Tensor: ...
+def _nested_view_from_jagged_copy(input: Tensor, offsets: Tensor, dummy: Tensor, lengths: Optional[Tensor] = None, ragged_idx: _int = 1, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _nnpack_available() -> _bool: ...
+def _nnpack_spatial_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]], stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ...
+def _pack_padded_sequence(input: Tensor, lengths: Tensor, batch_first: _bool) -> Tuple[Tensor, Tensor]: ...
+def _pad_packed_sequence(data: Tensor, batch_sizes: Tensor, batch_first: _bool, padding_value: Union[Number, _complex], total_length: _int) -> Tuple[Tensor, Tensor]: ...
+def _pin_memory(input: Tensor, device: Optional[Optional[DeviceLikeType]] = None) -> Tensor: ...
+def _prelu_kernel(input: Tensor, weight: Tensor) -> Tensor: ...
+def _print(s: str) -> None: ...
+def _propagate_xla_data(input: Tensor, output: Tensor) -> None: ...
+def _remove_batch_dim(input: Tensor, level: _int, batch_size: _int, out_dim: _int) -> Tensor: ...
+def _reshape_alias_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None) -> Tensor: ...
+def _reshape_from_tensor(input: Tensor, shape: Tensor) -> Tensor: ...
+def _resize_output_(input: Tensor, size: Sequence[Union[_int, SymInt]], device: Optional[DeviceLikeType]) -> Tensor: ...
+def _rowwise_prune(weight: Tensor, mask: Tensor, compressed_indices_dtype: _dtype) -> Tuple[Tensor, Tensor]: ...
+def _sample_dirichlet(input: Tensor, generator: Optional[Generator] = None) -> Tensor: ...
+def _saturate_weight_to_fp16(weight: Tensor) -> Tensor: ...
+def _scaled_dot_product_attention_math(query: Tensor, key: Tensor, value: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: _float = 0.0, is_causal: _bool = False, dropout_mask: Optional[Tensor] = None, *, scale: Optional[_float] = None) -> Tuple[Tensor, Tensor]: ...
+def _scaled_dot_product_cudnn_attention(query: Tensor, key: Tensor, value: Tensor, dropout_p: _float = 0.0, is_causal: _bool = False, return_debug_mask: _bool = False, *, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_cudnn_attention: ...
+def _scaled_dot_product_efficient_attention(query: Tensor, key: Tensor, value: Tensor, attn_bias: Optional[Tensor], compute_log_sumexp: _bool, dropout_p: _float = 0.0, is_causal: _bool = False, *, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_efficient_attention: ...
+def _scaled_dot_product_flash_attention(query: Tensor, key: Tensor, value: Tensor, dropout_p: _float = 0.0, is_causal: _bool = False, return_debug_mask: _bool = False, *, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_flash_attention: ...
+def _scaled_dot_product_flash_attention_for_cpu(query: Tensor, key: Tensor, value: Tensor, dropout_p: _float = 0.0, is_causal: _bool = False, *, attn_mask: Optional[Tensor] = None, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_flash_attention_for_cpu: ...
+def _scaled_mm(input: Tensor, mat2: Tensor, *, bias: Optional[Tensor] = None, out_dtype: Optional[_dtype] = None, scale_a: Optional[Tensor] = None, scale_b: Optional[Tensor] = None, scale_result: Optional[Tensor] = None, use_fast_accum: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor]: ...
+def _shape_as_tensor(input: Tensor) -> Tensor: ...
+def _sobol_engine_draw(quasi: Tensor, n: _int, sobolstate: Tensor, dimension: _int, num_generated: _int, dtype: Optional[_dtype]) -> Tuple[Tensor, Tensor]: ...
+def _sobol_engine_ff_(input: Tensor, n: _int, sobolstate: Tensor, dimension: _int, num_generated: _int) -> Tensor: ...
+def _sobol_engine_initialize_state_(input: Tensor, dimension: _int) -> Tensor: ...
+def _sobol_engine_scramble_(input: Tensor, ltm: Tensor, dimension: _int) -> Tensor: ...
+def _softmax(input: Tensor, dim: _int, half_to_float: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input_dtype: _dtype, *, grad_input: Optional[Tensor] = None) -> Tensor: ...
+def _sparse_broadcast_to(input: Tensor, size: _size) -> Tensor: ...
+def _sparse_broadcast_to_copy(input: Tensor, size: _size, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _sparse_csr_prod(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: ...
+def _sparse_csr_sum(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: ...
+def _sparse_log_softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input: Tensor) -> Tensor: ...
+def _sparse_semi_structured_linear(input: Tensor, weight: Tensor, meta: Tensor, *, bias: Optional[Tensor] = None, activation: Optional[str] = None, out_dtype: Optional[_dtype] = None) -> Tensor: ...
+def _sparse_softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input: Tensor) -> Tensor: ...
+def _sparse_sparse_matmul(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def _sparse_sum(input: Tensor) -> Tensor: ...
+@overload
+def _sparse_sum(input: Tensor, *, dtype: _dtype) -> Tensor: ...
+@overload
+def _sparse_sum(input: Tensor, dim: Union[_int, _size]) -> Tensor: ...
+@overload
+def _sparse_sum(input: Tensor, dim: Union[_int, _size], *, dtype: _dtype) -> Tensor: ...
+def _stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _standard_gamma(input: Tensor, generator: Optional[Generator] = None) -> Tensor: ...
+def _standard_gamma_grad(input: Tensor, output: Tensor) -> Tensor: ...
+def _sync(t: Tensor) -> None: ...
+@overload
+def _test_autograd_multiple_dispatch(input: Tensor) -> Tensor: ...
+@overload
+def _test_autograd_multiple_dispatch(input: Tensor, b: _bool) -> Tensor: ...
+def _test_autograd_multiple_dispatch_view(input: Tensor) -> Tensor: ...
+def _test_autograd_multiple_dispatch_view_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _test_check_tensor(input: Tensor) -> Tensor: ...
+def _test_functorch_fallback(input: Tensor, other: Tensor) -> Tensor: ...
+def _test_parallel_materialize(input: Tensor, num_parallel: _int, skip_first: _bool = False) -> Tensor: ...
+def _test_serialization_subcmul(input: Tensor, other: Tensor, alpha: Union[Number, _complex] = 1) -> Tensor: ...
+def _to_cpu(tensors: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+def _to_functional_tensor(t: Tensor) -> Tensor: ...
+def _to_sparse_semi_structured(dense: Tensor) -> Tuple[Tensor, Tensor]: ...
+def _transform_bias_rescale_qkv(qkv: Tensor, qkv_bias: Tensor, num_heads: _int) -> Tuple[Tensor, Tensor, Tensor]: ...
+def _transformer_encoder_layer_fwd(src: Tensor, embed_dim: _int, num_heads: _int, qkv_weight: Tensor, qkv_bias: Tensor, proj_weight: Tensor, proj_bias: Tensor, use_gelu: _bool, norm_first: _bool, eps: _float, norm_weight_1: Tensor, norm_bias_1: Tensor, norm_weight_2: Tensor, norm_bias_2: Tensor, ffn_weight_1: Tensor, ffn_bias_1: Tensor, ffn_weight_2: Tensor, ffn_bias_2: Tensor, mask: Optional[Tensor] = None, mask_type: Optional[_int] = None) -> Tensor: ...
+def _trilinear(i1: Tensor, i2: Tensor, i3: Tensor, expand1: _size, expand2: _size, expand3: _size, sumdim: _size, unroll_dim: _int = 1) -> Tensor: ...
+def _triton_multi_head_attention(query: Tensor, key: Tensor, value: Tensor, embed_dim: _int, num_head: _int, qkv_weight: Tensor, qkv_bias: Tensor, proj_weight: Tensor, proj_bias: Tensor, mask: Optional[Tensor] = None) -> Tensor: ...
+def _triton_scaled_dot_attention(q: Tensor, k: Tensor, v: Tensor, dropout_p: _float = 0.0) -> Tensor: ...
+def _unique(input: Tensor, sorted: _bool = True, return_inverse: _bool = False) -> Tuple[Tensor, Tensor]: ...
+def _unique2(input: Tensor, sorted: _bool = True, return_inverse: _bool = False, return_counts: _bool = False) -> Tuple[Tensor, Tensor, Tensor]: ...
+def _unpack_dual(dual: Tensor, level: _int) -> torch.return_types._unpack_dual: ...
+def _unsafe_index(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]]) -> Tensor: ...
+def _unsafe_index_put(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False) -> Tensor: ...
+@overload
+def _use_cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int) -> _bool: ...
+@overload
+def _use_cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int) -> _bool: ...
+def _use_cudnn_rnn_flatten_weight() -> _bool: ...
+def _validate_compressed_sparse_indices(is_crow: _bool, compressed_idx: Tensor, plain_idx: Tensor, cdim: _int, dim: _int, nnz: _int) -> None: ...
+def _validate_sparse_bsc_tensor_args(ccol_indices: Tensor, row_indices: Tensor, values: Tensor, size: _size) -> None: ...
+def _validate_sparse_bsr_tensor_args(crow_indices: Tensor, col_indices: Tensor, values: Tensor, size: _size) -> None: ...
+def _validate_sparse_compressed_tensor_args(compressed_indices: Tensor, plain_indices: Tensor, values: Tensor, size: _size, layout: _layout) -> None: ...
+def _validate_sparse_coo_tensor_args(indices: Tensor, values: Tensor, size: _size, is_coalesced: Optional[_bool] = None) -> None: ...
+def _validate_sparse_csc_tensor_args(ccol_indices: Tensor, row_indices: Tensor, values: Tensor, size: _size) -> None: ...
+def _validate_sparse_csr_tensor_args(crow_indices: Tensor, col_indices: Tensor, values: Tensor, size: _size) -> None: ...
+def _values_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _weight_int4pack_mm(input: Tensor, mat2: Tensor, qGroupSize: _int, qScaleAndZeros: Tensor) -> Tensor: ...
+def _weight_int8pack_mm(input: Tensor, mat2: Tensor, scales: Tensor) -> Tensor: ...
+def _weight_norm(v: Tensor, g: Tensor, dim: _int = 0) -> Tensor: ...
+def _weight_norm_interface(v: Tensor, g: Tensor, dim: _int = 0) -> Tuple[Tensor, Tensor]: ...
+def abs(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    abs(input, *, out=None) -> Tensor
+    
+    Computes the absolute value of each element in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = |\text{input}_{i}|
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.abs(torch.tensor([-1, -2, 3]))
+        tensor([ 1,  2,  3])
+    """
+    ...
+def abs_(input: Tensor) -> Tensor: ...
+def absolute(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    absolute(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.abs`
+    """
+    ...
+def acos(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    acos(input, *, out=None) -> Tensor
+    
+    Computes the inverse cosine of each element in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \cos^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.3348, -0.5889,  0.2005, -0.1584])
+        >>> torch.acos(a)
+        tensor([ 1.2294,  2.2004,  1.3690,  1.7298])
+    """
+    ...
+def acos_(input: Tensor) -> Tensor: ...
+def acosh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    acosh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the inverse hyperbolic cosine of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \cosh^{-1}(\text{input}_{i})
+    
+    Note:
+        The domain of the inverse hyperbolic cosine is `[1, inf)` and values outside this range
+        will be mapped to ``NaN``, except for `+ INF` for which the output is mapped to `+ INF`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4).uniform_(1, 2)
+        >>> a
+        tensor([ 1.3192, 1.9915, 1.9674, 1.7151 ])
+        >>> torch.acosh(a)
+        tensor([ 0.7791, 1.3120, 1.2979, 1.1341 ])
+    """
+    ...
+def acosh_(input: Tensor) -> Tensor: ...
+def adaptive_avg_pool1d(input: Tensor, output_size: Union[_int, _size]) -> Tensor: ...
+def adaptive_max_pool1d(input: Tensor, output_size: Union[_int, _size]) -> Tuple[Tensor, Tensor]: ...
+@overload
+def add(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], *, alpha: Optional[Union[Number, _complex]] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    add(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to add to :attr:`input`.
+    
+    Keyword arguments:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.0202,  1.0985,  1.3506, -0.6056])
+        >>> torch.add(a, 20)
+        tensor([ 20.0202,  21.0985,  21.3506,  19.3944])
+    
+        >>> b = torch.randn(4)
+        >>> b
+        tensor([-0.9732, -0.3497,  0.6245,  0.4022])
+        >>> c = torch.randn(4, 1)
+        >>> c
+        tensor([[ 0.3743],
+                [-1.7724],
+                [-0.5811],
+                [-0.8017]])
+        >>> torch.add(b, c, alpha=10)
+        tensor([[  2.7695,   3.3930,   4.3672,   4.1450],
+                [-18.6971, -18.0736, -17.0994, -17.3216],
+                [ -6.7845,  -6.1610,  -5.1868,  -5.4090],
+                [ -8.9902,  -8.3667,  -7.3925,  -7.6147]])
+    """
+    ...
+@overload
+def add(self: Tensor, alpha: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    add(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to add to :attr:`input`.
+    
+    Keyword arguments:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.0202,  1.0985,  1.3506, -0.6056])
+        >>> torch.add(a, 20)
+        tensor([ 20.0202,  21.0985,  21.3506,  19.3944])
+    
+        >>> b = torch.randn(4)
+        >>> b
+        tensor([-0.9732, -0.3497,  0.6245,  0.4022])
+        >>> c = torch.randn(4, 1)
+        >>> c
+        tensor([[ 0.3743],
+                [-1.7724],
+                [-0.5811],
+                [-0.8017]])
+        >>> torch.add(b, c, alpha=10)
+        tensor([[  2.7695,   3.3930,   4.3672,   4.1450],
+                [-18.6971, -18.0736, -17.0994, -17.3216],
+                [ -6.7845,  -6.1610,  -5.1868,  -5.4090],
+                [ -8.9902,  -8.3667,  -7.3925,  -7.6147]])
+    """
+    ...
+@overload
+def add(self: Tensor, alpha: Union[Number, _complex], other: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    add(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to add to :attr:`input`.
+    
+    Keyword arguments:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.0202,  1.0985,  1.3506, -0.6056])
+        >>> torch.add(a, 20)
+        tensor([ 20.0202,  21.0985,  21.3506,  19.3944])
+    
+        >>> b = torch.randn(4)
+        >>> b
+        tensor([-0.9732, -0.3497,  0.6245,  0.4022])
+        >>> c = torch.randn(4, 1)
+        >>> c
+        tensor([[ 0.3743],
+                [-1.7724],
+                [-0.5811],
+                [-0.8017]])
+        >>> torch.add(b, c, alpha=10)
+        tensor([[  2.7695,   3.3930,   4.3672,   4.1450],
+                [-18.6971, -18.0736, -17.0994, -17.3216],
+                [ -6.7845,  -6.1610,  -5.1868,  -5.4090],
+                [ -8.9902,  -8.3667,  -7.3925,  -7.6147]])
+    """
+    ...
+@overload
+def addbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addbmm(input: Tensor, batch1: Tensor, batch2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addcdiv(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor) -> Tensor: 
+    r"""
+    addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`,
+    multiplies the result by the scalar :attr:`value` and adds it to :attr:`input`.
+    
+    .. warning::
+        Integer division with addcdiv is no longer supported, and in a future
+        release addcdiv will perform a true division of tensor1 and tensor2.
+        The historic addcdiv behavior can be implemented as
+        (input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype)
+        for integer inputs and as (input + value * tensor1 / tensor2) for float inputs.
+        The future addcdiv behavior is just the latter implementation:
+        (input + value * tensor1 / tensor2), for all dtypes.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i}
+    
+    
+    The shapes of :attr:`input`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the numerator tensor
+        tensor2 (Tensor): the denominator tensor
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcdiv(t, t1, t2, value=0.1)
+        tensor([[-0.2312, -3.6496,  0.1312],
+                [-1.0428,  3.4292, -0.1030],
+                [-0.5369, -0.9829,  0.0430]])
+    """
+    ...
+@overload
+def addcdiv(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`,
+    multiplies the result by the scalar :attr:`value` and adds it to :attr:`input`.
+    
+    .. warning::
+        Integer division with addcdiv is no longer supported, and in a future
+        release addcdiv will perform a true division of tensor1 and tensor2.
+        The historic addcdiv behavior can be implemented as
+        (input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype)
+        for integer inputs and as (input + value * tensor1 / tensor2) for float inputs.
+        The future addcdiv behavior is just the latter implementation:
+        (input + value * tensor1 / tensor2), for all dtypes.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i}
+    
+    
+    The shapes of :attr:`input`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the numerator tensor
+        tensor2 (Tensor): the denominator tensor
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcdiv(t, t1, t2, value=0.1)
+        tensor([[-0.2312, -3.6496,  0.1312],
+                [-1.0428,  3.4292, -0.1030],
+                [-0.5369, -0.9829,  0.0430]])
+    """
+    ...
+@overload
+def addcdiv(input: Tensor, tensor1: Tensor, tensor2: Tensor, *, value: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`,
+    multiplies the result by the scalar :attr:`value` and adds it to :attr:`input`.
+    
+    .. warning::
+        Integer division with addcdiv is no longer supported, and in a future
+        release addcdiv will perform a true division of tensor1 and tensor2.
+        The historic addcdiv behavior can be implemented as
+        (input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype)
+        for integer inputs and as (input + value * tensor1 / tensor2) for float inputs.
+        The future addcdiv behavior is just the latter implementation:
+        (input + value * tensor1 / tensor2), for all dtypes.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i}
+    
+    
+    The shapes of :attr:`input`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the numerator tensor
+        tensor2 (Tensor): the denominator tensor
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcdiv(t, t1, t2, value=0.1)
+        tensor([[-0.2312, -3.6496,  0.1312],
+                [-1.0428,  3.4292, -0.1030],
+                [-0.5369, -0.9829,  0.0430]])
+    """
+    ...
+@overload
+def addcmul(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor) -> Tensor: 
+    r"""
+    addcmul(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise multiplication of :attr:`tensor1`
+    by :attr:`tensor2`, multiplies the result by the scalar :attr:`value`
+    and adds it to :attr:`input`.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i
+    
+    The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the tensor to be multiplied
+        tensor2 (Tensor): the tensor to be multiplied
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`tensor1 .* tensor2`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcmul(t, t1, t2, value=0.1)
+        tensor([[-0.8635, -0.6391,  1.6174],
+                [-0.7617, -0.5879,  1.7388],
+                [-0.8353, -0.6249,  1.6511]])
+    """
+    ...
+@overload
+def addcmul(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addcmul(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise multiplication of :attr:`tensor1`
+    by :attr:`tensor2`, multiplies the result by the scalar :attr:`value`
+    and adds it to :attr:`input`.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i
+    
+    The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the tensor to be multiplied
+        tensor2 (Tensor): the tensor to be multiplied
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`tensor1 .* tensor2`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcmul(t, t1, t2, value=0.1)
+        tensor([[-0.8635, -0.6391,  1.6174],
+                [-0.7617, -0.5879,  1.7388],
+                [-0.8353, -0.6249,  1.6511]])
+    """
+    ...
+@overload
+def addcmul(input: Tensor, tensor1: Tensor, tensor2: Tensor, *, value: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addcmul(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise multiplication of :attr:`tensor1`
+    by :attr:`tensor2`, multiplies the result by the scalar :attr:`value`
+    and adds it to :attr:`input`.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i
+    
+    The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the tensor to be multiplied
+        tensor2 (Tensor): the tensor to be multiplied
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`tensor1 .* tensor2`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcmul(t, t1, t2, value=0.1)
+        tensor([[-0.8635, -0.6391,  1.6174],
+                [-0.7617, -0.5879,  1.7388],
+                [-0.8353, -0.6249,  1.6511]])
+    """
+    ...
+@overload
+def addmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat1: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat1: Tensor, mat2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmm(beta: Union[Number, _complex], self: Tensor, mat1: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmm(beta: Union[Number, _complex], self: Tensor, mat1: Tensor, mat2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmv(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat: Tensor, vec: Tensor) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat: Tensor, vec: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv(input: Tensor, mat: Tensor, vec: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv(beta: Union[Number, _complex], self: Tensor, mat: Tensor, vec: Tensor) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv(beta: Union[Number, _complex], self: Tensor, mat: Tensor, vec: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv_(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat: Tensor, vec: Tensor) -> Tensor: ...
+@overload
+def addmv_(input: Tensor, mat: Tensor, vec: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: ...
+@overload
+def addmv_(beta: Union[Number, _complex], self: Tensor, mat: Tensor, vec: Tensor) -> Tensor: ...
+@overload
+def addr(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], vec1: Tensor, vec2: Tensor) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+@overload
+def addr(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], vec1: Tensor, vec2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+@overload
+def addr(input: Tensor, vec1: Tensor, vec2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+@overload
+def addr(beta: Union[Number, _complex], self: Tensor, vec1: Tensor, vec2: Tensor) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+@overload
+def addr(beta: Union[Number, _complex], self: Tensor, vec1: Tensor, vec2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+def adjoint(input: Tensor) -> Tensor: 
+    r"""
+    adjoint(Tensor) -> Tensor
+    Returns a view of the tensor conjugated and with the last two dimensions transposed.
+    
+    ``x.adjoint()`` is equivalent to ``x.transpose(-2, -1).conj()`` for complex tensors and
+    to ``x.transpose(-2, -1)`` for real tensors.
+    
+    Example::
+        >>> x = torch.arange(4, dtype=torch.float)
+        >>> A = torch.complex(x, x).reshape(2, 2)
+        >>> A
+        tensor([[0.+0.j, 1.+1.j],
+                [2.+2.j, 3.+3.j]])
+        >>> A.adjoint()
+        tensor([[0.-0.j, 2.-2.j],
+                [1.-1.j, 3.-3.j]])
+        >>> (A.adjoint() == A.mH).all()
+        tensor(True)
+    """
+    ...
+def affine_grid_generator(theta: Tensor, size: Sequence[Union[_int, SymInt]], align_corners: _bool) -> Tensor: ...
+def alias_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.alias`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def all(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    all(input) -> Tensor
+    
+    Tests if all elements in :attr:`input` evaluate to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.all(a)
+        tensor(False, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.all(a)
+        tensor(False)
+    
+    .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if all elements in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(4, 2).bool()
+        >>> a
+        tensor([[True, True],
+                [True, False],
+                [True, True],
+                [True, True]], dtype=torch.bool)
+        >>> torch.all(a, dim=1)
+        tensor([ True, False,  True,  True], dtype=torch.bool)
+        >>> torch.all(a, dim=0)
+        tensor([ True, False], dtype=torch.bool)
+    """
+    ...
+@overload
+def all(input: Tensor, dim: Optional[_size] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    all(input) -> Tensor
+    
+    Tests if all elements in :attr:`input` evaluate to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.all(a)
+        tensor(False, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.all(a)
+        tensor(False)
+    
+    .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if all elements in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(4, 2).bool()
+        >>> a
+        tensor([[True, True],
+                [True, False],
+                [True, True],
+                [True, True]], dtype=torch.bool)
+        >>> torch.all(a, dim=1)
+        tensor([ True, False,  True,  True], dtype=torch.bool)
+        >>> torch.all(a, dim=0)
+        tensor([ True, False], dtype=torch.bool)
+    """
+    ...
+@overload
+def all(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    all(input) -> Tensor
+    
+    Tests if all elements in :attr:`input` evaluate to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.all(a)
+        tensor(False, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.all(a)
+        tensor(False)
+    
+    .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if all elements in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(4, 2).bool()
+        >>> a
+        tensor([[True, True],
+                [True, False],
+                [True, True],
+                [True, True]], dtype=torch.bool)
+        >>> torch.all(a, dim=1)
+        tensor([ True, False,  True,  True], dtype=torch.bool)
+        >>> torch.all(a, dim=0)
+        tensor([ True, False], dtype=torch.bool)
+    """
+    ...
+@overload
+def all(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    all(input) -> Tensor
+    
+    Tests if all elements in :attr:`input` evaluate to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.all(a)
+        tensor(False, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.all(a)
+        tensor(False)
+    
+    .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if all elements in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(4, 2).bool()
+        >>> a
+        tensor([[True, True],
+                [True, False],
+                [True, True],
+                [True, True]], dtype=torch.bool)
+        >>> torch.all(a, dim=1)
+        tensor([ True, False,  True,  True], dtype=torch.bool)
+        >>> torch.all(a, dim=0)
+        tensor([ True, False], dtype=torch.bool)
+    """
+    ...
+def allclose(input: Tensor, other: Tensor, rtol: _float = 1e-05, atol: _float = 1e-08, equal_nan: _bool = False) -> _bool: 
+    r"""
+    allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False) -> bool
+    
+    This function checks if :attr:`input` and :attr:`other` satisfy the condition:
+    
+    .. math::
+        \lvert \text{input} - \text{other} \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert \text{other} \rvert
+    
+    elementwise, for all elements of :attr:`input` and :attr:`other`. The behaviour of this function is analogous to
+    `numpy.allclose <https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html>`_
+    
+    Args:
+        input (Tensor): first tensor to compare
+        other (Tensor): second tensor to compare
+        atol (float, optional): absolute tolerance. Default: 1e-08
+        rtol (float, optional): relative tolerance. Default: 1e-05
+        equal_nan (bool, optional): if ``True``, then two ``NaN`` s will be considered equal. Default: ``False``
+    
+    Example::
+    
+        >>> torch.allclose(torch.tensor([10000., 1e-07]), torch.tensor([10000.1, 1e-08]))
+        False
+        >>> torch.allclose(torch.tensor([10000., 1e-08]), torch.tensor([10000.1, 1e-09]))
+        True
+        >>> torch.allclose(torch.tensor([1.0, float('nan')]), torch.tensor([1.0, float('nan')]))
+        False
+        >>> torch.allclose(torch.tensor([1.0, float('nan')]), torch.tensor([1.0, float('nan')]), equal_nan=True)
+        True
+    """
+    ...
+def alpha_dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def alpha_dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def amax(input: Tensor, dim: Union[_int, _size] = (), keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    amax(input, dim, keepdim=False, *, out=None) -> Tensor
+    
+    Returns the maximum value of each slice of the :attr:`input` tensor in the given
+    dimension(s) :attr:`dim`.
+    
+    .. note::
+        The difference between ``max``/``min`` and ``amax``/``amin`` is:
+            - ``amax``/``amin`` supports reducing on multiple dimensions,
+            - ``amax``/``amin`` does not return indices,
+            - ``amax``/``amin`` evenly distributes gradient between equal values,
+              while ``max(dim)``/``min(dim)`` propagates gradient only to a single
+              index in the source tensor.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.8177,  1.4878, -0.2491,  0.9130],
+                [-0.7158,  1.1775,  2.0992,  0.4817],
+                [-0.0053,  0.0164, -1.3738, -0.0507],
+                [ 1.9700,  1.1106, -1.0318, -1.0816]])
+        >>> torch.amax(a, 1)
+        tensor([1.4878, 2.0992, 0.0164, 1.9700])
+    """
+    ...
+def amin(input: Tensor, dim: Union[_int, _size] = (), keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    amin(input, dim, keepdim=False, *, out=None) -> Tensor
+    
+    Returns the minimum value of each slice of the :attr:`input` tensor in the given
+    dimension(s) :attr:`dim`.
+    
+    .. note::
+        The difference between ``max``/``min`` and ``amax``/``amin`` is:
+            - ``amax``/``amin`` supports reducing on multiple dimensions,
+            - ``amax``/``amin`` does not return indices,
+            - ``amax``/``amin`` evenly distributes gradient between equal values,
+              while ``max(dim)``/``min(dim)`` propagates gradient only to a single
+              index in the source tensor.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.6451, -0.4866,  0.2987, -1.3312],
+                [-0.5744,  1.2980,  1.8397, -0.2713],
+                [ 0.9128,  0.9214, -1.7268, -0.2995],
+                [ 0.9023,  0.4853,  0.9075, -1.6165]])
+        >>> torch.amin(a, 1)
+        tensor([-1.3312, -0.5744, -1.7268, -1.6165])
+    """
+    ...
+def aminmax(input: Tensor, *, dim: Optional[_int] = None, keepdim: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.aminmax: 
+    r"""
+    aminmax(input, *, dim=None, keepdim=False, out=None) -> (Tensor min, Tensor max)
+    
+    Computes the minimum and maximum values of the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor):
+            The input tensor
+    
+    Keyword Args:
+        dim (Optional[int]):
+            The dimension along which to compute the values. If `None`,
+            computes the values over the entire :attr:`input` tensor.
+            Default is `None`.
+        keepdim (bool):
+            If `True`, the reduced dimensions will be kept in the output
+            tensor as dimensions with size 1 for broadcasting, otherwise
+            they will be removed, as if calling (:func:`torch.squeeze`).
+            Default is `False`.
+        out (Optional[Tuple[Tensor, Tensor]]):
+            Optional tensors on which to write the result. Must have the same
+            shape and dtype as the expected output.
+            Default is `None`.
+    
+    Returns:
+        A named tuple `(min, max)` containing the minimum and maximum values.
+    
+    Raises:
+        RuntimeError
+            If any of the dimensions to compute the values over has size 0.
+    
+    .. note::
+        NaN values are propagated to the output if at least one value is NaN.
+    
+    .. seealso::
+        :func:`torch.amin` computes just the minimum value
+        :func:`torch.amax` computes just the maximum value
+    
+    Example::
+    
+        >>> torch.aminmax(torch.tensor([1, -3, 5]))
+        torch.return_types.aminmax(
+        min=tensor(-3),
+        max=tensor(5))
+    
+        >>> # aminmax propagates NaNs
+        >>> torch.aminmax(torch.tensor([1, -3, 5, torch.nan]))
+        torch.return_types.aminmax(
+        min=tensor(nan),
+        max=tensor(nan))
+    
+        >>> t = torch.arange(10).view(2, 5)
+        >>> t
+        tensor([[0, 1, 2, 3, 4],
+                [5, 6, 7, 8, 9]])
+        >>> t.aminmax(dim=0, keepdim=True)
+        torch.return_types.aminmax(
+        min=tensor([[0, 1, 2, 3, 4]]),
+        max=tensor([[5, 6, 7, 8, 9]]))
+    """
+    ...
+def angle(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    angle(input, *, out=None) -> Tensor
+    
+    Computes the element-wise angle (in radians) of the given :attr:`input` tensor.
+    
+    .. math::
+        \text{out}_{i} = angle(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    .. note:: Starting in PyTorch 1.8, angle returns pi for negative real numbers,
+              zero for non-negative real numbers, and propagates NaNs. Previously
+              the function would return zero for all real numbers and not propagate
+              floating-point NaNs.
+    
+    Example::
+    
+        >>> torch.angle(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]))*180/3.14159
+        tensor([ 135.,  135,  -45])
+    """
+    ...
+@overload
+def any(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    any(input) -> Tensor
+    
+    Tests if any element in :attr:`input` evaluates to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.any(a)
+        tensor(True, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.any(a)
+        tensor(True)
+    
+    .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if any element in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2) < 0
+        >>> a
+        tensor([[ True,  True],
+                [False,  True],
+                [ True,  True],
+                [False, False]])
+        >>> torch.any(a, 1)
+        tensor([ True,  True,  True, False])
+        >>> torch.any(a, 0)
+        tensor([True, True])
+    """
+    ...
+@overload
+def any(input: Tensor, dim: Optional[_size] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    any(input) -> Tensor
+    
+    Tests if any element in :attr:`input` evaluates to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.any(a)
+        tensor(True, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.any(a)
+        tensor(True)
+    
+    .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if any element in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2) < 0
+        >>> a
+        tensor([[ True,  True],
+                [False,  True],
+                [ True,  True],
+                [False, False]])
+        >>> torch.any(a, 1)
+        tensor([ True,  True,  True, False])
+        >>> torch.any(a, 0)
+        tensor([True, True])
+    """
+    ...
+@overload
+def any(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    any(input) -> Tensor
+    
+    Tests if any element in :attr:`input` evaluates to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.any(a)
+        tensor(True, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.any(a)
+        tensor(True)
+    
+    .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if any element in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2) < 0
+        >>> a
+        tensor([[ True,  True],
+                [False,  True],
+                [ True,  True],
+                [False, False]])
+        >>> torch.any(a, 1)
+        tensor([ True,  True,  True, False])
+        >>> torch.any(a, 0)
+        tensor([True, True])
+    """
+    ...
+@overload
+def any(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    any(input) -> Tensor
+    
+    Tests if any element in :attr:`input` evaluates to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.any(a)
+        tensor(True, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.any(a)
+        tensor(True)
+    
+    .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if any element in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2) < 0
+        >>> a
+        tensor([[ True,  True],
+                [False,  True],
+                [ True,  True],
+                [False, False]])
+        >>> torch.any(a, 1)
+        tensor([ True,  True,  True, False])
+        >>> torch.any(a, 0)
+        tensor([True, True])
+    """
+    ...
+@overload
+def arange(start: Number, end: Number, step: Number, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(start: Number, end: Number, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(end: Number, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(end: Union[Number, _complex], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(start: Union[Number, _complex], end: Union[Number, _complex], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(start: Union[Number, _complex], end: Union[Number, _complex], step: Union[Number, _complex] = 1, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+def arccos(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arccos(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.acos`.
+    """
+    ...
+def arccos_(input: Tensor) -> Tensor: ...
+def arccosh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arccosh(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.acosh`.
+    """
+    ...
+def arccosh_(input: Tensor) -> Tensor: ...
+def arcsin(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arcsin(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.asin`.
+    """
+    ...
+def arcsin_(input: Tensor) -> Tensor: ...
+def arcsinh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arcsinh(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.asinh`.
+    """
+    ...
+def arcsinh_(input: Tensor) -> Tensor: ...
+def arctan(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arctan(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.atan`.
+    """
+    ...
+def arctan2(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arctan2(input, other, *, out=None) -> Tensor
+    Alias for :func:`torch.atan2`.
+    """
+    ...
+def arctan_(input: Tensor) -> Tensor: ...
+def arctanh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arctanh(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.atanh`.
+    """
+    ...
+def arctanh_(input: Tensor) -> Tensor: ...
+def argmax(input: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    argmax(input) -> LongTensor
+    
+    Returns the indices of the maximum value of all elements in the :attr:`input` tensor.
+    
+    This is the second value returned by :meth:`torch.max`. See its
+    documentation for the exact semantics of this method.
+    
+    .. note:: If there are multiple maximal values then the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 1.3398,  0.2663, -0.2686,  0.2450],
+                [-0.7401, -0.8805, -0.3402, -1.1936],
+                [ 0.4907, -1.3948, -1.0691, -0.3132],
+                [-1.6092,  0.5419, -0.2993,  0.3195]])
+        >>> torch.argmax(a)
+        tensor(0)
+    
+    .. function:: argmax(input, dim, keepdim=False) -> LongTensor
+       :noindex:
+    
+    Returns the indices of the maximum values of a tensor across a dimension.
+    
+    This is the second value returned by :meth:`torch.max`. See its
+    documentation for the exact semantics of this method.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce. If ``None``, the argmax of the flattened input is returned.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 1.3398,  0.2663, -0.2686,  0.2450],
+                [-0.7401, -0.8805, -0.3402, -1.1936],
+                [ 0.4907, -1.3948, -1.0691, -0.3132],
+                [-1.6092,  0.5419, -0.2993,  0.3195]])
+        >>> torch.argmax(a, dim=1)
+        tensor([ 0,  2,  0,  1])
+    """
+    ...
+def argmin(input: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    argmin(input, dim=None, keepdim=False) -> LongTensor
+    
+    Returns the indices of the minimum value(s) of the flattened tensor or along a dimension
+    
+    This is the second value returned by :meth:`torch.min`. See its
+    documentation for the exact semantics of this method.
+    
+    .. note:: If there are multiple minimal values then the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce. If ``None``, the argmin of the flattened input is returned.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.1139,  0.2254, -0.1381,  0.3687],
+                [ 1.0100, -1.1975, -0.0102, -0.4732],
+                [-0.9240,  0.1207, -0.7506, -1.0213],
+                [ 1.7809, -1.2960,  0.9384,  0.1438]])
+        >>> torch.argmin(a)
+        tensor(13)
+        >>> torch.argmin(a, dim=1)
+        tensor([ 2,  1,  3,  1])
+        >>> torch.argmin(a, dim=1, keepdim=True)
+        tensor([[2],
+                [1],
+                [3],
+                [1]])
+    """
+    ...
+@overload
+def argsort(input: Tensor, *, stable: _bool, dim: _int = -1, descending: _bool = False) -> Tensor: 
+    r"""
+    argsort(input, dim=-1, descending=False, stable=False) -> Tensor
+    
+    Returns the indices that sort a tensor along a given dimension in ascending
+    order by value.
+    
+    This is the second value returned by :meth:`torch.sort`.  See its documentation
+    for the exact semantics of this method.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements. If ``False``, the relative order of values
+    which compare equal is not guaranteed. ``True`` is slower.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): controls the relative order of equivalent elements
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0785,  1.5267, -0.8521,  0.4065],
+                [ 0.1598,  0.0788, -0.0745, -1.2700],
+                [ 1.2208,  1.0722, -0.7064,  1.2564],
+                [ 0.0669, -0.2318, -0.8229, -0.9280]])
+    
+    
+        >>> torch.argsort(a, dim=1)
+        tensor([[2, 0, 3, 1],
+                [3, 2, 1, 0],
+                [2, 1, 0, 3],
+                [3, 2, 1, 0]])
+    """
+    ...
+@overload
+def argsort(input: Tensor, dim: _int = -1, descending: _bool = False) -> Tensor: 
+    r"""
+    argsort(input, dim=-1, descending=False, stable=False) -> Tensor
+    
+    Returns the indices that sort a tensor along a given dimension in ascending
+    order by value.
+    
+    This is the second value returned by :meth:`torch.sort`.  See its documentation
+    for the exact semantics of this method.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements. If ``False``, the relative order of values
+    which compare equal is not guaranteed. ``True`` is slower.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): controls the relative order of equivalent elements
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0785,  1.5267, -0.8521,  0.4065],
+                [ 0.1598,  0.0788, -0.0745, -1.2700],
+                [ 1.2208,  1.0722, -0.7064,  1.2564],
+                [ 0.0669, -0.2318, -0.8229, -0.9280]])
+    
+    
+        >>> torch.argsort(a, dim=1)
+        tensor([[2, 0, 3, 1],
+                [3, 2, 1, 0],
+                [2, 1, 0, 3],
+                [3, 2, 1, 0]])
+    """
+    ...
+@overload
+def argsort(input: Tensor, dim: Union[str, ellipsis, None], descending: _bool = False) -> Tensor: 
+    r"""
+    argsort(input, dim=-1, descending=False, stable=False) -> Tensor
+    
+    Returns the indices that sort a tensor along a given dimension in ascending
+    order by value.
+    
+    This is the second value returned by :meth:`torch.sort`.  See its documentation
+    for the exact semantics of this method.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements. If ``False``, the relative order of values
+    which compare equal is not guaranteed. ``True`` is slower.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): controls the relative order of equivalent elements
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0785,  1.5267, -0.8521,  0.4065],
+                [ 0.1598,  0.0788, -0.0745, -1.2700],
+                [ 1.2208,  1.0722, -0.7064,  1.2564],
+                [ 0.0669, -0.2318, -0.8229, -0.9280]])
+    
+    
+        >>> torch.argsort(a, dim=1)
+        tensor([[2, 0, 3, 1],
+                [3, 2, 1, 0],
+                [2, 1, 0, 3],
+                [3, 2, 1, 0]])
+    """
+    ...
+def argwhere(input: Tensor) -> Tensor: 
+    r"""
+    argwhere(input) -> Tensor
+    
+    Returns a tensor containing the indices of all non-zero elements of
+    :attr:`input`.  Each row in the result contains the indices of a non-zero
+    element in :attr:`input`. The result is sorted lexicographically, with
+    the last index changing the fastest (C-style).
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor
+    :attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    .. note::
+        This function is similar to NumPy's `argwhere`.
+    
+        When :attr:`input` is on CUDA, this function causes host-device synchronization.
+    
+    Args:
+        {input}
+    
+    Example::
+    
+        >>> t = torch.tensor([1, 0, 1])
+        >>> torch.argwhere(t)
+        tensor([[0],
+                [2]])
+        >>> t = torch.tensor([[1, 0, 1], [0, 1, 1]])
+        >>> torch.argwhere(t)
+        tensor([[0, 0],
+                [0, 2],
+                [1, 1],
+                [1, 2]])
+    """
+    ...
+def as_strided(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    as_strided(input, size, stride, storage_offset=None) -> Tensor
+    
+    Create a view of an existing `torch.Tensor` :attr:`input` with specified
+    :attr:`size`, :attr:`stride` and :attr:`storage_offset`.
+    
+    .. warning::
+        Prefer using other view functions, like :meth:`torch.Tensor.expand`,
+        to setting a view's strides manually with `as_strided`, as this
+        function's behavior depends on the implementation of a tensor's storage.
+        The constructed view of the storage must only refer to elements within
+        the storage or a runtime error will be thrown, and if the view is
+        "overlapped" (with multiple indices referring to the same element in
+        memory) its behavior is undefined.
+    
+    Args:
+        input (Tensor): the input tensor.
+        size (tuple or ints): the shape of the output tensor
+        stride (tuple or ints): the stride of the output tensor
+        storage_offset (int, optional): the offset in the underlying storage of the output tensor.
+            If ``None``, the storage_offset of the output tensor will match the input tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(3, 3)
+        >>> x
+        tensor([[ 0.9039,  0.6291,  1.0795],
+                [ 0.1586,  2.1939, -0.4900],
+                [-0.1909, -0.7503,  1.9355]])
+        >>> t = torch.as_strided(x, (2, 2), (1, 2))
+        >>> t
+        tensor([[0.9039, 1.0795],
+                [0.6291, 0.1586]])
+        >>> t = torch.as_strided(x, (2, 2), (1, 2), 1)
+        tensor([[0.6291, 0.1586],
+                [1.0795, 2.1939]])
+    """
+    ...
+def as_strided_(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: ...
+def as_strided_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.as_strided`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def as_strided_scatter(input: Tensor, src: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    as_strided_scatter(input, src, size, stride, storage_offset=None) -> Tensor
+    
+    Embeds the values of the :attr:`src` tensor into :attr:`input` along
+    the elements corresponding to the result of calling
+    input.as_strided(size, stride, storage_offset).
+    
+    This function returns a tensor with fresh storage; it does not
+    return a view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        size (tuple or ints): the shape of the output tensor
+        stride (tuple or ints): the stride of the output tensor
+        storage_offset (int, optional): the offset in the underlying storage of the output tensor
+    
+    .. note::
+    
+        :attr:`src` must be of the proper size in order to be embedded
+        into :attr:`input`. Specifically, it should have the same shape as
+        `torch.as_strided(input, size, stride, storage_offset)`
+    
+    Example::
+    
+        >>> a = torch.arange(4).reshape(2, 2) + 1
+        >>> a
+        tensor([[1, 2],
+                [3, 4]])
+        >>> b = torch.zeros(3, 3)
+        >>> b
+        tensor([[0., 0., 0.],
+                [0., 0., 0.],
+                [0., 0., 0.]])
+        >>> torch.as_strided_scatter(b, a, (2, 2), (1, 2))
+        tensor([[1., 3., 2.],
+                [4., 0., 0.],
+                [0., 0., 0.]])
+    """
+    ...
+def as_tensor(data: Any, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None) -> Tensor: 
+    r"""
+    as_tensor(data, dtype=None, device=None) -> Tensor
+    
+    Converts :attr:`data` into a tensor, sharing data and preserving autograd
+    history if possible.
+    
+    If :attr:`data` is already a tensor with the requested dtype and device
+    then :attr:`data` itself is returned, but if :attr:`data` is a
+    tensor with a different dtype or device then it's copied as if using
+    `data.to(dtype=dtype, device=device)`.
+    
+    If :attr:`data` is a NumPy array (an ndarray) with the same dtype and device then a
+    tensor is constructed using :func:`torch.from_numpy`.
+    
+    .. seealso::
+    
+        :func:`torch.tensor` never shares its data and creates a new "leaf tensor" (see :doc:`/notes/autograd`).
+    
+    
+    Args:
+        data (array_like): Initial data for the tensor. Can be a list, tuple,
+            NumPy ``ndarray``, scalar, and other types.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, infers data type from :attr:`data`.
+        device (:class:`torch.device`, optional): the device of the constructed tensor. If None and data is a tensor
+            then the device of data is used. If None and data is not a tensor then
+            the result tensor is constructed on the current device.
+    
+    
+    Example::
+    
+        >>> a = numpy.array([1, 2, 3])
+        >>> t = torch.as_tensor(a)
+        >>> t
+        tensor([ 1,  2,  3])
+        >>> t[0] = -1
+        >>> a
+        array([-1,  2,  3])
+    
+        >>> a = numpy.array([1, 2, 3])
+        >>> t = torch.as_tensor(a, device=torch.device('cuda'))
+        >>> t
+        tensor([ 1,  2,  3])
+        >>> t[0] = -1
+        >>> a
+        array([1,  2,  3])
+    """
+    ...
+def asarray(obj: Any, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, copy: Optional[_bool] = None, requires_grad: _bool = False) -> Tensor: 
+    r"""
+    asarray(obj, *, dtype=None, device=None, copy=None, requires_grad=False) -> Tensor
+    
+    Converts :attr:`obj` to a tensor.
+    
+    :attr:`obj` can be one of:
+    
+    1. a tensor
+    2. a NumPy array or a NumPy scalar
+    3. a DLPack capsule
+    4. an object that implements Python's buffer protocol
+    5. a scalar
+    6. a sequence of scalars
+    
+    When :attr:`obj` is a tensor, NumPy array, or DLPack capsule the returned tensor will,
+    by default, not require a gradient, have the same datatype as :attr:`obj`, be on the
+    same device, and share memory with it. These properties can be controlled with the
+    :attr:`dtype`, :attr:`device`, :attr:`copy`, and :attr:`requires_grad` keyword arguments.
+    If the returned tensor is of a different datatype, on a different device, or a copy is
+    requested then it will not share its memory with :attr:`obj`. If :attr:`requires_grad`
+    is ``True`` then the returned tensor will require a gradient, and if :attr:`obj` is
+    also a tensor with an autograd history then the returned tensor will have the same history.
+    
+    When :attr:`obj` is not a tensor, NumPy array, or DLPack capsule but implements Python's
+    buffer protocol then the buffer is interpreted as an array of bytes grouped according to
+    the size of the datatype passed to the :attr:`dtype` keyword argument. (If no datatype is
+    passed then the default floating point datatype is used, instead.) The returned tensor
+    will have the specified datatype (or default floating point datatype if none is specified)
+    and, by default, be on the CPU device and share memory with the buffer.
+    
+    When :attr:`obj` is a NumPy scalar, the returned tensor will be a 0-dimensional tensor on
+    the CPU and that doesn't share its memory (i.e. ``copy=True``). By default datatype will
+    be the PyTorch datatype corresponding to the NumPy's scalar's datatype.
+    
+    When :attr:`obj` is none of the above but a scalar, or a sequence of scalars then the
+    returned tensor will, by default, infer its datatype from the scalar values, be on the
+    current default device, and not share its memory.
+    
+    .. seealso::
+    
+        :func:`torch.tensor` creates a tensor that always copies the data from the input object.
+        :func:`torch.from_numpy` creates a tensor that always shares memory from NumPy arrays.
+        :func:`torch.frombuffer` creates a tensor that always shares memory from objects that
+        implement the buffer protocol.
+        :func:`torch.from_dlpack` creates a tensor that always shares memory from
+        DLPack capsules.
+    
+    Args:
+        obj (object): a tensor, NumPy array, DLPack Capsule, object that implements Python's
+               buffer protocol, scalar, or sequence of scalars.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the datatype of the returned tensor.
+               Default: ``None``, which causes the datatype of the returned tensor to be
+               inferred from :attr:`obj`.
+        copy (bool, optional): controls whether the returned tensor shares memory with :attr:`obj`.
+               Default: ``None``, which causes the returned tensor to share memory with :attr:`obj`
+               whenever possible. If ``True`` then the returned tensor does not share its memory.
+               If ``False`` then the returned tensor shares its memory with :attr:`obj` and an
+               error is thrown if it cannot.
+        device (:class:`torch.device`, optional): the device of the returned tensor.
+               Default: ``None``, which causes the device of :attr:`obj` to be used. Or, if
+               :attr:`obj` is a Python sequence, the current default device will be used.
+        requires_grad (bool, optional): whether the returned tensor requires grad.
+               Default: ``False``, which causes the returned tensor not to require a gradient.
+               If ``True``, then the returned tensor will require a gradient, and if :attr:`obj`
+               is also a tensor with an autograd history then the returned tensor will have
+               the same history.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> # Shares memory with tensor 'a'
+        >>> b = torch.asarray(a)
+        >>> a.data_ptr() == b.data_ptr()
+        True
+        >>> # Forces memory copy
+        >>> c = torch.asarray(a, copy=True)
+        >>> a.data_ptr() == c.data_ptr()
+        False
+    
+        >>> a = torch.tensor([1., 2., 3.], requires_grad=True)
+        >>> b = a + 2
+        >>> b
+        tensor([3., 4., 5.], grad_fn=<AddBackward0>)
+        >>> # Shares memory with tensor 'b', with no grad
+        >>> c = torch.asarray(b)
+        >>> c
+        tensor([3., 4., 5.])
+        >>> # Shares memory with tensor 'b', retaining autograd history
+        >>> d = torch.asarray(b, requires_grad=True)
+        >>> d
+        tensor([3., 4., 5.], grad_fn=<AddBackward0>)
+    
+        >>> array = numpy.array([1, 2, 3])
+        >>> # Shares memory with array 'array'
+        >>> t1 = torch.asarray(array)
+        >>> array.__array_interface__['data'][0] == t1.data_ptr()
+        True
+        >>> # Copies memory due to dtype mismatch
+        >>> t2 = torch.asarray(array, dtype=torch.float32)
+        >>> array.__array_interface__['data'][0] == t2.data_ptr()
+        False
+    
+        >>> scalar = numpy.float64(0.5)
+        >>> torch.asarray(scalar)
+        tensor(0.5000, dtype=torch.float64)
+    """
+    ...
+def asin(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    asin(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the arcsine of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sin^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.5962,  1.4985, -0.4396,  1.4525])
+        >>> torch.asin(a)
+        tensor([-0.6387,     nan, -0.4552,     nan])
+    """
+    ...
+def asin_(input: Tensor) -> Tensor: ...
+def asinh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    asinh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the inverse hyperbolic sine of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sinh^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.1606, -1.4267, -1.0899, -1.0250 ])
+        >>> torch.asinh(a)
+        tensor([ 0.1599, -1.1534, -0.9435, -0.8990 ])
+    """
+    ...
+def asinh_(input: Tensor) -> Tensor: ...
+def atan(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    atan(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the arctangent of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \tan^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.2341,  0.2539, -0.6256, -0.6448])
+        >>> torch.atan(a)
+        tensor([ 0.2299,  0.2487, -0.5591, -0.5727])
+    """
+    ...
+def atan2(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    atan2(input, other, *, out=None) -> Tensor
+    
+    Element-wise arctangent of :math:`\text{input}_{i} / \text{other}_{i}`
+    with consideration of the quadrant. Returns a new tensor with the signed angles
+    in radians between vector :math:`(\text{other}_{i}, \text{input}_{i})`
+    and vector :math:`(1, 0)`. (Note that :math:`\text{other}_{i}`, the second
+    parameter, is the x-coordinate, while :math:`\text{input}_{i}`, the first
+    parameter, is the y-coordinate.)
+    
+    The shapes of ``input`` and ``other`` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the first input tensor
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.9041,  0.0196, -0.3108, -2.4423])
+        >>> torch.atan2(a, torch.randn(4))
+        tensor([ 0.9833,  0.0811, -1.9743, -1.4151])
+    """
+    ...
+def atan_(input: Tensor) -> Tensor: ...
+def atanh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    atanh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the inverse hyperbolic tangent of the elements of :attr:`input`.
+    
+    Note:
+        The domain of the inverse hyperbolic tangent is `(-1, 1)` and values outside this range
+        will be mapped to ``NaN``, except for the values `1` and `-1` for which the output is
+        mapped to `+/-INF` respectively.
+    
+    .. math::
+        \text{out}_{i} = \tanh^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4).uniform_(-1, 1)
+        >>> a
+        tensor([ -0.9385, 0.2968, -0.8591, -0.1871 ])
+        >>> torch.atanh(a)
+        tensor([ -1.7253, 0.3060, -1.2899, -0.1893 ])
+    """
+    ...
+def atanh_(input: Tensor) -> Tensor: ...
+def avg_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, ceil_mode: _bool = False, count_include_pad: _bool = True) -> Tensor: ...
+@overload
+def baddbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def baddbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def baddbmm(input: Tensor, batch1: Tensor, batch2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def baddbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def baddbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def bartlett_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    bartlett_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Bartlett window function.
+    
+    .. math::
+        w[n] = 1 - \left| \frac{2n}{N-1} - 1 \right| = \begin{cases}
+            \frac{2n}{N - 1} & \text{if } 0 \leq n \leq \frac{N - 1}{2} \\
+            2 - \frac{2n}{N - 1} & \text{if } \frac{N - 1}{2} < n < N \\
+        \end{cases},
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.bartlett_window(L, periodic=True)`` equal to
+    ``torch.bartlett_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+@overload
+def bartlett_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    bartlett_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Bartlett window function.
+    
+    .. math::
+        w[n] = 1 - \left| \frac{2n}{N-1} - 1 \right| = \begin{cases}
+            \frac{2n}{N - 1} & \text{if } 0 \leq n \leq \frac{N - 1}{2} \\
+            2 - \frac{2n}{N - 1} & \text{if } \frac{N - 1}{2} < n < N \\
+        \end{cases},
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.bartlett_window(L, periodic=True)`` equal to
+    ``torch.bartlett_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+def batch_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, momentum: _float, eps: _float, cudnn_enabled: _bool) -> Tensor: ...
+def batch_norm_backward_elemt(grad_out: Tensor, input: Tensor, mean: Tensor, invstd: Tensor, weight: Optional[Tensor], sum_dy: Tensor, sum_dy_xmu: Tensor, count: Tensor) -> Tensor: ...
+def batch_norm_backward_reduce(grad_out: Tensor, input: Tensor, mean: Tensor, invstd: Tensor, weight: Optional[Tensor], input_g: _bool, weight_g: _bool, bias_g: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def batch_norm_elemt(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], mean: Tensor, invstd: Tensor, eps: _float, *, out: Optional[Tensor] = None) -> Tensor: ...
+def batch_norm_gather_stats(input: Tensor, mean: Tensor, invstd: Tensor, running_mean: Optional[Tensor], running_var: Optional[Tensor], momentum: _float, eps: _float, count: _int) -> Tuple[Tensor, Tensor]: ...
+def batch_norm_gather_stats_with_counts(input: Tensor, mean: Tensor, invstd: Tensor, running_mean: Optional[Tensor], running_var: Optional[Tensor], momentum: _float, eps: _float, counts: Tensor) -> Tuple[Tensor, Tensor]: ...
+def batch_norm_stats(input: Tensor, eps: _float) -> Tuple[Tensor, Tensor]: ...
+def batch_norm_update_stats(input: Tensor, running_mean: Optional[Tensor], running_var: Optional[Tensor], momentum: _float) -> Tuple[Tensor, Tensor]: ...
+@overload
+def bernoulli(input: Tensor, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bernoulli(input, *, generator=None, out=None) -> Tensor
+    
+    Draws binary random numbers (0 or 1) from a Bernoulli distribution.
+    
+    The :attr:`input` tensor should be a tensor containing probabilities
+    to be used for drawing the binary random number.
+    Hence, all values in :attr:`input` have to be in the range:
+    :math:`0 \leq \text{input}_i \leq 1`.
+    
+    The :math:`\text{i}^{th}` element of the output tensor will draw a
+    value :math:`1` according to the :math:`\text{i}^{th}` probability value given
+    in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} \sim \mathrm{Bernoulli}(p = \text{input}_{i})
+    
+    The returned :attr:`out` tensor only has values 0 or 1 and is of the same
+    shape as :attr:`input`.
+    
+    :attr:`out` can have integral ``dtype``, but :attr:`input` must have floating
+    point ``dtype``.
+    
+    Args:
+        input (Tensor): the input tensor of probability values for the Bernoulli distribution
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.empty(3, 3).uniform_(0, 1)  # generate a uniform random matrix with range [0, 1]
+        >>> a
+        tensor([[ 0.1737,  0.0950,  0.3609],
+                [ 0.7148,  0.0289,  0.2676],
+                [ 0.9456,  0.8937,  0.7202]])
+        >>> torch.bernoulli(a)
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  0.,  0.],
+                [ 1.,  1.,  1.]])
+    
+        >>> a = torch.ones(3, 3) # probability of drawing "1" is 1
+        >>> torch.bernoulli(a)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+        >>> a = torch.zeros(3, 3) # probability of drawing "1" is 0
+        >>> torch.bernoulli(a)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    """
+    ...
+@overload
+def bernoulli(input: Tensor, p: _float, *, generator: Optional[Generator] = None) -> Tensor: 
+    r"""
+    bernoulli(input, *, generator=None, out=None) -> Tensor
+    
+    Draws binary random numbers (0 or 1) from a Bernoulli distribution.
+    
+    The :attr:`input` tensor should be a tensor containing probabilities
+    to be used for drawing the binary random number.
+    Hence, all values in :attr:`input` have to be in the range:
+    :math:`0 \leq \text{input}_i \leq 1`.
+    
+    The :math:`\text{i}^{th}` element of the output tensor will draw a
+    value :math:`1` according to the :math:`\text{i}^{th}` probability value given
+    in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} \sim \mathrm{Bernoulli}(p = \text{input}_{i})
+    
+    The returned :attr:`out` tensor only has values 0 or 1 and is of the same
+    shape as :attr:`input`.
+    
+    :attr:`out` can have integral ``dtype``, but :attr:`input` must have floating
+    point ``dtype``.
+    
+    Args:
+        input (Tensor): the input tensor of probability values for the Bernoulli distribution
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.empty(3, 3).uniform_(0, 1)  # generate a uniform random matrix with range [0, 1]
+        >>> a
+        tensor([[ 0.1737,  0.0950,  0.3609],
+                [ 0.7148,  0.0289,  0.2676],
+                [ 0.9456,  0.8937,  0.7202]])
+        >>> torch.bernoulli(a)
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  0.,  0.],
+                [ 1.,  1.,  1.]])
+    
+        >>> a = torch.ones(3, 3) # probability of drawing "1" is 1
+        >>> torch.bernoulli(a)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+        >>> a = torch.zeros(3, 3) # probability of drawing "1" is 0
+        >>> torch.bernoulli(a)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    """
+    ...
+def bilinear(input1: Tensor, input2: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor: ...
+def binary_cross_entropy_with_logits(input: Tensor, target: Tensor, weight: Optional[Tensor] = None, pos_weight: Optional[Tensor] = None, reduction: _int = 1) -> Tensor: ...
+def bincount(input: Tensor, weights: Optional[Tensor] = None, minlength: _int = 0) -> Tensor: 
+    r"""
+    bincount(input, weights=None, minlength=0) -> Tensor
+    
+    Count the frequency of each value in an array of non-negative ints.
+    
+    The number of bins (size 1) is one larger than the largest value in
+    :attr:`input` unless :attr:`input` is empty, in which case the result is a
+    tensor of size 0. If :attr:`minlength` is specified, the number of bins is at least
+    :attr:`minlength` and if :attr:`input` is empty, then the result is tensor of size
+    :attr:`minlength` filled with zeros. If ``n`` is the value at position ``i``,
+    ``out[n] += weights[i]`` if :attr:`weights` is specified else
+    ``out[n] += 1``.
+    
+    Note:
+        This operation may produce nondeterministic gradients when given tensors on a CUDA device. See :doc:`/notes/randomness` for more information.
+    
+    Arguments:
+        input (Tensor): 1-d int tensor
+        weights (Tensor): optional, weight for each value in the input tensor.
+            Should be of same size as input tensor.
+        minlength (int): optional, minimum number of bins. Should be non-negative.
+    
+    Returns:
+        output (Tensor): a tensor of shape ``Size([max(input) + 1])`` if
+        :attr:`input` is non-empty, else ``Size(0)``
+    
+    Example::
+    
+        >>> input = torch.randint(0, 8, (5,), dtype=torch.int64)
+        >>> weights = torch.linspace(0, 1, steps=5)
+        >>> input, weights
+        (tensor([4, 3, 6, 3, 4]),
+         tensor([ 0.0000,  0.2500,  0.5000,  0.7500,  1.0000])
+    
+        >>> torch.bincount(input)
+        tensor([0, 0, 0, 2, 2, 0, 1])
+    
+        >>> input.bincount(weights)
+        tensor([0.0000, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 0.5000])
+    """
+    ...
+def binomial(count: Tensor, prob: Tensor, generator: Optional[Generator] = None) -> Tensor: ...
+@overload
+def bitwise_and(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_and(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise AND of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical AND.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_and(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([1, 0,  3], dtype=torch.int8)
+        >>> torch.bitwise_and(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ False, True, False])
+    """
+    ...
+@overload
+def bitwise_and(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_and(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise AND of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical AND.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_and(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([1, 0,  3], dtype=torch.int8)
+        >>> torch.bitwise_and(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ False, True, False])
+    """
+    ...
+@overload
+def bitwise_and(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_and(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise AND of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical AND.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_and(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([1, 0,  3], dtype=torch.int8)
+        >>> torch.bitwise_and(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ False, True, False])
+    """
+    ...
+@overload
+def bitwise_left_shift(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_left_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i << \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_left_shift(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2, 24], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_left_shift(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_left_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i << \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_left_shift(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2, 24], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_left_shift(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_left_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i << \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_left_shift(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2, 24], dtype=torch.int8)
+    """
+    ...
+def bitwise_not(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_not(input, *, out=None) -> Tensor
+    
+    Computes the bitwise NOT of the given input tensor. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical NOT.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_not(torch.tensor([-1, -2, 3], dtype=torch.int8))
+        tensor([ 0,  1, -4], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_or(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_or(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise OR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical OR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_or(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -2,  3], dtype=torch.int8)
+        >>> torch.bitwise_or(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, True, False])
+    """
+    ...
+@overload
+def bitwise_or(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_or(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise OR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical OR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_or(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -2,  3], dtype=torch.int8)
+        >>> torch.bitwise_or(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, True, False])
+    """
+    ...
+@overload
+def bitwise_or(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_or(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise OR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical OR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_or(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -2,  3], dtype=torch.int8)
+        >>> torch.bitwise_or(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, True, False])
+    """
+    ...
+@overload
+def bitwise_right_shift(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_right_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    In any case, if the value of the right operand is negative or is greater
+    or equal to the number of bits in the promoted left operand, the behavior is undefined.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i >> \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_right_shift(torch.tensor([-2, -7, 31], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -7,  3], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_right_shift(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_right_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    In any case, if the value of the right operand is negative or is greater
+    or equal to the number of bits in the promoted left operand, the behavior is undefined.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i >> \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_right_shift(torch.tensor([-2, -7, 31], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -7,  3], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_right_shift(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_right_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    In any case, if the value of the right operand is negative or is greater
+    or equal to the number of bits in the promoted left operand, the behavior is undefined.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i >> \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_right_shift(torch.tensor([-2, -7, 31], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -7,  3], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_xor(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_xor(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise XOR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical XOR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_xor(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2,  0], dtype=torch.int8)
+        >>> torch.bitwise_xor(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, False, False])
+    """
+    ...
+@overload
+def bitwise_xor(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_xor(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise XOR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical XOR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_xor(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2,  0], dtype=torch.int8)
+        >>> torch.bitwise_xor(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, False, False])
+    """
+    ...
+@overload
+def bitwise_xor(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_xor(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise XOR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical XOR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_xor(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2,  0], dtype=torch.int8)
+        >>> torch.bitwise_xor(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, False, False])
+    """
+    ...
+@overload
+def blackman_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    blackman_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Blackman window function.
+    
+    .. math::
+        w[n] = 0.42 - 0.5 \cos \left( \frac{2 \pi n}{N - 1} \right) + 0.08 \cos \left( \frac{4 \pi n}{N - 1} \right)
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.blackman_window(L, periodic=True)`` equal to
+    ``torch.blackman_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+@overload
+def blackman_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    blackman_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Blackman window function.
+    
+    .. math::
+        w[n] = 0.42 - 0.5 \cos \left( \frac{2 \pi n}{N - 1} \right) + 0.08 \cos \left( \frac{4 \pi n}{N - 1} \right)
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.blackman_window(L, periodic=True)`` equal to
+    ``torch.blackman_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+def bmm(input: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bmm(input, mat2, *, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored in :attr:`input`
+    and :attr:`mat2`.
+    
+    :attr:`input` and :attr:`mat2` must be 3-D tensors each containing
+    the same number of matrices.
+    
+    If :attr:`input` is a :math:`(b \times n \times m)` tensor, :attr:`mat2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor.
+    
+    .. math::
+        \text{out}_i = \text{input}_i \mathbin{@} \text{mat2}_i
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+              For broadcasting matrix products, see :func:`torch.matmul`.
+    
+    Args:
+        input (Tensor): the first batch of matrices to be multiplied
+        mat2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword Args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> input = torch.randn(10, 3, 4)
+        >>> mat2 = torch.randn(10, 4, 5)
+        >>> res = torch.bmm(input, mat2)
+        >>> res.size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+def broadcast_to(input: Tensor, size: Sequence[Union[_int, SymInt]]) -> Tensor: 
+    r"""
+    broadcast_to(input, shape) -> Tensor
+    
+    Broadcasts :attr:`input` to the shape :attr:`\shape`.
+    Equivalent to calling ``input.expand(shape)``. See :meth:`~Tensor.expand` for details.
+    
+    Args:
+        input (Tensor): the input tensor.
+        shape (list, tuple, or :class:`torch.Size`): the new shape.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> torch.broadcast_to(x, (3, 3))
+        tensor([[1, 2, 3],
+                [1, 2, 3],
+                [1, 2, 3]])
+    """
+    ...
+@overload
+def bucketize(input: Tensor, boundaries: Tensor, *, out_int32: _bool = False, right: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bucketize(input, boundaries, *, out_int32=False, right=False, out=None) -> Tensor
+    
+    Returns the indices of the buckets to which each value in the :attr:`input` belongs, where the
+    boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size
+    as :attr:`input`. If :attr:`right` is False (default), then the left boundary is open. Note that
+    this behavior is opposite the behavior of
+    `numpy.digitize <https://docs.scipy.org/doc/numpy/reference/generated/numpy.digitize.html>`_.
+    More formally, the returned index satisfies the following rules:
+    
+    .. list-table::
+       :widths: 15 85
+       :header-rows: 1
+    
+       * - :attr:`right`
+         - *returned index satisfies*
+       * - False
+         - ``boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]``
+       * - True
+         - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]``
+    
+    Args:
+        input (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+        boundaries (Tensor): 1-D tensor, must contain a strictly increasing sequence, or the return value is undefined.
+    
+    Keyword args:
+        out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
+                                    Default value is False, i.e. default output data type is torch.int64.
+        right (bool, optional): if False, return the first suitable location that is found. If True, return the
+                                last such index. If no suitable index found, return 0 for non-numerical value
+                                (eg. nan, inf) or the size of :attr:`boundaries` (one pass the last index).
+                                In other words, if False, gets the lower bound index for each value in :attr:`input`
+                                from :attr:`boundaries`. If True, gets the upper bound index instead.
+                                Default value is False.
+        out (Tensor, optional): the output tensor, must be the same size as :attr:`input` if provided.
+    
+    
+    Example::
+    
+        >>> boundaries = torch.tensor([1, 3, 5, 7, 9])
+        >>> boundaries
+        tensor([1, 3, 5, 7, 9])
+        >>> v = torch.tensor([[3, 6, 9], [3, 6, 9]])
+        >>> v
+        tensor([[3, 6, 9],
+                [3, 6, 9]])
+        >>> torch.bucketize(v, boundaries)
+        tensor([[1, 3, 4],
+                [1, 3, 4]])
+        >>> torch.bucketize(v, boundaries, right=True)
+        tensor([[2, 3, 5],
+                [2, 3, 5]])
+    """
+    ...
+@overload
+def bucketize(self: Union[Number, _complex], boundaries: Tensor, *, out_int32: _bool = False, right: _bool = False) -> Tensor: 
+    r"""
+    bucketize(input, boundaries, *, out_int32=False, right=False, out=None) -> Tensor
+    
+    Returns the indices of the buckets to which each value in the :attr:`input` belongs, where the
+    boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size
+    as :attr:`input`. If :attr:`right` is False (default), then the left boundary is open. Note that
+    this behavior is opposite the behavior of
+    `numpy.digitize <https://docs.scipy.org/doc/numpy/reference/generated/numpy.digitize.html>`_.
+    More formally, the returned index satisfies the following rules:
+    
+    .. list-table::
+       :widths: 15 85
+       :header-rows: 1
+    
+       * - :attr:`right`
+         - *returned index satisfies*
+       * - False
+         - ``boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]``
+       * - True
+         - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]``
+    
+    Args:
+        input (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+        boundaries (Tensor): 1-D tensor, must contain a strictly increasing sequence, or the return value is undefined.
+    
+    Keyword args:
+        out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
+                                    Default value is False, i.e. default output data type is torch.int64.
+        right (bool, optional): if False, return the first suitable location that is found. If True, return the
+                                last such index. If no suitable index found, return 0 for non-numerical value
+                                (eg. nan, inf) or the size of :attr:`boundaries` (one pass the last index).
+                                In other words, if False, gets the lower bound index for each value in :attr:`input`
+                                from :attr:`boundaries`. If True, gets the upper bound index instead.
+                                Default value is False.
+        out (Tensor, optional): the output tensor, must be the same size as :attr:`input` if provided.
+    
+    
+    Example::
+    
+        >>> boundaries = torch.tensor([1, 3, 5, 7, 9])
+        >>> boundaries
+        tensor([1, 3, 5, 7, 9])
+        >>> v = torch.tensor([[3, 6, 9], [3, 6, 9]])
+        >>> v
+        tensor([[3, 6, 9],
+                [3, 6, 9]])
+        >>> torch.bucketize(v, boundaries)
+        tensor([[1, 3, 4],
+                [1, 3, 4]])
+        >>> torch.bucketize(v, boundaries, right=True)
+        tensor([[2, 3, 5],
+                [2, 3, 5]])
+    """
+    ...
+def can_cast(from_: _dtype, to: _dtype) -> _bool: 
+    r"""
+    can_cast(from, to) -> bool
+    
+    Determines if a type conversion is allowed under PyTorch casting rules
+    described in the type promotion :ref:`documentation <type-promotion-doc>`.
+    
+    Args:
+        from (dtype): The original :class:`torch.dtype`.
+        to (dtype): The target :class:`torch.dtype`.
+    
+    Example::
+    
+        >>> torch.can_cast(torch.double, torch.float)
+        True
+        >>> torch.can_cast(torch.float, torch.int)
+        False
+    """
+    ...
+@overload
+def cat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cat(tensors, dim=0, *, out=None) -> Tensor
+    
+    Concatenates the given sequence of :attr:`seq` tensors in the given dimension.
+    All tensors must either have the same shape (except in the concatenating
+    dimension) or be a 1-D empty tensor with size ``(0,)``.
+    
+    :func:`torch.cat` can be seen as an inverse operation for :func:`torch.split`
+    and :func:`torch.chunk`.
+    
+    :func:`torch.cat` can be best understood via examples.
+    
+    .. seealso::
+    
+        :func:`torch.stack` concatenates the given sequence along a new dimension.
+    
+    Args:
+        tensors (sequence of Tensors): any python sequence of tensors of the same type.
+            Non-empty tensors provided must have the same shape, except in the
+            cat dimension.
+        dim (int, optional): the dimension over which the tensors are concatenated
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497]])
+        >>> torch.cat((x, x, x), 0)
+        tensor([[ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497],
+                [ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497],
+                [ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497]])
+        >>> torch.cat((x, x, x), 1)
+        tensor([[ 0.6580, -1.0969, -0.4614,  0.6580, -1.0969, -0.4614,  0.6580,
+                 -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497, -0.1034, -0.5790,  0.1497, -0.1034,
+                 -0.5790,  0.1497]])
+    """
+    ...
+@overload
+def cat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cat(tensors, dim=0, *, out=None) -> Tensor
+    
+    Concatenates the given sequence of :attr:`seq` tensors in the given dimension.
+    All tensors must either have the same shape (except in the concatenating
+    dimension) or be a 1-D empty tensor with size ``(0,)``.
+    
+    :func:`torch.cat` can be seen as an inverse operation for :func:`torch.split`
+    and :func:`torch.chunk`.
+    
+    :func:`torch.cat` can be best understood via examples.
+    
+    .. seealso::
+    
+        :func:`torch.stack` concatenates the given sequence along a new dimension.
+    
+    Args:
+        tensors (sequence of Tensors): any python sequence of tensors of the same type.
+            Non-empty tensors provided must have the same shape, except in the
+            cat dimension.
+        dim (int, optional): the dimension over which the tensors are concatenated
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497]])
+        >>> torch.cat((x, x, x), 0)
+        tensor([[ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497],
+                [ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497],
+                [ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497]])
+        >>> torch.cat((x, x, x), 1)
+        tensor([[ 0.6580, -1.0969, -0.4614,  0.6580, -1.0969, -0.4614,  0.6580,
+                 -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497, -0.1034, -0.5790,  0.1497, -0.1034,
+                 -0.5790,  0.1497]])
+    """
+    ...
+def ccol_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def ceil(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ceil(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the ceil of the elements of :attr:`input`,
+    the smallest integer greater than or equal to each element.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    
+    .. math::
+        \text{out}_{i} = \left\lceil \text{input}_{i} \right\rceil
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.6341, -1.4208, -1.0900,  0.5826])
+        >>> torch.ceil(a)
+        tensor([-0., -1., -1.,  1.])
+    """
+    ...
+def ceil_(input: Tensor) -> Tensor: ...
+def celu(input: Tensor, alpha: Union[Number, _complex] = 1.0) -> Tensor: ...
+def celu_(input: Tensor, alpha: Union[Number, _complex] = 1.0) -> Tensor: ...
+def channel_shuffle(input: Tensor, groups: Union[_int, SymInt]) -> Tensor: ...
+def cholesky(input: Tensor, upper: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cholesky(input, upper=False, *, out=None) -> Tensor
+    
+    Computes the Cholesky decomposition of a symmetric positive-definite
+    matrix :math:`A` or for batches of symmetric positive-definite matrices.
+    
+    If :attr:`upper` is ``True``, the returned matrix ``U`` is upper-triangular, and
+    the decomposition has the form:
+    
+    .. math::
+    
+      A = U^TU
+    
+    If :attr:`upper` is ``False``, the returned matrix ``L`` is lower-triangular, and
+    the decomposition has the form:
+    
+    .. math::
+    
+        A = LL^T
+    
+    If :attr:`upper` is ``True``, and :math:`A` is a batch of symmetric positive-definite
+    matrices, then the returned tensor will be composed of upper-triangular Cholesky factors
+    of each of the individual matrices. Similarly, when :attr:`upper` is ``False``, the returned
+    tensor will be composed of lower-triangular Cholesky factors of each of the individual
+    matrices.
+    
+    .. warning::
+    
+        :func:`torch.cholesky` is deprecated in favor of :func:`torch.linalg.cholesky`
+        and will be removed in a future PyTorch release.
+    
+        ``L = torch.cholesky(A)`` should be replaced with
+    
+        .. code:: python
+    
+            L = torch.linalg.cholesky(A)
+    
+        ``U = torch.cholesky(A, upper=True)`` should be replaced with
+    
+        .. code:: python
+    
+            U = torch.linalg.cholesky(A).mH
+    
+        This transform will produce equivalent results for all valid (symmetric positive definite) inputs.
+    
+    Args:
+        input (Tensor): the input tensor :math:`A` of size :math:`(*, n, n)` where `*` is zero or more
+                    batch dimensions consisting of symmetric positive-definite matrices.
+        upper (bool, optional): flag that indicates whether to return a
+                                upper or lower triangular matrix. Default: ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output matrix
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a = a @ a.mT + 1e-3 # make symmetric positive-definite
+        >>> l = torch.cholesky(a)
+        >>> a
+        tensor([[ 2.4112, -0.7486,  1.4551],
+                [-0.7486,  1.3544,  0.1294],
+                [ 1.4551,  0.1294,  1.6724]])
+        >>> l
+        tensor([[ 1.5528,  0.0000,  0.0000],
+                [-0.4821,  1.0592,  0.0000],
+                [ 0.9371,  0.5487,  0.7023]])
+        >>> l @ l.mT
+        tensor([[ 2.4112, -0.7486,  1.4551],
+                [-0.7486,  1.3544,  0.1294],
+                [ 1.4551,  0.1294,  1.6724]])
+        >>> a = torch.randn(3, 2, 2) # Example for batched input
+        >>> a = a @ a.mT + 1e-03 # make symmetric positive-definite
+        >>> l = torch.cholesky(a)
+        >>> z = l @ l.mT
+        >>> torch.dist(z, a)
+        tensor(2.3842e-07)
+    """
+    ...
+def cholesky_inverse(input: Tensor, upper: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cholesky_inverse(L, upper=False, *, out=None) -> Tensor
+    
+    Computes the inverse of a complex Hermitian or real symmetric
+    positive-definite matrix given its Cholesky decomposition.
+    
+    Let :math:`A` be a complex Hermitian or real symmetric positive-definite matrix,
+    and :math:`L` its Cholesky decomposition such that:
+    
+    .. math::
+    
+        A = LL^{\text{H}}
+    
+    where :math:`L^{\text{H}}` is the conjugate transpose when :math:`L` is complex,
+    and the transpose when :math:`L` is real-valued.
+    
+    Computes the inverse matrix :math:`A^{-1}`.
+    
+    Supports input of float, double, cfloat and cdouble dtypes.
+    Also supports batches of matrices, and if :math:`A` is a batch of matrices
+    then the output has the same batch dimensions.
+    
+    Args:
+        L (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+            consisting of lower or upper triangular Cholesky decompositions of
+            symmetric or Hermitian positive-definite matrices.
+        upper (bool, optional): flag that indicates whether :math:`L` is lower triangular
+            or upper triangular. Default: ``False``
+    
+    Keyword args:
+        out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+    
+    Example::
+    
+        >>> A = torch.randn(3, 3)
+        >>> A = A @ A.T + torch.eye(3) * 1e-3 # Creates a symmetric positive-definite matrix
+        >>> L = torch.linalg.cholesky(A) # Extract Cholesky decomposition
+        >>> torch.cholesky_inverse(L)
+        tensor([[ 1.9314,  1.2251, -0.0889],
+                [ 1.2251,  2.4439,  0.2122],
+                [-0.0889,  0.2122,  0.1412]])
+        >>> A.inverse()
+        tensor([[ 1.9314,  1.2251, -0.0889],
+                [ 1.2251,  2.4439,  0.2122],
+                [-0.0889,  0.2122,  0.1412]])
+    
+        >>> A = torch.randn(3, 2, 2, dtype=torch.complex64)
+        >>> A = A @ A.mH + torch.eye(2) * 1e-3 # Batch of Hermitian positive-definite matrices
+        >>> L = torch.linalg.cholesky(A)
+        >>> torch.dist(torch.inverse(A), torch.cholesky_inverse(L))
+        tensor(5.6358e-7)
+    """
+    ...
+def cholesky_solve(input: Tensor, input2: Tensor, upper: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cholesky_solve(B, L, upper=False, *, out=None) -> Tensor
+    
+    Computes the solution of a system of linear equations with complex Hermitian
+    or real symmetric positive-definite lhs given its Cholesky decomposition.
+    
+    Let :math:`A` be a complex Hermitian or real symmetric positive-definite matrix,
+    and :math:`L` its Cholesky decomposition such that:
+    
+    .. math::
+    
+        A = LL^{\text{H}}
+    
+    where :math:`L^{\text{H}}` is the conjugate transpose when :math:`L` is complex,
+    and the transpose when :math:`L` is real-valued.
+    
+    Returns the solution :math:`X` of the following linear system:
+    
+    .. math::
+    
+        AX = B
+    
+    Supports inputs of float, double, cfloat and cdouble dtypes.
+    Also supports batches of matrices, and if :math:`A` or :math:`B` is a batch of matrices
+    then the output has the same batch dimensions.
+    
+    Args:
+        B (Tensor): right-hand side tensor of shape `(*, n, k)`
+            where :math:`*` is zero or more batch dimensions
+        L (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+            consisting of lower or upper triangular Cholesky decompositions of
+            symmetric or Hermitian positive-definite matrices.
+        upper (bool, optional): flag that indicates whether :math:`L` is lower triangular
+            or upper triangular. Default: ``False``.
+    
+    Keyword args:
+        out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+    
+    Example::
+    
+        >>> A = torch.randn(3, 3)
+        >>> A = A @ A.T + torch.eye(3) * 1e-3 # Creates a symmetric positive-definite matrix
+        >>> L = torch.linalg.cholesky(A) # Extract Cholesky decomposition
+        >>> B = torch.randn(3, 2)
+        >>> torch.cholesky_solve(B, L)
+        tensor([[ -8.1625,  19.6097],
+                [ -5.8398,  14.2387],
+                [ -4.3771,  10.4173]])
+        >>> A.inverse() @  B
+        tensor([[ -8.1626,  19.6097],
+                [ -5.8398,  14.2387],
+                [ -4.3771,  10.4173]])
+    
+        >>> A = torch.randn(3, 2, 2, dtype=torch.complex64)
+        >>> A = A @ A.mH + torch.eye(2) * 1e-3 # Batch of Hermitian positive-definite matrices
+        >>> L = torch.linalg.cholesky(A)
+        >>> B = torch.randn(2, 1, dtype=torch.complex64)
+        >>> X = torch.cholesky_solve(B, L)
+        >>> torch.dist(X, A.inverse() @ B)
+        tensor(1.6881e-5)
+    """
+    ...
+def choose_qparams_optimized(input: Tensor, numel: _int, n_bins: _int, ratio: _float, bit_width: _int) -> Tuple[Tensor, Tensor]: ...
+def chunk(input: Tensor, chunks: _int, dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    chunk(input, chunks, dim=0) -> List of Tensors
+    
+    Attempts to split a tensor into the specified number of chunks. Each chunk is a view of
+    the input tensor.
+    
+    
+    .. note::
+    
+        This function may return fewer than the specified number of chunks!
+    
+    .. seealso::
+    
+        :func:`torch.tensor_split` a function that always returns exactly the specified number of chunks
+    
+    If the tensor size along the given dimension :attr:`dim` is divisible by :attr:`chunks`,
+    all returned chunks will be the same size.
+    If the tensor size along the given dimension :attr:`dim` is not divisible by :attr:`chunks`,
+    all returned chunks will be the same size, except the last one.
+    If such division is not possible, this function may return fewer
+    than the specified number of chunks.
+    
+    Arguments:
+        input (Tensor): the tensor to split
+        chunks (int): number of chunks to return
+        dim (int): dimension along which to split the tensor
+    
+    Example:
+        >>> torch.arange(11).chunk(6)
+        (tensor([0, 1]),
+         tensor([2, 3]),
+         tensor([4, 5]),
+         tensor([6, 7]),
+         tensor([8, 9]),
+         tensor([10]))
+        >>> torch.arange(12).chunk(6)
+        (tensor([0, 1]),
+         tensor([2, 3]),
+         tensor([4, 5]),
+         tensor([6, 7]),
+         tensor([8, 9]),
+         tensor([10, 11]))
+        >>> torch.arange(13).chunk(6)
+        (tensor([0, 1, 2]),
+         tensor([3, 4, 5]),
+         tensor([6, 7, 8]),
+         tensor([ 9, 10, 11]),
+         tensor([12]))
+    """
+    ...
+@overload
+def clamp(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    clamp(input, min=None, max=None, *, out=None) -> Tensor
+    
+    Clamps all elements in :attr:`input` into the range `[` :attr:`min`, :attr:`max` `]`.
+    Letting min_value and max_value be :attr:`min` and :attr:`max`, respectively, this returns:
+    
+    .. math::
+        y_i = \min(\max(x_i, \text{min\_value}_i), \text{max\_value}_i)
+    
+    If :attr:`min` is ``None``, there is no lower bound.
+    Or, if :attr:`max` is ``None`` there is no upper bound.
+    
+    
+    .. note::
+        If :attr:`min` is greater than :attr:`max` :func:`torch.clamp(..., min, max) <torch.clamp>`
+        sets all elements in :attr:`input` to the value of :attr:`max`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        min (Number or Tensor, optional): lower-bound of the range to be clamped to
+        max (Number or Tensor, optional): upper-bound of the range to be clamped to
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-1.7120,  0.1734, -0.0478, -0.0922])
+        >>> torch.clamp(a, min=-0.5, max=0.5)
+        tensor([-0.5000,  0.1734, -0.0478, -0.0922])
+    
+        >>> min = torch.linspace(-1, 1, steps=4)
+        >>> torch.clamp(a, min=min)
+        tensor([-1.0000,  0.1734,  0.3333,  1.0000])
+    """
+    ...
+@overload
+def clamp(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    clamp(input, min=None, max=None, *, out=None) -> Tensor
+    
+    Clamps all elements in :attr:`input` into the range `[` :attr:`min`, :attr:`max` `]`.
+    Letting min_value and max_value be :attr:`min` and :attr:`max`, respectively, this returns:
+    
+    .. math::
+        y_i = \min(\max(x_i, \text{min\_value}_i), \text{max\_value}_i)
+    
+    If :attr:`min` is ``None``, there is no lower bound.
+    Or, if :attr:`max` is ``None`` there is no upper bound.
+    
+    
+    .. note::
+        If :attr:`min` is greater than :attr:`max` :func:`torch.clamp(..., min, max) <torch.clamp>`
+        sets all elements in :attr:`input` to the value of :attr:`max`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        min (Number or Tensor, optional): lower-bound of the range to be clamped to
+        max (Number or Tensor, optional): upper-bound of the range to be clamped to
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-1.7120,  0.1734, -0.0478, -0.0922])
+        >>> torch.clamp(a, min=-0.5, max=0.5)
+        tensor([-0.5000,  0.1734, -0.0478, -0.0922])
+    
+        >>> min = torch.linspace(-1, 1, steps=4)
+        >>> torch.clamp(a, min=min)
+        tensor([-1.0000,  0.1734,  0.3333,  1.0000])
+    """
+    ...
+@overload
+def clamp_(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None) -> Tensor: ...
+@overload
+def clamp_max(input: Tensor, max: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_max(input: Tensor, max: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_max_(input: Tensor, max: Tensor) -> Tensor: ...
+@overload
+def clamp_max_(input: Tensor, max: Union[Number, _complex]) -> Tensor: ...
+@overload
+def clamp_min(input: Tensor, min: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_min(input: Tensor, min: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_min_(input: Tensor, min: Tensor) -> Tensor: ...
+@overload
+def clamp_min_(input: Tensor, min: Union[Number, _complex]) -> Tensor: ...
+@overload
+def clip(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    clip(input, min=None, max=None, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.clamp`.
+    """
+    ...
+@overload
+def clip(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    clip(input, min=None, max=None, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.clamp`.
+    """
+    ...
+@overload
+def clip_(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clip_(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None) -> Tensor: ...
+def clone(input: Tensor, *, memory_format: Optional[memory_format] = None) -> Tensor: 
+    r"""
+    clone(input, *, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a copy of :attr:`input`.
+    
+    .. note::
+    
+        This function is differentiable, so gradients will flow back from the
+        result of this operation to :attr:`input`. To create a tensor without an
+        autograd relationship to :attr:`input` see :meth:`~Tensor.detach`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+def col_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.col_indices`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def column_stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    column_stack(tensors, *, out=None) -> Tensor
+    
+    Creates a new tensor by horizontally stacking the tensors in :attr:`tensors`.
+    
+    Equivalent to ``torch.hstack(tensors)``, except each zero or one dimensional tensor ``t``
+    in :attr:`tensors` is first reshaped into a ``(t.numel(), 1)`` column before being stacked horizontally.
+    
+    Args:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5, 6])
+        >>> torch.column_stack((a, b))
+        tensor([[1, 4],
+            [2, 5],
+            [3, 6]])
+        >>> a = torch.arange(5)
+        >>> b = torch.arange(10).reshape(5, 2)
+        >>> torch.column_stack((a, b, b))
+        tensor([[0, 0, 1, 0, 1],
+                [1, 2, 3, 2, 3],
+                [2, 4, 5, 4, 5],
+                [3, 6, 7, 6, 7],
+                [4, 8, 9, 8, 9]])
+    """
+    ...
+def combinations(input: Tensor, r: _int = 2, with_replacement: _bool = False) -> Tensor: 
+    r"""
+    combinations(input, r=2, with_replacement=False) -> seq
+    
+    Compute combinations of length :math:`r` of the given tensor. The behavior is similar to
+    python's `itertools.combinations` when `with_replacement` is set to `False`, and
+    `itertools.combinations_with_replacement` when `with_replacement` is set to `True`.
+    
+    Arguments:
+        input (Tensor): 1D vector.
+        r (int, optional): number of elements to combine
+        with_replacement (bool, optional): whether to allow duplication in combination
+    
+    Returns:
+        Tensor: A tensor equivalent to converting all the input tensors into lists, do
+        `itertools.combinations` or `itertools.combinations_with_replacement` on these
+        lists, and finally convert the resulting list into tensor.
+    
+    Example::
+    
+        >>> a = [1, 2, 3]
+        >>> list(itertools.combinations(a, r=2))
+        [(1, 2), (1, 3), (2, 3)]
+        >>> list(itertools.combinations(a, r=3))
+        [(1, 2, 3)]
+        >>> list(itertools.combinations_with_replacement(a, r=2))
+        [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)]
+        >>> tensor_a = torch.tensor(a)
+        >>> torch.combinations(tensor_a)
+        tensor([[1, 2],
+                [1, 3],
+                [2, 3]])
+        >>> torch.combinations(tensor_a, r=3)
+        tensor([[1, 2, 3]])
+        >>> torch.combinations(tensor_a, with_replacement=True)
+        tensor([[1, 1],
+                [1, 2],
+                [1, 3],
+                [2, 2],
+                [2, 3],
+                [3, 3]])
+    """
+    ...
+def complex(real: Tensor, imag: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    complex(real, imag, *, out=None) -> Tensor
+    
+    Constructs a complex tensor with its real part equal to :attr:`real` and its
+    imaginary part equal to :attr:`imag`.
+    
+    Args:
+        real (Tensor): The real part of the complex tensor. Must be half, float or double.
+        imag (Tensor): The imaginary part of the complex tensor. Must be same dtype
+            as :attr:`real`.
+    
+    Keyword args:
+        out (Tensor): If the inputs are ``torch.float32``, must be
+            ``torch.complex64``. If the inputs are ``torch.float64``, must be
+            ``torch.complex128``.
+    
+    Example::
+    
+        >>> real = torch.tensor([1, 2], dtype=torch.float32)
+        >>> imag = torch.tensor([3, 4], dtype=torch.float32)
+        >>> z = torch.complex(real, imag)
+        >>> z
+        tensor([(1.+3.j), (2.+4.j)])
+        >>> z.dtype
+        torch.complex64
+    """
+    ...
+@overload
+def concat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    concat(tensors, dim=0, *, out=None) -> Tensor
+    
+    Alias of :func:`torch.cat`.
+    """
+    ...
+@overload
+def concat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    concat(tensors, dim=0, *, out=None) -> Tensor
+    
+    Alias of :func:`torch.cat`.
+    """
+    ...
+@overload
+def concatenate(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    concatenate(tensors, axis=0, out=None) -> Tensor
+    
+    Alias of :func:`torch.cat`.
+    """
+    ...
+@overload
+def concatenate(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    concatenate(tensors, axis=0, out=None) -> Tensor
+    
+    Alias of :func:`torch.cat`.
+    """
+    ...
+def conj(input: Tensor) -> Tensor: 
+    r"""
+    conj(input) -> Tensor
+    
+    Returns a view of :attr:`input` with a flipped conjugate bit. If :attr:`input` has a non-complex dtype,
+    this function just returns :attr:`input`.
+    
+    .. note::
+        :func:`torch.conj` performs a lazy conjugation, but the actual conjugated tensor can be materialized
+        at any time using :func:`torch.resolve_conj`.
+    
+    .. warning:: In the future, :func:`torch.conj` may return a non-writeable view for an :attr:`input` of
+                 non-complex dtype. It's recommended that programs not modify the tensor returned by :func:`torch.conj_physical`
+                 when :attr:`input` is of non-complex dtype to be compatible with this change.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j])
+        >>> x.is_conj()
+        False
+        >>> y = torch.conj(x)
+        >>> y.is_conj()
+        True
+    """
+    ...
+def conj_physical(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    conj_physical(input, *, out=None) -> Tensor
+    
+    Computes the element-wise conjugate of the given :attr:`input` tensor.
+    If :attr:`input` has a non-complex dtype, this function just returns :attr:`input`.
+    
+    .. note::
+       This performs the conjugate operation regardless of the fact conjugate bit is set or not.
+    
+    .. warning:: In the future, :func:`torch.conj_physical` may return a non-writeable view for an :attr:`input` of
+                 non-complex dtype. It's recommended that programs not modify the tensor returned by :func:`torch.conj_physical`
+                 when :attr:`input` is of non-complex dtype to be compatible with this change.
+    
+    .. math::
+        \text{out}_{i} = conj(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.conj_physical(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]))
+        tensor([-1 - 1j, -2 - 2j, 3 + 3j])
+    """
+    ...
+def conj_physical_(input: Tensor) -> Tensor: ...
+def constant_pad_nd(input: Tensor, pad: Sequence[Union[_int, SymInt]], value: Union[Number, _complex] = 0) -> Tensor: ...
+@overload
+def conv1d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv1d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: str = "valid", dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv2d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv2d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: str = "valid", dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv3d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv3d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: str = "valid", dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+def conv_tbc(input: Tensor, weight: Tensor, bias: Tensor, pad: _int = 0) -> Tensor: ...
+def conv_transpose1d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, output_padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, groups: Union[_int, SymInt] = 1, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ...
+def conv_transpose2d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, output_padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, groups: Union[_int, SymInt] = 1, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ...
+def conv_transpose3d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, output_padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, groups: Union[_int, SymInt] = 1, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ...
+def convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], transposed: _bool, output_padding: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+@overload
+def copysign(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    copysign(input, other, *, out=None) -> Tensor
+    
+    Create a new floating-point tensor with the magnitude of :attr:`input` and the sign of :attr:`other`, elementwise.
+    
+    .. math::
+        \text{out}_{i} = \begin{cases}
+            -|\text{input}_{i}| & \text{if } \text{other}_{i} \leq -0.0 \\
+             |\text{input}_{i}| & \text{if } \text{other}_{i} \geq 0.0 \\
+        \end{cases}
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    and integer and float inputs.
+    
+    Args:
+        input (Tensor): magnitudes.
+        other (Tensor or Number): contains value(s) whose signbit(s) are
+            applied to the magnitudes in :attr:`input`.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(5)
+        >>> a
+        tensor([-1.2557, -0.0026, -0.5387,  0.4740, -0.9244])
+        >>> torch.copysign(a, 1)
+        tensor([1.2557, 0.0026, 0.5387, 0.4740, 0.9244])
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.7079,  0.2778, -1.0249,  0.5719],
+                [-0.0059, -0.2600, -0.4475, -1.3948],
+                [ 0.3667, -0.9567, -2.5757, -0.1751],
+                [ 0.2046, -0.0742,  0.2998, -0.1054]])
+        >>> b = torch.randn(4)
+        tensor([ 0.2373,  0.3120,  0.3190, -1.1128])
+        >>> torch.copysign(a, b)
+        tensor([[ 0.7079,  0.2778,  1.0249, -0.5719],
+                [ 0.0059,  0.2600,  0.4475, -1.3948],
+                [ 0.3667,  0.9567,  2.5757, -0.1751],
+                [ 0.2046,  0.0742,  0.2998, -0.1054]])
+        >>> a = torch.tensor([1.])
+        >>> b = torch.tensor([-0.])
+        >>> torch.copysign(a, b)
+        tensor([-1.])
+    
+    .. note::
+        copysign handles signed zeros. If the other argument has a negative zero (-0),
+        the corresponding output value will be negative.
+    """
+    ...
+@overload
+def copysign(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    copysign(input, other, *, out=None) -> Tensor
+    
+    Create a new floating-point tensor with the magnitude of :attr:`input` and the sign of :attr:`other`, elementwise.
+    
+    .. math::
+        \text{out}_{i} = \begin{cases}
+            -|\text{input}_{i}| & \text{if } \text{other}_{i} \leq -0.0 \\
+             |\text{input}_{i}| & \text{if } \text{other}_{i} \geq 0.0 \\
+        \end{cases}
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    and integer and float inputs.
+    
+    Args:
+        input (Tensor): magnitudes.
+        other (Tensor or Number): contains value(s) whose signbit(s) are
+            applied to the magnitudes in :attr:`input`.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(5)
+        >>> a
+        tensor([-1.2557, -0.0026, -0.5387,  0.4740, -0.9244])
+        >>> torch.copysign(a, 1)
+        tensor([1.2557, 0.0026, 0.5387, 0.4740, 0.9244])
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.7079,  0.2778, -1.0249,  0.5719],
+                [-0.0059, -0.2600, -0.4475, -1.3948],
+                [ 0.3667, -0.9567, -2.5757, -0.1751],
+                [ 0.2046, -0.0742,  0.2998, -0.1054]])
+        >>> b = torch.randn(4)
+        tensor([ 0.2373,  0.3120,  0.3190, -1.1128])
+        >>> torch.copysign(a, b)
+        tensor([[ 0.7079,  0.2778,  1.0249, -0.5719],
+                [ 0.0059,  0.2600,  0.4475, -1.3948],
+                [ 0.3667,  0.9567,  2.5757, -0.1751],
+                [ 0.2046,  0.0742,  0.2998, -0.1054]])
+        >>> a = torch.tensor([1.])
+        >>> b = torch.tensor([-0.])
+        >>> torch.copysign(a, b)
+        tensor([-1.])
+    
+    .. note::
+        copysign handles signed zeros. If the other argument has a negative zero (-0),
+        the corresponding output value will be negative.
+    """
+    ...
+def corrcoef(input: Tensor) -> Tensor: 
+    r"""
+    corrcoef(input) -> Tensor
+    
+    Estimates the Pearson product-moment correlation coefficient matrix of the variables given by the :attr:`input` matrix,
+    where rows are the variables and columns are the observations.
+    
+    .. note::
+    
+        The correlation coefficient matrix R is computed using the covariance matrix C as given by
+        :math:`R_{ij} = \frac{ C_{ij} } { \sqrt{ C_{ii} * C_{jj} } }`
+    
+    .. note::
+    
+        Due to floating point rounding, the resulting array may not be Hermitian and its diagonal elements may not be 1.
+        The real and imaginary values are clipped to the interval [-1, 1] in an attempt to improve this situation.
+    
+    Args:
+        input (Tensor): A 2D matrix containing multiple variables and observations, or a
+            Scalar or 1D vector representing a single variable.
+    
+    Returns:
+        (Tensor) The correlation coefficient matrix of the variables.
+    
+    .. seealso::
+    
+            :func:`torch.cov` covariance matrix.
+    
+    Example::
+    
+        >>> x = torch.tensor([[0, 1, 2], [2, 1, 0]])
+        >>> torch.corrcoef(x)
+        tensor([[ 1., -1.],
+                [-1.,  1.]])
+        >>> x = torch.randn(2, 4)
+        >>> x
+        tensor([[-0.2678, -0.0908, -0.3766,  0.2780],
+                [-0.5812,  0.1535,  0.2387,  0.2350]])
+        >>> torch.corrcoef(x)
+        tensor([[1.0000, 0.3582],
+                [0.3582, 1.0000]])
+        >>> torch.corrcoef(x[0])
+        tensor(1.)
+    """
+    ...
+def cos(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cos(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the cosine  of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \cos(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 1.4309,  1.2706, -0.8562,  0.9796])
+        >>> torch.cos(a)
+        tensor([ 0.1395,  0.2957,  0.6553,  0.5574])
+    """
+    ...
+def cos_(input: Tensor) -> Tensor: ...
+def cosh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cosh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the hyperbolic cosine  of the elements of
+    :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \cosh(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.1632,  1.1835, -0.6979, -0.7325])
+        >>> torch.cosh(a)
+        tensor([ 1.0133,  1.7860,  1.2536,  1.2805])
+    
+    .. note::
+       When :attr:`input` is on the CPU, the implementation of torch.cosh may use
+       the Sleef library, which rounds very large results to infinity or negative
+       infinity. See `here <https://sleef.org/purec.xhtml>`_ for details.
+    """
+    ...
+def cosh_(input: Tensor) -> Tensor: ...
+def cosine_embedding_loss(input1: Tensor, input2: Tensor, target: Tensor, margin: _float = 0.0, reduction: _int = 1) -> Tensor: ...
+def cosine_similarity(x1: Tensor, x2: Tensor, dim: _int = 1, eps: _float = 1e-08) -> Tensor: ...
+@overload
+def count_nonzero(input: Tensor, dim: Optional[_int] = None) -> Tensor: 
+    r"""
+    count_nonzero(input, dim=None) -> Tensor
+    
+    Counts the number of non-zero values in the tensor :attr:`input` along the given :attr:`dim`.
+    If no dim is specified then all non-zeros in the tensor are counted.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): Dim or tuple of dims along which to count non-zeros.
+    
+    Example::
+    
+        >>> x = torch.zeros(3,3)
+        >>> x[torch.randn(3,3) > 0.5] = 1
+        >>> x
+        tensor([[0., 1., 1.],
+                [0., 0., 0.],
+                [0., 0., 1.]])
+        >>> torch.count_nonzero(x)
+        tensor(3)
+        >>> torch.count_nonzero(x, dim=0)
+        tensor([0, 1, 2])
+    """
+    ...
+@overload
+def count_nonzero(input: Tensor, dim: _size) -> Tensor: 
+    r"""
+    count_nonzero(input, dim=None) -> Tensor
+    
+    Counts the number of non-zero values in the tensor :attr:`input` along the given :attr:`dim`.
+    If no dim is specified then all non-zeros in the tensor are counted.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): Dim or tuple of dims along which to count non-zeros.
+    
+    Example::
+    
+        >>> x = torch.zeros(3,3)
+        >>> x[torch.randn(3,3) > 0.5] = 1
+        >>> x
+        tensor([[0., 1., 1.],
+                [0., 0., 0.],
+                [0., 0., 1.]])
+        >>> torch.count_nonzero(x)
+        tensor(3)
+        >>> torch.count_nonzero(x, dim=0)
+        tensor([0, 1, 2])
+    """
+    ...
+def cov(input: Tensor, *, correction: _int = 1, fweights: Optional[Tensor] = None, aweights: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cov(input, *, correction=1, fweights=None, aweights=None) -> Tensor
+    
+    Estimates the covariance matrix of the variables given by the :attr:`input` matrix, where rows are
+    the variables and columns are the observations.
+    
+    A covariance matrix is a square matrix giving the covariance of each pair of variables. The diagonal contains
+    the variance of each variable (covariance of a variable with itself). By definition, if :attr:`input` represents
+    a single variable (Scalar or 1D) then its variance is returned.
+    
+    The sample covariance of the variables :math:`x` and :math:`y` is given by:
+    
+    .. math::
+        \text{cov}(x,y) = \frac{\sum^{N}_{i = 1}(x_{i} - \bar{x})(y_{i} - \bar{y})}{\max(0,~N~-~\delta N)}
+    
+    where :math:`\bar{x}` and :math:`\bar{y}` are the simple means of the :math:`x` and :math:`y` respectively, and
+    :math:`\delta N` is the :attr:`correction`.
+    
+    If :attr:`fweights` and/or :attr:`aweights` are provided, the weighted covariance
+    is calculated, which is given by:
+    
+    .. math::
+        \text{cov}_w(x,y) = \frac{\sum^{N}_{i = 1}w_i(x_{i} - \mu_x^*)(y_{i} - \mu_y^*)}
+        {\max(0,~\sum^{N}_{i = 1}w_i~-~\frac{\sum^{N}_{i = 1}w_ia_i}{\sum^{N}_{i = 1}w_i}~\delta N)}
+    
+    where :math:`w` denotes :attr:`fweights` or :attr:`aweights` (``f`` and ``a`` for brevity) based on whichever is
+    provided, or :math:`w = f \times a` if both are provided, and
+    :math:`\mu_x^* = \frac{\sum^{N}_{i = 1}w_ix_{i} }{\sum^{N}_{i = 1}w_i}` is the weighted mean of the variable. If not
+    provided, ``f`` and/or ``a`` can be seen as a :math:`\mathbb{1}` vector of appropriate size.
+    
+    Args:
+        input (Tensor): A 2D matrix containing multiple variables and observations, or a
+            Scalar or 1D vector representing a single variable.
+    
+    Keyword Args:
+        correction (int, optional): difference between the sample size and sample degrees of freedom.
+            Defaults to Bessel's correction, ``correction = 1`` which returns the unbiased estimate,
+            even if both :attr:`fweights` and :attr:`aweights` are specified. ``correction = 0``
+            will return the simple average. Defaults to ``1``.
+        fweights (tensor, optional): A Scalar or 1D tensor of observation vector frequencies representing the number of
+            times each observation should be repeated. Its numel must equal the number of columns of :attr:`input`.
+            Must have integral dtype. Ignored if ``None``. Defaults to ``None``.
+        aweights (tensor, optional): A Scalar or 1D array of observation vector weights.
+            These relative weights are typically large for observations considered “important” and smaller for
+            observations considered less “important”. Its numel must equal the number of columns of :attr:`input`.
+            Must have floating point dtype. Ignored if ``None``. Defaults to ``None``.
+    
+    Returns:
+        (Tensor) The covariance matrix of the variables.
+    
+    .. seealso::
+    
+            :func:`torch.corrcoef` normalized covariance matrix.
+    
+    Example::
+        >>> x = torch.tensor([[0, 2], [1, 1], [2, 0]]).T
+        >>> x
+        tensor([[0, 1, 2],
+                [2, 1, 0]])
+        >>> torch.cov(x)
+        tensor([[ 1., -1.],
+                [-1.,  1.]])
+        >>> torch.cov(x, correction=0)
+        tensor([[ 0.6667, -0.6667],
+                [-0.6667,  0.6667]])
+        >>> fw = torch.randint(1, 10, (3,))
+        >>> fw
+        tensor([1, 6, 9])
+        >>> aw = torch.rand(3)
+        >>> aw
+        tensor([0.4282, 0.0255, 0.4144])
+        >>> torch.cov(x, fweights=fw, aweights=aw)
+        tensor([[ 0.4169, -0.4169],
+                [-0.4169,  0.4169]])
+    """
+    ...
+def cross(input: Tensor, other: Tensor, dim: Optional[_int] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cross(input, other, dim=None, *, out=None) -> Tensor
+    
+    
+    Returns the cross product of vectors in dimension :attr:`dim` of :attr:`input`
+    and :attr:`other`.
+    
+    Supports input of float, double, cfloat and cdouble dtypes. Also supports batches
+    of vectors, for which it computes the product along the dimension :attr:`dim`.
+    In this case, the output has the same batch dimensions as the inputs.
+    
+    .. warning::
+        If :attr:`dim` is not given, it defaults to the first dimension found
+        with the size 3. Note that this might be unexpected.
+    
+        This behavior is deprecated and will be changed to match that of :func:`torch.linalg.cross`
+        in a future release.
+    
+    .. seealso::
+            :func:`torch.linalg.cross` which has dim=-1 as default.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+        dim  (int, optional): the dimension to take the cross-product in.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 3)
+        >>> a
+        tensor([[-0.3956,  1.1455,  1.6895],
+                [-0.5849,  1.3672,  0.3599],
+                [-1.1626,  0.7180, -0.0521],
+                [-0.1339,  0.9902, -2.0225]])
+        >>> b = torch.randn(4, 3)
+        >>> b
+        tensor([[-0.0257, -1.4725, -1.2251],
+                [-1.1479, -0.7005, -1.9757],
+                [-1.3904,  0.3726, -1.1836],
+                [-0.9688, -0.7153,  0.2159]])
+        >>> torch.cross(a, b, dim=1)
+        tensor([[ 1.0844, -0.5281,  0.6120],
+                [-2.4490, -1.5687,  1.9792],
+                [-0.8304, -1.3037,  0.5650],
+                [-1.2329,  1.9883,  1.0551]])
+        >>> torch.cross(a, b)
+        tensor([[ 1.0844, -0.5281,  0.6120],
+                [-2.4490, -1.5687,  1.9792],
+                [-0.8304, -1.3037,  0.5650],
+                [-1.2329,  1.9883,  1.0551]])
+    """
+    ...
+def crow_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.crow_indices`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int = 0, reduction: _int = 1, zero_infinity: _bool = False) -> Tensor: ...
+@overload
+def ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int = 0, reduction: _int = 1, zero_infinity: _bool = False) -> Tensor: ...
+def cudnn_affine_grid_generator(theta: Tensor, N: _int, C: _int, H: _int, W: _int) -> Tensor: ...
+def cudnn_batch_norm(input: Tensor, weight: Tensor, bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, exponential_average_factor: _float, epsilon: _float) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def cudnn_convolution(input: Tensor, weight: Tensor, padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, allow_tf32: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def cudnn_convolution_add_relu(input: Tensor, weight: Tensor, z: Tensor, alpha: Optional[Union[Number, _complex]], bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def cudnn_convolution_relu(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def cudnn_convolution_transpose(input: Tensor, weight: Tensor, padding: Sequence[Union[_int, SymInt]], output_padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, allow_tf32: _bool) -> Tensor: ...
+def cudnn_grid_sampler(input: Tensor, grid: Tensor) -> Tensor: ...
+def cudnn_is_acceptable(input: Tensor) -> _bool: ...
+@overload
+def cummax(input: Tensor, dim: _int, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummax: 
+    r"""
+    cummax(input, dim, *, out=None) -> (Tensor, LongTensor)
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative maximum of
+    elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index
+    location of each maximum value found in the dimension :attr:`dim`.
+    
+    .. math::
+        y_i = max(x_1, x_2, x_3, \dots, x_i)
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([-0.3449, -1.5447,  0.0685, -1.5104, -1.1706,  0.2259,  1.4696, -1.3284,
+             1.9946, -0.8209])
+        >>> torch.cummax(a, dim=0)
+        torch.return_types.cummax(
+            values=tensor([-0.3449, -0.3449,  0.0685,  0.0685,  0.0685,  0.2259,  1.4696,  1.4696,
+             1.9946,  1.9946]),
+            indices=tensor([0, 0, 2, 2, 2, 5, 6, 6, 8, 8]))
+    """
+    ...
+@overload
+def cummax(input: Tensor, dim: Union[str, ellipsis, None], *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummax: 
+    r"""
+    cummax(input, dim, *, out=None) -> (Tensor, LongTensor)
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative maximum of
+    elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index
+    location of each maximum value found in the dimension :attr:`dim`.
+    
+    .. math::
+        y_i = max(x_1, x_2, x_3, \dots, x_i)
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([-0.3449, -1.5447,  0.0685, -1.5104, -1.1706,  0.2259,  1.4696, -1.3284,
+             1.9946, -0.8209])
+        >>> torch.cummax(a, dim=0)
+        torch.return_types.cummax(
+            values=tensor([-0.3449, -0.3449,  0.0685,  0.0685,  0.0685,  0.2259,  1.4696,  1.4696,
+             1.9946,  1.9946]),
+            indices=tensor([0, 0, 2, 2, 2, 5, 6, 6, 8, 8]))
+    """
+    ...
+@overload
+def cummin(input: Tensor, dim: _int, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummin: 
+    r"""
+    cummin(input, dim, *, out=None) -> (Tensor, LongTensor)
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative minimum of
+    elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index
+    location of each maximum value found in the dimension :attr:`dim`.
+    
+    .. math::
+        y_i = min(x_1, x_2, x_3, \dots, x_i)
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([-0.2284, -0.6628,  0.0975,  0.2680, -1.3298, -0.4220, -0.3885,  1.1762,
+             0.9165,  1.6684])
+        >>> torch.cummin(a, dim=0)
+        torch.return_types.cummin(
+            values=tensor([-0.2284, -0.6628, -0.6628, -0.6628, -1.3298, -1.3298, -1.3298, -1.3298,
+            -1.3298, -1.3298]),
+            indices=tensor([0, 1, 1, 1, 4, 4, 4, 4, 4, 4]))
+    """
+    ...
+@overload
+def cummin(input: Tensor, dim: Union[str, ellipsis, None], *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummin: 
+    r"""
+    cummin(input, dim, *, out=None) -> (Tensor, LongTensor)
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative minimum of
+    elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index
+    location of each maximum value found in the dimension :attr:`dim`.
+    
+    .. math::
+        y_i = min(x_1, x_2, x_3, \dots, x_i)
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([-0.2284, -0.6628,  0.0975,  0.2680, -1.3298, -0.4220, -0.3885,  1.1762,
+             0.9165,  1.6684])
+        >>> torch.cummin(a, dim=0)
+        torch.return_types.cummin(
+            values=tensor([-0.2284, -0.6628, -0.6628, -0.6628, -1.3298, -1.3298, -1.3298, -1.3298,
+            -1.3298, -1.3298]),
+            indices=tensor([0, 1, 1, 1, 4, 4, 4, 4, 4, 4]))
+    """
+    ...
+@overload
+def cumprod(input: Tensor, dim: _int, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cumprod(input, dim, *, dtype=None, out=None) -> Tensor
+    
+    Returns the cumulative product of elements of :attr:`input` in the dimension
+    :attr:`dim`.
+    
+    For example, if :attr:`input` is a vector of size N, the result will also be
+    a vector of size N, with elements.
+    
+    .. math::
+        y_i = x_1 \times x_2\times x_3\times \dots \times x_i
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([ 0.6001,  0.2069, -0.1919,  0.9792,  0.6727,  1.0062,  0.4126,
+                -0.2129, -0.4206,  0.1968])
+        >>> torch.cumprod(a, dim=0)
+        tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0158, -0.0065,
+                 0.0014, -0.0006, -0.0001])
+    
+        >>> a[5] = 0.0
+        >>> torch.cumprod(a, dim=0)
+        tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0000, -0.0000,
+                 0.0000, -0.0000, -0.0000])
+    """
+    ...
+@overload
+def cumprod(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cumprod(input, dim, *, dtype=None, out=None) -> Tensor
+    
+    Returns the cumulative product of elements of :attr:`input` in the dimension
+    :attr:`dim`.
+    
+    For example, if :attr:`input` is a vector of size N, the result will also be
+    a vector of size N, with elements.
+    
+    .. math::
+        y_i = x_1 \times x_2\times x_3\times \dots \times x_i
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([ 0.6001,  0.2069, -0.1919,  0.9792,  0.6727,  1.0062,  0.4126,
+                -0.2129, -0.4206,  0.1968])
+        >>> torch.cumprod(a, dim=0)
+        tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0158, -0.0065,
+                 0.0014, -0.0006, -0.0001])
+    
+        >>> a[5] = 0.0
+        >>> torch.cumprod(a, dim=0)
+        tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0000, -0.0000,
+                 0.0000, -0.0000, -0.0000])
+    """
+    ...
+@overload
+def cumsum(input: Tensor, dim: _int, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cumsum(input, dim, *, dtype=None, out=None) -> Tensor
+    
+    Returns the cumulative sum of elements of :attr:`input` in the dimension
+    :attr:`dim`.
+    
+    For example, if :attr:`input` is a vector of size N, the result will also be
+    a vector of size N, with elements.
+    
+    .. math::
+        y_i = x_1 + x_2 + x_3 + \dots + x_i
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(1, 20, (10,))
+        >>> a
+        tensor([13,  7,  3, 10, 13,  3, 15, 10,  9, 10])
+        >>> torch.cumsum(a, dim=0)
+        tensor([13, 20, 23, 33, 46, 49, 64, 74, 83, 93])
+    """
+    ...
+@overload
+def cumsum(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cumsum(input, dim, *, dtype=None, out=None) -> Tensor
+    
+    Returns the cumulative sum of elements of :attr:`input` in the dimension
+    :attr:`dim`.
+    
+    For example, if :attr:`input` is a vector of size N, the result will also be
+    a vector of size N, with elements.
+    
+    .. math::
+        y_i = x_1 + x_2 + x_3 + \dots + x_i
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(1, 20, (10,))
+        >>> a
+        tensor([13,  7,  3, 10, 13,  3, 15, 10,  9, 10])
+        >>> torch.cumsum(a, dim=0)
+        tensor([13, 20, 23, 33, 46, 49, 64, 74, 83, 93])
+    """
+    ...
+@overload
+def cumulative_trapezoid(y: Tensor, x: Tensor, *, dim: _int = -1) -> Tensor: 
+    r"""
+    cumulative_trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor
+    
+    Cumulatively computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_
+    along :attr:`dim`. By default the spacing between elements is assumed to be 1, but
+    :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+    used to specify arbitrary spacing along :attr:`dim`.
+    
+    For more details, please read :func:`torch.trapezoid`. The difference between :func:`torch.trapezoid`
+    and this function is that, :func:`torch.trapezoid` returns a value for each integration,
+    where as this function returns a cumulative value for every spacing within the integration. This
+    is analogous to how `.sum` returns a value and `.cumsum` returns a cumulative sum.
+    
+    Arguments:
+        y (Tensor): Values to use when computing the trapezoidal rule.
+        x (Tensor): If specified, defines spacing between values as specified above.
+    
+    Keyword arguments:
+        dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx`
+            are specified then this defaults to 1. Effectively multiplies the result by its value.
+        dim (int): The dimension along which to compute the trapezoidal rule.
+            The last (inner-most) dimension by default.
+    
+    Examples::
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D, spacing is implicitly 1.
+        >>> y = torch.tensor([1, 5, 10])
+        >>> torch.cumulative_trapezoid(y)
+        tensor([3., 10.5])
+    
+        >>> # Computes the same trapezoidal rule directly up to each element to verify
+        >>> (1 + 5) / 2
+        3.0
+        >>> (1 + 10 + 10) / 2
+        10.5
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D with constant spacing of 2
+        >>> # NOTE: the result is the same as before, but multiplied by 2
+        >>> torch.cumulative_trapezoid(y, dx=2)
+        tensor([6., 21.])
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D with arbitrary spacing
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([6., 28.5])
+    
+        >>> # Computes the same trapezoidal rule directly up to each element to verify
+        >>> ((3 - 1) * (1 + 5)) / 2
+        6.0
+        >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2
+        28.5
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 matrix
+        >>> y = torch.arange(9).reshape(3, 3)
+        tensor([[0, 1, 2],
+                [3, 4, 5],
+                [6, 7, 8]])
+        >>> torch.cumulative_trapezoid(y)
+        tensor([[ 0.5,  2.],
+                [ 3.5,  8.],
+                [ 6.5, 14.]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each column of the matrix
+        >>> torch.cumulative_trapezoid(y, dim=0)
+        tensor([[ 1.5,  2.5,  3.5],
+                [ 6.0,  8.0, 10.0]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with the same arbitrary spacing
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([[2., 5.],
+                [2., 5.],
+                [2., 5.]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with different arbitrary spacing per row
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([[1., 2.],
+                [2., 4.],
+                [3., 6.]])
+    """
+    ...
+@overload
+def cumulative_trapezoid(y: Tensor, *, dx: Union[Number, _complex] = 1, dim: _int = -1) -> Tensor: 
+    r"""
+    cumulative_trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor
+    
+    Cumulatively computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_
+    along :attr:`dim`. By default the spacing between elements is assumed to be 1, but
+    :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+    used to specify arbitrary spacing along :attr:`dim`.
+    
+    For more details, please read :func:`torch.trapezoid`. The difference between :func:`torch.trapezoid`
+    and this function is that, :func:`torch.trapezoid` returns a value for each integration,
+    where as this function returns a cumulative value for every spacing within the integration. This
+    is analogous to how `.sum` returns a value and `.cumsum` returns a cumulative sum.
+    
+    Arguments:
+        y (Tensor): Values to use when computing the trapezoidal rule.
+        x (Tensor): If specified, defines spacing between values as specified above.
+    
+    Keyword arguments:
+        dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx`
+            are specified then this defaults to 1. Effectively multiplies the result by its value.
+        dim (int): The dimension along which to compute the trapezoidal rule.
+            The last (inner-most) dimension by default.
+    
+    Examples::
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D, spacing is implicitly 1.
+        >>> y = torch.tensor([1, 5, 10])
+        >>> torch.cumulative_trapezoid(y)
+        tensor([3., 10.5])
+    
+        >>> # Computes the same trapezoidal rule directly up to each element to verify
+        >>> (1 + 5) / 2
+        3.0
+        >>> (1 + 10 + 10) / 2
+        10.5
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D with constant spacing of 2
+        >>> # NOTE: the result is the same as before, but multiplied by 2
+        >>> torch.cumulative_trapezoid(y, dx=2)
+        tensor([6., 21.])
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D with arbitrary spacing
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([6., 28.5])
+    
+        >>> # Computes the same trapezoidal rule directly up to each element to verify
+        >>> ((3 - 1) * (1 + 5)) / 2
+        6.0
+        >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2
+        28.5
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 matrix
+        >>> y = torch.arange(9).reshape(3, 3)
+        tensor([[0, 1, 2],
+                [3, 4, 5],
+                [6, 7, 8]])
+        >>> torch.cumulative_trapezoid(y)
+        tensor([[ 0.5,  2.],
+                [ 3.5,  8.],
+                [ 6.5, 14.]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each column of the matrix
+        >>> torch.cumulative_trapezoid(y, dim=0)
+        tensor([[ 1.5,  2.5,  3.5],
+                [ 6.0,  8.0, 10.0]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with the same arbitrary spacing
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([[2., 5.],
+                [2., 5.],
+                [2., 5.]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with different arbitrary spacing per row
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([[1., 2.],
+                [2., 4.],
+                [3., 6.]])
+    """
+    ...
+def deg2rad(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    deg2rad(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with each of the elements of :attr:`input`
+    converted from angles in degrees to radians.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([[180.0, -180.0], [360.0, -360.0], [90.0, -90.0]])
+        >>> torch.deg2rad(a)
+        tensor([[ 3.1416, -3.1416],
+                [ 6.2832, -6.2832],
+                [ 1.5708, -1.5708]])
+    """
+    ...
+def deg2rad_(input: Tensor) -> Tensor: ...
+@overload
+def dequantize(input: Tensor) -> Tensor: 
+    r"""
+    dequantize(tensor) -> Tensor
+    
+    Returns an fp32 Tensor by dequantizing a quantized Tensor
+    
+    Args:
+        tensor (Tensor): A quantized Tensor
+    
+    .. function:: dequantize(tensors) -> sequence of Tensors
+       :noindex:
+    
+    Given a list of quantized Tensors, dequantize them and return a list of fp32 Tensors
+    
+    Args:
+         tensors (sequence of Tensors): A list of quantized Tensors
+    """
+    ...
+@overload
+def dequantize(tensors: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    dequantize(tensor) -> Tensor
+    
+    Returns an fp32 Tensor by dequantizing a quantized Tensor
+    
+    Args:
+        tensor (Tensor): A quantized Tensor
+    
+    .. function:: dequantize(tensors) -> sequence of Tensors
+       :noindex:
+    
+    Given a list of quantized Tensors, dequantize them and return a list of fp32 Tensors
+    
+    Args:
+         tensors (sequence of Tensors): A list of quantized Tensors
+    """
+    ...
+def det(input: Tensor) -> Tensor: 
+    r"""
+    det(input) -> Tensor
+    
+    Alias for :func:`torch.linalg.det`
+    """
+    ...
+def detach(input: Tensor) -> Tensor: ...
+def detach_(input: Tensor) -> Tensor: ...
+def detach_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.detach`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def diag(input: Tensor, diagonal: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    diag(input, diagonal=0, *, out=None) -> Tensor
+    
+    - If :attr:`input` is a vector (1-D tensor), then returns a 2-D square tensor
+      with the elements of :attr:`input` as the diagonal.
+    - If :attr:`input` is a matrix (2-D tensor), then returns a 1-D tensor with
+      the diagonal elements of :attr:`input`.
+    
+    The argument :attr:`diagonal` controls which diagonal to consider:
+    
+    - If :attr:`diagonal` = 0, it is the main diagonal.
+    - If :attr:`diagonal` > 0, it is above the main diagonal.
+    - If :attr:`diagonal` < 0, it is below the main diagonal.
+    
+    Args:
+        input (Tensor): the input tensor.
+        diagonal (int, optional): the diagonal to consider
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+            :func:`torch.diagonal` always returns the diagonal of its input.
+    
+            :func:`torch.diagflat` always constructs a tensor with diagonal elements
+            specified by the input.
+    
+    Examples:
+    
+    Get the square matrix where the input vector is the diagonal::
+    
+        >>> a = torch.randn(3)
+        >>> a
+        tensor([ 0.5950,-0.0872, 2.3298])
+        >>> torch.diag(a)
+        tensor([[ 0.5950, 0.0000, 0.0000],
+                [ 0.0000,-0.0872, 0.0000],
+                [ 0.0000, 0.0000, 2.3298]])
+        >>> torch.diag(a, 1)
+        tensor([[ 0.0000, 0.5950, 0.0000, 0.0000],
+                [ 0.0000, 0.0000,-0.0872, 0.0000],
+                [ 0.0000, 0.0000, 0.0000, 2.3298],
+                [ 0.0000, 0.0000, 0.0000, 0.0000]])
+    
+    Get the k-th diagonal of a given matrix::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[-0.4264, 0.0255,-0.1064],
+                [ 0.8795,-0.2429, 0.1374],
+                [ 0.1029,-0.6482,-1.6300]])
+        >>> torch.diag(a, 0)
+        tensor([-0.4264,-0.2429,-1.6300])
+        >>> torch.diag(a, 1)
+        tensor([ 0.0255, 0.1374])
+    """
+    ...
+def diag_embed(input: Tensor, offset: _int = 0, dim1: _int = -2, dim2: _int = -1) -> Tensor: 
+    r"""
+    diag_embed(input, offset=0, dim1=-2, dim2=-1) -> Tensor
+    
+    Creates a tensor whose diagonals of certain 2D planes (specified by
+    :attr:`dim1` and :attr:`dim2`) are filled by :attr:`input`.
+    To facilitate creating batched diagonal matrices, the 2D planes formed by
+    the last two dimensions of the returned tensor are chosen by default.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    The size of the new matrix will be calculated to make the specified diagonal
+    of the size of the last input dimension.
+    Note that for :attr:`offset` other than :math:`0`, the order of :attr:`dim1`
+    and :attr:`dim2` matters. Exchanging them is equivalent to changing the
+    sign of :attr:`offset`.
+    
+    Applying :meth:`torch.diagonal` to the output of this function with
+    the same arguments yields a matrix identical to input. However,
+    :meth:`torch.diagonal` has different default dimensions, so those
+    need to be explicitly specified.
+    
+    Args:
+        input (Tensor): the input tensor. Must be at least 1-dimensional.
+        offset (int, optional): which diagonal to consider. Default: 0
+            (main diagonal).
+        dim1 (int, optional): first dimension with respect to which to
+            take diagonal. Default: -2.
+        dim2 (int, optional): second dimension with respect to which to
+            take diagonal. Default: -1.
+    
+    Example::
+    
+        >>> a = torch.randn(2, 3)
+        >>> torch.diag_embed(a)
+        tensor([[[ 1.5410,  0.0000,  0.0000],
+                 [ 0.0000, -0.2934,  0.0000],
+                 [ 0.0000,  0.0000, -2.1788]],
+    
+                [[ 0.5684,  0.0000,  0.0000],
+                 [ 0.0000, -1.0845,  0.0000],
+                 [ 0.0000,  0.0000, -1.3986]]])
+    
+        >>> torch.diag_embed(a, offset=1, dim1=0, dim2=2)
+        tensor([[[ 0.0000,  1.5410,  0.0000,  0.0000],
+                 [ 0.0000,  0.5684,  0.0000,  0.0000]],
+    
+                [[ 0.0000,  0.0000, -0.2934,  0.0000],
+                 [ 0.0000,  0.0000, -1.0845,  0.0000]],
+    
+                [[ 0.0000,  0.0000,  0.0000, -2.1788],
+                 [ 0.0000,  0.0000,  0.0000, -1.3986]],
+    
+                [[ 0.0000,  0.0000,  0.0000,  0.0000],
+                 [ 0.0000,  0.0000,  0.0000,  0.0000]]])
+    """
+    ...
+def diagflat(input: Tensor, offset: _int = 0) -> Tensor: 
+    r"""
+    diagflat(input, offset=0) -> Tensor
+    
+    - If :attr:`input` is a vector (1-D tensor), then returns a 2-D square tensor
+      with the elements of :attr:`input` as the diagonal.
+    - If :attr:`input` is a tensor with more than one dimension, then returns a
+      2-D tensor with diagonal elements equal to a flattened :attr:`input`.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    Args:
+        input (Tensor): the input tensor.
+        offset (int, optional): the diagonal to consider. Default: 0 (main
+            diagonal).
+    
+    Examples::
+    
+        >>> a = torch.randn(3)
+        >>> a
+        tensor([-0.2956, -0.9068,  0.1695])
+        >>> torch.diagflat(a)
+        tensor([[-0.2956,  0.0000,  0.0000],
+                [ 0.0000, -0.9068,  0.0000],
+                [ 0.0000,  0.0000,  0.1695]])
+        >>> torch.diagflat(a, 1)
+        tensor([[ 0.0000, -0.2956,  0.0000,  0.0000],
+                [ 0.0000,  0.0000, -0.9068,  0.0000],
+                [ 0.0000,  0.0000,  0.0000,  0.1695],
+                [ 0.0000,  0.0000,  0.0000,  0.0000]])
+    
+        >>> a = torch.randn(2, 2)
+        >>> a
+        tensor([[ 0.2094, -0.3018],
+                [-0.1516,  1.9342]])
+        >>> torch.diagflat(a)
+        tensor([[ 0.2094,  0.0000,  0.0000,  0.0000],
+                [ 0.0000, -0.3018,  0.0000,  0.0000],
+                [ 0.0000,  0.0000, -0.1516,  0.0000],
+                [ 0.0000,  0.0000,  0.0000,  1.9342]])
+    """
+    ...
+@overload
+def diagonal(input: Tensor, offset: _int = 0, dim1: _int = 0, dim2: _int = 1) -> Tensor: 
+    r"""
+    diagonal(input, offset=0, dim1=0, dim2=1) -> Tensor
+    
+    Returns a partial view of :attr:`input` with the its diagonal elements
+    with respect to :attr:`dim1` and :attr:`dim2` appended as a dimension
+    at the end of the shape.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    Applying :meth:`torch.diag_embed` to the output of this function with
+    the same arguments yields a diagonal matrix with the diagonal entries
+    of the input. However, :meth:`torch.diag_embed` has different default
+    dimensions, so those need to be explicitly specified.
+    
+    Args:
+        input (Tensor): the input tensor. Must be at least 2-dimensional.
+        offset (int, optional): which diagonal to consider. Default: 0
+            (main diagonal).
+        dim1 (int, optional): first dimension with respect to which to
+            take diagonal. Default: 0.
+        dim2 (int, optional): second dimension with respect to which to
+            take diagonal. Default: 1.
+    
+    .. note::  To take a batch diagonal, pass in dim1=-2, dim2=-1.
+    
+    Examples::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[-1.0854,  1.1431, -0.1752],
+                [ 0.8536, -0.0905,  0.0360],
+                [ 0.6927, -0.3735, -0.4945]])
+    
+    
+        >>> torch.diagonal(a, 0)
+        tensor([-1.0854, -0.0905, -0.4945])
+    
+    
+        >>> torch.diagonal(a, 1)
+        tensor([ 1.1431,  0.0360])
+    
+    
+        >>> x = torch.randn(2, 5, 4, 2)
+        >>> torch.diagonal(x, offset=-1, dim1=1, dim2=2)
+        tensor([[[-1.2631,  0.3755, -1.5977, -1.8172],
+                 [-1.1065,  1.0401, -0.2235, -0.7938]],
+    
+                [[-1.7325, -0.3081,  0.6166,  0.2335],
+                 [ 1.0500,  0.7336, -0.3836, -1.1015]]])
+    """
+    ...
+@overload
+def diagonal(input: Tensor, *, outdim: Union[str, ellipsis, None], dim1: Union[str, ellipsis, None], dim2: Union[str, ellipsis, None], offset: _int = 0) -> Tensor: 
+    r"""
+    diagonal(input, offset=0, dim1=0, dim2=1) -> Tensor
+    
+    Returns a partial view of :attr:`input` with the its diagonal elements
+    with respect to :attr:`dim1` and :attr:`dim2` appended as a dimension
+    at the end of the shape.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    Applying :meth:`torch.diag_embed` to the output of this function with
+    the same arguments yields a diagonal matrix with the diagonal entries
+    of the input. However, :meth:`torch.diag_embed` has different default
+    dimensions, so those need to be explicitly specified.
+    
+    Args:
+        input (Tensor): the input tensor. Must be at least 2-dimensional.
+        offset (int, optional): which diagonal to consider. Default: 0
+            (main diagonal).
+        dim1 (int, optional): first dimension with respect to which to
+            take diagonal. Default: 0.
+        dim2 (int, optional): second dimension with respect to which to
+            take diagonal. Default: 1.
+    
+    .. note::  To take a batch diagonal, pass in dim1=-2, dim2=-1.
+    
+    Examples::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[-1.0854,  1.1431, -0.1752],
+                [ 0.8536, -0.0905,  0.0360],
+                [ 0.6927, -0.3735, -0.4945]])
+    
+    
+        >>> torch.diagonal(a, 0)
+        tensor([-1.0854, -0.0905, -0.4945])
+    
+    
+        >>> torch.diagonal(a, 1)
+        tensor([ 1.1431,  0.0360])
+    
+    
+        >>> x = torch.randn(2, 5, 4, 2)
+        >>> torch.diagonal(x, offset=-1, dim1=1, dim2=2)
+        tensor([[[-1.2631,  0.3755, -1.5977, -1.8172],
+                 [-1.1065,  1.0401, -0.2235, -0.7938]],
+    
+                [[-1.7325, -0.3081,  0.6166,  0.2335],
+                 [ 1.0500,  0.7336, -0.3836, -1.1015]]])
+    """
+    ...
+def diagonal_copy(input: Tensor, offset: _int = 0, dim1: _int = 0, dim2: _int = 1, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.diagonal`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def diagonal_scatter(input: Tensor, src: Tensor, offset: _int = 0, dim1: _int = 0, dim2: _int = 1) -> Tensor: 
+    r"""
+    diagonal_scatter(input, src, offset=0, dim1=0, dim2=1) -> Tensor
+    
+    Embeds the values of the :attr:`src` tensor into :attr:`input` along
+    the diagonal elements of :attr:`input`, with respect to :attr:`dim1`
+    and :attr:`dim2`.
+    
+    This function returns a tensor with fresh storage; it does not
+    return a view.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    Args:
+        input (Tensor): the input tensor. Must be at least 2-dimensional.
+        src (Tensor): the tensor to embed into :attr:`input`.
+        offset (int, optional): which diagonal to consider. Default: 0
+            (main diagonal).
+        dim1 (int, optional): first dimension with respect to which to
+            take diagonal. Default: 0.
+        dim2 (int, optional): second dimension with respect to which to
+            take diagonal. Default: 1.
+    
+    .. note::
+    
+        :attr:`src` must be of the proper size in order to be embedded
+        into :attr:`input`. Specifically, it should have the same shape as
+        ``torch.diagonal(input, offset, dim1, dim2)``
+    
+    Examples::
+    
+        >>> a = torch.zeros(3, 3)
+        >>> a
+        tensor([[0., 0., 0.],
+                [0., 0., 0.],
+                [0., 0., 0.]])
+    
+        >>> torch.diagonal_scatter(a, torch.ones(3), 0)
+        tensor([[1., 0., 0.],
+                [0., 1., 0.],
+                [0., 0., 1.]])
+    
+        >>> torch.diagonal_scatter(a, torch.ones(2), 1)
+        tensor([[0., 1., 0.],
+                [0., 0., 1.],
+                [0., 0., 0.]])
+    """
+    ...
+def diff(input: Tensor, n: _int = 1, dim: _int = -1, prepend: Optional[Tensor] = None, append: Optional[Tensor] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    diff(input, n=1, dim=-1, prepend=None, append=None) -> Tensor
+    
+    Computes the n-th forward difference along the given dimension.
+    
+    The first-order differences are given by `out[i] = input[i + 1] - input[i]`. Higher-order
+    differences are calculated by using :func:`torch.diff` recursively.
+    
+    Args:
+        input (Tensor): the tensor to compute the differences on
+        n (int, optional): the number of times to recursively compute the difference
+        dim (int, optional): the dimension to compute the difference along.
+            Default is the last dimension.
+        prepend, append (Tensor, optional): values to prepend or append to
+            :attr:`input` along :attr:`dim` before computing the difference.
+            Their dimensions must be equivalent to that of input, and their shapes
+            must match input's shape except on :attr:`dim`.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 3, 2])
+        >>> torch.diff(a)
+        tensor([ 2, -1])
+        >>> b = torch.tensor([4, 5])
+        >>> torch.diff(a, append=b)
+        tensor([ 2, -1,  2,  1])
+        >>> c = torch.tensor([[1, 2, 3], [3, 4, 5]])
+        >>> torch.diff(c, dim=0)
+        tensor([[2, 2, 2]])
+        >>> torch.diff(c, dim=1)
+        tensor([[1, 1],
+                [1, 1]])
+    """
+    ...
+def digamma(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    digamma(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.digamma`.
+    """
+    ...
+def dist(input: Tensor, other: Tensor, p: Union[Number, _complex] = 2) -> Tensor: 
+    r"""
+    dist(input, other, p=2) -> Tensor
+    
+    Returns the p-norm of (:attr:`input` - :attr:`other`)
+    
+    The shapes of :attr:`input` and :attr:`other` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the Right-hand-side input tensor
+        p (float, optional): the norm to be computed
+    
+    Example::
+    
+        >>> x = torch.randn(4)
+        >>> x
+        tensor([-1.5393, -0.8675,  0.5916,  1.6321])
+        >>> y = torch.randn(4)
+        >>> y
+        tensor([ 0.0967, -1.0511,  0.6295,  0.8360])
+        >>> torch.dist(x, y, 3.5)
+        tensor(1.6727)
+        >>> torch.dist(x, y, 3)
+        tensor(1.6973)
+        >>> torch.dist(x, y, 0)
+        tensor(4.)
+        >>> torch.dist(x, y, 1)
+        tensor(2.6537)
+    """
+    ...
+def div(input: Union[Tensor, Number], other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    div(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Divides each element of the input ``input`` by the corresponding element of
+    :attr:`other`.
+    
+    .. math::
+        \text{out}_i = \frac{\text{input}_i}{\text{other}_i}
+    
+    .. note::
+        By default, this performs a "true" division like Python 3.
+        See the :attr:`rounding_mode` argument for floor division.
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    Always promotes integer types to the default scalar type.
+    
+    Args:
+        input (Tensor): the dividend
+        other (Tensor or Number): the divisor
+    
+    Keyword args:
+        rounding_mode (str, optional): Type of rounding applied to the result:
+    
+            * None - default behavior. Performs no rounding and, if both :attr:`input` and
+              :attr:`other` are integer types, promotes the inputs to the default scalar type.
+              Equivalent to true division in Python (the ``/`` operator) and NumPy's ``np.true_divide``.
+            * ``"trunc"`` - rounds the results of the division towards zero.
+              Equivalent to C-style integer division.
+            * ``"floor"`` - rounds the results of the division down.
+              Equivalent to floor division in Python (the ``//`` operator) and NumPy's ``np.floor_divide``.
+    
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> x = torch.tensor([ 0.3810,  1.2774, -0.2972, -0.3719,  0.4637])
+        >>> torch.div(x, 0.5)
+        tensor([ 0.7620,  2.5548, -0.5944, -0.7438,  0.9274])
+    
+        >>> a = torch.tensor([[-0.3711, -1.9353, -0.4605, -0.2917],
+        ...                   [ 0.1815, -1.0111,  0.9805, -1.5923],
+        ...                   [ 0.1062,  1.4581,  0.7759, -1.2344],
+        ...                   [-0.1830, -0.0313,  1.1908, -1.4757]])
+        >>> b = torch.tensor([ 0.8032,  0.2930, -0.8113, -0.2308])
+        >>> torch.div(a, b)
+        tensor([[-0.4620, -6.6051,  0.5676,  1.2639],
+                [ 0.2260, -3.4509, -1.2086,  6.8990],
+                [ 0.1322,  4.9764, -0.9564,  5.3484],
+                [-0.2278, -0.1068, -1.4678,  6.3938]])
+    
+        >>> torch.div(a, b, rounding_mode='trunc')
+        tensor([[-0., -6.,  0.,  1.],
+                [ 0., -3., -1.,  6.],
+                [ 0.,  4., -0.,  5.],
+                [-0., -0., -1.,  6.]])
+    
+        >>> torch.div(a, b, rounding_mode='floor')
+        tensor([[-1., -7.,  0.,  1.],
+                [ 0., -4., -2.,  6.],
+                [ 0.,  4., -1.,  5.],
+                [-1., -1., -2.,  6.]])
+    """
+    ...
+@overload
+def divide(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    divide(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Alias for :func:`torch.div`.
+    """
+    ...
+@overload
+def divide(input: Tensor, other: Tensor, *, rounding_mode: Optional[str], out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    divide(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Alias for :func:`torch.div`.
+    """
+    ...
+@overload
+def divide(input: Tensor, other: Union[Number, _complex], *, rounding_mode: Optional[str]) -> Tensor: 
+    r"""
+    divide(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Alias for :func:`torch.div`.
+    """
+    ...
+@overload
+def divide(input: Tensor, other: Union[Number, _complex]) -> Tensor: 
+    r"""
+    divide(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Alias for :func:`torch.div`.
+    """
+    ...
+def dot(input: Tensor, tensor: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    dot(input, other, *, out=None) -> Tensor
+    
+    Computes the dot product of two 1D tensors.
+    
+    .. note::
+    
+        Unlike NumPy's dot, torch.dot intentionally only supports computing the dot product
+        of two 1D tensors with the same number of elements.
+    
+    Args:
+        input (Tensor): first tensor in the dot product, must be 1D.
+        other (Tensor): second tensor in the dot product, must be 1D.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.dot(torch.tensor([2, 3]), torch.tensor([2, 1]))
+        tensor(7)
+    """
+    ...
+def dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def dsmm(input: Tensor, mat2: Tensor) -> Tensor: ...
+@overload
+def dsplit(input: Tensor, sections: _int) -> Tuple[Tensor, ...]: 
+    r"""
+    dsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with three or more dimensions, into multiple tensors
+    depthwise according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=2)
+    (the split dimension is 2), except that if :attr:`indices_or_sections` is an integer
+    it must evenly divide the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.dsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(2, 2, 4)
+        >>> t
+        tensor([[[ 0.,  1.,  2.,  3.],
+                 [ 4.,  5.,  6.,  7.]],
+                [[ 8.,  9., 10., 11.],
+                 [12., 13., 14., 15.]]])
+        >>> torch.dsplit(t, 2)
+        (tensor([[[ 0.,  1.],
+                [ 4.,  5.]],
+               [[ 8.,  9.],
+                [12., 13.]]]),
+         tensor([[[ 2.,  3.],
+                  [ 6.,  7.]],
+                 [[10., 11.],
+                  [14., 15.]]]))
+    
+        >>> torch.dsplit(t, [3, 6])
+        (tensor([[[ 0.,  1.,  2.],
+                  [ 4.,  5.,  6.]],
+                 [[ 8.,  9., 10.],
+                  [12., 13., 14.]]]),
+         tensor([[[ 3.],
+                  [ 7.]],
+                 [[11.],
+                  [15.]]]),
+         tensor([], size=(2, 2, 0)))
+    """
+    ...
+@overload
+def dsplit(input: Tensor, indices: _size) -> Tuple[Tensor, ...]: 
+    r"""
+    dsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with three or more dimensions, into multiple tensors
+    depthwise according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=2)
+    (the split dimension is 2), except that if :attr:`indices_or_sections` is an integer
+    it must evenly divide the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.dsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(2, 2, 4)
+        >>> t
+        tensor([[[ 0.,  1.,  2.,  3.],
+                 [ 4.,  5.,  6.,  7.]],
+                [[ 8.,  9., 10., 11.],
+                 [12., 13., 14., 15.]]])
+        >>> torch.dsplit(t, 2)
+        (tensor([[[ 0.,  1.],
+                [ 4.,  5.]],
+               [[ 8.,  9.],
+                [12., 13.]]]),
+         tensor([[[ 2.,  3.],
+                  [ 6.,  7.]],
+                 [[10., 11.],
+                  [14., 15.]]]))
+    
+        >>> torch.dsplit(t, [3, 6])
+        (tensor([[[ 0.,  1.,  2.],
+                  [ 4.,  5.,  6.]],
+                 [[ 8.,  9., 10.],
+                  [12., 13., 14.]]]),
+         tensor([[[ 3.],
+                  [ 7.]],
+                 [[11.],
+                  [15.]]]),
+         tensor([], size=(2, 2, 0)))
+    """
+    ...
+def dstack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    dstack(tensors, *, out=None) -> Tensor
+    
+    Stack tensors in sequence depthwise (along third axis).
+    
+    This is equivalent to concatenation along the third axis after 1-D and 2-D tensors have been reshaped by :func:`torch.atleast_3d`.
+    
+    Args:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5, 6])
+        >>> torch.dstack((a,b))
+        tensor([[[1, 4],
+                 [2, 5],
+                 [3, 6]]])
+        >>> a = torch.tensor([[1],[2],[3]])
+        >>> b = torch.tensor([[4],[5],[6]])
+        >>> torch.dstack((a,b))
+        tensor([[[1, 4]],
+                [[2, 5]],
+                [[3, 6]]])
+    """
+    ...
+def embedding(weight: Tensor, indices: Tensor, padding_idx: Union[_int, SymInt] = -1, scale_grad_by_freq: _bool = False, sparse: _bool = False) -> Tensor: ...
+@overload
+def embedding_bag(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool, mode: _int, sparse: _bool, per_sample_weights: Optional[Tensor], include_last_offset: _bool, padding_idx: Optional[_int]) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+@overload
+def embedding_bag(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool = False, mode: _int = 0, sparse: _bool = False, per_sample_weights: Optional[Tensor] = None, include_last_offset: _bool = False) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def embedding_renorm_(input: Tensor, indices: Tensor, max_norm: _float, norm_type: _float) -> Tensor: ...
+@overload
+def empty(size: Sequence[Union[_int, SymInt]], *, memory_format: Optional[memory_format] = None, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor
+    
+    Returns a tensor filled with uninitialized data. The shape of the tensor is
+    defined by the variable argument :attr:`size`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.contiguous_format``.
+    
+    Example::
+    
+        >>> torch.empty((2,3), dtype=torch.int64)
+        tensor([[ 9.4064e+13,  2.8000e+01,  9.3493e+13],
+                [ 7.5751e+18,  7.1428e+18,  7.5955e+18]])
+    """
+    ...
+@overload
+def empty(*size: _int, memory_format: Optional[memory_format] = None, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor
+    
+    Returns a tensor filled with uninitialized data. The shape of the tensor is
+    defined by the variable argument :attr:`size`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.contiguous_format``.
+    
+    Example::
+    
+        >>> torch.empty((2,3), dtype=torch.int64)
+        tensor([[ 9.4064e+13,  2.8000e+01,  9.3493e+13],
+                [ 7.5751e+18,  7.1428e+18,  7.5955e+18]])
+    """
+    ...
+@overload
+def empty(size: _size, *, names: Optional[Sequence[Union[str, ellipsis, None]]], memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor
+    
+    Returns a tensor filled with uninitialized data. The shape of the tensor is
+    defined by the variable argument :attr:`size`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.contiguous_format``.
+    
+    Example::
+    
+        >>> torch.empty((2,3), dtype=torch.int64)
+        tensor([[ 9.4064e+13,  2.8000e+01,  9.3493e+13],
+                [ 7.5751e+18,  7.1428e+18,  7.5955e+18]])
+    """
+    ...
+@overload
+def empty(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor
+    
+    Returns a tensor filled with uninitialized data. The shape of the tensor is
+    defined by the variable argument :attr:`size`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.contiguous_format``.
+    
+    Example::
+    
+        >>> torch.empty((2,3), dtype=torch.int64)
+        tensor([[ 9.4064e+13,  2.8000e+01,  9.3493e+13],
+                [ 7.5751e+18,  7.1428e+18,  7.5955e+18]])
+    """
+    ...
+def empty_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns an uninitialized tensor with the same size as :attr:`input`.
+    ``torch.empty_like(input)`` is equivalent to
+    ``torch.empty(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    
+    Example::
+    
+        >>> a=torch.empty((2,3), dtype=torch.int32, device = 'cuda')
+        >>> torch.empty_like(a)
+        tensor([[0, 0, 0],
+                [0, 0, 0]], device='cuda:0', dtype=torch.int32)
+    """
+    ...
+def empty_permuted(size: Sequence[Union[_int, SymInt]], physical_layout: _size, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty_permuted(size, physical_layout, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Creates an uninitialized, non-overlapping and dense tensor with the
+    specified :attr:`size`, with :attr:`physical_layout` specifying how the
+    dimensions are physically laid out in memory (each logical dimension is listed
+    from outermost to innermost).  :attr:`physical_layout` is a generalization
+    of NCHW/NHWC notation: if each dimension is assigned a number according to
+    what order they occur in size (N=0, C=1, H=2, W=3), then NCHW is ``(0, 1, 2, 3)``
+    while NHWC is ``(0, 2, 3, 1)``.  Equivalently, the strides of the output
+    tensor ``t`` are such that ``t.stride(physical_layout[i]) == contiguous_strides[i]``
+    (notably, this function is *not* equivalent to ``torch.empty(size).permute(physical_layout)``).
+    
+    Unlike :func:`torch.empty_strided`, this is guaranteed to produce a dense
+    tensor with no overlaps.  If possible, prefer using this function over
+    :func:`torch.empty_strided` or manual use of :func:`torch.as_strided`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (tuple of int): the shape of the output tensor
+        physical_layout (tuple of int): the ordering of dimensions physically in memory
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Examples:
+    
+        >>> torch.empty((2, 3, 5, 7)).stride()
+        (105, 35, 7, 1)
+        >>> torch.empty_permuted((2, 3, 5, 7), (0, 1, 2, 3)).stride()
+        (105, 35, 7, 1)
+        >>> torch.empty((2, 3, 5, 7), memory_format=torch.channels_last).stride()
+        (105, 1, 21, 3)
+        >>> torch.empty_permuted((2, 3, 5, 7), (0, 2, 3, 1)).stride()
+        (105, 1, 21, 3)
+        >>> torch.empty_permuted((2, 3, 5, 7), (0, 2, 3, 1)).dim_order()
+        (0, 2, 3, 1)
+    """
+    ...
+def empty_quantized(size: _size, qtensor: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def empty_strided(size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty_strided(size, stride, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Creates a tensor with the specified :attr:`size` and :attr:`stride` and filled with undefined data.
+    
+    .. warning::
+        If the constructed tensor is "overlapped" (with multiple indices referring to the same element
+        in memory) its behavior is undefined.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (tuple of int): the shape of the output tensor
+        stride (tuple of int): the strides of the output tensor
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> a = torch.empty_strided((2, 3), (1, 2))
+        >>> a
+        tensor([[8.9683e-44, 4.4842e-44, 5.1239e+07],
+                [0.0000e+00, 0.0000e+00, 3.0705e-41]])
+        >>> a.stride()
+        (1, 2)
+        >>> a.size()
+        torch.Size([2, 3])
+    """
+    ...
+@overload
+def eq(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    eq(input, other, *, out=None) -> Tensor
+    
+    Computes element-wise equality
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.eq(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[ True, False],
+                [False, True]])
+    """
+    ...
+@overload
+def eq(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    eq(input, other, *, out=None) -> Tensor
+    
+    Computes element-wise equality
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.eq(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[ True, False],
+                [False, True]])
+    """
+    ...
+def equal(input: Tensor, other: Tensor) -> _bool: 
+    r"""
+    equal(input, other) -> bool
+    
+    ``True`` if two tensors have the same size and elements, ``False`` otherwise.
+    
+    Example::
+    
+        >>> torch.equal(torch.tensor([1, 2]), torch.tensor([1, 2]))
+        True
+    """
+    ...
+def erf(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    erf(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.erf`.
+    """
+    ...
+def erf_(input: Tensor) -> Tensor: ...
+def erfc(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    erfc(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.erfc`.
+    """
+    ...
+def erfc_(input: Tensor) -> Tensor: ...
+def erfinv(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    erfinv(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.erfinv`.
+    """
+    ...
+def exp(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    exp(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the exponential of the elements
+    of the input tensor :attr:`input`.
+    
+    .. math::
+        y_{i} = e^{x_{i}}
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.exp(torch.tensor([0, math.log(2.)]))
+        tensor([ 1.,  2.])
+    """
+    ...
+def exp2(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    exp2(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.exp2`.
+    """
+    ...
+def exp2_(input: Tensor) -> Tensor: ...
+def exp_(input: Tensor) -> Tensor: ...
+def expand_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], *, implicit: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.expand`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def expm1(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    expm1(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.expm1`.
+    """
+    ...
+def expm1_(input: Tensor) -> Tensor: ...
+@overload
+def eye(n: Union[_int, SymInt], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    eye(n, m=None, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 2-D tensor with ones on the diagonal and zeros elsewhere.
+    
+    Args:
+        n (int): the number of rows
+        m (int, optional): the number of columns with default being :attr:`n`
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 2-D tensor with ones on the diagonal and zeros elsewhere
+    
+    Example::
+    
+        >>> torch.eye(3)
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  1.,  0.],
+                [ 0.,  0.,  1.]])
+    """
+    ...
+@overload
+def eye(n: Union[_int, SymInt], m: Union[_int, SymInt], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    eye(n, m=None, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 2-D tensor with ones on the diagonal and zeros elsewhere.
+    
+    Args:
+        n (int): the number of rows
+        m (int, optional): the number of columns with default being :attr:`n`
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 2-D tensor with ones on the diagonal and zeros elsewhere
+    
+    Example::
+    
+        >>> torch.eye(3)
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  1.,  0.],
+                [ 0.,  0.,  1.]])
+    """
+    ...
+def fake_quantize_per_channel_affine(input: Tensor, scale: Tensor, zero_point: Tensor, axis: _int, quant_min: _int, quant_max: _int) -> Tensor: 
+    r"""
+    fake_quantize_per_channel_affine(input, scale, zero_point, axis, quant_min, quant_max) -> Tensor
+    
+    Returns a new tensor with the data in :attr:`input` fake quantized per channel using :attr:`scale`,
+    :attr:`zero_point`, :attr:`quant_min` and :attr:`quant_max`, across the channel specified by :attr:`axis`.
+    
+    .. math::
+        \text{output} = (
+            min(
+                \text{quant\_max},
+                max(
+                    \text{quant\_min},
+                    \text{std::nearby\_int}(\text{input} / \text{scale}) + \text{zero\_point}
+                )
+            ) - \text{zero\_point}
+        ) \times \text{scale}
+    
+    Args:
+        input (Tensor): the input value(s), in ``torch.float32``
+        scale (Tensor): quantization scale, per channel in ``torch.float32``
+        zero_point (Tensor): quantization zero_point, per channel in ``torch.int32`` or ``torch.half`` or ``torch.float32``
+        axis (int32): channel axis
+        quant_min (int64): lower bound of the quantized domain
+        quant_max (int64): upper bound of the quantized domain
+    
+    Returns:
+        Tensor: A newly fake_quantized per channel ``torch.float32`` tensor
+    
+    Example::
+    
+        >>> x = torch.randn(2, 2, 2)
+        >>> x
+        tensor([[[-0.2525, -0.0466],
+                 [ 0.3491, -0.2168]],
+    
+                [[-0.5906,  1.6258],
+                 [ 0.6444, -0.0542]]])
+        >>> scales = (torch.randn(2) + 1) * 0.05
+        >>> scales
+        tensor([0.0475, 0.0486])
+        >>> zero_points = torch.zeros(2).to(torch.int32)
+        >>> zero_points
+        tensor([0, 0])
+        >>> torch.fake_quantize_per_channel_affine(x, scales, zero_points, 1, 0, 255)
+        tensor([[[0.0000, 0.0000],
+                 [0.3405, 0.0000]],
+    
+                [[0.0000, 1.6134],
+                [0.6323, 0.0000]]])
+    """
+    ...
+@overload
+def fake_quantize_per_tensor_affine(input: Tensor, scale: _float, zero_point: _int, quant_min: _int, quant_max: _int) -> Tensor: 
+    r"""
+    fake_quantize_per_tensor_affine(input, scale, zero_point, quant_min, quant_max) -> Tensor
+    
+    Returns a new tensor with the data in :attr:`input` fake quantized using :attr:`scale`,
+    :attr:`zero_point`, :attr:`quant_min` and :attr:`quant_max`.
+    
+    .. math::
+        \text{output} = (
+            min(
+                \text{quant\_max},
+                max(
+                    \text{quant\_min},
+                    \text{std::nearby\_int}(\text{input} / \text{scale}) + \text{zero\_point}
+                )
+            ) - \text{zero\_point}
+        ) \times \text{scale}
+    
+    Args:
+        input (Tensor): the input value(s), ``torch.float32`` tensor
+        scale (double scalar or ``float32`` Tensor): quantization scale
+        zero_point (int64 scalar or ``int32`` Tensor): quantization zero_point
+        quant_min (int64): lower bound of the quantized domain
+        quant_max (int64): upper bound of the quantized domain
+    
+    Returns:
+        Tensor: A newly fake_quantized ``torch.float32`` tensor
+    
+    Example::
+    
+        >>> x = torch.randn(4)
+        >>> x
+        tensor([ 0.0552,  0.9730,  0.3973, -1.0780])
+        >>> torch.fake_quantize_per_tensor_affine(x, 0.1, 0, 0, 255)
+        tensor([0.1000, 1.0000, 0.4000, 0.0000])
+        >>> torch.fake_quantize_per_tensor_affine(x, torch.tensor(0.1), torch.tensor(0), 0, 255)
+        tensor([0.1000, 1.0000, 0.4000, 0.0000])
+    """
+    ...
+@overload
+def fake_quantize_per_tensor_affine(input: Tensor, scale: Tensor, zero_point: Tensor, quant_min: _int, quant_max: _int) -> Tensor: 
+    r"""
+    fake_quantize_per_tensor_affine(input, scale, zero_point, quant_min, quant_max) -> Tensor
+    
+    Returns a new tensor with the data in :attr:`input` fake quantized using :attr:`scale`,
+    :attr:`zero_point`, :attr:`quant_min` and :attr:`quant_max`.
+    
+    .. math::
+        \text{output} = (
+            min(
+                \text{quant\_max},
+                max(
+                    \text{quant\_min},
+                    \text{std::nearby\_int}(\text{input} / \text{scale}) + \text{zero\_point}
+                )
+            ) - \text{zero\_point}
+        ) \times \text{scale}
+    
+    Args:
+        input (Tensor): the input value(s), ``torch.float32`` tensor
+        scale (double scalar or ``float32`` Tensor): quantization scale
+        zero_point (int64 scalar or ``int32`` Tensor): quantization zero_point
+        quant_min (int64): lower bound of the quantized domain
+        quant_max (int64): upper bound of the quantized domain
+    
+    Returns:
+        Tensor: A newly fake_quantized ``torch.float32`` tensor
+    
+    Example::
+    
+        >>> x = torch.randn(4)
+        >>> x
+        tensor([ 0.0552,  0.9730,  0.3973, -1.0780])
+        >>> torch.fake_quantize_per_tensor_affine(x, 0.1, 0, 0, 255)
+        tensor([0.1000, 1.0000, 0.4000, 0.0000])
+        >>> torch.fake_quantize_per_tensor_affine(x, torch.tensor(0.1), torch.tensor(0), 0, 255)
+        tensor([0.1000, 1.0000, 0.4000, 0.0000])
+    """
+    ...
+def fbgemm_linear_fp16_weight(input: Tensor, packed_weight: Tensor, bias: Tensor) -> Tensor: ...
+def fbgemm_linear_fp16_weight_fp32_activation(input: Tensor, packed_weight: Tensor, bias: Tensor) -> Tensor: ...
+def fbgemm_linear_int8_weight(input: Tensor, weight: Tensor, packed: Tensor, col_offsets: Tensor, weight_scale: Union[Number, _complex], weight_zero_point: Union[Number, _complex], bias: Tensor) -> Tensor: ...
+def fbgemm_linear_int8_weight_fp32_activation(input: Tensor, weight: Tensor, packed: Tensor, col_offsets: Tensor, weight_scale: Union[Number, _complex], weight_zero_point: Union[Number, _complex], bias: Tensor) -> Tensor: ...
+def fbgemm_linear_quantize_weight(input: Tensor) -> Tuple[Tensor, Tensor, _float, _int]: ...
+def fbgemm_pack_gemm_matrix_fp16(input: Tensor) -> Tensor: ...
+@overload
+def fbgemm_pack_quantized_matrix(input: Tensor) -> Tensor: ...
+@overload
+def fbgemm_pack_quantized_matrix(input: Tensor, K: _int, N: _int) -> Tensor: ...
+def feature_alpha_dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def feature_alpha_dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def feature_dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def feature_dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+@overload
+def fill(input: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def fill(input: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+@overload
+def fill_(input: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def fill_(input: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+def fix(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fix(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.trunc`
+    """
+    ...
+def fix_(input: Tensor) -> Tensor: ...
+@overload
+def flatten(input: Tensor, start_dim: _int = 0, end_dim: _int = -1) -> Tensor: 
+    r"""
+    flatten(input, start_dim=0, end_dim=-1) -> Tensor
+    
+    Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+    are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+    The order of elements in :attr:`input` is unchanged.
+    
+    Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+    or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+    be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+    flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+    
+    .. note::
+        Flattening a zero-dimensional tensor will return a one-dimensional view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        start_dim (int): the first dim to flatten
+        end_dim (int): the last dim to flatten
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.flatten(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+        >>> torch.flatten(t, start_dim=1)
+        tensor([[1, 2, 3, 4],
+                [5, 6, 7, 8]])
+    """
+    ...
+@overload
+def flatten(input: Tensor, start_dim: _int, end_dim: _int, out_dim: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    flatten(input, start_dim=0, end_dim=-1) -> Tensor
+    
+    Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+    are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+    The order of elements in :attr:`input` is unchanged.
+    
+    Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+    or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+    be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+    flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+    
+    .. note::
+        Flattening a zero-dimensional tensor will return a one-dimensional view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        start_dim (int): the first dim to flatten
+        end_dim (int): the last dim to flatten
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.flatten(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+        >>> torch.flatten(t, start_dim=1)
+        tensor([[1, 2, 3, 4],
+                [5, 6, 7, 8]])
+    """
+    ...
+@overload
+def flatten(input: Tensor, start_dim: Union[str, ellipsis, None], end_dim: Union[str, ellipsis, None], out_dim: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    flatten(input, start_dim=0, end_dim=-1) -> Tensor
+    
+    Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+    are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+    The order of elements in :attr:`input` is unchanged.
+    
+    Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+    or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+    be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+    flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+    
+    .. note::
+        Flattening a zero-dimensional tensor will return a one-dimensional view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        start_dim (int): the first dim to flatten
+        end_dim (int): the last dim to flatten
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.flatten(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+        >>> torch.flatten(t, start_dim=1)
+        tensor([[1, 2, 3, 4],
+                [5, 6, 7, 8]])
+    """
+    ...
+@overload
+def flatten(input: Tensor, dims: Sequence[Union[str, ellipsis, None]], out_dim: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    flatten(input, start_dim=0, end_dim=-1) -> Tensor
+    
+    Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+    are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+    The order of elements in :attr:`input` is unchanged.
+    
+    Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+    or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+    be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+    flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+    
+    .. note::
+        Flattening a zero-dimensional tensor will return a one-dimensional view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        start_dim (int): the first dim to flatten
+        end_dim (int): the last dim to flatten
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.flatten(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+        >>> torch.flatten(t, start_dim=1)
+        tensor([[1, 2, 3, 4],
+                [5, 6, 7, 8]])
+    """
+    ...
+def flip(input: Tensor, dims: _size) -> Tensor: 
+    r"""
+    flip(input, dims) -> Tensor
+    
+    Reverse the order of an n-D tensor along given axis in dims.
+    
+    .. note::
+        `torch.flip` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.flip`,
+        which returns a view in constant time. Since copying a tensor's data is more work than viewing that data,
+        `torch.flip` is expected to be slower than `np.flip`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dims (a list or tuple): axis to flip on
+    
+    Example::
+    
+        >>> x = torch.arange(8).view(2, 2, 2)
+        >>> x
+        tensor([[[ 0,  1],
+                 [ 2,  3]],
+    
+                [[ 4,  5],
+                 [ 6,  7]]])
+        >>> torch.flip(x, [0, 1])
+        tensor([[[ 6,  7],
+                 [ 4,  5]],
+    
+                [[ 2,  3],
+                 [ 0,  1]]])
+    """
+    ...
+def fliplr(input: Tensor) -> Tensor: 
+    r"""
+    fliplr(input) -> Tensor
+    
+    Flip tensor in the left/right direction, returning a new tensor.
+    
+    Flip the entries in each row in the left/right direction.
+    Columns are preserved, but appear in a different order than before.
+    
+    Note:
+        Requires the tensor to be at least 2-D.
+    
+    .. note::
+        `torch.fliplr` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.fliplr`,
+        which returns a view in constant time. Since copying a tensor's data is more work than viewing that data,
+        `torch.fliplr` is expected to be slower than `np.fliplr`.
+    
+    Args:
+        input (Tensor): Must be at least 2-dimensional.
+    
+    Example::
+    
+        >>> x = torch.arange(4).view(2, 2)
+        >>> x
+        tensor([[0, 1],
+                [2, 3]])
+        >>> torch.fliplr(x)
+        tensor([[1, 0],
+                [3, 2]])
+    """
+    ...
+def flipud(input: Tensor) -> Tensor: 
+    r"""
+    flipud(input) -> Tensor
+    
+    Flip tensor in the up/down direction, returning a new tensor.
+    
+    Flip the entries in each column in the up/down direction.
+    Rows are preserved, but appear in a different order than before.
+    
+    Note:
+        Requires the tensor to be at least 1-D.
+    
+    .. note::
+        `torch.flipud` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.flipud`,
+        which returns a view in constant time. Since copying a tensor's data is more work than viewing that data,
+        `torch.flipud` is expected to be slower than `np.flipud`.
+    
+    Args:
+        input (Tensor): Must be at least 1-dimensional.
+    
+    Example::
+    
+        >>> x = torch.arange(4).view(2, 2)
+        >>> x
+        tensor([[0, 1],
+                [2, 3]])
+        >>> torch.flipud(x)
+        tensor([[2, 3],
+                [0, 1]])
+    """
+    ...
+@overload
+def float_power(input: Tensor, exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    float_power(input, exponent, *, out=None) -> Tensor
+    
+    Raises :attr:`input` to the power of :attr:`exponent`, elementwise, in double precision.
+    If neither input is complex returns a ``torch.float64`` tensor,
+    and if one or more inputs is complex returns a ``torch.complex128`` tensor.
+    
+    .. note::
+        This function always computes in double precision, unlike :func:`torch.pow`,
+        which implements more typical :ref:`type promotion <type-promotion-doc>`.
+        This is useful when the computation needs to be performed in a wider or more precise dtype,
+        or the results of the computation may contain fractional values not representable in the input dtypes,
+        like when an integer base is raised to a negative integer exponent.
+    
+    Args:
+        input (Tensor or Number): the base value(s)
+        exponent (Tensor or Number): the exponent value(s)
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(10, (4,))
+        >>> a
+        tensor([6, 4, 7, 1])
+        >>> torch.float_power(a, 2)
+        tensor([36., 16., 49.,  1.], dtype=torch.float64)
+    
+        >>> a = torch.arange(1, 5)
+        >>> a
+        tensor([ 1,  2,  3,  4])
+        >>> exp = torch.tensor([2, -3, 4, -5])
+        >>> exp
+        tensor([ 2, -3,  4, -5])
+        >>> torch.float_power(a, exp)
+        tensor([1.0000e+00, 1.2500e-01, 8.1000e+01, 9.7656e-04], dtype=torch.float64)
+    """
+    ...
+@overload
+def float_power(self: Union[Number, _complex], exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    float_power(input, exponent, *, out=None) -> Tensor
+    
+    Raises :attr:`input` to the power of :attr:`exponent`, elementwise, in double precision.
+    If neither input is complex returns a ``torch.float64`` tensor,
+    and if one or more inputs is complex returns a ``torch.complex128`` tensor.
+    
+    .. note::
+        This function always computes in double precision, unlike :func:`torch.pow`,
+        which implements more typical :ref:`type promotion <type-promotion-doc>`.
+        This is useful when the computation needs to be performed in a wider or more precise dtype,
+        or the results of the computation may contain fractional values not representable in the input dtypes,
+        like when an integer base is raised to a negative integer exponent.
+    
+    Args:
+        input (Tensor or Number): the base value(s)
+        exponent (Tensor or Number): the exponent value(s)
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(10, (4,))
+        >>> a
+        tensor([6, 4, 7, 1])
+        >>> torch.float_power(a, 2)
+        tensor([36., 16., 49.,  1.], dtype=torch.float64)
+    
+        >>> a = torch.arange(1, 5)
+        >>> a
+        tensor([ 1,  2,  3,  4])
+        >>> exp = torch.tensor([2, -3, 4, -5])
+        >>> exp
+        tensor([ 2, -3,  4, -5])
+        >>> torch.float_power(a, exp)
+        tensor([1.0000e+00, 1.2500e-01, 8.1000e+01, 9.7656e-04], dtype=torch.float64)
+    """
+    ...
+@overload
+def float_power(input: Tensor, exponent: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    float_power(input, exponent, *, out=None) -> Tensor
+    
+    Raises :attr:`input` to the power of :attr:`exponent`, elementwise, in double precision.
+    If neither input is complex returns a ``torch.float64`` tensor,
+    and if one or more inputs is complex returns a ``torch.complex128`` tensor.
+    
+    .. note::
+        This function always computes in double precision, unlike :func:`torch.pow`,
+        which implements more typical :ref:`type promotion <type-promotion-doc>`.
+        This is useful when the computation needs to be performed in a wider or more precise dtype,
+        or the results of the computation may contain fractional values not representable in the input dtypes,
+        like when an integer base is raised to a negative integer exponent.
+    
+    Args:
+        input (Tensor or Number): the base value(s)
+        exponent (Tensor or Number): the exponent value(s)
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(10, (4,))
+        >>> a
+        tensor([6, 4, 7, 1])
+        >>> torch.float_power(a, 2)
+        tensor([36., 16., 49.,  1.], dtype=torch.float64)
+    
+        >>> a = torch.arange(1, 5)
+        >>> a
+        tensor([ 1,  2,  3,  4])
+        >>> exp = torch.tensor([2, -3, 4, -5])
+        >>> exp
+        tensor([ 2, -3,  4, -5])
+        >>> torch.float_power(a, exp)
+        tensor([1.0000e+00, 1.2500e-01, 8.1000e+01, 9.7656e-04], dtype=torch.float64)
+    """
+    ...
+def floor(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    floor(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the floor of the elements of :attr:`input`,
+    the largest integer less than or equal to each element.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    
+    .. math::
+        \text{out}_{i} = \left\lfloor \text{input}_{i} \right\rfloor
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.8166,  1.5308, -0.2530, -0.2091])
+        >>> torch.floor(a)
+        tensor([-1.,  1., -1., -1.])
+    """
+    ...
+def floor_(input: Tensor) -> Tensor: ...
+def floor_divide(input: Union[Tensor, Number], other: Union[Tensor, Number], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    floor_divide(input, other, *, out=None) -> Tensor
+    
+    .. note::
+    
+        Before PyTorch 1.13 :func:`torch.floor_divide` incorrectly performed
+        truncation division. To restore the previous behavior use
+        :func:`torch.div` with ``rounding_mode='trunc'``.
+    
+    Computes :attr:`input` divided by :attr:`other`, elementwise, and floors
+    the result.
+    
+    .. math::
+        \text{{out}}_i = \text{floor} \left( \frac{{\text{{input}}_i}}{{\text{{other}}_i}} \right)
+    
+    
+    
+    Supports broadcasting to a common shape, type promotion, and integer and float inputs.
+    
+    Args:
+        input (Tensor or Number): the dividend
+        other (Tensor or Number): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([4.0, 3.0])
+        >>> b = torch.tensor([2.0, 2.0])
+        >>> torch.floor_divide(a, b)
+        tensor([2.0, 1.0])
+        >>> torch.floor_divide(a, 1.4)
+        tensor([2.0, 2.0])
+    """
+    ...
+def fmax(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fmax(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise maximum of :attr:`input` and :attr:`other`.
+    
+    This is like :func:`torch.maximum` except it handles NaNs differently:
+    if exactly one of the two elements being compared is a NaN then the non-NaN element is taken as the maximum.
+    Only if both elements are NaN is NaN propagated.
+    
+    This function is a wrapper around C++'s ``std::fmax`` and is similar to NumPy's ``fmax`` function.
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and floating-point inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([9.7, float('nan'), 3.1, float('nan')])
+        >>> b = torch.tensor([-2.2, 0.5, float('nan'), float('nan')])
+        >>> torch.fmax(a, b)
+        tensor([9.7000, 0.5000, 3.1000,    nan])
+    """
+    ...
+def fmin(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fmin(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise minimum of :attr:`input` and :attr:`other`.
+    
+    This is like :func:`torch.minimum` except it handles NaNs differently:
+    if exactly one of the two elements being compared is a NaN then the non-NaN element is taken as the minimum.
+    Only if both elements are NaN is NaN propagated.
+    
+    This function is a wrapper around C++'s ``std::fmin`` and is similar to NumPy's ``fmin`` function.
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and floating-point inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([2.2, float('nan'), 2.1, float('nan')])
+        >>> b = torch.tensor([-9.3, 0.1, float('nan'), float('nan')])
+        >>> torch.fmin(a, b)
+        tensor([-9.3000, 0.1000, 2.1000,    nan])
+    """
+    ...
+@overload
+def fmod(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fmod(input, other, *, out=None) -> Tensor
+    
+    Applies C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_ entrywise.
+    The result has the same sign as the dividend :attr:`input` and its absolute value
+    is less than that of :attr:`other`.
+    
+    This function may be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.fmod(a, b) == a - a.div(b, rounding_mode="trunc") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+    
+        When the divisor is zero, returns ``NaN`` for floating point dtypes
+        on both CPU and GPU; raises ``RuntimeError`` for integer division by
+        zero on CPU; Integer division by zero on GPU may return any value.
+    
+    .. note::
+    
+       Complex inputs are not supported. In some cases, it is not mathematically
+       possible to satisfy the definition of a modulo operation with complex numbers.
+    
+    .. seealso::
+    
+        :func:`torch.remainder` which implements Python's modulus operator.
+        This one is defined using division rounding down the result.
+    
+    Args:
+        input (Tensor): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.fmod(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([-1., -0., -1.,  1.,  0.,  1.])
+        >>> torch.fmod(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([1.0000, 0.5000, 0.0000, 1.0000, 0.5000])
+    """
+    ...
+@overload
+def fmod(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fmod(input, other, *, out=None) -> Tensor
+    
+    Applies C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_ entrywise.
+    The result has the same sign as the dividend :attr:`input` and its absolute value
+    is less than that of :attr:`other`.
+    
+    This function may be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.fmod(a, b) == a - a.div(b, rounding_mode="trunc") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+    
+        When the divisor is zero, returns ``NaN`` for floating point dtypes
+        on both CPU and GPU; raises ``RuntimeError`` for integer division by
+        zero on CPU; Integer division by zero on GPU may return any value.
+    
+    .. note::
+    
+       Complex inputs are not supported. In some cases, it is not mathematically
+       possible to satisfy the definition of a modulo operation with complex numbers.
+    
+    .. seealso::
+    
+        :func:`torch.remainder` which implements Python's modulus operator.
+        This one is defined using division rounding down the result.
+    
+    Args:
+        input (Tensor): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.fmod(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([-1., -0., -1.,  1.,  0.,  1.])
+        >>> torch.fmod(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([1.0000, 0.5000, 0.0000, 1.0000, 0.5000])
+    """
+    ...
+def frac(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    frac(input, *, out=None) -> Tensor
+    
+    Computes the fractional portion of each element in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \text{input}_{i} - \left\lfloor |\text{input}_{i}| \right\rfloor * \operatorname{sgn}(\text{input}_{i})
+    
+    Example::
+    
+        >>> torch.frac(torch.tensor([1, 2.5, -3.2]))
+        tensor([ 0.0000,  0.5000, -0.2000])
+    """
+    ...
+def frac_(input: Tensor) -> Tensor: ...
+def frexp(input: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.frexp: 
+    r"""
+    frexp(input, *, out=None) -> (Tensor mantissa, Tensor exponent)
+    
+    Decomposes :attr:`input` into mantissa and exponent tensors
+    such that :math:`\text{input} = \text{mantissa} \times 2^{\text{exponent}}`.
+    
+    The range of mantissa is the open interval (-1, 1).
+    
+    Supports float inputs.
+    
+    Args:
+        input (Tensor): the input tensor
+    
+    
+    Keyword args:
+        out (tuple, optional): the output tensors
+    
+    Example::
+    
+        >>> x = torch.arange(9.)
+        >>> mantissa, exponent = torch.frexp(x)
+        >>> mantissa
+        tensor([0.0000, 0.5000, 0.5000, 0.7500, 0.5000, 0.6250, 0.7500, 0.8750, 0.5000])
+        >>> exponent
+        tensor([0, 1, 2, 2, 3, 3, 3, 3, 4], dtype=torch.int32)
+        >>> torch.ldexp(mantissa, exponent)
+        tensor([0., 1., 2., 3., 4., 5., 6., 7., 8.])
+    """
+    ...
+def frobenius_norm(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: ...
+def from_file(filename: str, shared: Optional[_bool] = None, size: Optional[_int] = 0, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    from_file(filename, shared=None, size=0, *, dtype=None, layout=None, device=None, pin_memory=False)
+    
+    Creates a CPU tensor with a storage backed by a memory-mapped file.
+    
+    If ``shared`` is True, then memory is shared between processes. All changes are written to the file.
+    If ``shared`` is False, then changes to the tensor do not affect the file.
+    
+    ``size`` is the number of elements in the Tensor. If ``shared`` is ``False``, then the file must contain
+    at least ``size * sizeof(dtype)`` bytes. If ``shared`` is ``True`` the file will be created if needed.
+    
+    .. note::
+        Only CPU tensors can be mapped to files.
+    
+    .. note::
+        For now, tensors with storages backed by a memory-mapped file cannot be created in pinned memory.
+    
+    
+    Args:
+        filename (str): file name to map
+        shared (bool): whether to share memory (whether ``MAP_SHARED`` or ``MAP_PRIVATE`` is passed to the
+                        underlying `mmap(2) call <https://man7.org/linux/man-pages/man2/mmap.2.html>`_)
+        size (int): number of elements in the tensor
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+        >>> t = torch.randn(2, 5, dtype=torch.float64)
+        >>> t.numpy().tofile('storage.pt')
+        >>> t_mapped = torch.from_file('storage.pt', shared=False, size=10, dtype=torch.float64)
+    """
+    ...
+def from_numpy(ndarray) -> Tensor: 
+    r"""
+    from_numpy(ndarray) -> Tensor
+    
+    Creates a :class:`Tensor` from a :class:`numpy.ndarray`.
+    
+    The returned tensor and :attr:`ndarray` share the same memory. Modifications to
+    the tensor will be reflected in the :attr:`ndarray` and vice versa. The returned
+    tensor is not resizable.
+    
+    It currently accepts :attr:`ndarray` with dtypes of ``numpy.float64``,
+    ``numpy.float32``, ``numpy.float16``, ``numpy.complex64``, ``numpy.complex128``,
+    ``numpy.int64``, ``numpy.int32``, ``numpy.int16``, ``numpy.int8``, ``numpy.uint8``,
+    and ``bool``.
+    
+    .. warning::
+        Writing to a tensor created from a read-only NumPy array is not supported and will result in undefined behavior.
+    
+    Example::
+    
+        >>> a = numpy.array([1, 2, 3])
+        >>> t = torch.from_numpy(a)
+        >>> t
+        tensor([ 1,  2,  3])
+        >>> t[0] = -1
+        >>> a
+        array([-1,  2,  3])
+    """
+    ...
+def frombuffer(buffer: Any, *, dtype: _dtype, count: int = -1, offset: int = 0, requires_grad: _bool = False) -> Tensor: 
+    r"""
+    frombuffer(buffer, *, dtype, count=-1, offset=0, requires_grad=False) -> Tensor
+    
+    Creates a 1-dimensional :class:`Tensor` from an object that implements
+    the Python buffer protocol.
+    
+    Skips the first :attr:`offset` bytes in the buffer, and interprets the rest of
+    the raw bytes as a 1-dimensional tensor of type :attr:`dtype` with :attr:`count`
+    elements.
+    
+    Note that either of the following must be true:
+    
+    1. :attr:`count` is a positive non-zero number, and the total number of bytes
+    in the buffer is more than :attr:`offset` plus :attr:`count` times the size
+    (in bytes) of :attr:`dtype`.
+    
+    2. :attr:`count` is negative, and the length (number of bytes) of the buffer
+    subtracted by the :attr:`offset` is a multiple of the size (in bytes) of
+    :attr:`dtype`.
+    
+    The returned tensor and buffer share the same memory. Modifications to
+    the tensor will be reflected in the buffer and vice versa. The returned
+    tensor is not resizable.
+    
+    .. note::
+        This function increments the reference count for the object that
+        owns the shared memory. Therefore, such memory will not be deallocated
+        before the returned tensor goes out of scope.
+    
+    .. warning::
+        This function's behavior is undefined when passed an object implementing
+        the buffer protocol whose data is not on the CPU. Doing so is likely to
+        cause a segmentation fault.
+    
+    .. warning::
+        This function does not try to infer the :attr:`dtype` (hence, it is not
+        optional). Passing a different :attr:`dtype` than its source may result
+        in unexpected behavior.
+    
+    Args:
+        buffer (object): a Python object that exposes the buffer interface.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+        count (int, optional): the number of desired elements to be read.
+            If negative, all the elements (until the end of the buffer) will be
+            read. Default: -1.
+        offset (int, optional): the number of bytes to skip at the start of
+            the buffer. Default: 0.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> import array
+        >>> a = array.array('i', [1, 2, 3])
+        >>> t = torch.frombuffer(a, dtype=torch.int32)
+        >>> t
+        tensor([ 1,  2,  3])
+        >>> t[0] = -1
+        >>> a
+        array([-1,  2,  3])
+    
+        >>> # Interprets the signed char bytes as 32-bit integers.
+        >>> # Each 4 signed char elements will be interpreted as
+        >>> # 1 signed 32-bit integer.
+        >>> import array
+        >>> a = array.array('b', [-1, 0, 0, 0])
+        >>> torch.frombuffer(a, dtype=torch.int32)
+        tensor([255], dtype=torch.int32)
+    """
+    ...
+@overload
+def full(size: _size, fill_value: Union[Number, _complex], *, out: Optional[Tensor] = None, layout: _layout = strided, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The
+    tensor's dtype is inferred from :attr:`fill_value`.
+    
+    Args:
+        size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+            shape of the output tensor.
+        fill_value (Scalar): the value to fill the output tensor with.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.full((2, 3), 3.141592)
+        tensor([[ 3.1416,  3.1416,  3.1416],
+                [ 3.1416,  3.1416,  3.1416]])
+    """
+    ...
+@overload
+def full(size: _size, fill_value: Union[Number, _complex], *, names: List[Union[str, None]], layout: _layout = strided, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The
+    tensor's dtype is inferred from :attr:`fill_value`.
+    
+    Args:
+        size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+            shape of the output tensor.
+        fill_value (Scalar): the value to fill the output tensor with.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.full((2, 3), 3.141592)
+        tensor([[ 3.1416,  3.1416,  3.1416],
+                [ 3.1416,  3.1416,  3.1416]])
+    """
+    ...
+@overload
+def full(size: Sequence[Union[_int, SymInt]], fill_value: Union[Number, _complex], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The
+    tensor's dtype is inferred from :attr:`fill_value`.
+    
+    Args:
+        size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+            shape of the output tensor.
+        fill_value (Scalar): the value to fill the output tensor with.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.full((2, 3), 3.141592)
+        tensor([[ 3.1416,  3.1416,  3.1416],
+                [ 3.1416,  3.1416,  3.1416]])
+    """
+    ...
+@overload
+def full(size: _size, fill_value: Union[Number, _complex], *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The
+    tensor's dtype is inferred from :attr:`fill_value`.
+    
+    Args:
+        size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+            shape of the output tensor.
+        fill_value (Scalar): the value to fill the output tensor with.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.full((2, 3), 3.141592)
+        tensor([[ 3.1416,  3.1416,  3.1416],
+                [ 3.1416,  3.1416,  3.1416]])
+    """
+    ...
+def full_like(input: Tensor, fill_value: Union[Number, _complex], *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    full_like(input, fill_value, \*, dtype=None, layout=torch.strided, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same size as :attr:`input` filled with :attr:`fill_value`.
+    ``torch.full_like(input, fill_value)`` is equivalent to
+    ``torch.full(input.size(), fill_value, dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+        fill_value: the number to fill the output tensor with.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+def fused_moving_avg_obs_fake_quant(input: Tensor, observer_on: Tensor, fake_quant_on: Tensor, running_min: Tensor, running_max: Tensor, scale: Tensor, zero_point: Tensor, averaging_const: _float, quant_min: _int, quant_max: _int, ch_axis: _int, per_row_fake_quant: _bool = False, symmetric_quant: _bool = False) -> Tensor: ...
+@overload
+def gather(input: Tensor, dim: _int, index: Tensor, *, sparse_grad: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gather(input, dim, index, *, sparse_grad=False, out=None) -> Tensor
+    
+    Gathers values along an axis specified by `dim`.
+    
+    For a 3-D tensor the output is specified by::
+    
+        out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+        out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+        out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+    
+    :attr:`input` and :attr:`index` must have the same number of dimensions.
+    It is also required that ``index.size(d) <= input.size(d)`` for all
+    dimensions ``d != dim``.  :attr:`out` will have the same shape as :attr:`index`.
+    Note that ``input`` and ``index`` do not broadcast against each other.
+    
+    Args:
+        input (Tensor): the source tensor
+        dim (int): the axis along which to index
+        index (LongTensor): the indices of elements to gather
+    
+    Keyword arguments:
+        sparse_grad (bool, optional): If ``True``, gradient w.r.t. :attr:`input` will be a sparse tensor.
+        out (Tensor, optional): the destination tensor
+    
+    Example::
+    
+        >>> t = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]]))
+        tensor([[ 1,  1],
+                [ 4,  3]])
+    """
+    ...
+@overload
+def gather(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, *, sparse_grad: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gather(input, dim, index, *, sparse_grad=False, out=None) -> Tensor
+    
+    Gathers values along an axis specified by `dim`.
+    
+    For a 3-D tensor the output is specified by::
+    
+        out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+        out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+        out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+    
+    :attr:`input` and :attr:`index` must have the same number of dimensions.
+    It is also required that ``index.size(d) <= input.size(d)`` for all
+    dimensions ``d != dim``.  :attr:`out` will have the same shape as :attr:`index`.
+    Note that ``input`` and ``index`` do not broadcast against each other.
+    
+    Args:
+        input (Tensor): the source tensor
+        dim (int): the axis along which to index
+        index (LongTensor): the indices of elements to gather
+    
+    Keyword arguments:
+        sparse_grad (bool, optional): If ``True``, gradient w.r.t. :attr:`input` will be a sparse tensor.
+        out (Tensor, optional): the destination tensor
+    
+    Example::
+    
+        >>> t = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]]))
+        tensor([[ 1,  1],
+                [ 4,  3]])
+    """
+    ...
+def gcd(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gcd(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise greatest common divisor (GCD) of :attr:`input` and :attr:`other`.
+    
+    Both :attr:`input` and :attr:`other` must have integer types.
+    
+    .. note::
+        This defines :math:`gcd(0, 0) = 0`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([5, 10, 15])
+        >>> b = torch.tensor([3, 4, 5])
+        >>> torch.gcd(a, b)
+        tensor([1, 2, 5])
+        >>> c = torch.tensor([3])
+        >>> torch.gcd(a, c)
+        tensor([1, 1, 3])
+    """
+    ...
+def gcd_(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def ge(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ge(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \geq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is greater than or equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.ge(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[True, True], [False, True]])
+    """
+    ...
+@overload
+def ge(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ge(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \geq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is greater than or equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.ge(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[True, True], [False, True]])
+    """
+    ...
+def geqrf(input: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.geqrf: 
+    r"""
+    geqrf(input, *, out=None) -> (Tensor, Tensor)
+    
+    This is a low-level function for calling LAPACK's geqrf directly. This function
+    returns a namedtuple (a, tau) as defined in `LAPACK documentation for geqrf`_ .
+    
+    Computes a QR decomposition of :attr:`input`.
+    Both `Q` and `R` matrices are stored in the same output tensor `a`.
+    The elements of `R` are stored on and above the diagonal.
+    Elementary reflectors (or Householder vectors) implicitly defining matrix `Q`
+    are stored below the diagonal.
+    The results of this function can be used together with :func:`torch.linalg.householder_product`
+    to obtain the `Q` matrix or
+    with :func:`torch.ormqr`, which uses an implicit representation of the `Q` matrix,
+    for an efficient matrix-matrix multiplication.
+    
+    See `LAPACK documentation for geqrf`_ for further details.
+    
+    .. note::
+        See also :func:`torch.linalg.qr`, which computes Q and R matrices, and :func:`torch.linalg.lstsq`
+        with the ``driver="gels"`` option for a function that can solve matrix equations using a QR decomposition.
+    
+    Args:
+        input (Tensor): the input matrix
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (Tensor, Tensor). Ignored if `None`. Default: `None`.
+    
+    .. _LAPACK documentation for geqrf:
+        http://www.netlib.org/lapack/explore-html/df/dc5/group__variants_g_ecomputational_ga3766ea903391b5cf9008132f7440ec7b.html
+    """
+    ...
+def ger(input: Tensor, vec2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ger(input, vec2, *, out=None) -> Tensor
+    
+    Alias of :func:`torch.outer`.
+    
+    .. warning::
+        This function is deprecated and will be removed in a future PyTorch release.
+        Use :func:`torch.outer` instead.
+    """
+    ...
+def get_default_dtype() -> _dtype: 
+    r"""
+    get_default_dtype() -> torch.dtype
+    
+    Get the current default floating point :class:`torch.dtype`.
+    
+    Example::
+    
+        >>> torch.get_default_dtype()  # initial default for floating point is torch.float32
+        torch.float32
+        >>> torch.set_default_dtype(torch.float64)
+        >>> torch.get_default_dtype()  # default is now changed to torch.float64
+        torch.float64
+    """
+    ...
+def get_num_interop_threads() -> _int: 
+    r"""
+    get_num_interop_threads() -> int
+    
+    Returns the number of threads used for inter-op parallelism on CPU
+    (e.g. in JIT interpreter)
+    """
+    ...
+def get_num_threads() -> _int: 
+    r"""
+    get_num_threads() -> int
+    
+    Returns the number of threads used for parallelizing CPU operations
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Optional[Union[Number, _complex]] = None, dim: Optional[_int] = None, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor’s theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Sequence[Union[Number, _complex]], dim: Optional[_int] = None, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor’s theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Sequence[Union[Number, _complex]], dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor’s theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Union[Tuple[Tensor, ...], List[Tensor]], dim: Optional[_int] = None, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor’s theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Union[Number, _complex], dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor’s theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Union[Tuple[Tensor, ...], List[Tensor]], dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor’s theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor’s theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def greater(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    greater(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.gt`.
+    """
+    ...
+@overload
+def greater(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    greater(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.gt`.
+    """
+    ...
+@overload
+def greater_equal(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    greater_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.ge`.
+    """
+    ...
+@overload
+def greater_equal(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    greater_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.ge`.
+    """
+    ...
+def grid_sampler(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ...
+def grid_sampler_2d(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ...
+def grid_sampler_3d(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ...
+def group_norm(input: Tensor, num_groups: _int, weight: Optional[Tensor] = None, bias: Optional[Tensor] = None, eps: _float = 1e-05, cudnn_enabled: _bool = True) -> Tensor: ...
+@overload
+def gru(data: Tensor, batch_sizes: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor]: ...
+@overload
+def gru(input: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor]: ...
+def gru_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def gt(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gt(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} > \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is greater than :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.gt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, True], [False, False]])
+    """
+    ...
+@overload
+def gt(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gt(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} > \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is greater than :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.gt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, True], [False, False]])
+    """
+    ...
+@overload
+def hamming_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hamming window function.
+    
+    .. math::
+        w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hamming_window(L, periodic=True)`` equal to
+    ``torch.hamming_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    .. note::
+        This is a generalized version of :meth:`torch.hann_window`.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+        alpha (float, optional): The coefficient :math:`\alpha` in the equation above
+        beta (float, optional): The coefficient :math:`\beta` in the equation above
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window.
+    """
+    ...
+@overload
+def hamming_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hamming window function.
+    
+    .. math::
+        w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hamming_window(L, periodic=True)`` equal to
+    ``torch.hamming_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    .. note::
+        This is a generalized version of :meth:`torch.hann_window`.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+        alpha (float, optional): The coefficient :math:`\alpha` in the equation above
+        beta (float, optional): The coefficient :math:`\beta` in the equation above
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window.
+    """
+    ...
+@overload
+def hamming_window(window_length: _int, periodic: _bool, alpha: _float, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hamming window function.
+    
+    .. math::
+        w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hamming_window(L, periodic=True)`` equal to
+    ``torch.hamming_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    .. note::
+        This is a generalized version of :meth:`torch.hann_window`.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+        alpha (float, optional): The coefficient :math:`\alpha` in the equation above
+        beta (float, optional): The coefficient :math:`\beta` in the equation above
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window.
+    """
+    ...
+@overload
+def hamming_window(window_length: _int, periodic: _bool, alpha: _float, beta: _float, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hamming window function.
+    
+    .. math::
+        w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hamming_window(L, periodic=True)`` equal to
+    ``torch.hamming_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    .. note::
+        This is a generalized version of :meth:`torch.hann_window`.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+        alpha (float, optional): The coefficient :math:`\alpha` in the equation above
+        beta (float, optional): The coefficient :math:`\beta` in the equation above
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window.
+    """
+    ...
+@overload
+def hann_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hann_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hann window function.
+    
+    .. math::
+        w[n] = \frac{1}{2}\ \left[1 - \cos \left( \frac{2 \pi n}{N - 1} \right)\right] =
+                \sin^2 \left( \frac{\pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hann_window(L, periodic=True)`` equal to
+    ``torch.hann_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+@overload
+def hann_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hann_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hann window function.
+    
+    .. math::
+        w[n] = \frac{1}{2}\ \left[1 - \cos \left( \frac{2 \pi n}{N - 1} \right)\right] =
+                \sin^2 \left( \frac{\pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hann_window(L, periodic=True)`` equal to
+    ``torch.hann_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+def hardshrink(input: Tensor, lambd: Union[Number, _complex] = 0.5, *, out: Optional[Tensor] = None) -> Tensor: ...
+def heaviside(input: Tensor, values: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    heaviside(input, values, *, out=None) -> Tensor
+    
+    Computes the Heaviside step function for each element in :attr:`input`.
+    The Heaviside step function is defined as:
+    
+    .. math::
+        \text{{heaviside}}(input, values) = \begin{cases}
+            0, & \text{if input < 0}\\
+            values, & \text{if input == 0}\\
+            1, & \text{if input > 0}
+        \end{cases}
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        values (Tensor): The values to use where :attr:`input` is zero.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> input = torch.tensor([-1.5, 0, 2.0])
+        >>> values = torch.tensor([0.5])
+        >>> torch.heaviside(input, values)
+        tensor([0.0000, 0.5000, 1.0000])
+        >>> values = torch.tensor([1.2, -2.0, 3.5])
+        >>> torch.heaviside(input, values)
+        tensor([0., -2., 1.])
+    """
+    ...
+def hinge_embedding_loss(input: Tensor, target: Tensor, margin: _float = 1.0, reduction: _int = 1) -> Tensor: ...
+def histc(input: Tensor, bins: _int = 100, min: Union[Number, _complex] = 0, max: Union[Number, _complex] = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    histc(input, bins=100, min=0, max=0, *, out=None) -> Tensor
+    
+    Computes the histogram of a tensor.
+    
+    The elements are sorted into equal width bins between :attr:`min` and
+    :attr:`max`. If :attr:`min` and :attr:`max` are both zero, the minimum and
+    maximum values of the data are used.
+    
+    Elements lower than min and higher than max and ``NaN`` elements are ignored.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins (int): number of histogram bins
+        min (Scalar): lower end of the range (inclusive)
+        max (Scalar): upper end of the range (inclusive)
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: Histogram represented as a tensor
+    
+    Example::
+    
+        >>> torch.histc(torch.tensor([1., 2, 1]), bins=4, min=0, max=3)
+        tensor([ 0.,  2.,  1.,  0.])
+    """
+    ...
+@overload
+def histogram(input: Tensor, bins: Tensor, *, weight: Optional[Tensor] = None, density: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.histogram: 
+    r"""
+    histogram(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor)
+    
+    Computes a histogram of the values in a tensor.
+    
+    :attr:`bins` can be an integer or a 1D tensor.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins.
+    By default, the lower and upper range of the bins is determined by the
+    minimum and maximum elements of the input tensor. The :attr:`range`
+    argument can be provided to specify a range for the bins.
+    
+    If :attr:`bins` is a 1D tensor, it specifies the sequence of bin edges
+    including the rightmost edge. It should contain at least 2 elements
+    and its elements should be increasing.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: int or 1D Tensor. If int, defines the number of equal-width bins. If tensor,
+              defines the sequence of bin edges including the rightmost edge.
+    
+    Keyword args:
+        range (tuple of float): Defines the range of the bins.
+        weight (Tensor): If provided, weight should have the same shape as input. Each value in
+                         input contributes its associated weight towards its bin's result.
+        density (bool): If False, the result will contain the count (or total weight) in each bin.
+                        If True, the result is the value of the probability density function over the bins,
+                        normalized such that the integral over the range of the bins is 1.
+        out (Tensor, optional): the output tensor. (tuple, optional): The result tuple of two output tensors (hist, bin_edges).
+    
+    Returns:
+        hist (Tensor): 1D Tensor containing the values of the histogram.
+        bin_edges(Tensor): 1D Tensor containing the edges of the histogram bins.
+    
+    Example::
+    
+        >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]))
+        (tensor([ 0.,  5.,  2.,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
+        >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]), density=True)
+        (tensor([ 0.,  0.9524,  0.3810,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
+    """
+    ...
+@overload
+def histogram(input: Tensor, bins: _int = 100, *, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.histogram: 
+    r"""
+    histogram(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor)
+    
+    Computes a histogram of the values in a tensor.
+    
+    :attr:`bins` can be an integer or a 1D tensor.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins.
+    By default, the lower and upper range of the bins is determined by the
+    minimum and maximum elements of the input tensor. The :attr:`range`
+    argument can be provided to specify a range for the bins.
+    
+    If :attr:`bins` is a 1D tensor, it specifies the sequence of bin edges
+    including the rightmost edge. It should contain at least 2 elements
+    and its elements should be increasing.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: int or 1D Tensor. If int, defines the number of equal-width bins. If tensor,
+              defines the sequence of bin edges including the rightmost edge.
+    
+    Keyword args:
+        range (tuple of float): Defines the range of the bins.
+        weight (Tensor): If provided, weight should have the same shape as input. Each value in
+                         input contributes its associated weight towards its bin's result.
+        density (bool): If False, the result will contain the count (or total weight) in each bin.
+                        If True, the result is the value of the probability density function over the bins,
+                        normalized such that the integral over the range of the bins is 1.
+        out (Tensor, optional): the output tensor. (tuple, optional): The result tuple of two output tensors (hist, bin_edges).
+    
+    Returns:
+        hist (Tensor): 1D Tensor containing the values of the histogram.
+        bin_edges(Tensor): 1D Tensor containing the edges of the histogram bins.
+    
+    Example::
+    
+        >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]))
+        (tensor([ 0.,  5.,  2.,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
+        >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]), density=True)
+        (tensor([ 0.,  0.9524,  0.3810,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
+    """
+    ...
+@overload
+def histogramdd(input: Tensor, bins: _int, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> torch.return_types.histogramdd: 
+    r"""
+    histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[])
+    
+    Computes a multi-dimensional histogram of the values in a tensor.
+    
+    Interprets the elements of an input tensor whose innermost dimension has size N
+    as a collection of N-dimensional points. Maps each of the points into a set of
+    N-dimensional bins and returns the number of points (or total weight) in each bin.
+    
+    :attr:`input` must be a tensor with at least 2 dimensions.
+    If input has shape (M, N), each of its M rows defines a point in N-dimensional space.
+    If input has three or more dimensions, all but the last dimension are flattened.
+    
+    Each dimension is independently associated with its own strictly increasing sequence
+    of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D
+    tensors. Alternatively, bin edges may be constructed automatically by passing a
+    sequence of integers specifying the number of equal-width bins in each dimension.
+    
+    For each N-dimensional point in input:
+        - Each of its coordinates is binned independently among the bin edges
+            corresponding to its dimension
+        - Binning results are combined to identify the N-dimensional bin (if any)
+            into which the point falls
+        - If the point falls into a bin, the bin's count (or total weight) is incremented
+        - Points which do not fall into any bin do not contribute to the output
+    
+    :attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int.
+    
+    If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences
+    of bin edges. Each 1D tensor should contain a strictly increasing sequence with at
+    least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying
+    the left and right edges of all bins. Every bin is exclusive of its left edge. Only
+    the rightmost bin is inclusive of its right edge.
+    
+    If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins
+    in each dimension. By default, the leftmost and rightmost bin edges in each dimension
+    are determined by the minimum and maximum elements of the input tensor in the
+    corresponding dimension. The :attr:`range` argument can be provided to manually
+    specify the leftmost and rightmost bin edges in each dimension.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions.
+    
+    .. note::
+        See also :func:`torch.histogram`, which specifically computes 1D histograms.
+        While :func:`torch.histogramdd` infers the dimensionality of its bins and
+        binned values from the shape of :attr:`input`, :func:`torch.histogram`
+        accepts and flattens :attr:`input` of any shape.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: Tensor[], int[], or int.
+                If Tensor[], defines the sequences of bin edges.
+                If int[], defines the number of equal-width bins in each dimension.
+                If int, defines the number of equal-width bins for all dimensions.
+    Keyword args:
+        range (sequence of float): Defines the leftmost and rightmost bin edges
+                                    in each dimension.
+        weight (Tensor): By default, each value in the input has weight 1. If a weight
+                            tensor is passed, each N-dimensional coordinate in input
+                            contributes its associated weight towards its bin's result.
+                            The weight tensor should have the same shape as the :attr:`input`
+                            tensor excluding its innermost dimension N.
+        density (bool): If False (default), the result will contain the count (or total weight)
+                        in each bin. If True, each count (weight) is divided by the total count
+                        (total weight), then divided by the volume of its associated bin.
+    Returns:
+        hist (Tensor): N-dimensional Tensor containing the values of the histogram.
+        bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges.
+    
+    Example::
+        >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3],
+        ...                   weight=torch.tensor([1., 2., 4., 8.]))
+            torch.return_types.histogramdd(
+                hist=tensor([[0., 1., 0.],
+                             [2., 0., 0.],
+                             [4., 0., 8.]]),
+                bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]),
+                           tensor([0.0000, 0.6667, 1.3333, 2.0000])))
+    
+        >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2],
+        ...                   range=[0., 1., 0., 1.], density=True)
+            torch.return_types.histogramdd(
+               hist=tensor([[2., 0.],
+                            [0., 2.]]),
+               bin_edges=(tensor([0.0000, 0.5000, 1.0000]),
+                          tensor([0.0000, 0.5000, 1.0000])))
+    """
+    ...
+@overload
+def histogramdd(input: Tensor, bins: _size, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> torch.return_types.histogramdd: 
+    r"""
+    histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[])
+    
+    Computes a multi-dimensional histogram of the values in a tensor.
+    
+    Interprets the elements of an input tensor whose innermost dimension has size N
+    as a collection of N-dimensional points. Maps each of the points into a set of
+    N-dimensional bins and returns the number of points (or total weight) in each bin.
+    
+    :attr:`input` must be a tensor with at least 2 dimensions.
+    If input has shape (M, N), each of its M rows defines a point in N-dimensional space.
+    If input has three or more dimensions, all but the last dimension are flattened.
+    
+    Each dimension is independently associated with its own strictly increasing sequence
+    of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D
+    tensors. Alternatively, bin edges may be constructed automatically by passing a
+    sequence of integers specifying the number of equal-width bins in each dimension.
+    
+    For each N-dimensional point in input:
+        - Each of its coordinates is binned independently among the bin edges
+            corresponding to its dimension
+        - Binning results are combined to identify the N-dimensional bin (if any)
+            into which the point falls
+        - If the point falls into a bin, the bin's count (or total weight) is incremented
+        - Points which do not fall into any bin do not contribute to the output
+    
+    :attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int.
+    
+    If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences
+    of bin edges. Each 1D tensor should contain a strictly increasing sequence with at
+    least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying
+    the left and right edges of all bins. Every bin is exclusive of its left edge. Only
+    the rightmost bin is inclusive of its right edge.
+    
+    If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins
+    in each dimension. By default, the leftmost and rightmost bin edges in each dimension
+    are determined by the minimum and maximum elements of the input tensor in the
+    corresponding dimension. The :attr:`range` argument can be provided to manually
+    specify the leftmost and rightmost bin edges in each dimension.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions.
+    
+    .. note::
+        See also :func:`torch.histogram`, which specifically computes 1D histograms.
+        While :func:`torch.histogramdd` infers the dimensionality of its bins and
+        binned values from the shape of :attr:`input`, :func:`torch.histogram`
+        accepts and flattens :attr:`input` of any shape.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: Tensor[], int[], or int.
+                If Tensor[], defines the sequences of bin edges.
+                If int[], defines the number of equal-width bins in each dimension.
+                If int, defines the number of equal-width bins for all dimensions.
+    Keyword args:
+        range (sequence of float): Defines the leftmost and rightmost bin edges
+                                    in each dimension.
+        weight (Tensor): By default, each value in the input has weight 1. If a weight
+                            tensor is passed, each N-dimensional coordinate in input
+                            contributes its associated weight towards its bin's result.
+                            The weight tensor should have the same shape as the :attr:`input`
+                            tensor excluding its innermost dimension N.
+        density (bool): If False (default), the result will contain the count (or total weight)
+                        in each bin. If True, each count (weight) is divided by the total count
+                        (total weight), then divided by the volume of its associated bin.
+    Returns:
+        hist (Tensor): N-dimensional Tensor containing the values of the histogram.
+        bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges.
+    
+    Example::
+        >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3],
+        ...                   weight=torch.tensor([1., 2., 4., 8.]))
+            torch.return_types.histogramdd(
+                hist=tensor([[0., 1., 0.],
+                             [2., 0., 0.],
+                             [4., 0., 8.]]),
+                bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]),
+                           tensor([0.0000, 0.6667, 1.3333, 2.0000])))
+    
+        >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2],
+        ...                   range=[0., 1., 0., 1.], density=True)
+            torch.return_types.histogramdd(
+               hist=tensor([[2., 0.],
+                            [0., 2.]]),
+               bin_edges=(tensor([0.0000, 0.5000, 1.0000]),
+                          tensor([0.0000, 0.5000, 1.0000])))
+    """
+    ...
+@overload
+def histogramdd(input: Tensor, bins: Union[Tuple[Tensor, ...], List[Tensor]], range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> torch.return_types.histogramdd: 
+    r"""
+    histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[])
+    
+    Computes a multi-dimensional histogram of the values in a tensor.
+    
+    Interprets the elements of an input tensor whose innermost dimension has size N
+    as a collection of N-dimensional points. Maps each of the points into a set of
+    N-dimensional bins and returns the number of points (or total weight) in each bin.
+    
+    :attr:`input` must be a tensor with at least 2 dimensions.
+    If input has shape (M, N), each of its M rows defines a point in N-dimensional space.
+    If input has three or more dimensions, all but the last dimension are flattened.
+    
+    Each dimension is independently associated with its own strictly increasing sequence
+    of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D
+    tensors. Alternatively, bin edges may be constructed automatically by passing a
+    sequence of integers specifying the number of equal-width bins in each dimension.
+    
+    For each N-dimensional point in input:
+        - Each of its coordinates is binned independently among the bin edges
+            corresponding to its dimension
+        - Binning results are combined to identify the N-dimensional bin (if any)
+            into which the point falls
+        - If the point falls into a bin, the bin's count (or total weight) is incremented
+        - Points which do not fall into any bin do not contribute to the output
+    
+    :attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int.
+    
+    If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences
+    of bin edges. Each 1D tensor should contain a strictly increasing sequence with at
+    least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying
+    the left and right edges of all bins. Every bin is exclusive of its left edge. Only
+    the rightmost bin is inclusive of its right edge.
+    
+    If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins
+    in each dimension. By default, the leftmost and rightmost bin edges in each dimension
+    are determined by the minimum and maximum elements of the input tensor in the
+    corresponding dimension. The :attr:`range` argument can be provided to manually
+    specify the leftmost and rightmost bin edges in each dimension.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions.
+    
+    .. note::
+        See also :func:`torch.histogram`, which specifically computes 1D histograms.
+        While :func:`torch.histogramdd` infers the dimensionality of its bins and
+        binned values from the shape of :attr:`input`, :func:`torch.histogram`
+        accepts and flattens :attr:`input` of any shape.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: Tensor[], int[], or int.
+                If Tensor[], defines the sequences of bin edges.
+                If int[], defines the number of equal-width bins in each dimension.
+                If int, defines the number of equal-width bins for all dimensions.
+    Keyword args:
+        range (sequence of float): Defines the leftmost and rightmost bin edges
+                                    in each dimension.
+        weight (Tensor): By default, each value in the input has weight 1. If a weight
+                            tensor is passed, each N-dimensional coordinate in input
+                            contributes its associated weight towards its bin's result.
+                            The weight tensor should have the same shape as the :attr:`input`
+                            tensor excluding its innermost dimension N.
+        density (bool): If False (default), the result will contain the count (or total weight)
+                        in each bin. If True, each count (weight) is divided by the total count
+                        (total weight), then divided by the volume of its associated bin.
+    Returns:
+        hist (Tensor): N-dimensional Tensor containing the values of the histogram.
+        bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges.
+    
+    Example::
+        >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3],
+        ...                   weight=torch.tensor([1., 2., 4., 8.]))
+            torch.return_types.histogramdd(
+                hist=tensor([[0., 1., 0.],
+                             [2., 0., 0.],
+                             [4., 0., 8.]]),
+                bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]),
+                           tensor([0.0000, 0.6667, 1.3333, 2.0000])))
+    
+        >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2],
+        ...                   range=[0., 1., 0., 1.], density=True)
+            torch.return_types.histogramdd(
+               hist=tensor([[2., 0.],
+                            [0., 2.]]),
+               bin_edges=(tensor([0.0000, 0.5000, 1.0000]),
+                          tensor([0.0000, 0.5000, 1.0000])))
+    """
+    ...
+def hsmm(input: Tensor, mat2: Tensor) -> Tensor: ...
+@overload
+def hsplit(input: Tensor, sections: _int) -> Tuple[Tensor, ...]: 
+    r"""
+    hsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with one or more dimensions, into multiple tensors
+    horizontally according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    If :attr:`input` is one dimensional this is equivalent to calling
+    torch.tensor_split(input, indices_or_sections, dim=0) (the split dimension is
+    zero), and if :attr:`input` has two or more dimensions it's equivalent to calling
+    torch.tensor_split(input, indices_or_sections, dim=1) (the split dimension is 1),
+    except that if :attr:`indices_or_sections` is an integer it must evenly divide
+    the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.hsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(4,4)
+        >>> t
+        tensor([[ 0.,  1.,  2.,  3.],
+                [ 4.,  5.,  6.,  7.],
+                [ 8.,  9., 10., 11.],
+                [12., 13., 14., 15.]])
+        >>> torch.hsplit(t, 2)
+        (tensor([[ 0.,  1.],
+                 [ 4.,  5.],
+                 [ 8.,  9.],
+                 [12., 13.]]),
+         tensor([[ 2.,  3.],
+                 [ 6.,  7.],
+                 [10., 11.],
+                 [14., 15.]]))
+        >>> torch.hsplit(t, [3, 6])
+        (tensor([[ 0.,  1.,  2.],
+                 [ 4.,  5.,  6.],
+                 [ 8.,  9., 10.],
+                 [12., 13., 14.]]),
+         tensor([[ 3.],
+                 [ 7.],
+                 [11.],
+                 [15.]]),
+         tensor([], size=(4, 0)))
+    """
+    ...
+@overload
+def hsplit(input: Tensor, indices: _size) -> Tuple[Tensor, ...]: 
+    r"""
+    hsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with one or more dimensions, into multiple tensors
+    horizontally according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    If :attr:`input` is one dimensional this is equivalent to calling
+    torch.tensor_split(input, indices_or_sections, dim=0) (the split dimension is
+    zero), and if :attr:`input` has two or more dimensions it's equivalent to calling
+    torch.tensor_split(input, indices_or_sections, dim=1) (the split dimension is 1),
+    except that if :attr:`indices_or_sections` is an integer it must evenly divide
+    the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.hsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(4,4)
+        >>> t
+        tensor([[ 0.,  1.,  2.,  3.],
+                [ 4.,  5.,  6.,  7.],
+                [ 8.,  9., 10., 11.],
+                [12., 13., 14., 15.]])
+        >>> torch.hsplit(t, 2)
+        (tensor([[ 0.,  1.],
+                 [ 4.,  5.],
+                 [ 8.,  9.],
+                 [12., 13.]]),
+         tensor([[ 2.,  3.],
+                 [ 6.,  7.],
+                 [10., 11.],
+                 [14., 15.]]))
+        >>> torch.hsplit(t, [3, 6])
+        (tensor([[ 0.,  1.,  2.],
+                 [ 4.,  5.,  6.],
+                 [ 8.,  9., 10.],
+                 [12., 13., 14.]]),
+         tensor([[ 3.],
+                 [ 7.],
+                 [11.],
+                 [15.]]),
+         tensor([], size=(4, 0)))
+    """
+    ...
+def hspmm(mat1: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    hspmm(mat1, mat2, *, out=None) -> Tensor
+    
+    Performs a matrix multiplication of a :ref:`sparse COO matrix
+    <sparse-coo-docs>` :attr:`mat1` and a strided matrix :attr:`mat2`. The
+    result is a (1 + 1)-dimensional :ref:`hybrid COO matrix
+    <sparse-hybrid-coo-docs>`.
+    
+    Args:
+        mat1 (Tensor): the first sparse matrix to be matrix multiplied
+        mat2 (Tensor): the second strided matrix to be matrix multiplied
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+def hstack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    hstack(tensors, *, out=None) -> Tensor
+    
+    Stack tensors in sequence horizontally (column wise).
+    
+    This is equivalent to concatenation along the first axis for 1-D tensors, and along the second axis for all other tensors.
+    
+    Args:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5, 6])
+        >>> torch.hstack((a,b))
+        tensor([1, 2, 3, 4, 5, 6])
+        >>> a = torch.tensor([[1],[2],[3]])
+        >>> b = torch.tensor([[4],[5],[6]])
+        >>> torch.hstack((a,b))
+        tensor([[1, 4],
+                [2, 5],
+                [3, 6]])
+    """
+    ...
+def hypot(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    hypot(input, other, *, out=None) -> Tensor
+    
+    Given the legs of a right triangle, return its hypotenuse.
+    
+    .. math::
+        \text{out}_{i} = \sqrt{\text{input}_{i}^{2} + \text{other}_{i}^{2}}
+    
+    The shapes of ``input`` and ``other`` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the first input tensor
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.hypot(torch.tensor([4.0]), torch.tensor([3.0, 4.0, 5.0]))
+        tensor([5.0000, 5.6569, 6.4031])
+    """
+    ...
+def i0(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    i0(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.i0`.
+    """
+    ...
+def i0_(input: Tensor) -> Tensor: ...
+def igamma(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    igamma(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.gammainc`.
+    """
+    ...
+def igammac(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    igammac(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.gammaincc`.
+    """
+    ...
+def imag(input: Tensor) -> Tensor: 
+    r"""
+    imag(input) -> Tensor
+    
+    Returns a new tensor containing imaginary values of the :attr:`self` tensor.
+    The returned tensor and :attr:`self` share the same underlying storage.
+    
+    .. warning::
+        :func:`imag` is only supported for tensors with complex dtypes.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x=torch.randn(4, dtype=torch.cfloat)
+        >>> x
+        tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
+        >>> x.imag
+        tensor([ 0.3553, -0.7896, -0.0633, -0.8119])
+    """
+    ...
+@overload
+def index_add(input: Tensor, dim: _int, index: Tensor, source: Tensor, *, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_add(input, dim, index, source, *, alpha=1, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_add_` for function description.
+    """
+    ...
+@overload
+def index_add(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, source: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: 
+    r"""
+    index_add(input, dim, index, source, *, alpha=1, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_add_` for function description.
+    """
+    ...
+@overload
+def index_copy(input: Tensor, dim: _int, index: Tensor, source: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_copy(input, dim, index, source, *, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_add_` for function description.
+    """
+    ...
+@overload
+def index_copy(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, source: Tensor) -> Tensor: 
+    r"""
+    index_copy(input, dim, index, source, *, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_add_` for function description.
+    """
+    ...
+@overload
+def index_fill(input: Tensor, dim: _int, index: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def index_fill(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def index_fill(input: Tensor, dim: _int, index: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+@overload
+def index_fill(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+def index_put(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False) -> Tensor: ...
+def index_put_(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False) -> Tensor: ...
+def index_reduce(input: Tensor, dim: _int, index: Tensor, source: Tensor, reduce: str, *, include_self: _bool = True, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_reduce(input, dim, index, source, reduce, *, include_self=True, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_reduce_` for function description.
+    """
+    ...
+@overload
+def index_select(input: Tensor, dim: _int, index: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_select(input, dim, index, *, out=None) -> Tensor
+    
+    Returns a new tensor which indexes the :attr:`input` tensor along dimension
+    :attr:`dim` using the entries in :attr:`index` which is a `LongTensor`.
+    
+    The returned tensor has the same number of dimensions as the original tensor
+    (:attr:`input`).  The :attr:`dim`\ th dimension has the same size as the length
+    of :attr:`index`; other dimensions have the same size as in the original tensor.
+    
+    .. note:: The returned tensor does **not** use the same storage as the original
+              tensor.  If :attr:`out` has a different shape than expected, we
+              silently change it to the correct shape, reallocating the underlying
+              storage if necessary.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension in which we index
+        index (IntTensor or LongTensor): the 1-D tensor containing the indices to index
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> x
+        tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+                [-0.4664,  0.2647, -0.1228, -1.1068],
+                [-1.1734, -0.6571,  0.7230, -0.6004]])
+        >>> indices = torch.tensor([0, 2])
+        >>> torch.index_select(x, 0, indices)
+        tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+                [-1.1734, -0.6571,  0.7230, -0.6004]])
+        >>> torch.index_select(x, 1, indices)
+        tensor([[ 0.1427, -0.5414],
+                [-0.4664, -0.1228],
+                [-1.1734,  0.7230]])
+    """
+    ...
+@overload
+def index_select(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_select(input, dim, index, *, out=None) -> Tensor
+    
+    Returns a new tensor which indexes the :attr:`input` tensor along dimension
+    :attr:`dim` using the entries in :attr:`index` which is a `LongTensor`.
+    
+    The returned tensor has the same number of dimensions as the original tensor
+    (:attr:`input`).  The :attr:`dim`\ th dimension has the same size as the length
+    of :attr:`index`; other dimensions have the same size as in the original tensor.
+    
+    .. note:: The returned tensor does **not** use the same storage as the original
+              tensor.  If :attr:`out` has a different shape than expected, we
+              silently change it to the correct shape, reallocating the underlying
+              storage if necessary.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension in which we index
+        index (IntTensor or LongTensor): the 1-D tensor containing the indices to index
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> x
+        tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+                [-0.4664,  0.2647, -0.1228, -1.1068],
+                [-1.1734, -0.6571,  0.7230, -0.6004]])
+        >>> indices = torch.tensor([0, 2])
+        >>> torch.index_select(x, 0, indices)
+        tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+                [-1.1734, -0.6571,  0.7230, -0.6004]])
+        >>> torch.index_select(x, 1, indices)
+        tensor([[ 0.1427, -0.5414],
+                [-0.4664, -0.1228],
+                [-1.1734,  0.7230]])
+    """
+    ...
+def indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.indices`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def init_num_threads() -> None: ...
+def inner(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    inner(input, other, *, out=None) -> Tensor
+    
+    Computes the dot product for 1D tensors. For higher dimensions, sums the product
+    of elements from :attr:`input` and :attr:`other` along their last dimension.
+    
+    .. note::
+    
+        If either :attr:`input` or :attr:`other` is a scalar, the result is equivalent
+        to `torch.mul(input, other)`.
+    
+        If both :attr:`input` and :attr:`other` are non-scalars, the size of their last
+        dimension must match and the result is equivalent to `torch.tensordot(input,
+        other, dims=([-1], [-1]))`
+    
+    Args:
+        input (Tensor): First input tensor
+        other (Tensor): Second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): Optional output tensor to write result into. The output
+                                shape is `input.shape[:-1] + other.shape[:-1]`.
+    
+    Example::
+    
+        # Dot product
+        >>> torch.inner(torch.tensor([1, 2, 3]), torch.tensor([0, 2, 1]))
+        tensor(7)
+    
+        # Multidimensional input tensors
+        >>> a = torch.randn(2, 3)
+        >>> a
+        tensor([[0.8173, 1.0874, 1.1784],
+                [0.3279, 0.1234, 2.7894]])
+        >>> b = torch.randn(2, 4, 3)
+        >>> b
+        tensor([[[-0.4682, -0.7159,  0.1506],
+                [ 0.4034, -0.3657,  1.0387],
+                [ 0.9892, -0.6684,  0.1774],
+                [ 0.9482,  1.3261,  0.3917]],
+    
+                [[ 0.4537,  0.7493,  1.1724],
+                [ 0.2291,  0.5749, -0.2267],
+                [-0.7920,  0.3607, -0.3701],
+                [ 1.3666, -0.5850, -1.7242]]])
+        >>> torch.inner(a, b)
+        tensor([[[-0.9837,  1.1560,  0.2907,  2.6785],
+                [ 2.5671,  0.5452, -0.6912, -1.5509]],
+    
+                [[ 0.1782,  2.9843,  0.7366,  1.5672],
+                [ 3.5115, -0.4864, -1.2476, -4.4337]]])
+    
+        # Scalar input
+        >>> torch.inner(a, torch.tensor(2))
+        tensor([[1.6347, 2.1748, 2.3567],
+                [0.6558, 0.2469, 5.5787]])
+    """
+    ...
+def instance_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], use_input_stats: _bool, momentum: _float, eps: _float, cudnn_enabled: _bool) -> Tensor: ...
+def int_repr(input: Tensor) -> Tensor: ...
+def inverse(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    inverse(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.linalg.inv`
+    """
+    ...
+def is_complex(input: Tensor) -> _bool: 
+    r"""
+    is_complex(input) -> (bool)
+    
+    Returns True if the data type of :attr:`input` is a complex data type i.e.,
+    one of ``torch.complex64``, and ``torch.complex128``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    """
+    ...
+def is_conj(input: Tensor) -> _bool: 
+    r"""
+    is_conj(input) -> (bool)
+    
+    Returns True if the :attr:`input` is a conjugated tensor, i.e. its conjugate bit is set to `True`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    """
+    ...
+def is_distributed(input: Tensor) -> _bool: ...
+def is_floating_point(input: Tensor) -> _bool: 
+    r"""
+    is_floating_point(input) -> (bool)
+    
+    Returns True if the data type of :attr:`input` is a floating point data type i.e.,
+    one of ``torch.float64``, ``torch.float32``, ``torch.float16``, and ``torch.bfloat16``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    """
+    ...
+def is_grad_enabled() -> _bool: 
+    r"""
+    is_grad_enabled() -> (bool)
+    
+    Returns True if grad mode is currently enabled.
+    """
+    ...
+def is_inference(input: Tensor) -> _bool: 
+    r"""
+    is_inference(input) -> (bool)
+    
+    Returns True if :attr:`input` is an inference tensor.
+    
+    A non-view tensor is an inference tensor if and only if it was
+    allocated during inference mode. A view tensor is an inference
+    tensor if and only if the tensor it is a view of is an inference tensor.
+    
+    For details on inference mode please see
+    `Inference Mode <https://pytorch.org/cppdocs/notes/inference_mode.html>`_.
+    
+    Args:
+        input (Tensor): the input tensor.
+    """
+    ...
+def is_inference_mode_enabled() -> _bool: 
+    r"""
+    is_inference_mode_enabled() -> (bool)
+    
+    Returns True if inference mode is currently enabled.
+    """
+    ...
+def is_neg(input: Tensor) -> _bool: ...
+def is_nonzero(input: Tensor) -> _bool: 
+    r"""
+    is_nonzero(input) -> (bool)
+    
+    Returns True if the :attr:`input` is a single element tensor which is not equal to zero
+    after type conversions.
+    i.e. not equal to ``torch.tensor([0.])`` or ``torch.tensor([0])`` or
+    ``torch.tensor([False])``.
+    Throws a ``RuntimeError`` if ``torch.numel() != 1`` (even in case
+    of sparse tensors).
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Examples::
+    
+        >>> torch.is_nonzero(torch.tensor([0.]))
+        False
+        >>> torch.is_nonzero(torch.tensor([1.5]))
+        True
+        >>> torch.is_nonzero(torch.tensor([False]))
+        False
+        >>> torch.is_nonzero(torch.tensor([3]))
+        True
+        >>> torch.is_nonzero(torch.tensor([1, 3, 5]))
+        Traceback (most recent call last):
+        ...
+        RuntimeError: bool value of Tensor with more than one value is ambiguous
+        >>> torch.is_nonzero(torch.tensor([]))
+        Traceback (most recent call last):
+        ...
+        RuntimeError: bool value of Tensor with no values is ambiguous
+    """
+    ...
+def is_same_size(input: Tensor, other: Tensor) -> _bool: ...
+def is_signed(input: Tensor) -> _bool: ...
+def is_vulkan_available() -> _bool: ...
+def isclose(input: Tensor, other: Tensor, rtol: _float = 1e-05, atol: _float = 1e-08, equal_nan: _bool = False) -> Tensor: 
+    r"""
+    isclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False) -> Tensor
+    
+    Returns a new tensor with boolean elements representing if each element of
+    :attr:`input` is "close" to the corresponding element of :attr:`other`.
+    Closeness is defined as:
+    
+    .. math::
+        \lvert \text{input} - \text{other} \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert \text{other} \rvert
+    
+    
+    where :attr:`input` and :attr:`other` are finite. Where :attr:`input`
+    and/or :attr:`other` are nonfinite they are close if and only if
+    they are equal, with NaNs being considered equal to each other when
+    :attr:`equal_nan` is True.
+    
+    Args:
+        input (Tensor): first tensor to compare
+        other (Tensor): second tensor to compare
+        atol (float, optional): absolute tolerance. Default: 1e-08
+        rtol (float, optional): relative tolerance. Default: 1e-05
+        equal_nan (bool, optional): if ``True``, then two ``NaN`` s will be considered equal. Default: ``False``
+    
+    Examples::
+    
+        >>> torch.isclose(torch.tensor((1., 2, 3)), torch.tensor((1 + 1e-10, 3, 4)))
+        tensor([ True, False, False])
+        >>> torch.isclose(torch.tensor((float('inf'), 4)), torch.tensor((float('inf'), 6)), rtol=.5)
+        tensor([True, True])
+    """
+    ...
+def isfinite(input: Tensor) -> Tensor: 
+    r"""
+    isfinite(input) -> Tensor
+    
+    Returns a new tensor with boolean elements representing if each element is `finite` or not.
+    
+    Real values are finite when they are not NaN, negative infinity, or infinity.
+    Complex values are finite when both their real and imaginary parts are finite.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is finite and False elsewhere
+    
+    Example::
+    
+        >>> torch.isfinite(torch.tensor([1, float('inf'), 2, float('-inf'), float('nan')]))
+        tensor([True,  False,  True,  False,  False])
+    """
+    ...
+@overload
+def isin(elements: Tensor, test_elements: Tensor, *, assume_unique: _bool = False, invert: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor
+    
+    Tests if each element of :attr:`elements` is in :attr:`test_elements`. Returns
+    a boolean tensor of the same shape as :attr:`elements` that is True for elements
+    in :attr:`test_elements` and False otherwise.
+    
+    .. note::
+        One of :attr:`elements` or :attr:`test_elements` can be a scalar, but not both.
+    
+    Args:
+        elements (Tensor or Scalar): Input elements
+        test_elements (Tensor or Scalar): Values against which to test for each input element
+        assume_unique (bool, optional): If True, assumes both :attr:`elements` and
+            :attr:`test_elements` contain unique elements, which can speed up the
+            calculation. Default: False
+        invert (bool, optional): If True, inverts the boolean return tensor, resulting in True
+            values for elements *not* in :attr:`test_elements`. Default: False
+    
+    Returns:
+        A boolean tensor of the same shape as :attr:`elements` that is True for elements in
+        :attr:`test_elements` and False otherwise
+    
+    Example:
+        >>> torch.isin(torch.tensor([[1, 2], [3, 4]]), torch.tensor([2, 3]))
+        tensor([[False,  True],
+                [ True, False]])
+    """
+    ...
+@overload
+def isin(element: Union[Number, _complex], test_elements: Tensor, *, assume_unique: _bool = False, invert: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor
+    
+    Tests if each element of :attr:`elements` is in :attr:`test_elements`. Returns
+    a boolean tensor of the same shape as :attr:`elements` that is True for elements
+    in :attr:`test_elements` and False otherwise.
+    
+    .. note::
+        One of :attr:`elements` or :attr:`test_elements` can be a scalar, but not both.
+    
+    Args:
+        elements (Tensor or Scalar): Input elements
+        test_elements (Tensor or Scalar): Values against which to test for each input element
+        assume_unique (bool, optional): If True, assumes both :attr:`elements` and
+            :attr:`test_elements` contain unique elements, which can speed up the
+            calculation. Default: False
+        invert (bool, optional): If True, inverts the boolean return tensor, resulting in True
+            values for elements *not* in :attr:`test_elements`. Default: False
+    
+    Returns:
+        A boolean tensor of the same shape as :attr:`elements` that is True for elements in
+        :attr:`test_elements` and False otherwise
+    
+    Example:
+        >>> torch.isin(torch.tensor([[1, 2], [3, 4]]), torch.tensor([2, 3]))
+        tensor([[False,  True],
+                [ True, False]])
+    """
+    ...
+@overload
+def isin(elements: Tensor, test_element: Union[Number, _complex], *, assume_unique: _bool = False, invert: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor
+    
+    Tests if each element of :attr:`elements` is in :attr:`test_elements`. Returns
+    a boolean tensor of the same shape as :attr:`elements` that is True for elements
+    in :attr:`test_elements` and False otherwise.
+    
+    .. note::
+        One of :attr:`elements` or :attr:`test_elements` can be a scalar, but not both.
+    
+    Args:
+        elements (Tensor or Scalar): Input elements
+        test_elements (Tensor or Scalar): Values against which to test for each input element
+        assume_unique (bool, optional): If True, assumes both :attr:`elements` and
+            :attr:`test_elements` contain unique elements, which can speed up the
+            calculation. Default: False
+        invert (bool, optional): If True, inverts the boolean return tensor, resulting in True
+            values for elements *not* in :attr:`test_elements`. Default: False
+    
+    Returns:
+        A boolean tensor of the same shape as :attr:`elements` that is True for elements in
+        :attr:`test_elements` and False otherwise
+    
+    Example:
+        >>> torch.isin(torch.tensor([[1, 2], [3, 4]]), torch.tensor([2, 3]))
+        tensor([[False,  True],
+                [ True, False]])
+    """
+    ...
+def isinf(input: Tensor) -> Tensor: 
+    r"""
+    isinf(input) -> Tensor
+    
+    Tests if each element of :attr:`input` is infinite
+    (positive or negative infinity) or not.
+    
+    .. note::
+        Complex values are infinite when their real or imaginary part is
+        infinite.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is infinite and False elsewhere
+    
+    Example::
+    
+        >>> torch.isinf(torch.tensor([1, float('inf'), 2, float('-inf'), float('nan')]))
+        tensor([False,  True,  False,  True,  False])
+    """
+    ...
+def isnan(input: Tensor) -> Tensor: 
+    r"""
+    isnan(input) -> Tensor
+    
+    Returns a new tensor with boolean elements representing if each element of :attr:`input`
+    is NaN or not. Complex values are considered NaN when either their real
+    and/or imaginary part is NaN.
+    
+    Arguments:
+        input (Tensor): the input tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is NaN and False elsewhere
+    
+    Example::
+    
+        >>> torch.isnan(torch.tensor([1, float('nan'), 2]))
+        tensor([False, True, False])
+    """
+    ...
+def isneginf(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isneginf(input, *, out=None) -> Tensor
+    Tests if each element of :attr:`input` is negative infinity or not.
+    
+    Args:
+      input (Tensor): the input tensor.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([-float('inf'), float('inf'), 1.2])
+        >>> torch.isneginf(a)
+        tensor([ True, False, False])
+    """
+    ...
+def isposinf(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isposinf(input, *, out=None) -> Tensor
+    Tests if each element of :attr:`input` is positive infinity or not.
+    
+    Args:
+      input (Tensor): the input tensor.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([-float('inf'), float('inf'), 1.2])
+        >>> torch.isposinf(a)
+        tensor([False,  True, False])
+    """
+    ...
+def isreal(input: Tensor) -> Tensor: 
+    r"""
+    isreal(input) -> Tensor
+    
+    Returns a new tensor with boolean elements representing if each element of :attr:`input` is real-valued or not.
+    All real-valued types are considered real. Complex values are considered real when their imaginary part is 0.
+    
+    Arguments:
+        input (Tensor): the input tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is real and False elsewhere
+    
+    Example::
+    
+        >>> torch.isreal(torch.tensor([1, 1+1j, 2+0j]))
+        tensor([True, False, True])
+    """
+    ...
+def istft(input: Tensor, n_fft: _int, hop_length: Optional[_int] = None, win_length: Optional[_int] = None, window: Optional[Tensor] = None, center: _bool = True, normalized: _bool = False, onesided: Optional[_bool] = None, length: Optional[_int] = None, return_complex: _bool = False) -> Tensor: ...
+@overload
+def kaiser_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    kaiser_window(window_length, periodic=True, beta=12.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Computes the Kaiser window with window length :attr:`window_length` and shape parameter :attr:`beta`.
+    
+    Let I_0 be the zeroth order modified Bessel function of the first kind (see :func:`torch.i0`) and
+    ``N = L - 1`` if :attr:`periodic` is False and ``L`` if :attr:`periodic` is True,
+    where ``L`` is the :attr:`window_length`. This function computes:
+    
+    .. math::
+        out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
+    
+    Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling
+    ``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``.
+    The :attr:`periodic` argument is intended as a helpful shorthand
+    to produce a periodic window as input to functions like :func:`torch.stft`.
+    
+    .. note::
+        If :attr:`window_length` is one, then the returned window is a single element tensor containing a one.
+    
+    
+    Args:
+        window_length (int): length of the window.
+        periodic (bool, optional): If True, returns a periodic window suitable for use in spectral analysis.
+            If False, returns a symmetric window suitable for use in filter design.
+        beta (float, optional): shape parameter for the window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    """
+    ...
+@overload
+def kaiser_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    kaiser_window(window_length, periodic=True, beta=12.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Computes the Kaiser window with window length :attr:`window_length` and shape parameter :attr:`beta`.
+    
+    Let I_0 be the zeroth order modified Bessel function of the first kind (see :func:`torch.i0`) and
+    ``N = L - 1`` if :attr:`periodic` is False and ``L`` if :attr:`periodic` is True,
+    where ``L`` is the :attr:`window_length`. This function computes:
+    
+    .. math::
+        out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
+    
+    Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling
+    ``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``.
+    The :attr:`periodic` argument is intended as a helpful shorthand
+    to produce a periodic window as input to functions like :func:`torch.stft`.
+    
+    .. note::
+        If :attr:`window_length` is one, then the returned window is a single element tensor containing a one.
+    
+    
+    Args:
+        window_length (int): length of the window.
+        periodic (bool, optional): If True, returns a periodic window suitable for use in spectral analysis.
+            If False, returns a symmetric window suitable for use in filter design.
+        beta (float, optional): shape parameter for the window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    """
+    ...
+@overload
+def kaiser_window(window_length: _int, periodic: _bool, beta: _float, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    kaiser_window(window_length, periodic=True, beta=12.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Computes the Kaiser window with window length :attr:`window_length` and shape parameter :attr:`beta`.
+    
+    Let I_0 be the zeroth order modified Bessel function of the first kind (see :func:`torch.i0`) and
+    ``N = L - 1`` if :attr:`periodic` is False and ``L`` if :attr:`periodic` is True,
+    where ``L`` is the :attr:`window_length`. This function computes:
+    
+    .. math::
+        out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
+    
+    Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling
+    ``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``.
+    The :attr:`periodic` argument is intended as a helpful shorthand
+    to produce a periodic window as input to functions like :func:`torch.stft`.
+    
+    .. note::
+        If :attr:`window_length` is one, then the returned window is a single element tensor containing a one.
+    
+    
+    Args:
+        window_length (int): length of the window.
+        periodic (bool, optional): If True, returns a periodic window suitable for use in spectral analysis.
+            If False, returns a symmetric window suitable for use in filter design.
+        beta (float, optional): shape parameter for the window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    """
+    ...
+def kl_div(input: Tensor, target: Tensor, reduction: _int = 1, *, log_target: _bool = False) -> Tensor: ...
+def kron(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    kron(input, other, *, out=None) -> Tensor
+    
+    Computes the Kronecker product, denoted by :math:`\otimes`, of :attr:`input` and :attr:`other`.
+    
+    If :attr:`input` is a :math:`(a_0 \times a_1 \times \dots \times a_n)` tensor and :attr:`other` is a
+    :math:`(b_0 \times b_1 \times \dots \times b_n)` tensor, the result will be a
+    :math:`(a_0*b_0 \times a_1*b_1 \times \dots \times a_n*b_n)` tensor with the following entries:
+    
+    .. math::
+        (\text{input} \otimes \text{other})_{k_0, k_1, \dots, k_n} =
+            \text{input}_{i_0, i_1, \dots, i_n} * \text{other}_{j_0, j_1, \dots, j_n},
+    
+    where :math:`k_t = i_t * b_t + j_t` for :math:`0 \leq t \leq n`.
+    If one tensor has fewer dimensions than the other it is unsqueezed until it has the same number of dimensions.
+    
+    Supports real-valued and complex-valued inputs.
+    
+    .. note::
+        This function generalizes the typical definition of the Kronecker product for two matrices to two tensors,
+        as described above. When :attr:`input` is a :math:`(m \times n)` matrix and :attr:`other` is a
+        :math:`(p \times q)` matrix, the result will be a :math:`(p*m \times q*n)` block matrix:
+    
+        .. math::
+            \mathbf{A} \otimes \mathbf{B}=\begin{bmatrix}
+            a_{11} \mathbf{B} & \cdots & a_{1 n} \mathbf{B} \\
+            \vdots & \ddots & \vdots \\
+            a_{m 1} \mathbf{B} & \cdots & a_{m n} \mathbf{B} \end{bmatrix}
+    
+        where :attr:`input` is :math:`\mathbf{A}` and :attr:`other` is :math:`\mathbf{B}`.
+    
+    Arguments:
+        input (Tensor)
+        other (Tensor)
+    
+    Keyword args:
+        out (Tensor, optional): The output tensor. Ignored if ``None``. Default: ``None``
+    
+    Examples::
+    
+        >>> mat1 = torch.eye(2)
+        >>> mat2 = torch.ones(2, 2)
+        >>> torch.kron(mat1, mat2)
+        tensor([[1., 1., 0., 0.],
+                [1., 1., 0., 0.],
+                [0., 0., 1., 1.],
+                [0., 0., 1., 1.]])
+    
+        >>> mat1 = torch.eye(2)
+        >>> mat2 = torch.arange(1, 5).reshape(2, 2)
+        >>> torch.kron(mat1, mat2)
+        tensor([[1., 2., 0., 0.],
+                [3., 4., 0., 0.],
+                [0., 0., 1., 2.],
+                [0., 0., 3., 4.]])
+    """
+    ...
+@overload
+def kthvalue(input: Tensor, k: _int, dim: _int = -1, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.kthvalue: 
+    r"""
+    kthvalue(input, k, dim=None, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the :attr:`k` th
+    smallest element of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each element found.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`keepdim` is ``True``, both the :attr:`values` and :attr:`indices` tensors
+    are the same size as :attr:`input`, except in the dimension :attr:`dim` where
+    they are of size 1. Otherwise, :attr:`dim` is squeezed
+    (see :func:`torch.squeeze`), resulting in both the :attr:`values` and
+    :attr:`indices` tensors having 1 fewer dimension than the :attr:`input` tensor.
+    
+    .. note::
+        When :attr:`input` is a CUDA tensor and there are multiple valid
+        :attr:`k` th values, this function may nondeterministically return
+        :attr:`indices` for any of them.
+    
+    Args:
+        input (Tensor): the input tensor.
+        k (int): k for the k-th smallest element
+        dim (int, optional): the dimension to find the kth value along
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (Tensor, LongTensor)
+                               can be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.arange(1., 6.)
+        >>> x
+        tensor([ 1.,  2.,  3.,  4.,  5.])
+        >>> torch.kthvalue(x, 4)
+        torch.return_types.kthvalue(values=tensor(4.), indices=tensor(3))
+    
+        >>> x=torch.arange(1.,7.).resize_(2,3)
+        >>> x
+        tensor([[ 1.,  2.,  3.],
+                [ 4.,  5.,  6.]])
+        >>> torch.kthvalue(x, 2, 0, True)
+        torch.return_types.kthvalue(values=tensor([[4., 5., 6.]]), indices=tensor([[1, 1, 1]]))
+    """
+    ...
+@overload
+def kthvalue(input: Tensor, k: _int, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.kthvalue: 
+    r"""
+    kthvalue(input, k, dim=None, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the :attr:`k` th
+    smallest element of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each element found.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`keepdim` is ``True``, both the :attr:`values` and :attr:`indices` tensors
+    are the same size as :attr:`input`, except in the dimension :attr:`dim` where
+    they are of size 1. Otherwise, :attr:`dim` is squeezed
+    (see :func:`torch.squeeze`), resulting in both the :attr:`values` and
+    :attr:`indices` tensors having 1 fewer dimension than the :attr:`input` tensor.
+    
+    .. note::
+        When :attr:`input` is a CUDA tensor and there are multiple valid
+        :attr:`k` th values, this function may nondeterministically return
+        :attr:`indices` for any of them.
+    
+    Args:
+        input (Tensor): the input tensor.
+        k (int): k for the k-th smallest element
+        dim (int, optional): the dimension to find the kth value along
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (Tensor, LongTensor)
+                               can be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.arange(1., 6.)
+        >>> x
+        tensor([ 1.,  2.,  3.,  4.,  5.])
+        >>> torch.kthvalue(x, 4)
+        torch.return_types.kthvalue(values=tensor(4.), indices=tensor(3))
+    
+        >>> x=torch.arange(1.,7.).resize_(2,3)
+        >>> x
+        tensor([[ 1.,  2.,  3.],
+                [ 4.,  5.,  6.]])
+        >>> torch.kthvalue(x, 2, 0, True)
+        torch.return_types.kthvalue(values=tensor([[4., 5., 6.]]), indices=tensor([[1, 1, 1]]))
+    """
+    ...
+def layer_norm(input: Tensor, normalized_shape: Sequence[Union[_int, SymInt]], weight: Optional[Tensor] = None, bias: Optional[Tensor] = None, eps: _float = 1e-05, cudnn_enable: _bool = True) -> Tensor: ...
+def lcm(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lcm(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise least common multiple (LCM) of :attr:`input` and :attr:`other`.
+    
+    Both :attr:`input` and :attr:`other` must have integer types.
+    
+    .. note::
+        This defines :math:`lcm(0, 0) = 0` and :math:`lcm(0, a) = 0`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([5, 10, 15])
+        >>> b = torch.tensor([3, 4, 5])
+        >>> torch.lcm(a, b)
+        tensor([15, 20, 15])
+        >>> c = torch.tensor([3])
+        >>> torch.lcm(a, c)
+        tensor([15, 30, 15])
+    """
+    ...
+def lcm_(input: Tensor, other: Tensor) -> Tensor: ...
+def ldexp(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ldexp(input, other, *, out=None) -> Tensor
+    
+    Multiplies :attr:`input` by 2 ** :attr:`other`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i * 2^\text{{other}}_i
+    
+    
+    Typically this function is used to construct floating point numbers by multiplying
+    mantissas in :attr:`input` with integral powers of two created from the exponents
+    in :attr:`other`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): a tensor of exponents, typically integers.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.ldexp(torch.tensor([1.]), torch.tensor([1]))
+        tensor([2.])
+        >>> torch.ldexp(torch.tensor([1.0]), torch.tensor([1, 2, 3, 4]))
+        tensor([ 2.,  4.,  8., 16.])
+    """
+    ...
+def ldexp_(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def le(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    le(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \leq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or Scalar): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is less than or equal to
+        :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.le(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[True, False], [True, True]])
+    """
+    ...
+@overload
+def le(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    le(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \leq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or Scalar): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is less than or equal to
+        :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.le(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[True, False], [True, True]])
+    """
+    ...
+@overload
+def lerp(input: Tensor, end: Tensor, weight: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lerp(input, end, weight, *, out=None)
+    
+    Does a linear interpolation of two tensors :attr:`start` (given by :attr:`input`) and :attr:`end` based
+    on a scalar or tensor :attr:`weight` and returns the resulting :attr:`out` tensor.
+    
+    .. math::
+        \text{out}_i = \text{start}_i + \text{weight}_i \times (\text{end}_i - \text{start}_i)
+    
+    The shapes of :attr:`start` and :attr:`end` must be
+    :ref:`broadcastable <broadcasting-semantics>`. If :attr:`weight` is a tensor, then
+    the shapes of :attr:`weight`, :attr:`start`, and :attr:`end` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the tensor with the starting points
+        end (Tensor): the tensor with the ending points
+        weight (float or tensor): the weight for the interpolation formula
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> start = torch.arange(1., 5.)
+        >>> end = torch.empty(4).fill_(10)
+        >>> start
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> end
+        tensor([ 10.,  10.,  10.,  10.])
+        >>> torch.lerp(start, end, 0.5)
+        tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+        >>> torch.lerp(start, end, torch.full_like(start, 0.5))
+        tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+    """
+    ...
+@overload
+def lerp(input: Tensor, end: Tensor, weight: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lerp(input, end, weight, *, out=None)
+    
+    Does a linear interpolation of two tensors :attr:`start` (given by :attr:`input`) and :attr:`end` based
+    on a scalar or tensor :attr:`weight` and returns the resulting :attr:`out` tensor.
+    
+    .. math::
+        \text{out}_i = \text{start}_i + \text{weight}_i \times (\text{end}_i - \text{start}_i)
+    
+    The shapes of :attr:`start` and :attr:`end` must be
+    :ref:`broadcastable <broadcasting-semantics>`. If :attr:`weight` is a tensor, then
+    the shapes of :attr:`weight`, :attr:`start`, and :attr:`end` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the tensor with the starting points
+        end (Tensor): the tensor with the ending points
+        weight (float or tensor): the weight for the interpolation formula
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> start = torch.arange(1., 5.)
+        >>> end = torch.empty(4).fill_(10)
+        >>> start
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> end
+        tensor([ 10.,  10.,  10.,  10.])
+        >>> torch.lerp(start, end, 0.5)
+        tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+        >>> torch.lerp(start, end, torch.full_like(start, 0.5))
+        tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+    """
+    ...
+@overload
+def less(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    less(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.lt`.
+    """
+    ...
+@overload
+def less(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    less(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.lt`.
+    """
+    ...
+@overload
+def less_equal(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    less_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.le`.
+    """
+    ...
+@overload
+def less_equal(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    less_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.le`.
+    """
+    ...
+def lgamma(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lgamma(input, *, out=None) -> Tensor
+    
+    Computes the natural logarithm of the absolute value of the gamma function on :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \ln |\Gamma(\text{input}_{i})|
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.arange(0.5, 2, 0.5)
+        >>> torch.lgamma(a)
+        tensor([ 0.5724,  0.0000, -0.1208])
+    """
+    ...
+@overload
+def linspace(start: Number, end: Number, steps: Optional[_int] = None, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+@overload
+def linspace(start: Tensor, end: Tensor, steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+@overload
+def linspace(start: Union[Number, _complex], end: Tensor, steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+@overload
+def linspace(start: Tensor, end: Union[Number, _complex], steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+@overload
+def linspace(start: Union[Number, _complex], end: Union[Number, _complex], steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+def log(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    log(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the natural logarithm of the elements
+    of :attr:`input`.
+    
+    .. math::
+        y_{i} = \log_{e} (x_{i})
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(5) * 5
+        >>> a
+        tensor([4.7767, 4.3234, 1.2156, 0.2411, 4.5739])
+        >>> torch.log(a)
+        tensor([ 1.5637,  1.4640,  0.1952, -1.4226,  1.5204])
+    """
+    ...
+def log10(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    log10(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the logarithm to the base 10 of the elements
+    of :attr:`input`.
+    
+    .. math::
+        y_{i} = \log_{10} (x_{i})
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(5)
+        >>> a
+        tensor([ 0.5224,  0.9354,  0.7257,  0.1301,  0.2251])
+    
+    
+        >>> torch.log10(a)
+        tensor([-0.2820, -0.0290, -0.1392, -0.8857, -0.6476])
+    """
+    ...
+def log10_(input: Tensor) -> Tensor: ...
+def log1p(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    log1p(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the natural logarithm of (1 + :attr:`input`).
+    
+    .. math::
+        y_i = \log_{e} (x_i + 1)
+    
+    .. note:: This function is more accurate than :func:`torch.log` for small
+              values of :attr:`input`
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(5)
+        >>> a
+        tensor([-1.0090, -0.9923,  1.0249, -0.5372,  0.2492])
+        >>> torch.log1p(a)
+        tensor([    nan, -4.8653,  0.7055, -0.7705,  0.2225])
+    """
+    ...
+def log1p_(input: Tensor) -> Tensor: ...
+def log2(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    log2(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the logarithm to the base 2 of the elements
+    of :attr:`input`.
+    
+    .. math::
+        y_{i} = \log_{2} (x_{i})
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(5)
+        >>> a
+        tensor([ 0.8419,  0.8003,  0.9971,  0.5287,  0.0490])
+    
+    
+        >>> torch.log2(a)
+        tensor([-0.2483, -0.3213, -0.0042, -0.9196, -4.3504])
+    """
+    ...
+def log2_(input: Tensor) -> Tensor: ...
+def log_(input: Tensor) -> Tensor: ...
+@overload
+def log_softmax(input: Tensor, dim: _int, dtype: Optional[_dtype] = None, *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def log_softmax(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None) -> Tensor: ...
+def logaddexp(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logaddexp(input, other, *, out=None) -> Tensor
+    
+    Logarithm of the sum of exponentiations of the inputs.
+    
+    Calculates pointwise :math:`\log\left(e^x + e^y\right)`. This function is useful
+    in statistics where the calculated probabilities of events may be so small as to
+    exceed the range of normal floating point numbers. In such cases the logarithm
+    of the calculated probability is stored. This function allows adding
+    probabilities stored in such a fashion.
+    
+    This op should be disambiguated with :func:`torch.logsumexp` which performs a
+    reduction on a single tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logaddexp(torch.tensor([-1.0]), torch.tensor([-1.0, -2, -3]))
+        tensor([-0.3069, -0.6867, -0.8731])
+        >>> torch.logaddexp(torch.tensor([-100.0, -200, -300]), torch.tensor([-1.0, -2, -3]))
+        tensor([-1., -2., -3.])
+        >>> torch.logaddexp(torch.tensor([1.0, 2000, 30000]), torch.tensor([-1.0, -2, -3]))
+        tensor([1.1269e+00, 2.0000e+03, 3.0000e+04])
+    """
+    ...
+def logaddexp2(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logaddexp2(input, other, *, out=None) -> Tensor
+    
+    Logarithm of the sum of exponentiations of the inputs in base-2.
+    
+    Calculates pointwise :math:`\log_2\left(2^x + 2^y\right)`. See
+    :func:`torch.logaddexp` for more details.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+@overload
+def logcumsumexp(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logcumsumexp(input, dim, *, out=None) -> Tensor
+    Returns the logarithm of the cumulative summation of the exponentiation of
+    elements of :attr:`input` in the dimension :attr:`dim`.
+    
+    For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+    
+        .. math::
+            \text{logcumsumexp}(x)_{ij} = \log \sum\limits_{j=0}^{i} \exp(x_{ij})
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> torch.logcumsumexp(a, dim=0)
+        tensor([-0.42296738, -0.04462666,  0.86278635,  0.94622083,  1.05277811,
+                 1.39202815,  1.83525007,  1.84492621,  2.06084887,  2.06844475]))
+    """
+    ...
+@overload
+def logcumsumexp(input: Tensor, dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logcumsumexp(input, dim, *, out=None) -> Tensor
+    Returns the logarithm of the cumulative summation of the exponentiation of
+    elements of :attr:`input` in the dimension :attr:`dim`.
+    
+    For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+    
+        .. math::
+            \text{logcumsumexp}(x)_{ij} = \log \sum\limits_{j=0}^{i} \exp(x_{ij})
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> torch.logcumsumexp(a, dim=0)
+        tensor([-0.42296738, -0.04462666,  0.86278635,  0.94622083,  1.05277811,
+                 1.39202815,  1.83525007,  1.84492621,  2.06084887,  2.06844475]))
+    """
+    ...
+def logdet(input: Tensor) -> Tensor: 
+    r"""
+    logdet(input) -> Tensor
+    
+    Calculates log determinant of a square matrix or batches of square matrices.
+    
+    It returns ``-inf`` if the input has a determinant of zero, and ``NaN`` if it has
+    a negative determinant.
+    
+    .. note::
+        Backward through :meth:`logdet` internally uses SVD results when :attr:`input`
+        is not invertible. In this case, double backward through :meth:`logdet` will
+        be unstable in when :attr:`input` doesn't have distinct singular values. See
+        :func:`torch.linalg.svd` for details.
+    
+    .. seealso::
+    
+            :func:`torch.linalg.slogdet` computes the sign (resp. angle) and natural logarithm of the
+            absolute value of the determinant of real-valued (resp. complex) square matrices.
+    
+    Arguments:
+        input (Tensor): the input tensor of size ``(*, n, n)`` where ``*`` is zero or more
+                    batch dimensions.
+    
+    Example::
+    
+        >>> A = torch.randn(3, 3)
+        >>> torch.det(A)
+        tensor(0.2611)
+        >>> torch.logdet(A)
+        tensor(-1.3430)
+        >>> A
+        tensor([[[ 0.9254, -0.6213],
+                 [-0.5787,  1.6843]],
+    
+                [[ 0.3242, -0.9665],
+                 [ 0.4539, -0.0887]],
+    
+                [[ 1.1336, -0.4025],
+                 [-0.7089,  0.9032]]])
+        >>> A.det()
+        tensor([1.1990, 0.4099, 0.7386])
+        >>> A.det().log()
+        tensor([ 0.1815, -0.8917, -0.3031])
+    """
+    ...
+def logical_and(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logical_and(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise logical AND of the given input tensors. Zeros are treated as ``False`` and nonzeros are
+    treated as ``True``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the tensor to compute AND with
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logical_and(torch.tensor([True, False, True]), torch.tensor([True, False, False]))
+        tensor([ True, False, False])
+        >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
+        >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
+        >>> torch.logical_and(a, b)
+        tensor([False, False,  True, False])
+        >>> torch.logical_and(a.double(), b.double())
+        tensor([False, False,  True, False])
+        >>> torch.logical_and(a.double(), b)
+        tensor([False, False,  True, False])
+        >>> torch.logical_and(a, b, out=torch.empty(4, dtype=torch.bool))
+        tensor([False, False,  True, False])
+    """
+    ...
+def logical_not(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logical_not(input, *, out=None) -> Tensor
+    
+    Computes the element-wise logical NOT of the given input tensor. If not specified, the output tensor will have the bool
+    dtype. If the input tensor is not a bool tensor, zeros are treated as ``False`` and non-zeros are treated as ``True``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logical_not(torch.tensor([True, False]))
+        tensor([False,  True])
+        >>> torch.logical_not(torch.tensor([0, 1, -10], dtype=torch.int8))
+        tensor([ True, False, False])
+        >>> torch.logical_not(torch.tensor([0., 1.5, -10.], dtype=torch.double))
+        tensor([ True, False, False])
+        >>> torch.logical_not(torch.tensor([0., 1., -10.], dtype=torch.double), out=torch.empty(3, dtype=torch.int16))
+        tensor([1, 0, 0], dtype=torch.int16)
+    """
+    ...
+def logical_or(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logical_or(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise logical OR of the given input tensors. Zeros are treated as ``False`` and nonzeros are
+    treated as ``True``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the tensor to compute OR with
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logical_or(torch.tensor([True, False, True]), torch.tensor([True, False, False]))
+        tensor([ True, False,  True])
+        >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
+        >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
+        >>> torch.logical_or(a, b)
+        tensor([ True,  True,  True, False])
+        >>> torch.logical_or(a.double(), b.double())
+        tensor([ True,  True,  True, False])
+        >>> torch.logical_or(a.double(), b)
+        tensor([ True,  True,  True, False])
+        >>> torch.logical_or(a, b, out=torch.empty(4, dtype=torch.bool))
+        tensor([ True,  True,  True, False])
+    """
+    ...
+def logical_xor(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logical_xor(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise logical XOR of the given input tensors. Zeros are treated as ``False`` and nonzeros are
+    treated as ``True``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the tensor to compute XOR with
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logical_xor(torch.tensor([True, False, True]), torch.tensor([True, False, False]))
+        tensor([False, False,  True])
+        >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
+        >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
+        >>> torch.logical_xor(a, b)
+        tensor([ True,  True, False, False])
+        >>> torch.logical_xor(a.double(), b.double())
+        tensor([ True,  True, False, False])
+        >>> torch.logical_xor(a.double(), b)
+        tensor([ True,  True, False, False])
+        >>> torch.logical_xor(a, b, out=torch.empty(4, dtype=torch.bool))
+        tensor([ True,  True, False, False])
+    """
+    ...
+def logit(input: Tensor, eps: Optional[_float] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logit(input, eps=None, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.logit`.
+    """
+    ...
+def logit_(input: Tensor, eps: Optional[_float] = None) -> Tensor: ...
+@overload
+def logspace(start: Number, end: Number, steps: Optional[_int] = None, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logspace(start: Tensor, end: Tensor, steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logspace(start: Union[Number, _complex], end: Tensor, steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logspace(start: Tensor, end: Union[Number, _complex], steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logspace(start: Union[Number, _complex], end: Union[Number, _complex], steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logsumexp(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logsumexp(input, dim, keepdim=False, *, out=None)
+    
+    Returns the log of summed exponentials of each row of the :attr:`input`
+    tensor in the given dimension :attr:`dim`. The computation is numerically
+    stabilized.
+    
+    For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+    
+        .. math::
+            \text{logsumexp}(x)_{i} = \log \sum_j \exp(x_{ij})
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> torch.logsumexp(a, 1)
+        tensor([1.4907, 1.0593, 1.5696])
+        >>> torch.dist(torch.logsumexp(a, 1), torch.log(torch.sum(torch.exp(a), 1)))
+        tensor(1.6859e-07)
+    """
+    ...
+@overload
+def logsumexp(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logsumexp(input, dim, keepdim=False, *, out=None)
+    
+    Returns the log of summed exponentials of each row of the :attr:`input`
+    tensor in the given dimension :attr:`dim`. The computation is numerically
+    stabilized.
+    
+    For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+    
+        .. math::
+            \text{logsumexp}(x)_{i} = \log \sum_j \exp(x_{ij})
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> torch.logsumexp(a, 1)
+        tensor([1.4907, 1.0593, 1.5696])
+        >>> torch.dist(torch.logsumexp(a, 1), torch.log(torch.sum(torch.exp(a), 1)))
+        tensor(1.6859e-07)
+    """
+    ...
+@overload
+def lstm(data: Tensor, batch_sizes: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor, Tensor]: ...
+@overload
+def lstm(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor, Tensor]: ...
+def lstm_cell(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: ...
+@overload
+def lt(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lt(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} < \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is less than :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.lt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, False], [True, False]])
+    """
+    ...
+@overload
+def lt(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lt(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} < \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is less than :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.lt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, False], [True, False]])
+    """
+    ...
+def lu_solve(input: Tensor, LU_data: Tensor, LU_pivots: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lu_solve(b, LU_data, LU_pivots, *, out=None) -> Tensor
+    
+    Returns the LU solve of the linear system :math:`Ax = b` using the partially pivoted
+    LU factorization of A from :func:`~linalg.lu_factor`.
+    
+    This function supports ``float``, ``double``, ``cfloat`` and ``cdouble`` dtypes for :attr:`input`.
+    
+    .. warning::
+    
+        :func:`torch.lu_solve` is deprecated in favor of :func:`torch.linalg.lu_solve`.
+        :func:`torch.lu_solve` will be removed in a future PyTorch release.
+        ``X = torch.lu_solve(B, LU, pivots)`` should be replaced with
+    
+        .. code:: python
+    
+            X = linalg.lu_solve(LU, pivots, B)
+    
+    Arguments:
+        b (Tensor): the RHS tensor of size :math:`(*, m, k)`, where :math:`*`
+                    is zero or more batch dimensions.
+        LU_data (Tensor): the pivoted LU factorization of A from :meth:`~linalg.lu_factor` of size :math:`(*, m, m)`,
+                           where :math:`*` is zero or more batch dimensions.
+        LU_pivots (IntTensor): the pivots of the LU factorization from :meth:`~linalg.lu_factor` of size :math:`(*, m)`,
+                               where :math:`*` is zero or more batch dimensions.
+                               The batch dimensions of :attr:`LU_pivots` must be equal to the batch dimensions of
+                               :attr:`LU_data`.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> A = torch.randn(2, 3, 3)
+        >>> b = torch.randn(2, 3, 1)
+        >>> LU, pivots = torch.linalg.lu_factor(A)
+        >>> x = torch.lu_solve(b, LU, pivots)
+        >>> torch.dist(A @ x, b)
+        tensor(1.00000e-07 *
+               2.8312)
+    """
+    ...
+def lu_unpack(LU_data: Tensor, LU_pivots: Tensor, unpack_data: _bool = True, unpack_pivots: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.lu_unpack: 
+    r"""
+    lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True, *, out=None) -> (Tensor, Tensor, Tensor)
+    
+    Unpacks the LU decomposition returned by :func:`~linalg.lu_factor` into the `P, L, U` matrices.
+    
+    .. seealso::
+    
+        :func:`~linalg.lu` returns the matrices from the LU decomposition. Its gradient formula is more efficient
+        than that of doing :func:`~linalg.lu_factor` followed by :func:`~linalg.lu_unpack`.
+    
+    Args:
+        LU_data (Tensor): the packed LU factorization data
+        LU_pivots (Tensor): the packed LU factorization pivots
+        unpack_data (bool): flag indicating if the data should be unpacked.
+                            If ``False``, then the returned ``L`` and ``U`` are empty tensors.
+                            Default: ``True``
+        unpack_pivots (bool): flag indicating if the pivots should be unpacked into a permutation matrix ``P``.
+                              If ``False``, then the returned ``P`` is  an empty tensor.
+                              Default: ``True``
+    
+    Keyword args:
+        out (tuple, optional): output tuple of three tensors. Ignored if `None`.
+    
+    Returns:
+        A namedtuple ``(P, L, U)``
+    
+    Examples::
+    
+        >>> A = torch.randn(2, 3, 3)
+        >>> LU, pivots = torch.linalg.lu_factor(A)
+        >>> P, L, U = torch.lu_unpack(LU, pivots)
+        >>> # We can recover A from the factorization
+        >>> A_ = P @ L @ U
+        >>> torch.allclose(A, A_)
+        True
+    
+        >>> # LU factorization of a rectangular matrix:
+        >>> A = torch.randn(2, 3, 2)
+        >>> LU, pivots = torch.linalg.lu_factor(A)
+        >>> P, L, U = torch.lu_unpack(LU, pivots)
+        >>> # P, L, U are the same as returned by linalg.lu
+        >>> P_, L_, U_ = torch.linalg.lu(A)
+        >>> torch.allclose(P, P_) and torch.allclose(L, L_) and torch.allclose(U, U_)
+        True
+    """
+    ...
+def margin_ranking_loss(input1: Tensor, input2: Tensor, target: Tensor, margin: _float = 0.0, reduction: _int = 1) -> Tensor: ...
+@overload
+def masked_fill(input: Tensor, mask: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def masked_fill(input: Tensor, mask: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+def masked_scatter(input: Tensor, mask: Tensor, source: Tensor) -> Tensor: ...
+def masked_select(input: Tensor, mask: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    masked_select(input, mask, *, out=None) -> Tensor
+    
+    Returns a new 1-D tensor which indexes the :attr:`input` tensor according to
+    the boolean mask :attr:`mask` which is a `BoolTensor`.
+    
+    The shapes of the :attr:`mask` tensor and the :attr:`input` tensor don't need
+    to match, but they must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    .. note:: The returned tensor does **not** use the same storage
+              as the original tensor
+    
+    Args:
+        input (Tensor): the input tensor.
+        mask  (BoolTensor): the tensor containing the binary mask to index with
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> x
+        tensor([[ 0.3552, -2.3825, -0.8297,  0.3477],
+                [-1.2035,  1.2252,  0.5002,  0.6248],
+                [ 0.1307, -2.0608,  0.1244,  2.0139]])
+        >>> mask = x.ge(0.5)
+        >>> mask
+        tensor([[False, False, False, False],
+                [False, True, True, True],
+                [False, False, False, True]])
+        >>> torch.masked_select(x, mask)
+        tensor([ 1.2252,  0.5002,  0.6248,  2.0139])
+    """
+    ...
+def matmul(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    matmul(input, other, *, out=None) -> Tensor
+    
+    Matrix product of two tensors.
+    
+    The behavior depends on the dimensionality of the tensors as follows:
+    
+    - If both tensors are 1-dimensional, the dot product (scalar) is returned.
+    - If both arguments are 2-dimensional, the matrix-matrix product is returned.
+    - If the first argument is 1-dimensional and the second argument is 2-dimensional,
+      a 1 is prepended to its dimension for the purpose of the matrix multiply.
+      After the matrix multiply, the prepended dimension is removed.
+    - If the first argument is 2-dimensional and the second argument is 1-dimensional,
+      the matrix-vector product is returned.
+    - If both arguments are at least 1-dimensional and at least one argument is
+      N-dimensional (where N > 2), then a batched matrix multiply is returned.  If the first
+      argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the
+      batched matrix multiply and removed after.  If the second argument is 1-dimensional, a
+      1 is appended to its dimension for the purpose of the batched matrix multiple and removed after.
+      The non-matrix (i.e. batch) dimensions are :ref:`broadcasted <broadcasting-semantics>` (and thus
+      must be broadcastable).  For example, if :attr:`input` is a
+      :math:`(j \times 1 \times n \times n)` tensor and :attr:`other` is a :math:`(k \times n \times n)`
+      tensor, :attr:`out` will be a :math:`(j \times k \times n \times n)` tensor.
+    
+      Note that the broadcasting logic only looks at the batch dimensions when determining if the inputs
+      are broadcastable, and not the matrix dimensions. For example, if :attr:`input` is a
+      :math:`(j \times 1 \times n \times m)` tensor and :attr:`other` is a :math:`(k \times m \times p)`
+      tensor, these inputs are valid for broadcasting even though the final two dimensions (i.e. the
+      matrix dimensions) are different. :attr:`out` will be a :math:`(j \times k \times n \times p)` tensor.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. In particular the
+    matrix-matrix (both arguments 2-dimensional) supports sparse arguments with the same restrictions
+    as :func:`torch.mm`
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    .. note::
+    
+        The 1-dimensional dot product version of this function does not support an :attr:`out` parameter.
+    
+    Arguments:
+        input (Tensor): the first tensor to be multiplied
+        other (Tensor): the second tensor to be multiplied
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> # vector x vector
+        >>> tensor1 = torch.randn(3)
+        >>> tensor2 = torch.randn(3)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([])
+        >>> # matrix x vector
+        >>> tensor1 = torch.randn(3, 4)
+        >>> tensor2 = torch.randn(4)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([3])
+        >>> # batched matrix x broadcasted vector
+        >>> tensor1 = torch.randn(10, 3, 4)
+        >>> tensor2 = torch.randn(4)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([10, 3])
+        >>> # batched matrix x batched matrix
+        >>> tensor1 = torch.randn(10, 3, 4)
+        >>> tensor2 = torch.randn(10, 4, 5)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([10, 3, 5])
+        >>> # batched matrix x broadcasted matrix
+        >>> tensor1 = torch.randn(10, 3, 4)
+        >>> tensor2 = torch.randn(4, 5)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+def matrix_exp(input: Tensor) -> Tensor: 
+    r"""
+    matrix_exp(A) -> Tensor
+    
+    Alias for :func:`torch.linalg.matrix_exp`.
+    """
+    ...
+def matrix_power(input: Tensor, n: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    matrix_power(input, n, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.linalg.matrix_power`
+    """
+    ...
+@overload
+def max(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    max(input) -> Tensor
+    
+    Returns the maximum value of all elements in the ``input`` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``max(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6763,  0.7445, -2.2369]])
+        >>> torch.max(a)
+        tensor(0.7445)
+    
+    .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each maximum value found
+    (argmax).
+    
+    If ``keepdim`` is ``True``, the output tensors are of the same size
+    as ``input`` except in the dimension ``dim`` where they are of size 1.
+    Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than ``input``.
+    
+    .. note:: If there are multiple maximal values in a reduced row then
+              the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (max, max_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-1.2360, -0.2942, -0.1222,  0.8475],
+                [ 1.1949, -1.1127, -2.2379, -0.6702],
+                [ 1.5717, -0.9207,  0.1297, -1.8768],
+                [-0.6172,  1.0036, -0.6060, -0.2432]])
+        >>> torch.max(a, 1)
+        torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1]))
+    
+    .. function:: max(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.maximum`.
+    """
+    ...
+@overload
+def max(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    max(input) -> Tensor
+    
+    Returns the maximum value of all elements in the ``input`` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``max(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6763,  0.7445, -2.2369]])
+        >>> torch.max(a)
+        tensor(0.7445)
+    
+    .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each maximum value found
+    (argmax).
+    
+    If ``keepdim`` is ``True``, the output tensors are of the same size
+    as ``input`` except in the dimension ``dim`` where they are of size 1.
+    Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than ``input``.
+    
+    .. note:: If there are multiple maximal values in a reduced row then
+              the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (max, max_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-1.2360, -0.2942, -0.1222,  0.8475],
+                [ 1.1949, -1.1127, -2.2379, -0.6702],
+                [ 1.5717, -0.9207,  0.1297, -1.8768],
+                [-0.6172,  1.0036, -0.6060, -0.2432]])
+        >>> torch.max(a, 1)
+        torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1]))
+    
+    .. function:: max(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.maximum`.
+    """
+    ...
+@overload
+def max(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.max: 
+    r"""
+    max(input) -> Tensor
+    
+    Returns the maximum value of all elements in the ``input`` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``max(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6763,  0.7445, -2.2369]])
+        >>> torch.max(a)
+        tensor(0.7445)
+    
+    .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each maximum value found
+    (argmax).
+    
+    If ``keepdim`` is ``True``, the output tensors are of the same size
+    as ``input`` except in the dimension ``dim`` where they are of size 1.
+    Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than ``input``.
+    
+    .. note:: If there are multiple maximal values in a reduced row then
+              the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (max, max_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-1.2360, -0.2942, -0.1222,  0.8475],
+                [ 1.1949, -1.1127, -2.2379, -0.6702],
+                [ 1.5717, -0.9207,  0.1297, -1.8768],
+                [-0.6172,  1.0036, -0.6060, -0.2432]])
+        >>> torch.max(a, 1)
+        torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1]))
+    
+    .. function:: max(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.maximum`.
+    """
+    ...
+@overload
+def max(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.max: 
+    r"""
+    max(input) -> Tensor
+    
+    Returns the maximum value of all elements in the ``input`` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``max(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6763,  0.7445, -2.2369]])
+        >>> torch.max(a)
+        tensor(0.7445)
+    
+    .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each maximum value found
+    (argmax).
+    
+    If ``keepdim`` is ``True``, the output tensors are of the same size
+    as ``input`` except in the dimension ``dim`` where they are of size 1.
+    Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than ``input``.
+    
+    .. note:: If there are multiple maximal values in a reduced row then
+              the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (max, max_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-1.2360, -0.2942, -0.1222,  0.8475],
+                [ 1.1949, -1.1127, -2.2379, -0.6702],
+                [ 1.5717, -0.9207,  0.1297, -1.8768],
+                [-0.6172,  1.0036, -0.6060, -0.2432]])
+        >>> torch.max(a, 1)
+        torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1]))
+    
+    .. function:: max(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.maximum`.
+    """
+    ...
+def max_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def max_pool1d_with_indices(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tuple[Tensor, Tensor]: ...
+def max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def maximum(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    maximum(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise maximum of :attr:`input` and :attr:`other`.
+    
+    .. note::
+        If one of the elements being compared is a NaN, then that element is returned.
+        :func:`maximum` is not supported for tensors with complex dtypes.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2, -1))
+        >>> b = torch.tensor((3, 0, 4))
+        >>> torch.maximum(a, b)
+        tensor([3, 2, 4])
+    """
+    ...
+@overload
+def mean(input: Tensor, *, dtype: Optional[_dtype] = None) -> Tensor: 
+    r"""
+    mean(input, *, dtype=None) -> Tensor
+    
+    Returns the mean value of all elements in the :attr:`input` tensor. Input must be floating point or complex.
+    
+    Args:
+        input (Tensor):
+          the input tensor, either of floating point or complex dtype
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.2294, -0.5481,  1.3288]])
+        >>> torch.mean(a)
+        tensor(0.3367)
+    
+    .. function:: mean(input, dim, keepdim=False, *, dtype=None, out=None) -> Tensor
+       :noindex:
+    
+    Returns the mean value of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+        :func:`torch.nanmean` computes the mean value of `non-NaN` elements.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.3841,  0.6320,  0.4254, -0.7384],
+                [-0.9644,  1.0131, -0.6549, -1.4279],
+                [-0.2951, -1.3350, -0.7694,  0.5600],
+                [ 1.0842, -0.9580,  0.3623,  0.2343]])
+        >>> torch.mean(a, 1)
+        tensor([-0.0163, -0.5085, -0.4599,  0.1807])
+        >>> torch.mean(a, 1, True)
+        tensor([[-0.0163],
+                [-0.5085],
+                [-0.4599],
+                [ 0.1807]])
+    """
+    ...
+@overload
+def mean(input: Tensor, dim: Optional[Union[_int, _size]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mean(input, *, dtype=None) -> Tensor
+    
+    Returns the mean value of all elements in the :attr:`input` tensor. Input must be floating point or complex.
+    
+    Args:
+        input (Tensor):
+          the input tensor, either of floating point or complex dtype
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.2294, -0.5481,  1.3288]])
+        >>> torch.mean(a)
+        tensor(0.3367)
+    
+    .. function:: mean(input, dim, keepdim=False, *, dtype=None, out=None) -> Tensor
+       :noindex:
+    
+    Returns the mean value of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+        :func:`torch.nanmean` computes the mean value of `non-NaN` elements.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.3841,  0.6320,  0.4254, -0.7384],
+                [-0.9644,  1.0131, -0.6549, -1.4279],
+                [-0.2951, -1.3350, -0.7694,  0.5600],
+                [ 1.0842, -0.9580,  0.3623,  0.2343]])
+        >>> torch.mean(a, 1)
+        tensor([-0.0163, -0.5085, -0.4599,  0.1807])
+        >>> torch.mean(a, 1, True)
+        tensor([[-0.0163],
+                [-0.5085],
+                [-0.4599],
+                [ 0.1807]])
+    """
+    ...
+@overload
+def mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mean(input, *, dtype=None) -> Tensor
+    
+    Returns the mean value of all elements in the :attr:`input` tensor. Input must be floating point or complex.
+    
+    Args:
+        input (Tensor):
+          the input tensor, either of floating point or complex dtype
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.2294, -0.5481,  1.3288]])
+        >>> torch.mean(a)
+        tensor(0.3367)
+    
+    .. function:: mean(input, dim, keepdim=False, *, dtype=None, out=None) -> Tensor
+       :noindex:
+    
+    Returns the mean value of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+        :func:`torch.nanmean` computes the mean value of `non-NaN` elements.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.3841,  0.6320,  0.4254, -0.7384],
+                [-0.9644,  1.0131, -0.6549, -1.4279],
+                [-0.2951, -1.3350, -0.7694,  0.5600],
+                [ 1.0842, -0.9580,  0.3623,  0.2343]])
+        >>> torch.mean(a, 1)
+        tensor([-0.0163, -0.5085, -0.4599,  0.1807])
+        >>> torch.mean(a, 1, True)
+        tensor([[-0.0163],
+                [-0.5085],
+                [-0.4599],
+                [ 0.1807]])
+    """
+    ...
+@overload
+def median(input: Tensor) -> Tensor: 
+    r"""
+    median(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements. In this case the lower of the two medians is returned. To
+        compute the mean of both medians, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``median(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 1.5219, -1.5212,  0.2202]])
+        >>> torch.median(a)
+        tensor(0.2202)
+    
+    .. function:: median(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, and ``indices`` contains the index of the median values found in the dimension :attr:`dim`.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the outputs tensor having 1 fewer dimension than :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements in the dimension :attr:`dim`. In this case the lower of the
+        two medians is returned. To compute the mean of both medians in
+        :attr:`input`, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        ``indices`` does not necessarily contain the first occurrence of each
+        median value found, unless it is unique.
+        The exact implementation details are device-specific.
+        Do not expect the same result when run on CPU and GPU in general.
+        For the same reason do not expect the gradients to be deterministic.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 5)
+        >>> a
+        tensor([[ 0.2505, -0.3982, -0.9948,  0.3518, -1.3131],
+                [ 0.3180, -0.6993,  1.0436,  0.0438,  0.2270],
+                [-0.2751,  0.7303,  0.2192,  0.3321,  0.2488],
+                [ 1.0778, -1.9510,  0.7048,  0.4742, -0.7125]])
+        >>> torch.median(a, 1)
+        torch.return_types.median(values=tensor([-0.3982,  0.2270,  0.2488,  0.4742]), indices=tensor([1, 4, 4, 3]))
+    """
+    ...
+@overload
+def median(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.median: 
+    r"""
+    median(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements. In this case the lower of the two medians is returned. To
+        compute the mean of both medians, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``median(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 1.5219, -1.5212,  0.2202]])
+        >>> torch.median(a)
+        tensor(0.2202)
+    
+    .. function:: median(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, and ``indices`` contains the index of the median values found in the dimension :attr:`dim`.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the outputs tensor having 1 fewer dimension than :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements in the dimension :attr:`dim`. In this case the lower of the
+        two medians is returned. To compute the mean of both medians in
+        :attr:`input`, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        ``indices`` does not necessarily contain the first occurrence of each
+        median value found, unless it is unique.
+        The exact implementation details are device-specific.
+        Do not expect the same result when run on CPU and GPU in general.
+        For the same reason do not expect the gradients to be deterministic.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 5)
+        >>> a
+        tensor([[ 0.2505, -0.3982, -0.9948,  0.3518, -1.3131],
+                [ 0.3180, -0.6993,  1.0436,  0.0438,  0.2270],
+                [-0.2751,  0.7303,  0.2192,  0.3321,  0.2488],
+                [ 1.0778, -1.9510,  0.7048,  0.4742, -0.7125]])
+        >>> torch.median(a, 1)
+        torch.return_types.median(values=tensor([-0.3982,  0.2270,  0.2488,  0.4742]), indices=tensor([1, 4, 4, 3]))
+    """
+    ...
+@overload
+def median(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.median: 
+    r"""
+    median(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements. In this case the lower of the two medians is returned. To
+        compute the mean of both medians, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``median(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 1.5219, -1.5212,  0.2202]])
+        >>> torch.median(a)
+        tensor(0.2202)
+    
+    .. function:: median(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, and ``indices`` contains the index of the median values found in the dimension :attr:`dim`.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the outputs tensor having 1 fewer dimension than :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements in the dimension :attr:`dim`. In this case the lower of the
+        two medians is returned. To compute the mean of both medians in
+        :attr:`input`, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        ``indices`` does not necessarily contain the first occurrence of each
+        median value found, unless it is unique.
+        The exact implementation details are device-specific.
+        Do not expect the same result when run on CPU and GPU in general.
+        For the same reason do not expect the gradients to be deterministic.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 5)
+        >>> a
+        tensor([[ 0.2505, -0.3982, -0.9948,  0.3518, -1.3131],
+                [ 0.3180, -0.6993,  1.0436,  0.0438,  0.2270],
+                [-0.2751,  0.7303,  0.2192,  0.3321,  0.2488],
+                [ 1.0778, -1.9510,  0.7048,  0.4742, -0.7125]])
+        >>> torch.median(a, 1)
+        torch.return_types.median(values=tensor([-0.3982,  0.2270,  0.2488,  0.4742]), indices=tensor([1, 4, 4, 3]))
+    """
+    ...
+@overload
+def min(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    min(input) -> Tensor
+    
+    Returns the minimum value of all elements in the :attr:`input` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``min(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6750,  1.0857,  1.7197]])
+        >>> torch.min(a)
+        tensor(0.6750)
+    
+    .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each minimum value found
+    (argmin).
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: If there are multiple minimal values in a reduced row then
+              the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the tuple of two output tensors (min, min_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.6248,  1.1334, -1.1899, -0.2803],
+                [-1.4644, -0.2635, -0.3651,  0.6134],
+                [ 0.2457,  0.0384,  1.0128,  0.7015],
+                [-0.1153,  2.9849,  2.1458,  0.5788]])
+        >>> torch.min(a, 1)
+        torch.return_types.min(values=tensor([-1.1899, -1.4644,  0.0384, -0.1153]), indices=tensor([2, 0, 1, 0]))
+    
+    .. function:: min(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.minimum`.
+    """
+    ...
+@overload
+def min(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    min(input) -> Tensor
+    
+    Returns the minimum value of all elements in the :attr:`input` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``min(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6750,  1.0857,  1.7197]])
+        >>> torch.min(a)
+        tensor(0.6750)
+    
+    .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each minimum value found
+    (argmin).
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: If there are multiple minimal values in a reduced row then
+              the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the tuple of two output tensors (min, min_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.6248,  1.1334, -1.1899, -0.2803],
+                [-1.4644, -0.2635, -0.3651,  0.6134],
+                [ 0.2457,  0.0384,  1.0128,  0.7015],
+                [-0.1153,  2.9849,  2.1458,  0.5788]])
+        >>> torch.min(a, 1)
+        torch.return_types.min(values=tensor([-1.1899, -1.4644,  0.0384, -0.1153]), indices=tensor([2, 0, 1, 0]))
+    
+    .. function:: min(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.minimum`.
+    """
+    ...
+@overload
+def min(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.min: 
+    r"""
+    min(input) -> Tensor
+    
+    Returns the minimum value of all elements in the :attr:`input` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``min(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6750,  1.0857,  1.7197]])
+        >>> torch.min(a)
+        tensor(0.6750)
+    
+    .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each minimum value found
+    (argmin).
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: If there are multiple minimal values in a reduced row then
+              the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the tuple of two output tensors (min, min_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.6248,  1.1334, -1.1899, -0.2803],
+                [-1.4644, -0.2635, -0.3651,  0.6134],
+                [ 0.2457,  0.0384,  1.0128,  0.7015],
+                [-0.1153,  2.9849,  2.1458,  0.5788]])
+        >>> torch.min(a, 1)
+        torch.return_types.min(values=tensor([-1.1899, -1.4644,  0.0384, -0.1153]), indices=tensor([2, 0, 1, 0]))
+    
+    .. function:: min(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.minimum`.
+    """
+    ...
+@overload
+def min(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.min: 
+    r"""
+    min(input) -> Tensor
+    
+    Returns the minimum value of all elements in the :attr:`input` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``min(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6750,  1.0857,  1.7197]])
+        >>> torch.min(a)
+        tensor(0.6750)
+    
+    .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each minimum value found
+    (argmin).
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: If there are multiple minimal values in a reduced row then
+              the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the tuple of two output tensors (min, min_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.6248,  1.1334, -1.1899, -0.2803],
+                [-1.4644, -0.2635, -0.3651,  0.6134],
+                [ 0.2457,  0.0384,  1.0128,  0.7015],
+                [-0.1153,  2.9849,  2.1458,  0.5788]])
+        >>> torch.min(a, 1)
+        torch.return_types.min(values=tensor([-1.1899, -1.4644,  0.0384, -0.1153]), indices=tensor([2, 0, 1, 0]))
+    
+    .. function:: min(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.minimum`.
+    """
+    ...
+def minimum(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    minimum(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise minimum of :attr:`input` and :attr:`other`.
+    
+    .. note::
+        If one of the elements being compared is a NaN, then that element is returned.
+        :func:`minimum` is not supported for tensors with complex dtypes.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2, -1))
+        >>> b = torch.tensor((3, 0, 4))
+        >>> torch.minimum(a, b)
+        tensor([1, 0, -1])
+    """
+    ...
+def miopen_batch_norm(input: Tensor, weight: Tensor, bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, exponential_average_factor: _float, epsilon: _float) -> Tuple[Tensor, Tensor, Tensor]: ...
+def miopen_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool) -> Tensor: ...
+def miopen_convolution_add_relu(input: Tensor, weight: Tensor, z: Tensor, alpha: Optional[Union[Number, _complex]], bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def miopen_convolution_relu(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def miopen_convolution_transpose(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], output_padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool) -> Tensor: ...
+def miopen_depthwise_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool) -> Tensor: ...
+def miopen_rnn(input: Tensor, weight: Union[Tuple[Tensor, ...], List[Tensor]], weight_stride0: _int, hx: Tensor, cx: Optional[Tensor], mode: _int, hidden_size: _int, num_layers: _int, batch_first: _bool, dropout: _float, train: _bool, bidirectional: _bool, batch_sizes: _size, dropout_state: Optional[Tensor]) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: ...
+def mkldnn_adaptive_avg_pool2d(input: Tensor, output_size: Union[_int, _size], *, out: Optional[Tensor] = None) -> Tensor: ...
+def mkldnn_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def mkldnn_linear_backward_weights(grad_output: Tensor, input: Tensor, weight: Tensor, bias_defined: _bool) -> Tuple[Tensor, Tensor]: ...
+def mkldnn_max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def mkldnn_max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def mkldnn_rnn_layer(input: Tensor, weight0: Tensor, weight1: Tensor, weight2: Tensor, weight3: Tensor, hx_: Tensor, cx_: Tensor, reverse: _bool, batch_sizes: _size, mode: _int, hidden_size: _int, num_layers: _int, has_biases: _bool, bidirectional: _bool, batch_first: _bool, train: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def mm(input: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mm(input, mat2, *, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`input` and :attr:`mat2`.
+    
+    If :attr:`input` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+              For broadcasting matrix products, see :func:`torch.matmul`.
+    
+    Supports strided and sparse 2-D tensors as inputs, autograd with
+    respect to strided inputs.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`.
+    If :attr:`out` is provided it's layout will be used. Otherwise, the result
+    layout will be deduced from that of :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.mm(mat1, mat2)
+        tensor([[ 0.4851,  0.5037, -0.3633],
+                [-0.0760, -3.6705,  2.4784]])
+    """
+    ...
+@overload
+def mode(input: Tensor, dim: _int = -1, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.mode: 
+    r"""
+    mode(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the mode
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`, i.e. a value which appears most often
+    in that row, and ``indices`` is the index location of each mode value found.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: This function is not defined for ``torch.cuda.Tensor`` yet.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> b = torch.tensor(
+               [[0, 0, 0, 2, 0, 0, 2],
+                [0, 3, 0, 0, 2, 0, 1],
+                [2, 2, 2, 0, 0, 0, 3],
+                [2, 2, 3, 0, 1, 1, 0],
+                [1, 1, 0, 0, 2, 0, 2]])
+        >>> torch.mode(b, 0)
+        torch.return_types.mode(
+        values=tensor([0, 2, 0, 0, 0, 0, 2]),
+        indices=tensor([1, 3, 4, 4, 2, 4, 4]))
+    """
+    ...
+@overload
+def mode(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.mode: 
+    r"""
+    mode(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the mode
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`, i.e. a value which appears most often
+    in that row, and ``indices`` is the index location of each mode value found.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: This function is not defined for ``torch.cuda.Tensor`` yet.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> b = torch.tensor(
+               [[0, 0, 0, 2, 0, 0, 2],
+                [0, 3, 0, 0, 2, 0, 1],
+                [2, 2, 2, 0, 0, 0, 3],
+                [2, 2, 3, 0, 1, 1, 0],
+                [1, 1, 0, 0, 2, 0, 2]])
+        >>> torch.mode(b, 0)
+        torch.return_types.mode(
+        values=tensor([0, 2, 0, 0, 0, 0, 2]),
+        indices=tensor([1, 3, 4, 4, 2, 4, 4]))
+    """
+    ...
+@overload
+def moveaxis(input: Tensor, source: _int, destination: _int) -> Tensor: 
+    r"""
+    moveaxis(input, source, destination) -> Tensor
+    
+    Alias for :func:`torch.movedim`.
+    
+    This function is equivalent to NumPy's moveaxis function.
+    
+    Examples::
+    
+        >>> t = torch.randn(3,2,1)
+        >>> t
+        tensor([[[-0.3362],
+                [-0.8437]],
+    
+                [[-0.9627],
+                [ 0.1727]],
+    
+                [[ 0.5173],
+                [-0.1398]]])
+        >>> torch.moveaxis(t, 1, 0).shape
+        torch.Size([2, 3, 1])
+        >>> torch.moveaxis(t, 1, 0)
+        tensor([[[-0.3362],
+                [-0.9627],
+                [ 0.5173]],
+    
+                [[-0.8437],
+                [ 0.1727],
+                [-0.1398]]])
+        >>> torch.moveaxis(t, (1, 2), (0, 1)).shape
+        torch.Size([2, 1, 3])
+        >>> torch.moveaxis(t, (1, 2), (0, 1))
+        tensor([[[-0.3362, -0.9627,  0.5173]],
+    
+                [[-0.8437,  0.1727, -0.1398]]])
+    """
+    ...
+@overload
+def moveaxis(input: Tensor, source: _size, destination: _size) -> Tensor: 
+    r"""
+    moveaxis(input, source, destination) -> Tensor
+    
+    Alias for :func:`torch.movedim`.
+    
+    This function is equivalent to NumPy's moveaxis function.
+    
+    Examples::
+    
+        >>> t = torch.randn(3,2,1)
+        >>> t
+        tensor([[[-0.3362],
+                [-0.8437]],
+    
+                [[-0.9627],
+                [ 0.1727]],
+    
+                [[ 0.5173],
+                [-0.1398]]])
+        >>> torch.moveaxis(t, 1, 0).shape
+        torch.Size([2, 3, 1])
+        >>> torch.moveaxis(t, 1, 0)
+        tensor([[[-0.3362],
+                [-0.9627],
+                [ 0.5173]],
+    
+                [[-0.8437],
+                [ 0.1727],
+                [-0.1398]]])
+        >>> torch.moveaxis(t, (1, 2), (0, 1)).shape
+        torch.Size([2, 1, 3])
+        >>> torch.moveaxis(t, (1, 2), (0, 1))
+        tensor([[[-0.3362, -0.9627,  0.5173]],
+    
+                [[-0.8437,  0.1727, -0.1398]]])
+    """
+    ...
+@overload
+def movedim(input: Tensor, source: _int, destination: _int) -> Tensor: 
+    r"""
+    movedim(input, source, destination) -> Tensor
+    
+    Moves the dimension(s) of :attr:`input` at the position(s) in :attr:`source`
+    to the position(s) in :attr:`destination`.
+    
+    Other dimensions of :attr:`input` that are not explicitly moved remain in
+    their original order and appear at the positions not specified in :attr:`destination`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        source (int or tuple of ints): Original positions of the dims to move. These must be unique.
+        destination (int or tuple of ints): Destination positions for each of the original dims. These must also be unique.
+    
+    Examples::
+    
+        >>> t = torch.randn(3,2,1)
+        >>> t
+        tensor([[[-0.3362],
+                [-0.8437]],
+    
+                [[-0.9627],
+                [ 0.1727]],
+    
+                [[ 0.5173],
+                [-0.1398]]])
+        >>> torch.movedim(t, 1, 0).shape
+        torch.Size([2, 3, 1])
+        >>> torch.movedim(t, 1, 0)
+        tensor([[[-0.3362],
+                [-0.9627],
+                [ 0.5173]],
+    
+                [[-0.8437],
+                [ 0.1727],
+                [-0.1398]]])
+        >>> torch.movedim(t, (1, 2), (0, 1)).shape
+        torch.Size([2, 1, 3])
+        >>> torch.movedim(t, (1, 2), (0, 1))
+        tensor([[[-0.3362, -0.9627,  0.5173]],
+    
+                [[-0.8437,  0.1727, -0.1398]]])
+    """
+    ...
+@overload
+def movedim(input: Tensor, source: _size, destination: _size) -> Tensor: 
+    r"""
+    movedim(input, source, destination) -> Tensor
+    
+    Moves the dimension(s) of :attr:`input` at the position(s) in :attr:`source`
+    to the position(s) in :attr:`destination`.
+    
+    Other dimensions of :attr:`input` that are not explicitly moved remain in
+    their original order and appear at the positions not specified in :attr:`destination`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        source (int or tuple of ints): Original positions of the dims to move. These must be unique.
+        destination (int or tuple of ints): Destination positions for each of the original dims. These must also be unique.
+    
+    Examples::
+    
+        >>> t = torch.randn(3,2,1)
+        >>> t
+        tensor([[[-0.3362],
+                [-0.8437]],
+    
+                [[-0.9627],
+                [ 0.1727]],
+    
+                [[ 0.5173],
+                [-0.1398]]])
+        >>> torch.movedim(t, 1, 0).shape
+        torch.Size([2, 3, 1])
+        >>> torch.movedim(t, 1, 0)
+        tensor([[[-0.3362],
+                [-0.9627],
+                [ 0.5173]],
+    
+                [[-0.8437],
+                [ 0.1727],
+                [-0.1398]]])
+        >>> torch.movedim(t, (1, 2), (0, 1)).shape
+        torch.Size([2, 1, 3])
+        >>> torch.movedim(t, (1, 2), (0, 1))
+        tensor([[[-0.3362, -0.9627,  0.5173]],
+    
+                [[-0.8437,  0.1727, -0.1398]]])
+    """
+    ...
+def msort(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    msort(input, *, out=None) -> Tensor
+    
+    Sorts the elements of the :attr:`input` tensor along its first dimension
+    in ascending order by value.
+    
+    .. note:: `torch.msort(t)` is equivalent to `torch.sort(t, dim=0)[0]`.
+              See also :func:`torch.sort`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(3, 4)
+        >>> t
+        tensor([[-0.1321,  0.4370, -1.2631, -1.1289],
+                [-2.0527, -1.1250,  0.2275,  0.3077],
+                [-0.0881, -0.1259, -0.5495,  1.0284]])
+        >>> torch.msort(t)
+        tensor([[-2.0527, -1.1250, -1.2631, -1.1289],
+                [-0.1321, -0.1259, -0.5495,  0.3077],
+                [-0.0881,  0.4370,  0.2275,  1.0284]])
+    """
+    ...
+def mul(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mul(input, other, *, out=None) -> Tensor
+    
+    Multiplies :attr:`input` by :attr:`other`.
+    
+    
+    .. math::
+        \text{out}_i = \text{input}_i \times \text{other}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number) - the tensor or number to multiply input by.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> a = torch.randn(3)
+        >>> a
+        tensor([ 0.2015, -0.4255,  2.6087])
+        >>> torch.mul(a, 100)
+        tensor([  20.1494,  -42.5491,  260.8663])
+    
+        >>> b = torch.randn(4, 1)
+        >>> b
+        tensor([[ 1.1207],
+                [-0.3137],
+                [ 0.0700],
+                [ 0.8378]])
+        >>> c = torch.randn(1, 4)
+        >>> c
+        tensor([[ 0.5146,  0.1216, -0.5244,  2.2382]])
+        >>> torch.mul(b, c)
+        tensor([[ 0.5767,  0.1363, -0.5877,  2.5083],
+                [-0.1614, -0.0382,  0.1645, -0.7021],
+                [ 0.0360,  0.0085, -0.0367,  0.1567],
+                [ 0.4312,  0.1019, -0.4394,  1.8753]])
+    """
+    ...
+def multinomial(input: Tensor, num_samples: _int, replacement: _bool = False, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    multinomial(input, num_samples, replacement=False, *, generator=None, out=None) -> LongTensor
+    
+    Returns a tensor where each row contains :attr:`num_samples` indices sampled
+    from the multinomial (a stricter definition would be multivariate,
+    refer to torch.distributions.multinomial.Multinomial for more details)
+    probability distribution located in the corresponding row
+    of tensor :attr:`input`.
+    
+    .. note::
+        The rows of :attr:`input` do not need to sum to one (in which case we use
+        the values as weights), but must be non-negative, finite and have
+        a non-zero sum.
+    
+    Indices are ordered from left to right according to when each was sampled
+    (first samples are placed in first column).
+    
+    If :attr:`input` is a vector, :attr:`out` is a vector of size :attr:`num_samples`.
+    
+    If :attr:`input` is a matrix with `m` rows, :attr:`out` is an matrix of shape
+    :math:`(m \times \text{num\_samples})`.
+    
+    If replacement is ``True``, samples are drawn with replacement.
+    
+    If not, they are drawn without replacement, which means that when a
+    sample index is drawn for a row, it cannot be drawn again for that row.
+    
+    .. note::
+        When drawn without replacement, :attr:`num_samples` must be lower than
+        number of non-zero elements in :attr:`input` (or the min number of non-zero
+        elements in each row of :attr:`input` if it is a matrix).
+    
+    Args:
+        input (Tensor): the input tensor containing probabilities
+        num_samples (int): number of samples to draw
+        replacement (bool, optional): whether to draw with replacement or not
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> weights = torch.tensor([0, 10, 3, 0], dtype=torch.float) # create a tensor of weights
+        >>> torch.multinomial(weights, 2)
+        tensor([1, 2])
+        >>> torch.multinomial(weights, 4) # ERROR!
+        RuntimeError: invalid argument 2: invalid multinomial distribution (with replacement=False,
+        not enough non-negative category to sample) at ../aten/src/TH/generic/THTensorRandom.cpp:320
+        >>> torch.multinomial(weights, 4, replacement=True)
+        tensor([ 2,  1,  1,  1])
+    """
+    ...
+@overload
+def multiply(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    multiply(input, other, *, out=None)
+    
+    Alias for :func:`torch.mul`.
+    """
+    ...
+@overload
+def multiply(input: Tensor, other: Union[Number, _complex]) -> Tensor: 
+    r"""
+    multiply(input, other, *, out=None)
+    
+    Alias for :func:`torch.mul`.
+    """
+    ...
+def mv(input: Tensor, vec: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mv(input, vec, *, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`input` and the vector
+    :attr:`vec`.
+    
+    If :attr:`input` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size :math:`m`, :attr:`out` will be 1-D of size :math:`n`.
+    
+    .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): matrix to be multiplied
+        vec (Tensor): vector to be multiplied
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.mv(mat, vec)
+        tensor([ 1.0404, -0.6361])
+    """
+    ...
+def mvlgamma(input: Tensor, p: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mvlgamma(input, p, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.multigammaln`.
+    """
+    ...
+def nan_to_num(input: Tensor, nan: Optional[_float] = None, posinf: Optional[_float] = None, neginf: Optional[_float] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None) -> Tensor
+    
+    Replaces :literal:`NaN`, positive infinity, and negative infinity values in :attr:`input`
+    with the values specified by :attr:`nan`, :attr:`posinf`, and :attr:`neginf`, respectively.
+    By default, :literal:`NaN`\ s are replaced with zero, positive infinity is replaced with the
+    greatest finite value representable by :attr:`input`'s dtype, and negative infinity
+    is replaced with the least finite value representable by :attr:`input`'s dtype.
+    
+    Args:
+        input (Tensor): the input tensor.
+        nan (Number, optional): the value to replace :literal:`NaN`\s with. Default is zero.
+        posinf (Number, optional): if a Number, the value to replace positive infinity values with.
+            If None, positive infinity values are replaced with the greatest finite value representable by :attr:`input`'s dtype.
+            Default is None.
+        neginf (Number, optional): if a Number, the value to replace negative infinity values with.
+            If None, negative infinity values are replaced with the lowest finite value representable by :attr:`input`'s dtype.
+            Default is None.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([float('nan'), float('inf'), -float('inf'), 3.14])
+        >>> torch.nan_to_num(x)
+        tensor([ 0.0000e+00,  3.4028e+38, -3.4028e+38,  3.1400e+00])
+        >>> torch.nan_to_num(x, nan=2.0)
+        tensor([ 2.0000e+00,  3.4028e+38, -3.4028e+38,  3.1400e+00])
+        >>> torch.nan_to_num(x, nan=2.0, posinf=1.0)
+        tensor([ 2.0000e+00,  1.0000e+00, -3.4028e+38,  3.1400e+00])
+    """
+    ...
+def nan_to_num_(input: Tensor, nan: Optional[_float] = None, posinf: Optional[_float] = None, neginf: Optional[_float] = None) -> Tensor: ...
+def nanmean(input: Tensor, dim: Optional[Union[_int, _size]] = None, keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nanmean(input, dim=None, keepdim=False, *, dtype=None, out=None) -> Tensor
+    
+    Computes the mean of all `non-NaN` elements along the specified dimensions.
+    
+    This function is identical to :func:`torch.mean` when there are no `NaN` values
+    in the :attr:`input` tensor. In the presence of `NaN`, :func:`torch.mean` will
+    propagate the `NaN` to the output whereas :func:`torch.nanmean` will ignore the
+    `NaN` values (`torch.nanmean(a)` is equivalent to `torch.mean(a[~a.isnan()])`).
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+        :func:`torch.mean` computes the mean value, propagating `NaN`.
+    
+    Example::
+    
+        >>> x = torch.tensor([[torch.nan, 1, 2], [1, 2, 3]])
+        >>> x.mean()
+        tensor(nan)
+        >>> x.nanmean()
+        tensor(1.8000)
+        >>> x.mean(dim=0)
+        tensor([   nan, 1.5000, 2.5000])
+        >>> x.nanmean(dim=0)
+        tensor([1.0000, 1.5000, 2.5000])
+    
+        # If all elements in the reduced dimensions are NaN then the result is NaN
+        >>> torch.tensor([torch.nan]).nanmean()
+        tensor(nan)
+    """
+    ...
+@overload
+def nanmedian(input: Tensor) -> Tensor: 
+    r"""
+    nanmedian(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`, ignoring ``NaN`` values.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in :attr:`input`.
+    When :attr:`input` has one or more ``NaN`` values, :func:`torch.median` will always return ``NaN``,
+    while this function will return the median of the non-``NaN`` elements in :attr:`input`.
+    If all the elements in :attr:`input` are ``NaN`` it will also return ``NaN``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, float('nan'), 3, 2])
+        >>> a.median()
+        tensor(nan)
+        >>> a.nanmedian()
+        tensor(2.)
+    
+    .. function:: nanmedian(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, ignoring ``NaN`` values, and ``indices`` contains the index of the median values
+    found in the dimension :attr:`dim`.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in a reduced row. When a reduced row has
+    one or more ``NaN`` values, :func:`torch.median` will always reduce it to ``NaN``, while this function will reduce it to the
+    median of the non-``NaN`` elements. If all the elements in a reduced row are ``NaN`` then it will be reduced to ``NaN``, too.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.tensor([[2, 3, 1], [float('nan'), 1, float('nan')]])
+        >>> a
+        tensor([[2., 3., 1.],
+                [nan, 1., nan]])
+        >>> a.median(0)
+        torch.return_types.median(values=tensor([nan, 1., nan]), indices=tensor([1, 1, 1]))
+        >>> a.nanmedian(0)
+        torch.return_types.nanmedian(values=tensor([2., 1., 1.]), indices=tensor([0, 1, 0]))
+    """
+    ...
+@overload
+def nanmedian(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.nanmedian: 
+    r"""
+    nanmedian(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`, ignoring ``NaN`` values.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in :attr:`input`.
+    When :attr:`input` has one or more ``NaN`` values, :func:`torch.median` will always return ``NaN``,
+    while this function will return the median of the non-``NaN`` elements in :attr:`input`.
+    If all the elements in :attr:`input` are ``NaN`` it will also return ``NaN``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, float('nan'), 3, 2])
+        >>> a.median()
+        tensor(nan)
+        >>> a.nanmedian()
+        tensor(2.)
+    
+    .. function:: nanmedian(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, ignoring ``NaN`` values, and ``indices`` contains the index of the median values
+    found in the dimension :attr:`dim`.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in a reduced row. When a reduced row has
+    one or more ``NaN`` values, :func:`torch.median` will always reduce it to ``NaN``, while this function will reduce it to the
+    median of the non-``NaN`` elements. If all the elements in a reduced row are ``NaN`` then it will be reduced to ``NaN``, too.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.tensor([[2, 3, 1], [float('nan'), 1, float('nan')]])
+        >>> a
+        tensor([[2., 3., 1.],
+                [nan, 1., nan]])
+        >>> a.median(0)
+        torch.return_types.median(values=tensor([nan, 1., nan]), indices=tensor([1, 1, 1]))
+        >>> a.nanmedian(0)
+        torch.return_types.nanmedian(values=tensor([2., 1., 1.]), indices=tensor([0, 1, 0]))
+    """
+    ...
+@overload
+def nanmedian(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.nanmedian: 
+    r"""
+    nanmedian(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`, ignoring ``NaN`` values.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in :attr:`input`.
+    When :attr:`input` has one or more ``NaN`` values, :func:`torch.median` will always return ``NaN``,
+    while this function will return the median of the non-``NaN`` elements in :attr:`input`.
+    If all the elements in :attr:`input` are ``NaN`` it will also return ``NaN``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, float('nan'), 3, 2])
+        >>> a.median()
+        tensor(nan)
+        >>> a.nanmedian()
+        tensor(2.)
+    
+    .. function:: nanmedian(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, ignoring ``NaN`` values, and ``indices`` contains the index of the median values
+    found in the dimension :attr:`dim`.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in a reduced row. When a reduced row has
+    one or more ``NaN`` values, :func:`torch.median` will always reduce it to ``NaN``, while this function will reduce it to the
+    median of the non-``NaN`` elements. If all the elements in a reduced row are ``NaN`` then it will be reduced to ``NaN``, too.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.tensor([[2, 3, 1], [float('nan'), 1, float('nan')]])
+        >>> a
+        tensor([[2., 3., 1.],
+                [nan, 1., nan]])
+        >>> a.median(0)
+        torch.return_types.median(values=tensor([nan, 1., nan]), indices=tensor([1, 1, 1]))
+        >>> a.nanmedian(0)
+        torch.return_types.nanmedian(values=tensor([2., 1., 1.]), indices=tensor([0, 1, 0]))
+    """
+    ...
+@overload
+def nanquantile(input: Tensor, q: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nanquantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor
+    
+    This is a variant of :func:`torch.quantile` that "ignores" ``NaN`` values,
+    computing the quantiles :attr:`q` as if ``NaN`` values in :attr:`input` did
+    not exist. If all values in a reduced row are ``NaN`` then the quantiles for
+    that reduction will be ``NaN``. See the documentation for :func:`torch.quantile`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        q (float or Tensor): a scalar or 1D tensor of quantile values in the range [0, 1]
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword arguments:
+        interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+                                Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
+                                Default is ``linear``.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([float('nan'), 1, 2])
+        >>> t.quantile(0.5)
+        tensor(nan)
+        >>> t.nanquantile(0.5)
+        tensor(1.5000)
+        >>> t = torch.tensor([[float('nan'), float('nan')], [1, 2]])
+        >>> t
+        tensor([[nan, nan],
+                [1., 2.]])
+        >>> t.nanquantile(0.5, dim=0)
+        tensor([1., 2.])
+        >>> t.nanquantile(0.5, dim=1)
+        tensor([   nan, 1.5000])
+    """
+    ...
+@overload
+def nanquantile(input: Tensor, q: _float, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nanquantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor
+    
+    This is a variant of :func:`torch.quantile` that "ignores" ``NaN`` values,
+    computing the quantiles :attr:`q` as if ``NaN`` values in :attr:`input` did
+    not exist. If all values in a reduced row are ``NaN`` then the quantiles for
+    that reduction will be ``NaN``. See the documentation for :func:`torch.quantile`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        q (float or Tensor): a scalar or 1D tensor of quantile values in the range [0, 1]
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword arguments:
+        interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+                                Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
+                                Default is ``linear``.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([float('nan'), 1, 2])
+        >>> t.quantile(0.5)
+        tensor(nan)
+        >>> t.nanquantile(0.5)
+        tensor(1.5000)
+        >>> t = torch.tensor([[float('nan'), float('nan')], [1, 2]])
+        >>> t
+        tensor([[nan, nan],
+                [1., 2.]])
+        >>> t.nanquantile(0.5, dim=0)
+        tensor([1., 2.])
+        >>> t.nanquantile(0.5, dim=1)
+        tensor([   nan, 1.5000])
+    """
+    ...
+def nansum(input: Tensor, dim: Optional[Union[_int, _size]] = None, keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nansum(input, *, dtype=None) -> Tensor
+    
+    Returns the sum of all elements, treating Not a Numbers (NaNs) as zero.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.tensor([1., 2., float('nan'), 4.])
+        >>> torch.nansum(a)
+        tensor(7.)
+    
+    .. function:: nansum(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the sum of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`, treating Not a Numbers (NaNs) as zero.
+    If :attr:`dim` is a list of dimensions, reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> torch.nansum(torch.tensor([1., float("nan")]))
+        1.0
+        >>> a = torch.tensor([[1, 2], [3., float("nan")]])
+        >>> torch.nansum(a)
+        tensor(6.)
+        >>> torch.nansum(a, dim=0)
+        tensor([4., 2.])
+        >>> torch.nansum(a, dim=1)
+        tensor([3., 3.])
+    """
+    ...
+@overload
+def narrow(input: Tensor, dim: _int, start: Tensor, length: Union[_int, SymInt]) -> Tensor: 
+    r"""
+    narrow(input, dim, start, length) -> Tensor
+    
+    Returns a new tensor that is a narrowed version of :attr:`input` tensor. The
+    dimension :attr:`dim` is input from :attr:`start` to ``start + length``. The
+    returned tensor and :attr:`input` tensor share the same underlying storage.
+    
+    Args:
+        input (Tensor): the tensor to narrow
+        dim (int): the dimension along which to narrow
+        start (int or Tensor): index of the element to start the narrowed dimension
+            from. Can be negative, which means indexing from the end of `dim`. If
+            `Tensor`, it must be an 0-dim integral `Tensor` (bools not allowed)
+        length (int): length of the narrowed dimension, must be weakly positive
+    
+    Example::
+    
+        >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        >>> torch.narrow(x, 0, 0, 2)
+        tensor([[ 1,  2,  3],
+                [ 4,  5,  6]])
+        >>> torch.narrow(x, 1, 1, 2)
+        tensor([[ 2,  3],
+                [ 5,  6],
+                [ 8,  9]])
+        >>> torch.narrow(x, -1, torch.tensor(-1), 1)
+        tensor([[3],
+                [6],
+                [9]])
+    """
+    ...
+@overload
+def narrow(input: Tensor, dim: _int, start: Union[_int, SymInt], length: Union[_int, SymInt]) -> Tensor: 
+    r"""
+    narrow(input, dim, start, length) -> Tensor
+    
+    Returns a new tensor that is a narrowed version of :attr:`input` tensor. The
+    dimension :attr:`dim` is input from :attr:`start` to ``start + length``. The
+    returned tensor and :attr:`input` tensor share the same underlying storage.
+    
+    Args:
+        input (Tensor): the tensor to narrow
+        dim (int): the dimension along which to narrow
+        start (int or Tensor): index of the element to start the narrowed dimension
+            from. Can be negative, which means indexing from the end of `dim`. If
+            `Tensor`, it must be an 0-dim integral `Tensor` (bools not allowed)
+        length (int): length of the narrowed dimension, must be weakly positive
+    
+    Example::
+    
+        >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        >>> torch.narrow(x, 0, 0, 2)
+        tensor([[ 1,  2,  3],
+                [ 4,  5,  6]])
+        >>> torch.narrow(x, 1, 1, 2)
+        tensor([[ 2,  3],
+                [ 5,  6],
+                [ 8,  9]])
+        >>> torch.narrow(x, -1, torch.tensor(-1), 1)
+        tensor([[3],
+                [6],
+                [9]])
+    """
+    ...
+def narrow_copy(input: Tensor, dim: _int, start: Union[_int, SymInt], length: Union[_int, SymInt], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    narrow_copy(input, dim, start, length, *, out=None) -> Tensor
+    
+    Same as :meth:`Tensor.narrow` except this returns a copy rather
+    than shared storage. This is primarily for sparse tensors, which
+    do not have a shared-storage narrow method.
+    
+    Args:
+        input (Tensor): the tensor to narrow
+        dim (int): the dimension along which to narrow
+        start (int): index of the element to start the narrowed dimension from. Can
+            be negative, which means indexing from the end of `dim`
+        length (int): length of the narrowed dimension, must be weakly positive
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        >>> torch.narrow_copy(x, 0, 0, 2)
+        tensor([[ 1,  2,  3],
+                [ 4,  5,  6]])
+        >>> torch.narrow_copy(x, 1, 1, 2)
+        tensor([[ 2,  3],
+                [ 5,  6],
+                [ 8,  9]])
+        >>> s = torch.arange(16).reshape(2, 2, 2, 2).to_sparse(2)
+        >>> torch.narrow_copy(s, 0, 0, 1)
+        tensor(indices=tensor([[0, 0],
+                               [0, 1]]),
+               values=tensor([[[0, 1],
+                               [2, 3]],
+    
+                              [[4, 5],
+                               [6, 7]]]),
+               size=(1, 2, 2, 2), nnz=2, layout=torch.sparse_coo)
+    
+    .. seealso::
+    
+            :func:`torch.narrow` for a non copy variant
+    """
+    ...
+def native_batch_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, momentum: _float, eps: _float, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor, Tensor]: ...
+def native_channel_shuffle(input: Tensor, groups: Union[_int, SymInt]) -> Tensor: ...
+def native_dropout(input: Tensor, p: _float, train: Optional[_bool]) -> Tuple[Tensor, Tensor]: ...
+def native_group_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], N: Union[_int, SymInt], C: Union[_int, SymInt], HxW: Union[_int, SymInt], group: _int, eps: _float) -> Tuple[Tensor, Tensor, Tensor]: ...
+def native_layer_norm(input: Tensor, normalized_shape: Sequence[Union[_int, SymInt]], weight: Optional[Tensor], bias: Optional[Tensor], eps: _float) -> Tuple[Tensor, Tensor, Tensor]: ...
+@overload
+def native_norm(input: Tensor, p: Optional[Union[Number, _complex]], dim: Union[_int, _size], keepdim: _bool, dtype: Optional[_dtype]) -> Tensor: ...
+@overload
+def native_norm(input: Tensor, p: Union[Number, _complex] = 2) -> Tensor: ...
+@overload
+def ne(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ne(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \neq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is not equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.ne(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, True], [True, False]])
+    """
+    ...
+@overload
+def ne(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ne(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \neq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is not equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.ne(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, True], [True, False]])
+    """
+    ...
+def neg(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    neg(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the negative of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out} = -1 \times \text{input}
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(5)
+        >>> a
+        tensor([ 0.0090, -0.2262, -0.0682, -0.2866,  0.3940])
+        >>> torch.neg(a)
+        tensor([-0.0090,  0.2262,  0.0682,  0.2866, -0.3940])
+    """
+    ...
+def neg_(input: Tensor) -> Tensor: ...
+def negative(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    negative(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.neg`
+    """
+    ...
+def negative_(input: Tensor) -> Tensor: ...
+def nextafter(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nextafter(input, other, *, out=None) -> Tensor
+    
+    Return the next floating-point value after :attr:`input` towards :attr:`other`, elementwise.
+    
+    The shapes of ``input`` and ``other`` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the first input tensor
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> eps = torch.finfo(torch.float32).eps
+        >>> torch.nextafter(torch.tensor([1.0, 2.0]), torch.tensor([2.0, 1.0])) == torch.tensor([eps + 1, 2 - eps])
+        tensor([True, True])
+    """
+    ...
+@overload
+def nonzero(input: Tensor, *, as_tuple: Literal[False] = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nonzero(input, *, out=None, as_tuple=False) -> LongTensor or tuple of LongTensors
+    
+    .. note::
+        :func:`torch.nonzero(..., as_tuple=False) <torch.nonzero>` (default) returns a
+        2-D tensor where each row is the index for a nonzero value.
+    
+        :func:`torch.nonzero(..., as_tuple=True) <torch.nonzero>` returns a tuple of 1-D
+        index tensors, allowing for advanced indexing, so ``x[x.nonzero(as_tuple=True)]``
+        gives all nonzero values of tensor ``x``. Of the returned tuple, each index tensor
+        contains nonzero indices for a certain dimension.
+    
+        See below for more details on the two behaviors.
+    
+        When :attr:`input` is on CUDA, :func:`torch.nonzero() <torch.nonzero>` causes
+        host-device synchronization.
+    
+    **When** :attr:`as_tuple` **is** ``False`` **(default)**:
+    
+    Returns a tensor containing the indices of all non-zero elements of
+    :attr:`input`.  Each row in the result contains the indices of a non-zero
+    element in :attr:`input`. The result is sorted lexicographically, with
+    the last index changing the fastest (C-style).
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor
+    :attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    **When** :attr:`as_tuple` **is** ``True``:
+    
+    Returns a tuple of 1-D tensors, one for each dimension in :attr:`input`,
+    each containing the indices (in that dimension) of all non-zero elements of
+    :attr:`input` .
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting tuple contains :math:`n`
+    tensors of size :math:`z`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    As a special case, when :attr:`input` has zero dimensions and a nonzero scalar
+    value, it is treated as a one-dimensional tensor with one element.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (LongTensor, optional): the output tensor containing indices
+    
+    Returns:
+        LongTensor or tuple of LongTensor: If :attr:`as_tuple` is ``False``, the output
+        tensor containing indices. If :attr:`as_tuple` is ``True``, one 1-D tensor for
+        each dimension, containing the indices of each nonzero element along that
+        dimension.
+    
+    Example::
+    
+        >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]))
+        tensor([[ 0],
+                [ 1],
+                [ 2],
+                [ 4]])
+        >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+        ...                             [0.0, 0.4, 0.0, 0.0],
+        ...                             [0.0, 0.0, 1.2, 0.0],
+        ...                             [0.0, 0.0, 0.0,-0.4]]))
+        tensor([[ 0,  0],
+                [ 1,  1],
+                [ 2,  2],
+                [ 3,  3]])
+        >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]), as_tuple=True)
+        (tensor([0, 1, 2, 4]),)
+        >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+        ...                             [0.0, 0.4, 0.0, 0.0],
+        ...                             [0.0, 0.0, 1.2, 0.0],
+        ...                             [0.0, 0.0, 0.0,-0.4]]), as_tuple=True)
+        (tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3]))
+        >>> torch.nonzero(torch.tensor(5), as_tuple=True)
+        (tensor([0]),)
+    """
+    ...
+@overload
+def nonzero(input: Tensor, *, as_tuple: Literal[True]) -> Tuple[Tensor, ...]: 
+    r"""
+    nonzero(input, *, out=None, as_tuple=False) -> LongTensor or tuple of LongTensors
+    
+    .. note::
+        :func:`torch.nonzero(..., as_tuple=False) <torch.nonzero>` (default) returns a
+        2-D tensor where each row is the index for a nonzero value.
+    
+        :func:`torch.nonzero(..., as_tuple=True) <torch.nonzero>` returns a tuple of 1-D
+        index tensors, allowing for advanced indexing, so ``x[x.nonzero(as_tuple=True)]``
+        gives all nonzero values of tensor ``x``. Of the returned tuple, each index tensor
+        contains nonzero indices for a certain dimension.
+    
+        See below for more details on the two behaviors.
+    
+        When :attr:`input` is on CUDA, :func:`torch.nonzero() <torch.nonzero>` causes
+        host-device synchronization.
+    
+    **When** :attr:`as_tuple` **is** ``False`` **(default)**:
+    
+    Returns a tensor containing the indices of all non-zero elements of
+    :attr:`input`.  Each row in the result contains the indices of a non-zero
+    element in :attr:`input`. The result is sorted lexicographically, with
+    the last index changing the fastest (C-style).
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor
+    :attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    **When** :attr:`as_tuple` **is** ``True``:
+    
+    Returns a tuple of 1-D tensors, one for each dimension in :attr:`input`,
+    each containing the indices (in that dimension) of all non-zero elements of
+    :attr:`input` .
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting tuple contains :math:`n`
+    tensors of size :math:`z`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    As a special case, when :attr:`input` has zero dimensions and a nonzero scalar
+    value, it is treated as a one-dimensional tensor with one element.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (LongTensor, optional): the output tensor containing indices
+    
+    Returns:
+        LongTensor or tuple of LongTensor: If :attr:`as_tuple` is ``False``, the output
+        tensor containing indices. If :attr:`as_tuple` is ``True``, one 1-D tensor for
+        each dimension, containing the indices of each nonzero element along that
+        dimension.
+    
+    Example::
+    
+        >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]))
+        tensor([[ 0],
+                [ 1],
+                [ 2],
+                [ 4]])
+        >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+        ...                             [0.0, 0.4, 0.0, 0.0],
+        ...                             [0.0, 0.0, 1.2, 0.0],
+        ...                             [0.0, 0.0, 0.0,-0.4]]))
+        tensor([[ 0,  0],
+                [ 1,  1],
+                [ 2,  2],
+                [ 3,  3]])
+        >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]), as_tuple=True)
+        (tensor([0, 1, 2, 4]),)
+        >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+        ...                             [0.0, 0.4, 0.0, 0.0],
+        ...                             [0.0, 0.0, 1.2, 0.0],
+        ...                             [0.0, 0.0, 0.0,-0.4]]), as_tuple=True)
+        (tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3]))
+        >>> torch.nonzero(torch.tensor(5), as_tuple=True)
+        (tensor([0]),)
+    """
+    ...
+def nonzero_static(input: Tensor, *, size: _int, fill_value: _int = -1, out: Optional[Tensor] = None) -> Tensor: ...
+def norm_except_dim(v: Tensor, pow: _int = 2, dim: _int = 0) -> Tensor: ...
+@overload
+def normal(mean: Tensor, std: Tensor, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    normal(mean, std, *, generator=None, out=None) -> Tensor
+    
+    Returns a tensor of random numbers drawn from separate normal distributions
+    whose mean and standard deviation are given.
+    
+    The :attr:`mean` is a tensor with the mean of
+    each output element's normal distribution
+    
+    The :attr:`std` is a tensor with the standard deviation of
+    each output element's normal distribution
+    
+    The shapes of :attr:`mean` and :attr:`std` don't need to match, but the
+    total number of elements in each tensor need to be the same.
+    
+    .. note:: When the shapes do not match, the shape of :attr:`mean`
+              is used as the shape for the returned output tensor
+    
+    .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes
+              its device with the CPU.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1))
+        tensor([  1.0425,   3.5672,   2.7969,   4.2925,   4.7229,   6.2134,
+                  8.0505,   8.1408,   9.0563,  10.0566])
+    
+    .. function:: normal(mean=0.0, std, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means are shared among all drawn
+    elements.
+    
+    Args:
+        mean (float, optional): the mean for all distributions
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=0.5, std=torch.arange(1., 6.))
+        tensor([-1.2793, -1.0732, -2.0687,  5.1177, -1.2303])
+    
+    .. function:: normal(mean, std=1.0, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the standard deviations are shared among
+    all drawn elements.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (float, optional): the standard deviation for all distributions
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 6.))
+        tensor([ 1.1552,  2.6148,  2.6535,  5.8318,  4.2361])
+    
+    .. function:: normal(mean, std, size, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means and standard deviations are shared
+    among all drawn elements. The resulting tensor has size given by :attr:`size`.
+    
+    Args:
+        mean (float): the mean for all distributions
+        std (float): the standard deviation for all distributions
+        size (int...): a sequence of integers defining the shape of the output tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(2, 3, size=(1, 4))
+        tensor([[-1.3987, -1.9544,  3.6048,  0.7909]])
+    """
+    ...
+@overload
+def normal(mean: Tensor, std: _float = 1, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    normal(mean, std, *, generator=None, out=None) -> Tensor
+    
+    Returns a tensor of random numbers drawn from separate normal distributions
+    whose mean and standard deviation are given.
+    
+    The :attr:`mean` is a tensor with the mean of
+    each output element's normal distribution
+    
+    The :attr:`std` is a tensor with the standard deviation of
+    each output element's normal distribution
+    
+    The shapes of :attr:`mean` and :attr:`std` don't need to match, but the
+    total number of elements in each tensor need to be the same.
+    
+    .. note:: When the shapes do not match, the shape of :attr:`mean`
+              is used as the shape for the returned output tensor
+    
+    .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes
+              its device with the CPU.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1))
+        tensor([  1.0425,   3.5672,   2.7969,   4.2925,   4.7229,   6.2134,
+                  8.0505,   8.1408,   9.0563,  10.0566])
+    
+    .. function:: normal(mean=0.0, std, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means are shared among all drawn
+    elements.
+    
+    Args:
+        mean (float, optional): the mean for all distributions
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=0.5, std=torch.arange(1., 6.))
+        tensor([-1.2793, -1.0732, -2.0687,  5.1177, -1.2303])
+    
+    .. function:: normal(mean, std=1.0, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the standard deviations are shared among
+    all drawn elements.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (float, optional): the standard deviation for all distributions
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 6.))
+        tensor([ 1.1552,  2.6148,  2.6535,  5.8318,  4.2361])
+    
+    .. function:: normal(mean, std, size, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means and standard deviations are shared
+    among all drawn elements. The resulting tensor has size given by :attr:`size`.
+    
+    Args:
+        mean (float): the mean for all distributions
+        std (float): the standard deviation for all distributions
+        size (int...): a sequence of integers defining the shape of the output tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(2, 3, size=(1, 4))
+        tensor([[-1.3987, -1.9544,  3.6048,  0.7909]])
+    """
+    ...
+@overload
+def normal(mean: _float, std: Tensor, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    normal(mean, std, *, generator=None, out=None) -> Tensor
+    
+    Returns a tensor of random numbers drawn from separate normal distributions
+    whose mean and standard deviation are given.
+    
+    The :attr:`mean` is a tensor with the mean of
+    each output element's normal distribution
+    
+    The :attr:`std` is a tensor with the standard deviation of
+    each output element's normal distribution
+    
+    The shapes of :attr:`mean` and :attr:`std` don't need to match, but the
+    total number of elements in each tensor need to be the same.
+    
+    .. note:: When the shapes do not match, the shape of :attr:`mean`
+              is used as the shape for the returned output tensor
+    
+    .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes
+              its device with the CPU.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1))
+        tensor([  1.0425,   3.5672,   2.7969,   4.2925,   4.7229,   6.2134,
+                  8.0505,   8.1408,   9.0563,  10.0566])
+    
+    .. function:: normal(mean=0.0, std, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means are shared among all drawn
+    elements.
+    
+    Args:
+        mean (float, optional): the mean for all distributions
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=0.5, std=torch.arange(1., 6.))
+        tensor([-1.2793, -1.0732, -2.0687,  5.1177, -1.2303])
+    
+    .. function:: normal(mean, std=1.0, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the standard deviations are shared among
+    all drawn elements.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (float, optional): the standard deviation for all distributions
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 6.))
+        tensor([ 1.1552,  2.6148,  2.6535,  5.8318,  4.2361])
+    
+    .. function:: normal(mean, std, size, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means and standard deviations are shared
+    among all drawn elements. The resulting tensor has size given by :attr:`size`.
+    
+    Args:
+        mean (float): the mean for all distributions
+        std (float): the standard deviation for all distributions
+        size (int...): a sequence of integers defining the shape of the output tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(2, 3, size=(1, 4))
+        tensor([[-1.3987, -1.9544,  3.6048,  0.7909]])
+    """
+    ...
+@overload
+def normal(mean: _float, std: _float, size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator] = None, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    normal(mean, std, *, generator=None, out=None) -> Tensor
+    
+    Returns a tensor of random numbers drawn from separate normal distributions
+    whose mean and standard deviation are given.
+    
+    The :attr:`mean` is a tensor with the mean of
+    each output element's normal distribution
+    
+    The :attr:`std` is a tensor with the standard deviation of
+    each output element's normal distribution
+    
+    The shapes of :attr:`mean` and :attr:`std` don't need to match, but the
+    total number of elements in each tensor need to be the same.
+    
+    .. note:: When the shapes do not match, the shape of :attr:`mean`
+              is used as the shape for the returned output tensor
+    
+    .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes
+              its device with the CPU.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1))
+        tensor([  1.0425,   3.5672,   2.7969,   4.2925,   4.7229,   6.2134,
+                  8.0505,   8.1408,   9.0563,  10.0566])
+    
+    .. function:: normal(mean=0.0, std, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means are shared among all drawn
+    elements.
+    
+    Args:
+        mean (float, optional): the mean for all distributions
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=0.5, std=torch.arange(1., 6.))
+        tensor([-1.2793, -1.0732, -2.0687,  5.1177, -1.2303])
+    
+    .. function:: normal(mean, std=1.0, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the standard deviations are shared among
+    all drawn elements.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (float, optional): the standard deviation for all distributions
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 6.))
+        tensor([ 1.1552,  2.6148,  2.6535,  5.8318,  4.2361])
+    
+    .. function:: normal(mean, std, size, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means and standard deviations are shared
+    among all drawn elements. The resulting tensor has size given by :attr:`size`.
+    
+    Args:
+        mean (float): the mean for all distributions
+        std (float): the standard deviation for all distributions
+        size (int...): a sequence of integers defining the shape of the output tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(2, 3, size=(1, 4))
+        tensor([[-1.3987, -1.9544,  3.6048,  0.7909]])
+    """
+    ...
+@overload
+def not_equal(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    not_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.ne`.
+    """
+    ...
+@overload
+def not_equal(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    not_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.ne`.
+    """
+    ...
+@overload
+def nuclear_norm(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def nuclear_norm(input: Tensor, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: ...
+def numel(self: Tensor) -> _int: 
+    r"""
+    numel(input) -> int
+    
+    Returns the total number of elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 2, 3, 4, 5)
+        >>> torch.numel(a)
+        120
+        >>> a = torch.zeros(4,4)
+        >>> torch.numel(a)
+        16
+    """
+    ...
+@overload
+def ones(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.ones(2, 3)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    
+        >>> torch.ones(5)
+        tensor([ 1.,  1.,  1.,  1.,  1.])
+    """
+    ...
+@overload
+def ones(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.ones(2, 3)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    
+        >>> torch.ones(5)
+        tensor([ 1.,  1.,  1.,  1.,  1.])
+    """
+    ...
+@overload
+def ones(size: _size, *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.ones(2, 3)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    
+        >>> torch.ones(5)
+        tensor([ 1.,  1.,  1.,  1.,  1.])
+    """
+    ...
+@overload
+def ones(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.ones(2, 3)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    
+        >>> torch.ones(5)
+        tensor([ 1.,  1.,  1.,  1.,  1.])
+    """
+    ...
+def ones_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the same size as
+    :attr:`input`. ``torch.ones_like(input)`` is equivalent to
+    ``torch.ones(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    .. warning::
+        As of 0.4, this function does not support an :attr:`out` keyword. As an alternative,
+        the old ``torch.ones_like(input, out=output)`` is equivalent to
+        ``torch.ones(input.size(), out=output)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword arguments:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    
+    Example::
+    
+        >>> input = torch.empty(2, 3)
+        >>> torch.ones_like(input)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    """
+    ...
+def orgqr(input: Tensor, input2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    orgqr(input, tau) -> Tensor
+    
+    Alias for :func:`torch.linalg.householder_product`.
+    """
+    ...
+def ormqr(input: Tensor, input2: Tensor, input3: Tensor, left: _bool = True, transpose: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ormqr(input, tau, other, left=True, transpose=False, *, out=None) -> Tensor
+    
+    Computes the matrix-matrix multiplication of a product of Householder matrices with a general matrix.
+    
+    Multiplies a :math:`m \times n` matrix `C` (given by :attr:`other`) with a matrix `Q`,
+    where `Q` is represented using Householder reflectors `(input, tau)`.
+    See `Representation of Orthogonal or Unitary Matrices`_ for further details.
+    
+    If :attr:`left` is `True` then `op(Q)` times `C` is computed, otherwise the result is `C` times `op(Q)`.
+    When :attr:`left` is `True`, the implicit matrix `Q` has size :math:`m \times m`.
+    It has size :math:`n \times n` otherwise.
+    If :attr:`transpose` is `True` then `op` is the conjugate transpose operation, otherwise it's a no-op.
+    
+    Supports inputs of float, double, cfloat and cdouble dtypes.
+    Also supports batched inputs, and, if the input is batched, the output is batched with the same dimensions.
+    
+    .. seealso::
+            :func:`torch.geqrf` can be used to form the Householder representation `(input, tau)` of matrix `Q`
+            from the QR decomposition.
+    
+    .. note::
+            This function supports backward but it is only fast when ``(input, tau)`` do not require gradients
+            and/or ``tau.size(-1)`` is very small.
+            ``
+    
+    Args:
+        input (Tensor): tensor of shape `(*, mn, k)` where `*` is zero or more batch dimensions
+                        and `mn` equals to `m` or `n` depending on the :attr:`left`.
+        tau (Tensor): tensor of shape `(*, min(mn, k))` where `*` is zero or more batch dimensions.
+        other (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+        left (bool): controls the order of multiplication.
+        transpose (bool): controls whether the matrix `Q` is conjugate transposed or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output Tensor. Ignored if `None`. Default: `None`.
+    
+    .. _Representation of Orthogonal or Unitary Matrices:
+        https://www.netlib.org/lapack/lug/node128.html
+    """
+    ...
+def outer(input: Tensor, vec2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    outer(input, vec2, *, out=None) -> Tensor
+    
+    Outer product of :attr:`input` and :attr:`vec2`.
+    If :attr:`input` is a vector of size :math:`n` and :attr:`vec2` is a vector of
+    size :math:`m`, then :attr:`out` must be a matrix of size :math:`(n \times m)`.
+    
+    .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): 1-D input vector
+        vec2 (Tensor): 1-D input vector
+    
+    Keyword args:
+        out (Tensor, optional): optional output matrix
+    
+    Example::
+    
+        >>> v1 = torch.arange(1., 5.)
+        >>> v2 = torch.arange(1., 4.)
+        >>> torch.outer(v1, v2)
+        tensor([[  1.,   2.,   3.],
+                [  2.,   4.,   6.],
+                [  3.,   6.,   9.],
+                [  4.,   8.,  12.]])
+    """
+    ...
+def pairwise_distance(x1: Tensor, x2: Tensor, p: _float = 2, eps: _float = 1e-06, keepdim: _bool = False) -> Tensor: ...
+def pdist(input: Tensor, p: _float = 2) -> Tensor: ...
+def permute(input: Tensor, dims: _size) -> Tensor: 
+    r"""
+    permute(input, dims) -> Tensor
+    
+    Returns a view of the original tensor :attr:`input` with its dimensions permuted.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dims (tuple of int): The desired ordering of dimensions
+    
+    Example:
+        >>> x = torch.randn(2, 3, 5)
+        >>> x.size()
+        torch.Size([2, 3, 5])
+        >>> torch.permute(x, (2, 0, 1)).size()
+        torch.Size([5, 2, 3])
+    """
+    ...
+def permute_copy(input: Tensor, dims: _size, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.permute`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def pinverse(input: Tensor, rcond: _float = 1e-15) -> Tensor: 
+    r"""
+    pinverse(input, rcond=1e-15) -> Tensor
+    
+    Alias for :func:`torch.linalg.pinv`
+    """
+    ...
+def pixel_shuffle(input: Tensor, upscale_factor: _int) -> Tensor: ...
+def pixel_unshuffle(input: Tensor, downscale_factor: _int) -> Tensor: ...
+def poisson(input: Tensor, generator: Optional[Generator] = None) -> Tensor: 
+    r"""
+    poisson(input, generator=None) -> Tensor
+    
+    Returns a tensor of the same size as :attr:`input` with each element
+    sampled from a Poisson distribution with rate parameter given by the corresponding
+    element in :attr:`input` i.e.,
+    
+    .. math::
+        \text{out}_i \sim \text{Poisson}(\text{input}_i)
+    
+    :attr:`input` must be non-negative.
+    
+    Args:
+        input (Tensor): the input tensor containing the rates of the Poisson distribution
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+    
+    Example::
+    
+        >>> rates = torch.rand(4, 4) * 5  # rate parameter between 0 and 5
+        >>> torch.poisson(rates)
+        tensor([[9., 1., 3., 5.],
+                [8., 6., 6., 0.],
+                [0., 4., 5., 3.],
+                [2., 1., 4., 2.]])
+    """
+    ...
+def poisson_nll_loss(input: Tensor, target: Tensor, log_input: _bool, full: _bool, eps: _float, reduction: _int) -> Tensor: ...
+def polar(abs: Tensor, angle: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    polar(abs, angle, *, out=None) -> Tensor
+    
+    Constructs a complex tensor whose elements are Cartesian coordinates
+    corresponding to the polar coordinates with absolute value :attr:`abs` and angle
+    :attr:`angle`.
+    
+    .. math::
+        \text{out} = \text{abs} \cdot \cos(\text{angle}) + \text{abs} \cdot \sin(\text{angle}) \cdot j
+    
+    .. note::
+        `torch.polar` is similar to
+        `std::polar <https://en.cppreference.com/w/cpp/numeric/complex/polar>`_
+        and does not compute the polar decomposition
+        of a complex tensor like Python's `cmath.polar` and SciPy's `linalg.polar` do.
+        The behavior of this function is undefined if `abs` is negative or NaN, or if `angle` is
+        infinite.
+    
+    
+    Args:
+        abs (Tensor): The absolute value the complex tensor. Must be float or double.
+        angle (Tensor): The angle of the complex tensor. Must be same dtype as
+            :attr:`abs`.
+    
+    Keyword args:
+        out (Tensor): If the inputs are ``torch.float32``, must be
+            ``torch.complex64``. If the inputs are ``torch.float64``, must be
+            ``torch.complex128``.
+    
+    Example::
+    
+        >>> import numpy as np
+        >>> abs = torch.tensor([1, 2], dtype=torch.float64)
+        >>> angle = torch.tensor([np.pi / 2, 5 * np.pi / 4], dtype=torch.float64)
+        >>> z = torch.polar(abs, angle)
+        >>> z
+        tensor([(0.0000+1.0000j), (-1.4142-1.4142j)], dtype=torch.complex128)
+    """
+    ...
+def polygamma(n: _int, input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    polygamma(n, input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.polygamma`.
+    """
+    ...
+def positive(input: Tensor) -> Tensor: 
+    r"""
+    positive(input) -> Tensor
+    
+    Returns :attr:`input`.
+    Throws a runtime error if :attr:`input` is a bool tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(5)
+        >>> t
+        tensor([ 0.0090, -0.2262, -0.0682, -0.2866,  0.3940])
+        >>> torch.positive(t)
+        tensor([ 0.0090, -0.2262, -0.0682, -0.2866,  0.3940])
+    """
+    ...
+@overload
+def pow(input: Tensor, exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    pow(input, exponent, *, out=None) -> Tensor
+    
+    Takes the power of each element in :attr:`input` with :attr:`exponent` and
+    returns a tensor with the result.
+    
+    :attr:`exponent` can be either a single ``float`` number or a `Tensor`
+    with the same number of elements as :attr:`input`.
+    
+    When :attr:`exponent` is a scalar value, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ \text{exponent}
+    
+    When :attr:`exponent` is a tensor, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ {\text{exponent}_i}
+    
+    When :attr:`exponent` is a tensor, the shapes of :attr:`input`
+    and :attr:`exponent` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        exponent (float or tensor): the exponent value
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.4331,  1.2475,  0.6834, -0.2791])
+        >>> torch.pow(a, 2)
+        tensor([ 0.1875,  1.5561,  0.4670,  0.0779])
+        >>> exp = torch.arange(1., 5.)
+    
+        >>> a = torch.arange(1., 5.)
+        >>> a
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> exp
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> torch.pow(a, exp)
+        tensor([   1.,    4.,   27.,  256.])
+    
+    .. function:: pow(self, exponent, *, out=None) -> Tensor
+       :noindex:
+    
+    :attr:`self` is a scalar ``float`` value, and :attr:`exponent` is a tensor.
+    The returned tensor :attr:`out` is of the same shape as :attr:`exponent`
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{self} ^ {\text{exponent}_i}
+    
+    Args:
+        self (float): the scalar base value for the power operation
+        exponent (Tensor): the exponent tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> exp = torch.arange(1., 5.)
+        >>> base = 2
+        >>> torch.pow(base, exp)
+        tensor([  2.,   4.,   8.,  16.])
+    """
+    ...
+@overload
+def pow(self: Union[Number, _complex], exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    pow(input, exponent, *, out=None) -> Tensor
+    
+    Takes the power of each element in :attr:`input` with :attr:`exponent` and
+    returns a tensor with the result.
+    
+    :attr:`exponent` can be either a single ``float`` number or a `Tensor`
+    with the same number of elements as :attr:`input`.
+    
+    When :attr:`exponent` is a scalar value, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ \text{exponent}
+    
+    When :attr:`exponent` is a tensor, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ {\text{exponent}_i}
+    
+    When :attr:`exponent` is a tensor, the shapes of :attr:`input`
+    and :attr:`exponent` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        exponent (float or tensor): the exponent value
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.4331,  1.2475,  0.6834, -0.2791])
+        >>> torch.pow(a, 2)
+        tensor([ 0.1875,  1.5561,  0.4670,  0.0779])
+        >>> exp = torch.arange(1., 5.)
+    
+        >>> a = torch.arange(1., 5.)
+        >>> a
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> exp
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> torch.pow(a, exp)
+        tensor([   1.,    4.,   27.,  256.])
+    
+    .. function:: pow(self, exponent, *, out=None) -> Tensor
+       :noindex:
+    
+    :attr:`self` is a scalar ``float`` value, and :attr:`exponent` is a tensor.
+    The returned tensor :attr:`out` is of the same shape as :attr:`exponent`
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{self} ^ {\text{exponent}_i}
+    
+    Args:
+        self (float): the scalar base value for the power operation
+        exponent (Tensor): the exponent tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> exp = torch.arange(1., 5.)
+        >>> base = 2
+        >>> torch.pow(base, exp)
+        tensor([  2.,   4.,   8.,  16.])
+    """
+    ...
+@overload
+def pow(input: Tensor, exponent: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    pow(input, exponent, *, out=None) -> Tensor
+    
+    Takes the power of each element in :attr:`input` with :attr:`exponent` and
+    returns a tensor with the result.
+    
+    :attr:`exponent` can be either a single ``float`` number or a `Tensor`
+    with the same number of elements as :attr:`input`.
+    
+    When :attr:`exponent` is a scalar value, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ \text{exponent}
+    
+    When :attr:`exponent` is a tensor, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ {\text{exponent}_i}
+    
+    When :attr:`exponent` is a tensor, the shapes of :attr:`input`
+    and :attr:`exponent` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        exponent (float or tensor): the exponent value
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.4331,  1.2475,  0.6834, -0.2791])
+        >>> torch.pow(a, 2)
+        tensor([ 0.1875,  1.5561,  0.4670,  0.0779])
+        >>> exp = torch.arange(1., 5.)
+    
+        >>> a = torch.arange(1., 5.)
+        >>> a
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> exp
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> torch.pow(a, exp)
+        tensor([   1.,    4.,   27.,  256.])
+    
+    .. function:: pow(self, exponent, *, out=None) -> Tensor
+       :noindex:
+    
+    :attr:`self` is a scalar ``float`` value, and :attr:`exponent` is a tensor.
+    The returned tensor :attr:`out` is of the same shape as :attr:`exponent`
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{self} ^ {\text{exponent}_i}
+    
+    Args:
+        self (float): the scalar base value for the power operation
+        exponent (Tensor): the exponent tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> exp = torch.arange(1., 5.)
+        >>> base = 2
+        >>> torch.pow(base, exp)
+        tensor([  2.,   4.,   8.,  16.])
+    """
+    ...
+def prelu(input: Tensor, weight: Tensor) -> Tensor: ...
+@overload
+def prod(input: Tensor, *, dtype: Optional[_dtype] = None) -> Tensor: 
+    r"""
+    prod(input, *, dtype=None) -> Tensor
+    
+    Returns the product of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[-0.8020,  0.5428, -1.5854]])
+        >>> torch.prod(a)
+        tensor(0.6902)
+    
+    .. function:: prod(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the product of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`.
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensor having 1 fewer dimension than :attr:`input`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2)
+        >>> a
+        tensor([[ 0.5261, -0.3837],
+                [ 1.1857, -0.2498],
+                [-1.1646,  0.0705],
+                [ 1.1131, -1.0629]])
+        >>> torch.prod(a, 1)
+        tensor([-0.2018, -0.2962, -0.0821, -1.1831])
+    """
+    ...
+@overload
+def prod(input: Tensor, dim: _int, keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    prod(input, *, dtype=None) -> Tensor
+    
+    Returns the product of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[-0.8020,  0.5428, -1.5854]])
+        >>> torch.prod(a)
+        tensor(0.6902)
+    
+    .. function:: prod(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the product of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`.
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensor having 1 fewer dimension than :attr:`input`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2)
+        >>> a
+        tensor([[ 0.5261, -0.3837],
+                [ 1.1857, -0.2498],
+                [-1.1646,  0.0705],
+                [ 1.1131, -1.0629]])
+        >>> torch.prod(a, 1)
+        tensor([-0.2018, -0.2962, -0.0821, -1.1831])
+    """
+    ...
+@overload
+def prod(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    prod(input, *, dtype=None) -> Tensor
+    
+    Returns the product of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[-0.8020,  0.5428, -1.5854]])
+        >>> torch.prod(a)
+        tensor(0.6902)
+    
+    .. function:: prod(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the product of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`.
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensor having 1 fewer dimension than :attr:`input`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2)
+        >>> a
+        tensor([[ 0.5261, -0.3837],
+                [ 1.1857, -0.2498],
+                [-1.1646,  0.0705],
+                [ 1.1131, -1.0629]])
+        >>> torch.prod(a, 1)
+        tensor([-0.2018, -0.2962, -0.0821, -1.1831])
+    """
+    ...
+def promote_types(type1: _dtype, type2: _dtype) -> _dtype: 
+    r"""
+    promote_types(type1, type2) -> dtype
+    
+    Returns the :class:`torch.dtype` with the smallest size and scalar kind that is
+    not smaller nor of lower kind than either `type1` or `type2`. See type promotion
+    :ref:`documentation <type-promotion-doc>` for more information on the type
+    promotion logic.
+    
+    Args:
+        type1 (:class:`torch.dtype`)
+        type2 (:class:`torch.dtype`)
+    
+    Example::
+    
+        >>> torch.promote_types(torch.int32, torch.float32)
+        torch.float32
+        >>> torch.promote_types(torch.uint8, torch.long)
+        torch.long
+    """
+    ...
+def put(input: Tensor, index: Tensor, source: Tensor, accumulate: _bool = False) -> Tensor: ...
+def q_per_channel_axis(input: Tensor) -> _int: ...
+def q_per_channel_scales(input: Tensor) -> Tensor: ...
+def q_per_channel_zero_points(input: Tensor) -> Tensor: ...
+def q_scale(input: Tensor) -> _float: ...
+def q_zero_point(input: Tensor) -> _int: ...
+def qr(input: Tensor, some: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.qr: 
+    r"""
+    qr(input, some=True, *, out=None) -> (Tensor, Tensor)
+    
+    Computes the QR decomposition of a matrix or a batch of matrices :attr:`input`,
+    and returns a namedtuple (Q, R) of tensors such that :math:`\text{input} = Q R`
+    with :math:`Q` being an orthogonal matrix or batch of orthogonal matrices and
+    :math:`R` being an upper triangular matrix or batch of upper triangular matrices.
+    
+    If :attr:`some` is ``True``, then this function returns the thin (reduced) QR factorization.
+    Otherwise, if :attr:`some` is ``False``, this function returns the complete QR factorization.
+    
+    .. warning::
+    
+        :func:`torch.qr` is deprecated in favor of :func:`torch.linalg.qr`
+        and will be removed in a future PyTorch release. The boolean parameter :attr:`some` has been
+        replaced with a string parameter :attr:`mode`.
+    
+        ``Q, R = torch.qr(A)`` should be replaced with
+    
+        .. code:: python
+    
+            Q, R = torch.linalg.qr(A)
+    
+        ``Q, R = torch.qr(A, some=False)`` should be replaced with
+    
+        .. code:: python
+    
+            Q, R = torch.linalg.qr(A, mode="complete")
+    
+    .. warning::
+              If you plan to backpropagate through QR, note that the current backward implementation
+              is only well-defined when the first :math:`\min(input.size(-1), input.size(-2))`
+              columns of :attr:`input` are linearly independent.
+              This behavior will probably change once QR supports pivoting.
+    
+    .. note:: This function uses LAPACK for CPU inputs and MAGMA for CUDA inputs,
+              and may produce different (valid) decompositions on different device types
+              or different platforms.
+    
+    Args:
+        input (Tensor): the input tensor of size :math:`(*, m, n)` where `*` is zero or more
+                    batch dimensions consisting of matrices of dimension :math:`m \times n`.
+        some (bool, optional): Set to ``True`` for reduced QR decomposition and ``False`` for
+                    complete QR decomposition. If `k = min(m, n)` then:
+    
+                      * ``some=True`` : returns `(Q, R)` with dimensions (m, k), (k, n) (default)
+    
+                      * ``'some=False'``: returns `(Q, R)` with dimensions (m, m), (m, n)
+    
+    Keyword args:
+        out (tuple, optional): tuple of `Q` and `R` tensors.
+                    The dimensions of `Q` and `R` are detailed in the description of :attr:`some` above.
+    
+    Example::
+    
+        >>> a = torch.tensor([[12., -51, 4], [6, 167, -68], [-4, 24, -41]])
+        >>> q, r = torch.qr(a)
+        >>> q
+        tensor([[-0.8571,  0.3943,  0.3314],
+                [-0.4286, -0.9029, -0.0343],
+                [ 0.2857, -0.1714,  0.9429]])
+        >>> r
+        tensor([[ -14.0000,  -21.0000,   14.0000],
+                [   0.0000, -175.0000,   70.0000],
+                [   0.0000,    0.0000,  -35.0000]])
+        >>> torch.mm(q, r).round()
+        tensor([[  12.,  -51.,    4.],
+                [   6.,  167.,  -68.],
+                [  -4.,   24.,  -41.]])
+        >>> torch.mm(q.t(), q).round()
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  1., -0.],
+                [ 0., -0.,  1.]])
+        >>> a = torch.randn(3, 4, 5)
+        >>> q, r = torch.qr(a, some=False)
+        >>> torch.allclose(torch.matmul(q, r), a)
+        True
+        >>> torch.allclose(torch.matmul(q.mT, q), torch.eye(5))
+        True
+    """
+    ...
+@overload
+def quantile(input: Tensor, q: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    quantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor
+    
+    Computes the q-th quantiles of each row of the :attr:`input` tensor along the dimension :attr:`dim`.
+    
+    To compute the quantile, we map q in [0, 1] to the range of indices [0, n] to find the location
+    of the quantile in the sorted input. If the quantile lies between two data points ``a < b`` with
+    indices ``i`` and ``j`` in the sorted order, result is computed according to the given
+    :attr:`interpolation` method as follows:
+    
+    - ``linear``: ``a + (b - a) * fraction``, where ``fraction`` is the fractional part of the computed quantile index.
+    - ``lower``: ``a``.
+    - ``higher``: ``b``.
+    - ``nearest``: ``a`` or ``b``, whichever's index is closer to the computed quantile index (rounding down for .5 fractions).
+    - ``midpoint``: ``(a + b) / 2``.
+    
+    If :attr:`q` is a 1D tensor, the first dimension of the output represents the quantiles and has size
+    equal to the size of :attr:`q`, the remaining dimensions are what remains from the reduction.
+    
+    .. note::
+        By default :attr:`dim` is ``None`` resulting in the :attr:`input` tensor being flattened before computation.
+    
+    Args:
+        input (Tensor): the input tensor.
+        q (float or Tensor): a scalar or 1D tensor of values in the range [0, 1].
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword arguments:
+        interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+                                Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
+                                Default is ``linear``.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(2, 3)
+        >>> a
+        tensor([[ 0.0795, -1.2117,  0.9765],
+                [ 1.1707,  0.6706,  0.4884]])
+        >>> q = torch.tensor([0.25, 0.5, 0.75])
+        >>> torch.quantile(a, q, dim=1, keepdim=True)
+        tensor([[[-0.5661],
+                [ 0.5795]],
+    
+                [[ 0.0795],
+                [ 0.6706]],
+    
+                [[ 0.5280],
+                [ 0.9206]]])
+        >>> torch.quantile(a, q, dim=1, keepdim=True).shape
+        torch.Size([3, 2, 1])
+        >>> a = torch.arange(4.)
+        >>> a
+        tensor([0., 1., 2., 3.])
+        >>> torch.quantile(a, 0.6, interpolation='linear')
+        tensor(1.8000)
+        >>> torch.quantile(a, 0.6, interpolation='lower')
+        tensor(1.)
+        >>> torch.quantile(a, 0.6, interpolation='higher')
+        tensor(2.)
+        >>> torch.quantile(a, 0.6, interpolation='midpoint')
+        tensor(1.5000)
+        >>> torch.quantile(a, 0.6, interpolation='nearest')
+        tensor(2.)
+        >>> torch.quantile(a, 0.4, interpolation='nearest')
+        tensor(1.)
+    """
+    ...
+@overload
+def quantile(input: Tensor, q: _float, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    quantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor
+    
+    Computes the q-th quantiles of each row of the :attr:`input` tensor along the dimension :attr:`dim`.
+    
+    To compute the quantile, we map q in [0, 1] to the range of indices [0, n] to find the location
+    of the quantile in the sorted input. If the quantile lies between two data points ``a < b`` with
+    indices ``i`` and ``j`` in the sorted order, result is computed according to the given
+    :attr:`interpolation` method as follows:
+    
+    - ``linear``: ``a + (b - a) * fraction``, where ``fraction`` is the fractional part of the computed quantile index.
+    - ``lower``: ``a``.
+    - ``higher``: ``b``.
+    - ``nearest``: ``a`` or ``b``, whichever's index is closer to the computed quantile index (rounding down for .5 fractions).
+    - ``midpoint``: ``(a + b) / 2``.
+    
+    If :attr:`q` is a 1D tensor, the first dimension of the output represents the quantiles and has size
+    equal to the size of :attr:`q`, the remaining dimensions are what remains from the reduction.
+    
+    .. note::
+        By default :attr:`dim` is ``None`` resulting in the :attr:`input` tensor being flattened before computation.
+    
+    Args:
+        input (Tensor): the input tensor.
+        q (float or Tensor): a scalar or 1D tensor of values in the range [0, 1].
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword arguments:
+        interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+                                Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
+                                Default is ``linear``.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(2, 3)
+        >>> a
+        tensor([[ 0.0795, -1.2117,  0.9765],
+                [ 1.1707,  0.6706,  0.4884]])
+        >>> q = torch.tensor([0.25, 0.5, 0.75])
+        >>> torch.quantile(a, q, dim=1, keepdim=True)
+        tensor([[[-0.5661],
+                [ 0.5795]],
+    
+                [[ 0.0795],
+                [ 0.6706]],
+    
+                [[ 0.5280],
+                [ 0.9206]]])
+        >>> torch.quantile(a, q, dim=1, keepdim=True).shape
+        torch.Size([3, 2, 1])
+        >>> a = torch.arange(4.)
+        >>> a
+        tensor([0., 1., 2., 3.])
+        >>> torch.quantile(a, 0.6, interpolation='linear')
+        tensor(1.8000)
+        >>> torch.quantile(a, 0.6, interpolation='lower')
+        tensor(1.)
+        >>> torch.quantile(a, 0.6, interpolation='higher')
+        tensor(2.)
+        >>> torch.quantile(a, 0.6, interpolation='midpoint')
+        tensor(1.5000)
+        >>> torch.quantile(a, 0.6, interpolation='nearest')
+        tensor(2.)
+        >>> torch.quantile(a, 0.4, interpolation='nearest')
+        tensor(1.)
+    """
+    ...
+def quantize_per_channel(input: Tensor, scales: Tensor, zero_points: Tensor, axis: _int, dtype: _dtype) -> Tensor: 
+    r"""
+    quantize_per_channel(input, scales, zero_points, axis, dtype) -> Tensor
+    
+    Converts a float tensor to a per-channel quantized tensor with given scales and zero points.
+    
+    Arguments:
+        input (Tensor): float tensor to quantize
+        scales (Tensor): float 1D tensor of scales to use, size should match ``input.size(axis)``
+        zero_points (int): integer 1D tensor of offset to use, size should match ``input.size(axis)``
+        axis (int): dimension on which apply per-channel quantization
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32``
+    
+    Returns:
+        Tensor: A newly quantized tensor
+    
+    Example::
+    
+        >>> x = torch.tensor([[-1.0, 0.0], [1.0, 2.0]])
+        >>> torch.quantize_per_channel(x, torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8)
+        tensor([[-1.,  0.],
+                [ 1.,  2.]], size=(2, 2), dtype=torch.quint8,
+               quantization_scheme=torch.per_channel_affine,
+               scale=tensor([0.1000, 0.0100], dtype=torch.float64),
+               zero_point=tensor([10,  0]), axis=0)
+        >>> torch.quantize_per_channel(x, torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8).int_repr()
+        tensor([[  0,  10],
+                [100, 200]], dtype=torch.uint8)
+    """
+    ...
+@overload
+def quantize_per_tensor(input: Tensor, scale: Tensor, zero_point: Tensor, dtype: _dtype) -> Tensor: 
+    r"""
+    quantize_per_tensor(input, scale, zero_point, dtype) -> Tensor
+    
+    Converts a float tensor to a quantized tensor with given scale and zero point.
+    
+    Arguments:
+        input (Tensor): float tensor or list of tensors to quantize
+        scale (float or Tensor): scale to apply in quantization formula
+        zero_point (int or Tensor): offset in integer value that maps to float zero
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32``
+    
+    Returns:
+        Tensor: A newly quantized tensor or list of quantized tensors.
+    
+    Example::
+    
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+               quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10)
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8).int_repr()
+        tensor([ 0, 10, 20, 30], dtype=torch.uint8)
+        >>> torch.quantize_per_tensor([torch.tensor([-1.0, 0.0]), torch.tensor([-2.0, 2.0])],
+        >>> torch.tensor([0.1, 0.2]), torch.tensor([10, 20]), torch.quint8)
+        (tensor([-1.,  0.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10),
+            tensor([-2.,  2.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=20))
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.tensor(0.1), torch.tensor(10), torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+           quantization_scheme=torch.per_tensor_affine, scale=0.10, zero_point=10)
+    """
+    ...
+@overload
+def quantize_per_tensor(input: Tensor, scale: _float, zero_point: _int, dtype: _dtype) -> Tensor: 
+    r"""
+    quantize_per_tensor(input, scale, zero_point, dtype) -> Tensor
+    
+    Converts a float tensor to a quantized tensor with given scale and zero point.
+    
+    Arguments:
+        input (Tensor): float tensor or list of tensors to quantize
+        scale (float or Tensor): scale to apply in quantization formula
+        zero_point (int or Tensor): offset in integer value that maps to float zero
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32``
+    
+    Returns:
+        Tensor: A newly quantized tensor or list of quantized tensors.
+    
+    Example::
+    
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+               quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10)
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8).int_repr()
+        tensor([ 0, 10, 20, 30], dtype=torch.uint8)
+        >>> torch.quantize_per_tensor([torch.tensor([-1.0, 0.0]), torch.tensor([-2.0, 2.0])],
+        >>> torch.tensor([0.1, 0.2]), torch.tensor([10, 20]), torch.quint8)
+        (tensor([-1.,  0.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10),
+            tensor([-2.,  2.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=20))
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.tensor(0.1), torch.tensor(10), torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+           quantization_scheme=torch.per_tensor_affine, scale=0.10, zero_point=10)
+    """
+    ...
+@overload
+def quantize_per_tensor(tensors: Union[Tuple[Tensor, ...], List[Tensor]], scales: Tensor, zero_points: Tensor, dtype: _dtype) -> Tuple[Tensor, ...]: 
+    r"""
+    quantize_per_tensor(input, scale, zero_point, dtype) -> Tensor
+    
+    Converts a float tensor to a quantized tensor with given scale and zero point.
+    
+    Arguments:
+        input (Tensor): float tensor or list of tensors to quantize
+        scale (float or Tensor): scale to apply in quantization formula
+        zero_point (int or Tensor): offset in integer value that maps to float zero
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32``
+    
+    Returns:
+        Tensor: A newly quantized tensor or list of quantized tensors.
+    
+    Example::
+    
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+               quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10)
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8).int_repr()
+        tensor([ 0, 10, 20, 30], dtype=torch.uint8)
+        >>> torch.quantize_per_tensor([torch.tensor([-1.0, 0.0]), torch.tensor([-2.0, 2.0])],
+        >>> torch.tensor([0.1, 0.2]), torch.tensor([10, 20]), torch.quint8)
+        (tensor([-1.,  0.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10),
+            tensor([-2.,  2.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=20))
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.tensor(0.1), torch.tensor(10), torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+           quantization_scheme=torch.per_tensor_affine, scale=0.10, zero_point=10)
+    """
+    ...
+def quantize_per_tensor_dynamic(input: Tensor, dtype: _dtype, reduce_range: _bool) -> Tensor: 
+    r"""
+    quantize_per_tensor_dynamic(input, dtype, reduce_range) -> Tensor
+    
+    Converts a float tensor to a quantized tensor with scale and zero_point calculated
+    dynamically based on the input.
+    
+    Arguments:
+        input (Tensor): float tensor or list of tensors to quantize
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``
+        reduce_range (bool): a flag to indicate whether to reduce the range of quantized
+        data by 1 bit, it's required to avoid instruction overflow for some hardwares
+    
+    Returns:
+        Tensor: A newly (dynamically) quantized tensor
+    
+    Example::
+    
+        >>> t = torch.quantize_per_tensor_dynamic(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.quint8, False)
+        >>> print(t)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+               quantization_scheme=torch.per_tensor_affine, scale=0.011764705882352941,
+               zero_point=85)
+        >>> t.int_repr()
+        tensor([  0,  85, 170, 255], dtype=torch.uint8)
+    """
+    ...
+def quantized_batch_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], mean: Tensor, var: Tensor, eps: _float, output_scale: _float, output_zero_point: _int) -> Tensor: 
+    r"""
+    quantized_batch_norm(input, weight=None, bias=None, mean, var, eps, output_scale, output_zero_point) -> Tensor
+    
+    Applies batch normalization on a 4D (NCHW) quantized tensor.
+    
+    .. math::
+    
+            y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+    
+    Arguments:
+        input (Tensor): quantized tensor
+        weight (Tensor): float tensor that corresponds to the gamma, size C
+        bias (Tensor):  float tensor that corresponds to the beta, size C
+        mean (Tensor): float mean value in batch normalization, size C
+        var (Tensor): float tensor for variance, size C
+        eps (float): a value added to the denominator for numerical stability.
+        output_scale (float): output quantized tensor scale
+        output_zero_point (int): output quantized tensor zero_point
+    
+    Returns:
+        Tensor: A quantized tensor with batch normalization applied.
+    
+    Example::
+    
+        >>> qx = torch.quantize_per_tensor(torch.rand(2, 2, 2, 2), 1.5, 3, torch.quint8)
+        >>> torch.quantized_batch_norm(qx, torch.ones(2), torch.zeros(2), torch.rand(2), torch.rand(2), 0.00001, 0.2, 2)
+        tensor([[[[-0.2000, -0.2000],
+              [ 1.6000, -0.2000]],
+    
+             [[-0.4000, -0.4000],
+              [-0.4000,  0.6000]]],
+    
+    
+            [[[-0.2000, -0.2000],
+              [-0.2000, -0.2000]],
+    
+             [[ 0.6000, -0.4000],
+              [ 0.6000, -0.4000]]]], size=(2, 2, 2, 2), dtype=torch.quint8,
+           quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=2)
+    """
+    ...
+def quantized_gru_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tensor: ...
+def quantized_lstm_cell(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tuple[Tensor, Tensor]: ...
+def quantized_max_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: 
+    r"""
+    quantized_max_pool1d(input, kernel_size, stride=[], padding=0, dilation=1, ceil_mode=False) -> Tensor
+    
+    Applies a 1D max pooling over an input quantized tensor composed of several input planes.
+    
+    Arguments:
+        input (Tensor): quantized tensor
+        kernel_size (list of int): the size of the sliding window
+        stride (``list of int``, optional): the stride of the sliding window
+        padding (``list of int``, optional): padding to be added on both sides, must be >= 0 and <= kernel_size / 2
+        dilation (``list of int``, optional): The stride between elements within a sliding window, must be > 0. Default 1
+        ceil_mode (bool, optional):  If True, will use ceil instead of floor to compute the output shape.
+            Defaults to False.
+    
+    
+    Returns:
+        Tensor: A quantized tensor with max_pool1d applied.
+    
+    Example::
+    
+        >>> qx = torch.quantize_per_tensor(torch.rand(2, 2), 1.5, 3, torch.quint8)
+        >>> torch.quantized_max_pool1d(qx, [2])
+        tensor([[0.0000],
+                [1.5000]], size=(2, 1), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=1.5, zero_point=3)
+    """
+    ...
+def quantized_max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: 
+    r"""
+    quantized_max_pool2d(input, kernel_size, stride=[], padding=0, dilation=1, ceil_mode=False) -> Tensor
+    
+    Applies a 2D max pooling over an input quantized tensor composed of several input planes.
+    
+    Arguments:
+        input (Tensor): quantized tensor
+        kernel_size (``list of int``): the size of the sliding window
+        stride (``list of int``, optional): the stride of the sliding window
+        padding (``list of int``, optional): padding to be added on both sides, must be >= 0 and <= kernel_size / 2
+        dilation (``list of int``, optional): The stride between elements within a sliding window, must be > 0. Default 1
+        ceil_mode (bool, optional):  If True, will use ceil instead of floor to compute the output shape.
+            Defaults to False.
+    
+    
+    Returns:
+        Tensor: A quantized tensor with max_pool2d applied.
+    
+    Example::
+    
+        >>> qx = torch.quantize_per_tensor(torch.rand(2, 2, 2, 2), 1.5, 3, torch.quint8)
+        >>> torch.quantized_max_pool2d(qx, [2,2])
+        tensor([[[[1.5000]],
+    
+                [[1.5000]]],
+    
+    
+                [[[0.0000]],
+    
+                [[0.0000]]]], size=(2, 2, 1, 1), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=1.5, zero_point=3)
+    """
+    ...
+def quantized_max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def quantized_rnn_relu_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tensor: ...
+def quantized_rnn_tanh_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tensor: ...
+def rad2deg(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    rad2deg(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with each of the elements of :attr:`input`
+    converted from angles in radians to degrees.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([[3.142, -3.142], [6.283, -6.283], [1.570, -1.570]])
+        >>> torch.rad2deg(a)
+        tensor([[ 180.0233, -180.0233],
+                [ 359.9894, -359.9894],
+                [  89.9544,  -89.9544]])
+    """
+    ...
+def rad2deg_(input: Tensor) -> Tensor: ...
+@overload
+def rand(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(*size: _int, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(*size: _int, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(size: Sequence[Union[_int, SymInt]], *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+def rand_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same size as :attr:`input` that is filled with
+    random numbers from a uniform distribution on the interval :math:`[0, 1)`.
+    ``torch.rand_like(input)`` is equivalent to
+    ``torch.rand(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+@overload
+def randint(low: _int, high: _int, size: _size, *, generator: Optional[Generator] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(high: _int, size: _size, *, generator: Optional[Generator] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(low: Union[_int, SymInt], high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(low: Union[_int, SymInt], high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint_like(input: Tensor, high: Union[_int, SymInt], *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint_like(input, low=0, high, \*, dtype=None, layout=torch.strided, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same shape as Tensor :attr:`input` filled with
+    random integers generated uniformly between :attr:`low` (inclusive) and
+    :attr:`high` (exclusive).
+    
+    .. note:
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+@overload
+def randint_like(input: Tensor, low: Union[_int, SymInt], high: Union[_int, SymInt], *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint_like(input, low=0, high, \*, dtype=None, layout=torch.strided, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same shape as Tensor :attr:`input` filled with
+    random integers generated uniformly between :attr:`low` (inclusive) and
+    :attr:`high` (exclusive).
+    
+    .. note:
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+@overload
+def randn(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(*size: _int, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(*size: _int, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(size: Sequence[Union[_int, SymInt]], *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+def randn_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same size as :attr:`input` that is filled with
+    random numbers from a normal distribution with mean 0 and variance 1. Please refer to :func:`torch.randn` for the
+    sampling process of complex dtypes. ``torch.randn_like(input)`` is equivalent to
+    ``torch.randn(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+@overload
+def randperm(n: Union[_int, SymInt], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randperm(n, *, generator=None, out=None, dtype=torch.int64,layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a random permutation of integers from ``0`` to ``n - 1``.
+    
+    Args:
+        n (int): the upper bound (exclusive)
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randperm(4)
+        tensor([2, 1, 0, 3])
+    """
+    ...
+@overload
+def randperm(n: Union[_int, SymInt], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randperm(n, *, generator=None, out=None, dtype=torch.int64,layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a random permutation of integers from ``0`` to ``n - 1``.
+    
+    Args:
+        n (int): the upper bound (exclusive)
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randperm(4)
+        tensor([2, 1, 0, 3])
+    """
+    ...
+def range(start: Number, end: Number, step: Number = 1, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    range(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lfloor \frac{\text{end} - \text{start}}{\text{step}} \right\rfloor + 1`
+    with values from :attr:`start` to :attr:`end` with step :attr:`step`. Step is
+    the gap between two values in the tensor.
+    
+    .. math::
+        \text{out}_{i+1} = \text{out}_i + \text{step}.
+    
+    .. warning::
+        This function is deprecated and will be removed in a future release because its behavior is inconsistent with
+        Python's range builtin. Instead, use :func:`torch.arange`, which produces values in [start, end).
+    
+    Args:
+        start (float): the starting value for the set of points. Default: ``0``.
+        end (float): the ending value for the set of points
+        step (float): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.range(1, 4)
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> torch.range(1, 4, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000,  2.5000,  3.0000,  3.5000,  4.0000])
+    """
+    ...
+def ravel(input: Tensor) -> Tensor: 
+    r"""
+    ravel(input) -> Tensor
+    
+    Return a contiguous flattened tensor. A copy is made only if needed.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.ravel(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+    """
+    ...
+def real(input: Tensor) -> Tensor: 
+    r"""
+    real(input) -> Tensor
+    
+    Returns a new tensor containing real values of the :attr:`self` tensor.
+    The returned tensor and :attr:`self` share the same underlying storage.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x=torch.randn(4, dtype=torch.cfloat)
+        >>> x
+        tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
+        >>> x.real
+        tensor([ 0.3100, -0.5445, -1.6492, -0.0638])
+    """
+    ...
+def reciprocal(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    reciprocal(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the reciprocal of the elements of :attr:`input`
+    
+    .. math::
+        \text{out}_{i} = \frac{1}{\text{input}_{i}}
+    
+    .. note::
+        Unlike NumPy's reciprocal, torch.reciprocal supports integral inputs. Integral
+        inputs to reciprocal are automatically :ref:`promoted <type-promotion-doc>` to
+        the default scalar type.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.4595, -2.1219, -1.4314,  0.7298])
+        >>> torch.reciprocal(a)
+        tensor([-2.1763, -0.4713, -0.6986,  1.3702])
+    """
+    ...
+def reciprocal_(input: Tensor) -> Tensor: ...
+def relu(input: Tensor) -> Tensor: ...
+def relu_(input: Tensor) -> Tensor: ...
+@overload
+def remainder(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    remainder(input, other, *, out=None) -> Tensor
+    
+    Computes
+    `Python's modulus operation <https://docs.python.org/3/reference/expressions.html#binary-arithmetic-operations>`_
+    entrywise.  The result has the same sign as the divisor :attr:`other` and its absolute value
+    is less than that of :attr:`other`.
+    
+    It may also be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+        Complex inputs are not supported. In some cases, it is not mathematically
+        possible to satisfy the definition of a modulo operation with complex numbers.
+        See :func:`torch.fmod` for how division by zero is handled.
+    
+    .. seealso::
+    
+        :func:`torch.fmod` which implements C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_.
+        This one is defined in terms of division rounding towards zero.
+    
+    Args:
+        input (Tensor or Scalar): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.remainder(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([ 1.,  0.,  1.,  1.,  0.,  1.])
+        >>> torch.remainder(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([ -0.5000, -1.0000,  0.0000, -0.5000, -1.0000 ])
+    """
+    ...
+@overload
+def remainder(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    remainder(input, other, *, out=None) -> Tensor
+    
+    Computes
+    `Python's modulus operation <https://docs.python.org/3/reference/expressions.html#binary-arithmetic-operations>`_
+    entrywise.  The result has the same sign as the divisor :attr:`other` and its absolute value
+    is less than that of :attr:`other`.
+    
+    It may also be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+        Complex inputs are not supported. In some cases, it is not mathematically
+        possible to satisfy the definition of a modulo operation with complex numbers.
+        See :func:`torch.fmod` for how division by zero is handled.
+    
+    .. seealso::
+    
+        :func:`torch.fmod` which implements C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_.
+        This one is defined in terms of division rounding towards zero.
+    
+    Args:
+        input (Tensor or Scalar): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.remainder(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([ 1.,  0.,  1.,  1.,  0.,  1.])
+        >>> torch.remainder(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([ -0.5000, -1.0000,  0.0000, -0.5000, -1.0000 ])
+    """
+    ...
+@overload
+def remainder(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    remainder(input, other, *, out=None) -> Tensor
+    
+    Computes
+    `Python's modulus operation <https://docs.python.org/3/reference/expressions.html#binary-arithmetic-operations>`_
+    entrywise.  The result has the same sign as the divisor :attr:`other` and its absolute value
+    is less than that of :attr:`other`.
+    
+    It may also be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+        Complex inputs are not supported. In some cases, it is not mathematically
+        possible to satisfy the definition of a modulo operation with complex numbers.
+        See :func:`torch.fmod` for how division by zero is handled.
+    
+    .. seealso::
+    
+        :func:`torch.fmod` which implements C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_.
+        This one is defined in terms of division rounding towards zero.
+    
+    Args:
+        input (Tensor or Scalar): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.remainder(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([ 1.,  0.,  1.,  1.,  0.,  1.])
+        >>> torch.remainder(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([ -0.5000, -1.0000,  0.0000, -0.5000, -1.0000 ])
+    """
+    ...
+def renorm(input: Tensor, p: Union[Number, _complex], dim: _int, maxnorm: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    renorm(input, p, dim, maxnorm, *, out=None) -> Tensor
+    
+    Returns a tensor where each sub-tensor of :attr:`input` along dimension
+    :attr:`dim` is normalized such that the `p`-norm of the sub-tensor is lower
+    than the value :attr:`maxnorm`
+    
+    .. note:: If the norm of a row is lower than `maxnorm`, the row is unchanged
+    
+    Args:
+        input (Tensor): the input tensor.
+        p (float): the power for the norm computation
+        dim (int): the dimension to slice over to get the sub-tensors
+        maxnorm (float): the maximum norm to keep each sub-tensor under
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.ones(3, 3)
+        >>> x[1].fill_(2)
+        tensor([ 2.,  2.,  2.])
+        >>> x[2].fill_(3)
+        tensor([ 3.,  3.,  3.])
+        >>> x
+        tensor([[ 1.,  1.,  1.],
+                [ 2.,  2.,  2.],
+                [ 3.,  3.,  3.]])
+        >>> torch.renorm(x, 1, 0, 5)
+        tensor([[ 1.0000,  1.0000,  1.0000],
+                [ 1.6667,  1.6667,  1.6667],
+                [ 1.6667,  1.6667,  1.6667]])
+    """
+    ...
+@overload
+def repeat_interleave(input: Tensor, repeats: Tensor, dim: Optional[_int] = None, *, output_size: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    repeat_interleave(input, repeats, dim=None, *, output_size=None) -> Tensor
+    
+    Repeat elements of a tensor.
+    
+    .. warning::
+    
+        This is different from :meth:`torch.Tensor.repeat` but similar to ``numpy.repeat``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        repeats (Tensor or int): The number of repetitions for each element.
+            repeats is broadcasted to fit the shape of the given axis.
+        dim (int, optional): The dimension along which to repeat values.
+            By default, use the flattened input array, and return a flat output
+            array.
+    
+    Keyword args:
+        output_size (int, optional): Total output size for the given axis
+            ( e.g. sum of repeats). If given, it will avoid stream synchronization
+            needed to calculate output shape of the tensor.
+    
+    Returns:
+        Tensor: Repeated tensor which has the same shape as input, except along the given axis.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> x.repeat_interleave(2)
+        tensor([1, 1, 2, 2, 3, 3])
+        >>> y = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.repeat_interleave(y, 2)
+        tensor([1, 1, 2, 2, 3, 3, 4, 4])
+        >>> torch.repeat_interleave(y, 3, dim=1)
+        tensor([[1, 1, 1, 2, 2, 2],
+                [3, 3, 3, 4, 4, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0, output_size=3)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+    
+    If the `repeats` is `tensor([n1, n2, n3, ...])`, then the output will be
+    `tensor([0, 0, ..., 1, 1, ..., 2, 2, ..., ...])` where `0` appears `n1` times,
+    `1` appears `n2` times, `2` appears `n3` times, etc.
+    
+    .. function:: repeat_interleave(repeats, *) -> Tensor
+       :noindex:
+    
+    Repeats 0 repeats[0] times, 1 repeats[1] times, 2 repeats[2] times, etc.
+    
+    Args:
+        repeats (Tensor): The number of repetitions for each element.
+    
+    Returns:
+        Tensor: Repeated tensor of size `sum(repeats)`.
+    
+    Example::
+    
+        >>> torch.repeat_interleave(torch.tensor([1, 2, 3]))
+        tensor([0, 1, 1, 2, 2, 2])
+    """
+    ...
+@overload
+def repeat_interleave(repeats: Tensor, *, output_size: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    repeat_interleave(input, repeats, dim=None, *, output_size=None) -> Tensor
+    
+    Repeat elements of a tensor.
+    
+    .. warning::
+    
+        This is different from :meth:`torch.Tensor.repeat` but similar to ``numpy.repeat``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        repeats (Tensor or int): The number of repetitions for each element.
+            repeats is broadcasted to fit the shape of the given axis.
+        dim (int, optional): The dimension along which to repeat values.
+            By default, use the flattened input array, and return a flat output
+            array.
+    
+    Keyword args:
+        output_size (int, optional): Total output size for the given axis
+            ( e.g. sum of repeats). If given, it will avoid stream synchronization
+            needed to calculate output shape of the tensor.
+    
+    Returns:
+        Tensor: Repeated tensor which has the same shape as input, except along the given axis.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> x.repeat_interleave(2)
+        tensor([1, 1, 2, 2, 3, 3])
+        >>> y = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.repeat_interleave(y, 2)
+        tensor([1, 1, 2, 2, 3, 3, 4, 4])
+        >>> torch.repeat_interleave(y, 3, dim=1)
+        tensor([[1, 1, 1, 2, 2, 2],
+                [3, 3, 3, 4, 4, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0, output_size=3)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+    
+    If the `repeats` is `tensor([n1, n2, n3, ...])`, then the output will be
+    `tensor([0, 0, ..., 1, 1, ..., 2, 2, ..., ...])` where `0` appears `n1` times,
+    `1` appears `n2` times, `2` appears `n3` times, etc.
+    
+    .. function:: repeat_interleave(repeats, *) -> Tensor
+       :noindex:
+    
+    Repeats 0 repeats[0] times, 1 repeats[1] times, 2 repeats[2] times, etc.
+    
+    Args:
+        repeats (Tensor): The number of repetitions for each element.
+    
+    Returns:
+        Tensor: Repeated tensor of size `sum(repeats)`.
+    
+    Example::
+    
+        >>> torch.repeat_interleave(torch.tensor([1, 2, 3]))
+        tensor([0, 1, 1, 2, 2, 2])
+    """
+    ...
+@overload
+def repeat_interleave(input: Tensor, repeats: Union[_int, SymInt], dim: Optional[_int] = None, *, output_size: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    repeat_interleave(input, repeats, dim=None, *, output_size=None) -> Tensor
+    
+    Repeat elements of a tensor.
+    
+    .. warning::
+    
+        This is different from :meth:`torch.Tensor.repeat` but similar to ``numpy.repeat``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        repeats (Tensor or int): The number of repetitions for each element.
+            repeats is broadcasted to fit the shape of the given axis.
+        dim (int, optional): The dimension along which to repeat values.
+            By default, use the flattened input array, and return a flat output
+            array.
+    
+    Keyword args:
+        output_size (int, optional): Total output size for the given axis
+            ( e.g. sum of repeats). If given, it will avoid stream synchronization
+            needed to calculate output shape of the tensor.
+    
+    Returns:
+        Tensor: Repeated tensor which has the same shape as input, except along the given axis.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> x.repeat_interleave(2)
+        tensor([1, 1, 2, 2, 3, 3])
+        >>> y = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.repeat_interleave(y, 2)
+        tensor([1, 1, 2, 2, 3, 3, 4, 4])
+        >>> torch.repeat_interleave(y, 3, dim=1)
+        tensor([[1, 1, 1, 2, 2, 2],
+                [3, 3, 3, 4, 4, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0, output_size=3)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+    
+    If the `repeats` is `tensor([n1, n2, n3, ...])`, then the output will be
+    `tensor([0, 0, ..., 1, 1, ..., 2, 2, ..., ...])` where `0` appears `n1` times,
+    `1` appears `n2` times, `2` appears `n3` times, etc.
+    
+    .. function:: repeat_interleave(repeats, *) -> Tensor
+       :noindex:
+    
+    Repeats 0 repeats[0] times, 1 repeats[1] times, 2 repeats[2] times, etc.
+    
+    Args:
+        repeats (Tensor): The number of repetitions for each element.
+    
+    Returns:
+        Tensor: Repeated tensor of size `sum(repeats)`.
+    
+    Example::
+    
+        >>> torch.repeat_interleave(torch.tensor([1, 2, 3]))
+        tensor([0, 1, 1, 2, 2, 2])
+    """
+    ...
+def reshape(input: Tensor, shape: Sequence[Union[_int, SymInt]]) -> Tensor: 
+    r"""
+    reshape(input, shape) -> Tensor
+    
+    Returns a tensor with the same data and number of elements as :attr:`input`,
+    but with the specified shape. When possible, the returned tensor will be a view
+    of :attr:`input`. Otherwise, it will be a copy. Contiguous inputs and inputs
+    with compatible strides can be reshaped without copying, but you should not
+    depend on the copying vs. viewing behavior.
+    
+    See :meth:`torch.Tensor.view` on when it is possible to return a view.
+    
+    A single dimension may be -1, in which case it's inferred from the remaining
+    dimensions and the number of elements in :attr:`input`.
+    
+    Args:
+        input (Tensor): the tensor to be reshaped
+        shape (tuple of int): the new shape
+    
+    Example::
+    
+        >>> a = torch.arange(4.)
+        >>> torch.reshape(a, (2, 2))
+        tensor([[ 0.,  1.],
+                [ 2.,  3.]])
+        >>> b = torch.tensor([[0, 1], [2, 3]])
+        >>> torch.reshape(b, (-1,))
+        tensor([ 0,  1,  2,  3])
+    """
+    ...
+def resize_as_(input: Tensor, the_template: Tensor, *, memory_format: Optional[memory_format] = None) -> Tensor: ...
+def resize_as_sparse_(input: Tensor, the_template: Tensor) -> Tensor: ...
+def resolve_conj(input: Tensor) -> Tensor: 
+    r"""
+    resolve_conj(input) -> Tensor
+    
+    Returns a new tensor with materialized conjugation if :attr:`input`'s conjugate bit is set to `True`,
+    else returns :attr:`input`. The output tensor will always have its conjugate bit set to `False`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j])
+        >>> y = x.conj()
+        >>> y.is_conj()
+        True
+        >>> z = y.resolve_conj()
+        >>> z
+        tensor([-1 - 1j, -2 - 2j, 3 + 3j])
+        >>> z.is_conj()
+        False
+    """
+    ...
+def resolve_neg(input: Tensor) -> Tensor: 
+    r"""
+    resolve_neg(input) -> Tensor
+    
+    Returns a new tensor with materialized negation if :attr:`input`'s negative bit is set to `True`,
+    else returns :attr:`input`. The output tensor will always have its negative bit set to `False`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j])
+        >>> y = x.conj()
+        >>> z = y.imag
+        >>> z.is_neg()
+        True
+        >>> out = z.resolve_neg()
+        >>> out
+        tensor([-1., -2., 3.])
+        >>> out.is_neg()
+        False
+    """
+    ...
+@overload
+def result_type(tensor: Tensor, other: Tensor) -> _dtype: 
+    r"""
+    result_type(tensor1, tensor2) -> dtype
+    
+    Returns the :class:`torch.dtype` that would result from performing an arithmetic
+    operation on the provided input tensors. See type promotion :ref:`documentation <type-promotion-doc>`
+    for more information on the type promotion logic.
+    
+    Args:
+        tensor1 (Tensor or Number): an input tensor or number
+        tensor2 (Tensor or Number): an input tensor or number
+    
+    Example::
+    
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0)
+        torch.float32
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1))
+        torch.uint8
+    """
+    ...
+@overload
+def result_type(scalar: Union[Number, _complex], tensor: Tensor) -> _dtype: 
+    r"""
+    result_type(tensor1, tensor2) -> dtype
+    
+    Returns the :class:`torch.dtype` that would result from performing an arithmetic
+    operation on the provided input tensors. See type promotion :ref:`documentation <type-promotion-doc>`
+    for more information on the type promotion logic.
+    
+    Args:
+        tensor1 (Tensor or Number): an input tensor or number
+        tensor2 (Tensor or Number): an input tensor or number
+    
+    Example::
+    
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0)
+        torch.float32
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1))
+        torch.uint8
+    """
+    ...
+@overload
+def result_type(tensor: Tensor, other: Union[Number, _complex]) -> _dtype: 
+    r"""
+    result_type(tensor1, tensor2) -> dtype
+    
+    Returns the :class:`torch.dtype` that would result from performing an arithmetic
+    operation on the provided input tensors. See type promotion :ref:`documentation <type-promotion-doc>`
+    for more information on the type promotion logic.
+    
+    Args:
+        tensor1 (Tensor or Number): an input tensor or number
+        tensor2 (Tensor or Number): an input tensor or number
+    
+    Example::
+    
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0)
+        torch.float32
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1))
+        torch.uint8
+    """
+    ...
+@overload
+def result_type(scalar1: Union[Number, _complex], scalar2: Union[Number, _complex]) -> _dtype: 
+    r"""
+    result_type(tensor1, tensor2) -> dtype
+    
+    Returns the :class:`torch.dtype` that would result from performing an arithmetic
+    operation on the provided input tensors. See type promotion :ref:`documentation <type-promotion-doc>`
+    for more information on the type promotion logic.
+    
+    Args:
+        tensor1 (Tensor or Number): an input tensor or number
+        tensor2 (Tensor or Number): an input tensor or number
+    
+    Example::
+    
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0)
+        torch.float32
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1))
+        torch.uint8
+    """
+    ...
+@overload
+def rnn_relu(data: Tensor, batch_sizes: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor]: ...
+@overload
+def rnn_relu(input: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor]: ...
+def rnn_relu_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def rnn_tanh(data: Tensor, batch_sizes: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor]: ...
+@overload
+def rnn_tanh(input: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor]: ...
+def rnn_tanh_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tensor: ...
+def roll(input: Tensor, shifts: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]], dims: Union[_int, _size] = ()) -> Tensor: 
+    r"""
+    roll(input, shifts, dims=None) -> Tensor
+    
+    Roll the tensor :attr:`input` along the given dimension(s). Elements that are
+    shifted beyond the last position are re-introduced at the first position. If
+    :attr:`dims` is `None`, the tensor will be flattened before rolling and then
+    restored to the original shape.
+    
+    Args:
+        input (Tensor): the input tensor.
+        shifts (int or tuple of ints): The number of places by which the elements
+            of the tensor are shifted. If shifts is a tuple, dims must be a tuple of
+            the same size, and each dimension will be rolled by the corresponding
+            value
+        dims (int or tuple of ints): Axis along which to roll
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]).view(4, 2)
+        >>> x
+        tensor([[1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8]])
+        >>> torch.roll(x, 1)
+        tensor([[8, 1],
+                [2, 3],
+                [4, 5],
+                [6, 7]])
+        >>> torch.roll(x, 1, 0)
+        tensor([[7, 8],
+                [1, 2],
+                [3, 4],
+                [5, 6]])
+        >>> torch.roll(x, -1, 0)
+        tensor([[3, 4],
+                [5, 6],
+                [7, 8],
+                [1, 2]])
+        >>> torch.roll(x, shifts=(2, 1), dims=(0, 1))
+        tensor([[6, 5],
+                [8, 7],
+                [2, 1],
+                [4, 3]])
+    """
+    ...
+def rot90(input: Tensor, k: _int = 1, dims: _size = (0,1)) -> Tensor: 
+    r"""
+    rot90(input, k=1, dims=[0,1]) -> Tensor
+    
+    Rotate an n-D tensor by 90 degrees in the plane specified by dims axis.
+    Rotation direction is from the first towards the second axis if k > 0, and from the second towards the first for k < 0.
+    
+    Args:
+        input (Tensor): the input tensor.
+        k (int): number of times to rotate. Default value is 1
+        dims (a list or tuple): axis to rotate. Default value is [0, 1]
+    
+    Example::
+    
+        >>> x = torch.arange(4).view(2, 2)
+        >>> x
+        tensor([[0, 1],
+                [2, 3]])
+        >>> torch.rot90(x, 1, [0, 1])
+        tensor([[1, 3],
+                [0, 2]])
+    
+        >>> x = torch.arange(8).view(2, 2, 2)
+        >>> x
+        tensor([[[0, 1],
+                 [2, 3]],
+    
+                [[4, 5],
+                 [6, 7]]])
+        >>> torch.rot90(x, 1, [1, 2])
+        tensor([[[1, 3],
+                 [0, 2]],
+    
+                [[5, 7],
+                 [4, 6]]])
+    """
+    ...
+@overload
+def round(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    round(input, *, decimals=0, out=None) -> Tensor
+    
+    Rounds elements of :attr:`input` to the nearest integer.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    The return type of output is same as that of input's dtype.
+    
+    .. note::
+        This function implements the "round half to even" to
+        break ties when a number is equidistant from two
+        integers (e.g. `round(2.5)` is 2).
+    
+        When the :attr:\`decimals\` argument is specified the
+        algorithm used is similar to NumPy's `around`. This
+        algorithm is fast but inexact and it can easily
+        overflow for low precision dtypes.
+        Eg. `round(tensor([10000], dtype=torch.float16), decimals=3)` is `inf`.
+    
+    .. seealso::
+        :func:`torch.ceil`, which rounds up.
+        :func:`torch.floor`, which rounds down.
+        :func:`torch.trunc`, which rounds towards zero.
+    
+    Args:
+        input (Tensor): the input tensor.
+        decimals (int): Number of decimal places to round to (default: 0).
+            If decimals is negative, it specifies the number of positions
+            to the left of the decimal point.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.round(torch.tensor((4.7, -2.3, 9.1, -7.7)))
+        tensor([ 5.,  -2.,  9., -8.])
+    
+        >>> # Values equidistant from two integers are rounded towards the
+        >>> #   the nearest even value (zero is treated as even)
+        >>> torch.round(torch.tensor([-0.5, 0.5, 1.5, 2.5]))
+        tensor([-0., 0., 2., 2.])
+    
+        >>> # A positive decimals argument rounds to the to that decimal place
+        >>> torch.round(torch.tensor([0.1234567]), decimals=3)
+        tensor([0.1230])
+    
+        >>> # A negative decimals argument rounds to the left of the decimal
+        >>> torch.round(torch.tensor([1200.1234567]), decimals=-3)
+        tensor([1000.])
+    """
+    ...
+@overload
+def round(input: Tensor, *, decimals: _int, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    round(input, *, decimals=0, out=None) -> Tensor
+    
+    Rounds elements of :attr:`input` to the nearest integer.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    The return type of output is same as that of input's dtype.
+    
+    .. note::
+        This function implements the "round half to even" to
+        break ties when a number is equidistant from two
+        integers (e.g. `round(2.5)` is 2).
+    
+        When the :attr:\`decimals\` argument is specified the
+        algorithm used is similar to NumPy's `around`. This
+        algorithm is fast but inexact and it can easily
+        overflow for low precision dtypes.
+        Eg. `round(tensor([10000], dtype=torch.float16), decimals=3)` is `inf`.
+    
+    .. seealso::
+        :func:`torch.ceil`, which rounds up.
+        :func:`torch.floor`, which rounds down.
+        :func:`torch.trunc`, which rounds towards zero.
+    
+    Args:
+        input (Tensor): the input tensor.
+        decimals (int): Number of decimal places to round to (default: 0).
+            If decimals is negative, it specifies the number of positions
+            to the left of the decimal point.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.round(torch.tensor((4.7, -2.3, 9.1, -7.7)))
+        tensor([ 5.,  -2.,  9., -8.])
+    
+        >>> # Values equidistant from two integers are rounded towards the
+        >>> #   the nearest even value (zero is treated as even)
+        >>> torch.round(torch.tensor([-0.5, 0.5, 1.5, 2.5]))
+        tensor([-0., 0., 2., 2.])
+    
+        >>> # A positive decimals argument rounds to the to that decimal place
+        >>> torch.round(torch.tensor([0.1234567]), decimals=3)
+        tensor([0.1230])
+    
+        >>> # A negative decimals argument rounds to the left of the decimal
+        >>> torch.round(torch.tensor([1200.1234567]), decimals=-3)
+        tensor([1000.])
+    """
+    ...
+@overload
+def round_(input: Tensor) -> Tensor: ...
+@overload
+def round_(input: Tensor, *, decimals: _int) -> Tensor: ...
+def row_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def row_stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    row_stack(tensors, *, out=None) -> Tensor
+    
+    Alias of :func:`torch.vstack`.
+    """
+    ...
+def rrelu(input: Tensor, lower: Union[Number, _complex] = 0.125, upper: Union[Number, _complex] = 0.3333333333333333, training: _bool = False, generator: Optional[Generator] = None) -> Tensor: ...
+def rrelu_(input: Tensor, lower: Union[Number, _complex] = 0.125, upper: Union[Number, _complex] = 0.3333333333333333, training: _bool = False, generator: Optional[Generator] = None) -> Tensor: ...
+def rsqrt(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    rsqrt(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the reciprocal of the square-root of each of
+    the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \frac{1}{\sqrt{\text{input}_{i}}}
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.0370,  0.2970,  1.5420, -0.9105])
+        >>> torch.rsqrt(a)
+        tensor([    nan,  1.8351,  0.8053,     nan])
+    """
+    ...
+def rsqrt_(input: Tensor) -> Tensor: ...
+@overload
+def rsub(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: ...
+@overload
+def rsub(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: ...
+def saddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Number = 1, alpha: Number = 1, out: Optional[Tensor] = None) -> Tensor: ...
+def scalar_tensor(s: Union[Number, _complex], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def scatter(input: Tensor, dim: _int, index: Tensor, src: Tensor, *, reduce: str, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: _int, index: Tensor, src: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: _int, index: Tensor, value: Union[Number, _complex], *, reduce: str, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, src: Tensor) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: _int, index: Tensor, value: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, value: Union[Number, _complex]) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter_add(input: Tensor, dim: _int, index: Tensor, src: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter_add(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_add_`
+    """
+    ...
+@overload
+def scatter_add(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, src: Tensor) -> Tensor: 
+    r"""
+    scatter_add(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_add_`
+    """
+    ...
+def scatter_reduce(input: Tensor, dim: _int, index: Tensor, src: Tensor, reduce: str, *, include_self: _bool = True, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter_reduce(input, dim, index, src, reduce, *, include_self=True) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_reduce_`
+    """
+    ...
+@overload
+def searchsorted(sorted_sequence: Tensor, input: Tensor, *, out_int32: _bool = False, right: _bool = False, side: Optional[str] = None, sorter: Optional[Tensor] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    searchsorted(sorted_sequence, values, *, out_int32=False, right=False, side=None, out=None, sorter=None) -> Tensor
+    
+    Find the indices from the *innermost* dimension of :attr:`sorted_sequence` such that, if the
+    corresponding values in :attr:`values` were inserted before the indices, when sorted, the order
+    of the corresponding *innermost* dimension within :attr:`sorted_sequence` would be preserved.
+    Return a new tensor with the same size as :attr:`values`. More formally,
+    the returned index satisfies the following rules:
+    
+    .. list-table::
+       :widths: 12 10 78
+       :header-rows: 1
+    
+       * - :attr:`sorted_sequence`
+         - :attr:`right`
+         - *returned index satisfies*
+       * - 1-D
+         - False
+         - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]``
+       * - 1-D
+         - True
+         - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]``
+       * - N-D
+         - False
+         - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]``
+       * - N-D
+         - True
+         - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]``
+    
+    Args:
+        sorted_sequence (Tensor): N-D or 1-D tensor, containing monotonically increasing sequence on the *innermost*
+                                  dimension unless :attr:`sorter` is provided, in which case the sequence does not
+                                  need to be sorted
+        values (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+    
+    Keyword args:
+        out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
+                                    Default value is False, i.e. default output data type is torch.int64.
+        right (bool, optional): if False, return the first suitable location that is found. If True, return the
+                                last such index. If no suitable index found, return 0 for non-numerical value
+                                (eg. nan, inf) or the size of *innermost* dimension within :attr:`sorted_sequence`
+                                (one pass the last index of the *innermost* dimension). In other words, if False,
+                                gets the lower bound index for each value in :attr:`values` on the corresponding
+                                *innermost* dimension of the :attr:`sorted_sequence`. If True, gets the upper
+                                bound index instead. Default value is False. :attr:`side` does the same and is
+                                preferred. It will error if :attr:`side` is set to "left" while this is True.
+        side (str, optional): the same as :attr:`right` but preferred. "left" corresponds to False for :attr:`right`
+                                and "right" corresponds to True for :attr:`right`. It will error if this is set to
+                                "left" while :attr:`right` is True. Default value is None.
+        out (Tensor, optional): the output tensor, must be the same size as :attr:`values` if provided.
+        sorter (LongTensor, optional): if provided, a tensor matching the shape of the unsorted
+                                :attr:`sorted_sequence` containing a sequence of indices that sort it in the
+                                ascending order on the innermost dimension
+    
+    
+    Example::
+    
+        >>> sorted_sequence = torch.tensor([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]])
+        >>> sorted_sequence
+        tensor([[ 1,  3,  5,  7,  9],
+                [ 2,  4,  6,  8, 10]])
+        >>> values = torch.tensor([[3, 6, 9], [3, 6, 9]])
+        >>> values
+        tensor([[3, 6, 9],
+                [3, 6, 9]])
+        >>> torch.searchsorted(sorted_sequence, values)
+        tensor([[1, 3, 4],
+                [1, 2, 4]])
+        >>> torch.searchsorted(sorted_sequence, values, side='right')
+        tensor([[2, 3, 5],
+                [1, 3, 4]])
+    
+        >>> sorted_sequence_1d = torch.tensor([1, 3, 5, 7, 9])
+        >>> sorted_sequence_1d
+        tensor([1, 3, 5, 7, 9])
+        >>> torch.searchsorted(sorted_sequence_1d, values)
+        tensor([[1, 3, 4],
+                [1, 3, 4]])
+    """
+    ...
+@overload
+def searchsorted(sorted_sequence: Tensor, self: Union[Number, _complex], *, out_int32: _bool = False, right: _bool = False, side: Optional[str] = None, sorter: Optional[Tensor] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    searchsorted(sorted_sequence, values, *, out_int32=False, right=False, side=None, out=None, sorter=None) -> Tensor
+    
+    Find the indices from the *innermost* dimension of :attr:`sorted_sequence` such that, if the
+    corresponding values in :attr:`values` were inserted before the indices, when sorted, the order
+    of the corresponding *innermost* dimension within :attr:`sorted_sequence` would be preserved.
+    Return a new tensor with the same size as :attr:`values`. More formally,
+    the returned index satisfies the following rules:
+    
+    .. list-table::
+       :widths: 12 10 78
+       :header-rows: 1
+    
+       * - :attr:`sorted_sequence`
+         - :attr:`right`
+         - *returned index satisfies*
+       * - 1-D
+         - False
+         - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]``
+       * - 1-D
+         - True
+         - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]``
+       * - N-D
+         - False
+         - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]``
+       * - N-D
+         - True
+         - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]``
+    
+    Args:
+        sorted_sequence (Tensor): N-D or 1-D tensor, containing monotonically increasing sequence on the *innermost*
+                                  dimension unless :attr:`sorter` is provided, in which case the sequence does not
+                                  need to be sorted
+        values (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+    
+    Keyword args:
+        out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
+                                    Default value is False, i.e. default output data type is torch.int64.
+        right (bool, optional): if False, return the first suitable location that is found. If True, return the
+                                last such index. If no suitable index found, return 0 for non-numerical value
+                                (eg. nan, inf) or the size of *innermost* dimension within :attr:`sorted_sequence`
+                                (one pass the last index of the *innermost* dimension). In other words, if False,
+                                gets the lower bound index for each value in :attr:`values` on the corresponding
+                                *innermost* dimension of the :attr:`sorted_sequence`. If True, gets the upper
+                                bound index instead. Default value is False. :attr:`side` does the same and is
+                                preferred. It will error if :attr:`side` is set to "left" while this is True.
+        side (str, optional): the same as :attr:`right` but preferred. "left" corresponds to False for :attr:`right`
+                                and "right" corresponds to True for :attr:`right`. It will error if this is set to
+                                "left" while :attr:`right` is True. Default value is None.
+        out (Tensor, optional): the output tensor, must be the same size as :attr:`values` if provided.
+        sorter (LongTensor, optional): if provided, a tensor matching the shape of the unsorted
+                                :attr:`sorted_sequence` containing a sequence of indices that sort it in the
+                                ascending order on the innermost dimension
+    
+    
+    Example::
+    
+        >>> sorted_sequence = torch.tensor([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]])
+        >>> sorted_sequence
+        tensor([[ 1,  3,  5,  7,  9],
+                [ 2,  4,  6,  8, 10]])
+        >>> values = torch.tensor([[3, 6, 9], [3, 6, 9]])
+        >>> values
+        tensor([[3, 6, 9],
+                [3, 6, 9]])
+        >>> torch.searchsorted(sorted_sequence, values)
+        tensor([[1, 3, 4],
+                [1, 2, 4]])
+        >>> torch.searchsorted(sorted_sequence, values, side='right')
+        tensor([[2, 3, 5],
+                [1, 3, 4]])
+    
+        >>> sorted_sequence_1d = torch.tensor([1, 3, 5, 7, 9])
+        >>> sorted_sequence_1d
+        tensor([1, 3, 5, 7, 9])
+        >>> torch.searchsorted(sorted_sequence_1d, values)
+        tensor([[1, 3, 4],
+                [1, 3, 4]])
+    """
+    ...
+def segment_reduce(data: Tensor, reduce: str, *, lengths: Optional[Tensor] = None, indices: Optional[Tensor] = None, offsets: Optional[Tensor] = None, axis: _int = 0, unsafe: _bool = False, initial: Optional[Union[Number, _complex]] = None) -> Tensor: ...
+@overload
+def select(input: Tensor, dim: _int, index: Union[_int, SymInt]) -> Tensor: 
+    r"""
+    select(input, dim, index) -> Tensor
+    
+    Slices the :attr:`input` tensor along the selected dimension at the given index.
+    This function returns a view of the original tensor with the given dimension removed.
+    
+    .. note:: If :attr:`input` is a sparse tensor and returning a view of
+              the tensor is not possible, a RuntimeError exception is
+              raised. In this is the case, consider using
+              :func:`torch.select_copy` function.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to slice
+        index (int): the index to select with
+    
+    .. note::
+    
+        :meth:`select` is equivalent to slicing. For example,
+        ``tensor.select(0, index)`` is equivalent to ``tensor[index]`` and
+        ``tensor.select(2, index)`` is equivalent to ``tensor[:,:,index]``.
+    """
+    ...
+@overload
+def select(input: Tensor, dim: Union[str, ellipsis, None], index: _int) -> Tensor: 
+    r"""
+    select(input, dim, index) -> Tensor
+    
+    Slices the :attr:`input` tensor along the selected dimension at the given index.
+    This function returns a view of the original tensor with the given dimension removed.
+    
+    .. note:: If :attr:`input` is a sparse tensor and returning a view of
+              the tensor is not possible, a RuntimeError exception is
+              raised. In this is the case, consider using
+              :func:`torch.select_copy` function.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to slice
+        index (int): the index to select with
+    
+    .. note::
+    
+        :meth:`select` is equivalent to slicing. For example,
+        ``tensor.select(0, index)`` is equivalent to ``tensor[index]`` and
+        ``tensor.select(2, index)`` is equivalent to ``tensor[:,:,index]``.
+    """
+    ...
+def select_copy(input: Tensor, dim: _int, index: Union[_int, SymInt], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.select`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def select_scatter(input: Tensor, src: Tensor, dim: _int, index: Union[_int, SymInt]) -> Tensor: 
+    r"""
+    select_scatter(input, src, dim, index) -> Tensor
+    
+    Embeds the values of the :attr:`src` tensor into :attr:`input` at the given index.
+    This function returns a tensor with fresh storage; it does not create a view.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        src (Tensor): The tensor to embed into :attr:`input`
+        dim (int): the dimension to insert the slice into.
+        index (int): the index to select with
+    
+    .. note::
+    
+        :attr:`src` must be of the proper size in order to be embedded
+        into :attr:`input`. Specifically, it should have the same shape as
+        ``torch.select(input, dim, index)``
+    
+    Example::
+    
+        >>> a = torch.zeros(2, 2)
+        >>> b = torch.ones(2)
+        >>> a.select_scatter(b, 0, 0)
+        tensor([[1., 1.],
+                [0., 0.]])
+    """
+    ...
+def selu(input: Tensor) -> Tensor: ...
+def selu_(input: Tensor) -> Tensor: ...
+def set_flush_denormal(mode: _bool) -> _bool: 
+    r"""
+    set_flush_denormal(mode) -> bool
+    
+    Disables denormal floating numbers on CPU.
+    
+    Returns ``True`` if your system supports flushing denormal numbers and it
+    successfully configures flush denormal mode.  :meth:`~torch.set_flush_denormal`
+    is supported on x86 architectures supporting SSE3 and AArch64 architecture.
+    
+    Args:
+        mode (bool): Controls whether to enable flush denormal mode or not
+    
+    Example::
+    
+        >>> torch.set_flush_denormal(True)
+        True
+        >>> torch.tensor([1e-323], dtype=torch.float64)
+        tensor([ 0.], dtype=torch.float64)
+        >>> torch.set_flush_denormal(False)
+        True
+        >>> torch.tensor([1e-323], dtype=torch.float64)
+        tensor(9.88131e-324 *
+               [ 1.0000], dtype=torch.float64)
+    """
+    ...
+def set_num_interop_threads(num: _int) -> None: 
+    r"""
+    set_num_interop_threads(int)
+    
+    Sets the number of threads used for interop parallelism
+    (e.g. in JIT interpreter) on CPU.
+    
+    .. warning::
+        Can only be called once and before any inter-op parallel work
+        is started (e.g. JIT execution).
+    """
+    ...
+def set_num_threads(num: _int) -> None: 
+    r"""
+    set_num_threads(int)
+    
+    Sets the number of threads used for intraop parallelism on CPU.
+    
+    .. warning::
+        To ensure that the correct number of threads is used, set_num_threads
+        must be called before running eager, JIT or autograd code.
+    """
+    ...
+def sgn(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sgn(input, *, out=None) -> Tensor
+    
+    This function is an extension of torch.sign() to complex tensors.
+    It computes a new tensor whose elements have
+    the same angles as the corresponding elements of :attr:`input` and
+    absolute values (i.e. magnitudes) of one for complex tensors and
+    is equivalent to torch.sign() for non-complex tensors.
+    
+    .. math::
+        \text{out}_{i} = \begin{cases}
+                        0 & |\text{{input}}_i| == 0 \\
+                        \frac{{\text{{input}}_i}}{|{\text{{input}}_i}|} & \text{otherwise}
+                        \end{cases}
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([3+4j, 7-24j, 0, 1+2j])
+        >>> t.sgn()
+        tensor([0.6000+0.8000j, 0.2800-0.9600j, 0.0000+0.0000j, 0.4472+0.8944j])
+    """
+    ...
+def sigmoid(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sigmoid(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.expit`.
+    """
+    ...
+def sigmoid_(input: Tensor) -> Tensor: ...
+def sign(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sign(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the signs of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \operatorname{sgn}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([0.7, -1.2, 0., 2.3])
+        >>> a
+        tensor([ 0.7000, -1.2000,  0.0000,  2.3000])
+        >>> torch.sign(a)
+        tensor([ 1., -1.,  0.,  1.])
+    """
+    ...
+def signbit(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    signbit(input, *, out=None) -> Tensor
+    
+    Tests if each element of :attr:`input` has its sign bit set or not.
+    
+    Args:
+      input (Tensor): the input tensor.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([0.7, -1.2, 0., 2.3])
+        >>> torch.signbit(a)
+        tensor([ False, True,  False,  False])
+        >>> a = torch.tensor([-0.0, 0.0])
+        >>> torch.signbit(a)
+        tensor([ True,  False])
+    
+    .. note::
+        signbit handles signed zeros, so negative zero (-0) returns True.
+    """
+    ...
+def sin(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sin(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the sine of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sin(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.5461,  0.1347, -2.7266, -0.2746])
+        >>> torch.sin(a)
+        tensor([-0.5194,  0.1343, -0.4032, -0.2711])
+    """
+    ...
+def sin_(input: Tensor) -> Tensor: ...
+def sinc(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sinc(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.sinc`.
+    """
+    ...
+def sinc_(input: Tensor) -> Tensor: ...
+def sinh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sinh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the hyperbolic sine of the elements of
+    :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sinh(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.5380, -0.8632, -0.1265,  0.9399])
+        >>> torch.sinh(a)
+        tensor([ 0.5644, -0.9744, -0.1268,  1.0845])
+    
+    .. note::
+       When :attr:`input` is on the CPU, the implementation of torch.sinh may use
+       the Sleef library, which rounds very large results to infinity or negative
+       infinity. See `here <https://sleef.org/purec.xhtml>`_ for details.
+    """
+    ...
+def sinh_(input: Tensor) -> Tensor: ...
+def slice_copy(input: Tensor, dim: _int = 0, start: Optional[Union[_int, SymInt]] = None, end: Optional[Union[_int, SymInt]] = None, step: Union[_int, SymInt] = 1, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.slice`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def slice_inverse(input: Tensor, src: Tensor, dim: _int = 0, start: Optional[Union[_int, SymInt]] = None, end: Optional[Union[_int, SymInt]] = None, step: Union[_int, SymInt] = 1) -> Tensor: ...
+def slice_scatter(input: Tensor, src: Tensor, dim: _int = 0, start: Optional[Union[_int, SymInt]] = None, end: Optional[Union[_int, SymInt]] = None, step: Union[_int, SymInt] = 1, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    slice_scatter(input, src, dim=0, start=None, end=None, step=1) -> Tensor
+    
+    Embeds the values of the :attr:`src` tensor into :attr:`input` at the given
+    dimension.
+    This function returns a tensor with fresh storage; it does not create a view.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        src (Tensor): The tensor to embed into :attr:`input`
+        dim (int): the dimension to insert the slice into
+        start (Optional[int]): the start index of where to insert the slice
+        end (Optional[int]): the end index of where to insert the slice
+        step (int): the how many elements to skip in
+    
+    Example::
+    
+        >>> a = torch.zeros(8, 8)
+        >>> b = torch.ones(2, 8)
+        >>> a.slice_scatter(b, start=6)
+        tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [1., 1., 1., 1., 1., 1., 1., 1.],
+                [1., 1., 1., 1., 1., 1., 1., 1.]])
+    
+        >>> b = torch.ones(8, 2)
+        >>> a.slice_scatter(b, dim=1, start=2, end=6, step=2)
+        tensor([[0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.]])
+    """
+    ...
+def slogdet(input: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.slogdet: 
+    r"""
+    slogdet(input) -> (Tensor, Tensor)
+    
+    Alias for :func:`torch.linalg.slogdet`
+    """
+    ...
+def smm(input: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    smm(input, mat) -> Tensor
+    
+    Performs a matrix multiplication of the sparse matrix :attr:`input`
+    with the dense matrix :attr:`mat`.
+    
+    Args:
+        input (Tensor): a sparse matrix to be matrix multiplied
+        mat (Tensor): a dense matrix to be matrix multiplied
+    """
+    ...
+@overload
+def softmax(input: Tensor, dim: _int, dtype: Optional[_dtype] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    softmax(input, dim, *, dtype=None) -> Tensor
+    
+    Alias for :func:`torch.nn.functional.softmax`.
+    """
+    ...
+@overload
+def softmax(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None) -> Tensor: 
+    r"""
+    softmax(input, dim, *, dtype=None) -> Tensor
+    
+    Alias for :func:`torch.nn.functional.softmax`.
+    """
+    ...
+@overload
+def sort(input: Tensor, *, stable: Optional[_bool], dim: _int = -1, descending: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: 
+    r"""
+    sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Sorts the elements of the :attr:`input` tensor along a given dimension
+    in ascending order by value.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`descending` is ``True`` then the elements are sorted in descending
+    order by value.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements.
+    
+    A namedtuple of (values, indices) is returned, where the `values` are the
+    sorted values and `indices` are the indices of the elements in the original
+    `input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+           of equivalent elements is preserved.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
+            be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> sorted, indices = torch.sort(x)
+        >>> sorted
+        tensor([[-0.2162,  0.0608,  0.6719,  2.3332],
+                [-0.5793,  0.0061,  0.6058,  0.9497],
+                [-0.5071,  0.3343,  0.9553,  1.0960]])
+        >>> indices
+        tensor([[ 1,  0,  2,  3],
+                [ 3,  1,  0,  2],
+                [ 0,  3,  1,  2]])
+    
+        >>> sorted, indices = torch.sort(x, 0)
+        >>> sorted
+        tensor([[-0.5071, -0.2162,  0.6719, -0.5793],
+                [ 0.0608,  0.0061,  0.9497,  0.3343],
+                [ 0.6058,  0.9553,  1.0960,  2.3332]])
+        >>> indices
+        tensor([[ 2,  0,  0,  1],
+                [ 0,  1,  1,  2],
+                [ 1,  2,  2,  0]])
+        >>> x = torch.tensor([0, 1] * 9)
+        >>> x.sort()
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 2, 16,  4,  6, 14,  8,  0, 10, 12,  9, 17, 15, 13, 11,  7,  5,  3,  1]))
+        >>> x.sort(stable=True)
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16,  1,  3,  5,  7,  9, 11, 13, 15, 17]))
+    """
+    ...
+@overload
+def sort(input: Tensor, dim: _int = -1, descending: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: 
+    r"""
+    sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Sorts the elements of the :attr:`input` tensor along a given dimension
+    in ascending order by value.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`descending` is ``True`` then the elements are sorted in descending
+    order by value.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements.
+    
+    A namedtuple of (values, indices) is returned, where the `values` are the
+    sorted values and `indices` are the indices of the elements in the original
+    `input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+           of equivalent elements is preserved.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
+            be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> sorted, indices = torch.sort(x)
+        >>> sorted
+        tensor([[-0.2162,  0.0608,  0.6719,  2.3332],
+                [-0.5793,  0.0061,  0.6058,  0.9497],
+                [-0.5071,  0.3343,  0.9553,  1.0960]])
+        >>> indices
+        tensor([[ 1,  0,  2,  3],
+                [ 3,  1,  0,  2],
+                [ 0,  3,  1,  2]])
+    
+        >>> sorted, indices = torch.sort(x, 0)
+        >>> sorted
+        tensor([[-0.5071, -0.2162,  0.6719, -0.5793],
+                [ 0.0608,  0.0061,  0.9497,  0.3343],
+                [ 0.6058,  0.9553,  1.0960,  2.3332]])
+        >>> indices
+        tensor([[ 2,  0,  0,  1],
+                [ 0,  1,  1,  2],
+                [ 1,  2,  2,  0]])
+        >>> x = torch.tensor([0, 1] * 9)
+        >>> x.sort()
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 2, 16,  4,  6, 14,  8,  0, 10, 12,  9, 17, 15, 13, 11,  7,  5,  3,  1]))
+        >>> x.sort(stable=True)
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16,  1,  3,  5,  7,  9, 11, 13, 15, 17]))
+    """
+    ...
+@overload
+def sort(input: Tensor, *, stable: Optional[_bool], dim: Union[str, ellipsis, None], descending: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: 
+    r"""
+    sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Sorts the elements of the :attr:`input` tensor along a given dimension
+    in ascending order by value.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`descending` is ``True`` then the elements are sorted in descending
+    order by value.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements.
+    
+    A namedtuple of (values, indices) is returned, where the `values` are the
+    sorted values and `indices` are the indices of the elements in the original
+    `input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+           of equivalent elements is preserved.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
+            be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> sorted, indices = torch.sort(x)
+        >>> sorted
+        tensor([[-0.2162,  0.0608,  0.6719,  2.3332],
+                [-0.5793,  0.0061,  0.6058,  0.9497],
+                [-0.5071,  0.3343,  0.9553,  1.0960]])
+        >>> indices
+        tensor([[ 1,  0,  2,  3],
+                [ 3,  1,  0,  2],
+                [ 0,  3,  1,  2]])
+    
+        >>> sorted, indices = torch.sort(x, 0)
+        >>> sorted
+        tensor([[-0.5071, -0.2162,  0.6719, -0.5793],
+                [ 0.0608,  0.0061,  0.9497,  0.3343],
+                [ 0.6058,  0.9553,  1.0960,  2.3332]])
+        >>> indices
+        tensor([[ 2,  0,  0,  1],
+                [ 0,  1,  1,  2],
+                [ 1,  2,  2,  0]])
+        >>> x = torch.tensor([0, 1] * 9)
+        >>> x.sort()
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 2, 16,  4,  6, 14,  8,  0, 10, 12,  9, 17, 15, 13, 11,  7,  5,  3,  1]))
+        >>> x.sort(stable=True)
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16,  1,  3,  5,  7,  9, 11, 13, 15, 17]))
+    """
+    ...
+@overload
+def sort(input: Tensor, dim: Union[str, ellipsis, None], descending: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: 
+    r"""
+    sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Sorts the elements of the :attr:`input` tensor along a given dimension
+    in ascending order by value.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`descending` is ``True`` then the elements are sorted in descending
+    order by value.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements.
+    
+    A namedtuple of (values, indices) is returned, where the `values` are the
+    sorted values and `indices` are the indices of the elements in the original
+    `input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+           of equivalent elements is preserved.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
+            be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> sorted, indices = torch.sort(x)
+        >>> sorted
+        tensor([[-0.2162,  0.0608,  0.6719,  2.3332],
+                [-0.5793,  0.0061,  0.6058,  0.9497],
+                [-0.5071,  0.3343,  0.9553,  1.0960]])
+        >>> indices
+        tensor([[ 1,  0,  2,  3],
+                [ 3,  1,  0,  2],
+                [ 0,  3,  1,  2]])
+    
+        >>> sorted, indices = torch.sort(x, 0)
+        >>> sorted
+        tensor([[-0.5071, -0.2162,  0.6719, -0.5793],
+                [ 0.0608,  0.0061,  0.9497,  0.3343],
+                [ 0.6058,  0.9553,  1.0960,  2.3332]])
+        >>> indices
+        tensor([[ 2,  0,  0,  1],
+                [ 0,  1,  1,  2],
+                [ 1,  2,  2,  0]])
+        >>> x = torch.tensor([0, 1] * 9)
+        >>> x.sort()
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 2, 16,  4,  6, 14,  8,  0, 10, 12,  9, 17, 15, 13, 11,  7,  5,  3,  1]))
+        >>> x.sort(stable=True)
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16,  1,  3,  5,  7,  9, 11, 13, 15, 17]))
+    """
+    ...
+def sparse_bsc_tensor(ccol_indices: Union[Tensor, List], row_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_bsc_tensor(ccol_indices, row_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in BSC (Block Compressed Sparse
+    Column)) <sparse-bsc-docs>` with specified 2-dimensional blocks at the
+    given :attr:`ccol_indices` and :attr:`row_indices`. Sparse matrix
+    multiplication operations in BSC format are typically faster than that
+    for sparse tensors in COO format. Make you have a look at :ref:`the
+    note on the data type of the indices <sparse-bsc-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        ccol_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, ncolblocks + 1)``. The last element of each
+            batch is the number of non-zeros. This tensor encodes the
+            index in values and row_indices depending on where the given
+            column starts. Each successive number in the tensor subtracted
+            by the number before it denotes the number of elements in a
+            given column.
+        row_indices (array_like): Row block co-ordinates of each block in
+            values. (B+1)-dimensional tensor with the same length
+            as values.
+        values (array_list): Initial blocks for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, and other types that
+            represents a (1 + 2 + K)-dimensional tensor where ``K`` is the
+            number of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
+            blocksize[1], *densesize)`` If not provided, the size will be
+            inferred as the minimum size big enough to hold all non-zero
+            blocks.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> ccol_indices = [0, 1, 2]
+        >>> row_indices = [0, 1]
+        >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+        >>> torch.sparse_bsc_tensor(torch.tensor(ccol_indices, dtype=torch.int64),
+        ...                         torch.tensor(row_indices, dtype=torch.int64),
+        ...                         torch.tensor(values), dtype=torch.double)
+        tensor(ccol_indices=tensor([0, 1, 2]),
+               row_indices=tensor([0, 1]),
+               values=tensor([[[1., 2.],
+                               [3., 4.]],
+                              [[5., 6.],
+                               [7., 8.]]]), size=(2, 2), nnz=2, dtype=torch.float64,
+               layout=torch.sparse_bsc)
+    """
+    ...
+def sparse_bsr_tensor(crow_indices: Union[Tensor, List], col_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_bsr_tensor(crow_indices, col_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in BSR (Block Compressed Sparse Row))
+    <sparse-bsr-docs>` with specified 2-dimensional blocks at the given
+    :attr:`crow_indices` and :attr:`col_indices`. Sparse matrix
+    multiplication operations in BSR format are typically faster than that
+    for sparse tensors in COO format. Make you have a look at :ref:`the
+    note on the data type of the indices <sparse-bsr-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        crow_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, nrowblocks + 1)``.  The last element of each
+            batch is the number of non-zeros. This tensor encodes the
+            block index in values and col_indices depending on where the
+            given row block starts. Each successive number in the tensor
+            subtracted by the number before it denotes the number of
+            blocks in a given row.
+        col_indices (array_like): Column block co-ordinates of each block
+            in values. (B+1)-dimensional tensor with the same length as
+            values.
+        values (array_list): Initial values for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, scalar, and other types that
+            represents a (1 + 2 + K)-dimensional tensor where ``K`` is the
+            number of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
+            blocksize[1], *densesize)`` where ``blocksize ==
+            values.shape[1:3]``. If not provided, the size will be
+            inferred as the minimum size big enough to hold all non-zero
+            blocks.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> crow_indices = [0, 1, 2]
+        >>> col_indices = [0, 1]
+        >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+        >>> torch.sparse_bsr_tensor(torch.tensor(crow_indices, dtype=torch.int64),
+        ...                         torch.tensor(col_indices, dtype=torch.int64),
+        ...                         torch.tensor(values), dtype=torch.double)
+        tensor(crow_indices=tensor([0, 1, 2]),
+               col_indices=tensor([0, 1]),
+               values=tensor([[[1., 2.],
+                               [3., 4.]],
+                              [[5., 6.],
+                               [7., 8.]]]), size=(2, 2), nnz=2, dtype=torch.float64,
+               layout=torch.sparse_bsr)
+    """
+    ...
+def sparse_compressed_tensor(compressed_indices: Union[Tensor, List], plain_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_compressed_tensor(compressed_indices, plain_indices, values, size=None, *, dtype=None, layout=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in Compressed Sparse format - CSR,
+    CSC, BSR, or BSC - <sparse-compressed-docs>` with specified values at
+    the given :attr:`compressed_indices` and :attr:`plain_indices`. Sparse
+    matrix multiplication operations in Compressed Sparse format are
+    typically faster than that for sparse tensors in COO format. Make you
+    have a look at :ref:`the note on the data type of the indices
+    <sparse-compressed-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        compressed_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, compressed_dim_size + 1)``.  The last element of
+            each batch is the number of non-zero elements or blocks. This
+            tensor encodes the index in ``values`` and ``plain_indices``
+            depending on where the given compressed dimension (row or
+            column) starts. Each successive number in the tensor
+            subtracted by the number before it denotes the number of
+            elements or blocks in a given compressed dimension.
+        plain_indices (array_like): Plain dimension (column or row)
+            co-ordinates of each element or block in values. (B+1)-dimensional
+            tensor with the same length as values.
+    
+        values (array_list): Initial values for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, scalar, and other types.  that
+            represents a (1+K)-dimensional (for CSR and CSC layouts) or
+            (1+2+K)-dimensional tensor (for BSR and BSC layouts) where
+            ``K`` is the number of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
+            blocksize[1], *densesize)`` where ``blocksize[0] ==
+            blocksize[1] == 1`` for CSR and CSC formats. If not provided,
+            the size will be inferred as the minimum size big enough to
+            hold all non-zero elements or blocks.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        layout (:class:`torch.layout`, required): the desired layout of
+            returned tensor: :attr:`torch.sparse_csr`,
+            :attr:`torch.sparse_csc`, :attr:`torch.sparse_bsr`, or
+            :attr:`torch.sparse_bsc`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> compressed_indices = [0, 2, 4]
+        >>> plain_indices = [0, 1, 0, 1]
+        >>> values = [1, 2, 3, 4]
+        >>> torch.sparse_compressed_tensor(torch.tensor(compressed_indices, dtype=torch.int64),
+        ...                                torch.tensor(plain_indices, dtype=torch.int64),
+        ...                                torch.tensor(values), dtype=torch.double, layout=torch.sparse_csr)
+        tensor(crow_indices=tensor([0, 2, 4]),
+               col_indices=tensor([0, 1, 0, 1]),
+               values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+               dtype=torch.float64, layout=torch.sparse_csr)
+    """
+    ...
+def sparse_coo_tensor(indices: Tensor, values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None, is_coalesced: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_coo_tensor(indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None, is_coalesced=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in COO(rdinate) format
+    <sparse-coo-docs>` with specified values at the given
+    :attr:`indices`.
+    
+    .. note::
+    
+       This function returns an :ref:`uncoalesced tensor
+       <sparse-uncoalesced-coo-docs>` when :attr:`is_coalesced` is
+       unspecified or ``None``.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        indices (array_like): Initial data for the tensor. Can be a list, tuple,
+            NumPy ``ndarray``, scalar, and other types. Will be cast to a :class:`torch.LongTensor`
+            internally. The indices are the coordinates of the non-zero values in the matrix, and thus
+            should be two-dimensional where the first dimension is the number of tensor dimensions and
+            the second dimension is the number of non-zero values.
+        values (array_like): Initial values for the tensor. Can be a list, tuple,
+            NumPy ``ndarray``, scalar, and other types.
+        size (list, tuple, or :class:`torch.Size`, optional): Size of the sparse tensor. If not
+            provided the size will be inferred as the minimum size big enough to hold all non-zero
+            elements.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if None, infers data type from :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if None, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+        is_coalesced (bool, optional): When``True``, the caller is
+            responsible for providing tensor indices that correspond to a
+            coalesced tensor.  If the :attr:`check_invariants` flag is
+            False, no error will be raised if the prerequisites are not
+            met and this will lead to silently incorrect results. To force
+            coalescion please use :meth:`coalesce` on the resulting
+            Tensor.
+            Default: None: except for trivial cases (e.g. nnz < 2) the
+            resulting Tensor has is_coalesced set to ``False```.
+    
+    Example::
+    
+        >>> i = torch.tensor([[0, 1, 1],
+        ...                   [2, 0, 2]])
+        >>> v = torch.tensor([3, 4, 5], dtype=torch.float32)
+        >>> torch.sparse_coo_tensor(i, v, [2, 4])
+        tensor(indices=tensor([[0, 1, 1],
+                               [2, 0, 2]]),
+               values=tensor([3., 4., 5.]),
+               size=(2, 4), nnz=3, layout=torch.sparse_coo)
+    
+        >>> torch.sparse_coo_tensor(i, v)  # Shape inference
+        tensor(indices=tensor([[0, 1, 1],
+                               [2, 0, 2]]),
+               values=tensor([3., 4., 5.]),
+               size=(2, 3), nnz=3, layout=torch.sparse_coo)
+    
+        >>> torch.sparse_coo_tensor(i, v, [2, 4],
+        ...                         dtype=torch.float64,
+        ...                         device=torch.device('cuda:0'))
+        tensor(indices=tensor([[0, 1, 1],
+                               [2, 0, 2]]),
+               values=tensor([3., 4., 5.]),
+               device='cuda:0', size=(2, 4), nnz=3, dtype=torch.float64,
+               layout=torch.sparse_coo)
+    
+        # Create an empty sparse tensor with the following invariants:
+        #   1. sparse_dim + dense_dim = len(SparseTensor.shape)
+        #   2. SparseTensor._indices().shape = (sparse_dim, nnz)
+        #   3. SparseTensor._values().shape = (nnz, SparseTensor.shape[sparse_dim:])
+        #
+        # For instance, to create an empty sparse tensor with nnz = 0, dense_dim = 0 and
+        # sparse_dim = 1 (hence indices is a 2D tensor of shape = (1, 0))
+        >>> S = torch.sparse_coo_tensor(torch.empty([1, 0]), [], [1])
+        tensor(indices=tensor([], size=(1, 0)),
+               values=tensor([], size=(0,)),
+               size=(1,), nnz=0, layout=torch.sparse_coo)
+    
+        # and to create an empty sparse tensor with nnz = 0, dense_dim = 1 and
+        # sparse_dim = 1
+        >>> S = torch.sparse_coo_tensor(torch.empty([1, 0]), torch.empty([0, 2]), [1, 2])
+        tensor(indices=tensor([], size=(1, 0)),
+               values=tensor([], size=(0, 2)),
+               size=(1, 2), nnz=0, layout=torch.sparse_coo)
+    
+    .. _torch.sparse: https://pytorch.org/docs/stable/sparse.html
+    """
+    ...
+def sparse_csc_tensor(ccol_indices: Union[Tensor, List], row_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_csc_tensor(ccol_indices, row_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in CSC (Compressed Sparse Column)
+    <sparse-csc-docs>` with specified values at the given
+    :attr:`ccol_indices` and :attr:`row_indices`. Sparse matrix
+    multiplication operations in CSC format are typically faster than that
+    for sparse tensors in COO format. Make you have a look at :ref:`the
+    note on the data type of the indices <sparse-csc-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        ccol_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, ncols + 1)``.  The last element of each batch
+            is the number of non-zeros. This tensor encodes the index in
+            values and row_indices depending on where the given column
+            starts. Each successive number in the tensor subtracted by the
+            number before it denotes the number of elements in a given
+            column.
+        row_indices (array_like): Row co-ordinates of each element in
+            values. (B+1)-dimensional tensor with the same length as
+            values.
+        values (array_list): Initial values for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, scalar, and other types that
+            represents a (1+K)-dimensional tensor where ``K`` is the number
+            of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows, ncols, *densesize)``. If
+            not provided, the size will be inferred as the minimum size
+            big enough to hold all non-zero elements.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> ccol_indices = [0, 2, 4]
+        >>> row_indices = [0, 1, 0, 1]
+        >>> values = [1, 2, 3, 4]
+        >>> torch.sparse_csc_tensor(torch.tensor(ccol_indices, dtype=torch.int64),
+        ...                         torch.tensor(row_indices, dtype=torch.int64),
+        ...                         torch.tensor(values), dtype=torch.double)
+        tensor(ccol_indices=tensor([0, 2, 4]),
+               row_indices=tensor([0, 1, 0, 1]),
+               values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+               dtype=torch.float64, layout=torch.sparse_csc)
+    """
+    ...
+def sparse_csr_tensor(crow_indices: Union[Tensor, List], col_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_csr_tensor(crow_indices, col_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in CSR (Compressed Sparse Row) <sparse-csr-docs>` with specified
+    values at the given :attr:`crow_indices` and :attr:`col_indices`. Sparse matrix multiplication operations
+    in CSR format are typically faster than that for sparse tensors in COO format. Make you have a look
+    at :ref:`the note on the data type of the indices <sparse-csr-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        crow_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, nrows + 1)``.  The last element of each batch
+            is the number of non-zeros. This tensor encodes the index in
+            values and col_indices depending on where the given row
+            starts. Each successive number in the tensor subtracted by the
+            number before it denotes the number of elements in a given
+            row.
+        col_indices (array_like): Column co-ordinates of each element in
+            values. (B+1)-dimensional tensor with the same length
+            as values.
+        values (array_list): Initial values for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, scalar, and other types that
+            represents a (1+K)-dimensional tensor where ``K`` is the number
+            of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows, ncols, *densesize)``. If
+            not provided, the size will be inferred as the minimum size
+            big enough to hold all non-zero elements.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> crow_indices = [0, 2, 4]
+        >>> col_indices = [0, 1, 0, 1]
+        >>> values = [1, 2, 3, 4]
+        >>> torch.sparse_csr_tensor(torch.tensor(crow_indices, dtype=torch.int64),
+        ...                         torch.tensor(col_indices, dtype=torch.int64),
+        ...                         torch.tensor(values), dtype=torch.double)
+        tensor(crow_indices=tensor([0, 2, 4]),
+               col_indices=tensor([0, 1, 0, 1]),
+               values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+               dtype=torch.float64, layout=torch.sparse_csr)
+    """
+    ...
+def split_copy(input: Tensor, split_size: Union[_int, SymInt], dim: _int = 0, *, out: Union[Tuple[Tensor, ...], List[Tensor], None] = None) -> None: 
+    r"""
+    Performs the same operation as :func:`torch.split`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def split_with_sizes(input: Tensor, split_sizes: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: ...
+def split_with_sizes_copy(input: Tensor, split_sizes: Sequence[Union[_int, SymInt]], dim: _int = 0, *, out: Union[Tuple[Tensor, ...], List[Tensor], None] = None) -> None: 
+    r"""
+    Performs the same operation as :func:`torch.split_with_sizes`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def spmm(input: Tensor, mat2: Tensor) -> Tensor: ...
+def sqrt(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sqrt(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the square-root of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sqrt{\text{input}_{i}}
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-2.0755,  1.0226,  0.0831,  0.4806])
+        >>> torch.sqrt(a)
+        tensor([    nan,  1.0112,  0.2883,  0.6933])
+    """
+    ...
+def sqrt_(input: Tensor) -> Tensor: ...
+def square(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    square(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the square of the elements of :attr:`input`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-2.0755,  1.0226,  0.0831,  0.4806])
+        >>> torch.square(a)
+        tensor([ 4.3077,  1.0457,  0.0069,  0.2310])
+    """
+    ...
+def square_(input: Tensor) -> Tensor: ...
+@overload
+def squeeze(input: Tensor) -> Tensor: 
+    r"""
+    squeeze(input, dim=None) -> Tensor
+    
+    Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed.
+    
+    For example, if `input` is of shape:
+    :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()`
+    will be of shape: :math:`(A \times B \times C \times D)`.
+    
+    When :attr:`dim` is given, a squeeze operation is done only in the given
+    dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`,
+    ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+    will squeeze the tensor to the shape :math:`(A \times B)`.
+    
+    .. note:: The returned tensor shares the storage with the input tensor,
+              so changing the contents of one will change the contents of the other.
+    
+    .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)`
+              will also remove the batch dimension, which can lead to unexpected
+              errors. Consider specifying only the dims you wish to be squeezed.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): if given, the input will be squeezed
+               only in the specified dimensions.
+    
+            .. versionchanged:: 2.0
+               :attr:`dim` now accepts tuples of dimensions.
+    
+    Example::
+    
+        >>> x = torch.zeros(2, 1, 2, 1, 2)
+        >>> x.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x)
+        >>> y.size()
+        torch.Size([2, 2, 2])
+        >>> y = torch.squeeze(x, 0)
+        >>> y.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x, 1)
+        >>> y.size()
+        torch.Size([2, 2, 1, 2])
+        >>> y = torch.squeeze(x, (1, 2, 3))
+        torch.Size([2, 2, 2])
+    """
+    ...
+@overload
+def squeeze(input: Tensor, dim: _int) -> Tensor: 
+    r"""
+    squeeze(input, dim=None) -> Tensor
+    
+    Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed.
+    
+    For example, if `input` is of shape:
+    :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()`
+    will be of shape: :math:`(A \times B \times C \times D)`.
+    
+    When :attr:`dim` is given, a squeeze operation is done only in the given
+    dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`,
+    ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+    will squeeze the tensor to the shape :math:`(A \times B)`.
+    
+    .. note:: The returned tensor shares the storage with the input tensor,
+              so changing the contents of one will change the contents of the other.
+    
+    .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)`
+              will also remove the batch dimension, which can lead to unexpected
+              errors. Consider specifying only the dims you wish to be squeezed.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): if given, the input will be squeezed
+               only in the specified dimensions.
+    
+            .. versionchanged:: 2.0
+               :attr:`dim` now accepts tuples of dimensions.
+    
+    Example::
+    
+        >>> x = torch.zeros(2, 1, 2, 1, 2)
+        >>> x.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x)
+        >>> y.size()
+        torch.Size([2, 2, 2])
+        >>> y = torch.squeeze(x, 0)
+        >>> y.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x, 1)
+        >>> y.size()
+        torch.Size([2, 2, 1, 2])
+        >>> y = torch.squeeze(x, (1, 2, 3))
+        torch.Size([2, 2, 2])
+    """
+    ...
+@overload
+def squeeze(input: Tensor, dim: _size) -> Tensor: 
+    r"""
+    squeeze(input, dim=None) -> Tensor
+    
+    Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed.
+    
+    For example, if `input` is of shape:
+    :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()`
+    will be of shape: :math:`(A \times B \times C \times D)`.
+    
+    When :attr:`dim` is given, a squeeze operation is done only in the given
+    dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`,
+    ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+    will squeeze the tensor to the shape :math:`(A \times B)`.
+    
+    .. note:: The returned tensor shares the storage with the input tensor,
+              so changing the contents of one will change the contents of the other.
+    
+    .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)`
+              will also remove the batch dimension, which can lead to unexpected
+              errors. Consider specifying only the dims you wish to be squeezed.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): if given, the input will be squeezed
+               only in the specified dimensions.
+    
+            .. versionchanged:: 2.0
+               :attr:`dim` now accepts tuples of dimensions.
+    
+    Example::
+    
+        >>> x = torch.zeros(2, 1, 2, 1, 2)
+        >>> x.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x)
+        >>> y.size()
+        torch.Size([2, 2, 2])
+        >>> y = torch.squeeze(x, 0)
+        >>> y.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x, 1)
+        >>> y.size()
+        torch.Size([2, 2, 1, 2])
+        >>> y = torch.squeeze(x, (1, 2, 3))
+        torch.Size([2, 2, 2])
+    """
+    ...
+@overload
+def squeeze(input: Tensor, dim: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    squeeze(input, dim=None) -> Tensor
+    
+    Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed.
+    
+    For example, if `input` is of shape:
+    :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()`
+    will be of shape: :math:`(A \times B \times C \times D)`.
+    
+    When :attr:`dim` is given, a squeeze operation is done only in the given
+    dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`,
+    ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+    will squeeze the tensor to the shape :math:`(A \times B)`.
+    
+    .. note:: The returned tensor shares the storage with the input tensor,
+              so changing the contents of one will change the contents of the other.
+    
+    .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)`
+              will also remove the batch dimension, which can lead to unexpected
+              errors. Consider specifying only the dims you wish to be squeezed.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): if given, the input will be squeezed
+               only in the specified dimensions.
+    
+            .. versionchanged:: 2.0
+               :attr:`dim` now accepts tuples of dimensions.
+    
+    Example::
+    
+        >>> x = torch.zeros(2, 1, 2, 1, 2)
+        >>> x.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x)
+        >>> y.size()
+        torch.Size([2, 2, 2])
+        >>> y = torch.squeeze(x, 0)
+        >>> y.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x, 1)
+        >>> y.size()
+        torch.Size([2, 2, 1, 2])
+        >>> y = torch.squeeze(x, (1, 2, 3))
+        torch.Size([2, 2, 2])
+    """
+    ...
+@overload
+def squeeze_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.squeeze`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def squeeze_copy(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.squeeze`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def squeeze_copy(input: Tensor, dim: _size, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.squeeze`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def sspaddmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat1: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    sspaddmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Matrix multiplies a sparse tensor :attr:`mat1` with a dense tensor
+    :attr:`mat2`, then adds the sparse tensor :attr:`input` to the result.
+    
+    Note: This function is equivalent to :func:`torch.addmm`, except
+    :attr:`input` and :attr:`mat1` are sparse.
+    
+    Args:
+        input (Tensor): a sparse matrix to be added
+        mat1 (Tensor): a sparse matrix to be matrix multiplied
+        mat2 (Tensor): a dense matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+@overload
+def sspaddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sspaddmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Matrix multiplies a sparse tensor :attr:`mat1` with a dense tensor
+    :attr:`mat2`, then adds the sparse tensor :attr:`input` to the result.
+    
+    Note: This function is equivalent to :func:`torch.addmm`, except
+    :attr:`input` and :attr:`mat1` are sparse.
+    
+    Args:
+        input (Tensor): a sparse matrix to be added
+        mat1 (Tensor): a sparse matrix to be matrix multiplied
+        mat2 (Tensor): a dense matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+@overload
+def sspaddmm(beta: Union[Number, _complex], self: Tensor, mat1: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    sspaddmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Matrix multiplies a sparse tensor :attr:`mat1` with a dense tensor
+    :attr:`mat2`, then adds the sparse tensor :attr:`input` to the result.
+    
+    Note: This function is equivalent to :func:`torch.addmm`, except
+    :attr:`input` and :attr:`mat1` are sparse.
+    
+    Args:
+        input (Tensor): a sparse matrix to be added
+        mat1 (Tensor): a sparse matrix to be matrix multiplied
+        mat2 (Tensor): a dense matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+def stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    stack(tensors, dim=0, *, out=None) -> Tensor
+    
+    Concatenates a sequence of tensors along a new dimension.
+    
+    All tensors need to be of the same size.
+    
+    .. seealso::
+    
+        :func:`torch.cat` concatenates the given sequence along an existing dimension.
+    
+    Arguments:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+        dim (int, optional): dimension to insert. Has to be between 0 and the number
+            of dimensions of concatenated tensors (inclusive). Default: 0
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 0.3367,  0.1288,  0.2345],
+                [ 0.2303, -1.1229, -0.1863]])
+        >>> x = torch.stack((x, x)) # same as torch.stack((x, x), dim=0)
+        >>> x
+        tensor([[[ 0.3367,  0.1288,  0.2345],
+                 [ 0.2303, -1.1229, -0.1863]],
+    
+                [[ 0.3367,  0.1288,  0.2345],
+                 [ 0.2303, -1.1229, -0.1863]]])
+        >>> x.size()
+        torch.Size([2, 2, 3])
+        >>> x = torch.stack((x, x), dim=1)
+        tensor([[[ 0.3367,  0.1288,  0.2345],
+                 [ 0.3367,  0.1288,  0.2345]],
+    
+                [[ 0.2303, -1.1229, -0.1863],
+                 [ 0.2303, -1.1229, -0.1863]]])
+        >>> x = torch.stack((x, x), dim=2)
+        tensor([[[ 0.3367,  0.3367],
+                 [ 0.1288,  0.1288],
+                 [ 0.2345,  0.2345]],
+    
+                [[ 0.2303,  0.2303],
+                 [-1.1229, -1.1229],
+                 [-0.1863, -0.1863]]])
+        >>> x = torch.stack((x, x), dim=-1)
+        tensor([[[ 0.3367,  0.3367],
+                 [ 0.1288,  0.1288],
+                 [ 0.2345,  0.2345]],
+    
+                [[ 0.2303,  0.2303],
+                 [-1.1229, -1.1229],
+                 [-0.1863, -0.1863]]])
+    """
+    ...
+@overload
+def std(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std(input: Tensor, unbiased: _bool = True) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, unbiased: _bool = True) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def sub(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], *, alpha: Optional[Union[Number, _complex]] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sub(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Subtracts :attr:`other`, scaled by :attr:`alpha`, from :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i - \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to subtract from :attr:`input`.
+    
+    Keyword args:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2))
+        >>> b = torch.tensor((0, 1))
+        >>> torch.sub(a, b, alpha=2)
+        tensor([1, 0])
+    """
+    ...
+@overload
+def sub(self: Tensor, alpha: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    sub(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Subtracts :attr:`other`, scaled by :attr:`alpha`, from :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i - \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to subtract from :attr:`input`.
+    
+    Keyword args:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2))
+        >>> b = torch.tensor((0, 1))
+        >>> torch.sub(a, b, alpha=2)
+        tensor([1, 0])
+    """
+    ...
+@overload
+def sub(self: Tensor, alpha: Union[Number, _complex], other: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    sub(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Subtracts :attr:`other`, scaled by :attr:`alpha`, from :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i - \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to subtract from :attr:`input`.
+    
+    Keyword args:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2))
+        >>> b = torch.tensor((0, 1))
+        >>> torch.sub(a, b, alpha=2)
+        tensor([1, 0])
+    """
+    ...
+@overload
+def subtract(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    subtract(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Alias for :func:`torch.sub`.
+    """
+    ...
+@overload
+def subtract(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: 
+    r"""
+    subtract(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Alias for :func:`torch.sub`.
+    """
+    ...
+@overload
+def sum(input: Tensor, *, dtype: Optional[_dtype] = None) -> Tensor: 
+    r"""
+    sum(input, *, dtype=None) -> Tensor
+    
+    Returns the sum of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.1133, -0.9567,  0.2958]])
+        >>> torch.sum(a)
+        tensor(-0.5475)
+    
+    .. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the sum of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0569, -0.2475,  0.0737, -0.3429],
+                [-0.2993,  0.9138,  0.9337, -1.6864],
+                [ 0.1132,  0.7892, -0.1003,  0.5688],
+                [ 0.3637, -0.9906, -0.4752, -1.5197]])
+        >>> torch.sum(a, 1)
+        tensor([-0.4598, -0.1381,  1.3708, -2.6217])
+        >>> b = torch.arange(4 * 5 * 6).view(4, 5, 6)
+        >>> torch.sum(b, (2, 1))
+        tensor([  435.,  1335.,  2235.,  3135.])
+    """
+    ...
+@overload
+def sum(input: Tensor, dim: Optional[Union[_int, _size]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sum(input, *, dtype=None) -> Tensor
+    
+    Returns the sum of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.1133, -0.9567,  0.2958]])
+        >>> torch.sum(a)
+        tensor(-0.5475)
+    
+    .. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the sum of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0569, -0.2475,  0.0737, -0.3429],
+                [-0.2993,  0.9138,  0.9337, -1.6864],
+                [ 0.1132,  0.7892, -0.1003,  0.5688],
+                [ 0.3637, -0.9906, -0.4752, -1.5197]])
+        >>> torch.sum(a, 1)
+        tensor([-0.4598, -0.1381,  1.3708, -2.6217])
+        >>> b = torch.arange(4 * 5 * 6).view(4, 5, 6)
+        >>> torch.sum(b, (2, 1))
+        tensor([  435.,  1335.,  2235.,  3135.])
+    """
+    ...
+@overload
+def sum(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sum(input, *, dtype=None) -> Tensor
+    
+    Returns the sum of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.1133, -0.9567,  0.2958]])
+        >>> torch.sum(a)
+        tensor(-0.5475)
+    
+    .. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the sum of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0569, -0.2475,  0.0737, -0.3429],
+                [-0.2993,  0.9138,  0.9337, -1.6864],
+                [ 0.1132,  0.7892, -0.1003,  0.5688],
+                [ 0.3637, -0.9906, -0.4752, -1.5197]])
+        >>> torch.sum(a, 1)
+        tensor([-0.4598, -0.1381,  1.3708, -2.6217])
+        >>> b = torch.arange(4 * 5 * 6).view(4, 5, 6)
+        >>> torch.sum(b, (2, 1))
+        tensor([  435.,  1335.,  2235.,  3135.])
+    """
+    ...
+def svd(input: Tensor, some: _bool = True, compute_uv: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.svd: 
+    r"""
+    svd(input, some=True, compute_uv=True, *, out=None) -> (Tensor, Tensor, Tensor)
+    
+    Computes the singular value decomposition of either a matrix or batch of
+    matrices :attr:`input`. The singular value decomposition is represented as a
+    namedtuple `(U, S, V)`, such that :attr:`input` :math:`= U \text{diag}(S) V^{\text{H}}`.
+    where :math:`V^{\text{H}}` is the transpose of `V` for real inputs,
+    and the conjugate transpose of `V` for complex inputs.
+    If :attr:`input` is a batch of matrices, then `U`, `S`, and `V` are also
+    batched with the same batch dimensions as :attr:`input`.
+    
+    If :attr:`some` is `True` (default), the method returns the reduced singular
+    value decomposition. In this case, if the last two dimensions of :attr:`input` are
+    `m` and `n`, then the returned `U` and `V` matrices will contain only
+    `min(n, m)` orthonormal columns.
+    
+    If :attr:`compute_uv` is `False`, the returned `U` and `V` will be
+    zero-filled matrices of shape `(m, m)` and `(n, n)`
+    respectively, and the same device as :attr:`input`. The argument :attr:`some`
+    has no effect when :attr:`compute_uv` is `False`.
+    
+    Supports :attr:`input` of float, double, cfloat and cdouble data types.
+    The dtypes of `U` and `V` are the same as :attr:`input`'s. `S` will
+    always be real-valued, even if :attr:`input` is complex.
+    
+    .. warning::
+    
+        :func:`torch.svd` is deprecated in favor of :func:`torch.linalg.svd`
+        and will be removed in a future PyTorch release.
+    
+        ``U, S, V = torch.svd(A, some=some, compute_uv=True)`` (default) should be replaced with
+    
+        .. code:: python
+    
+            U, S, Vh = torch.linalg.svd(A, full_matrices=not some)
+            V = Vh.mH
+    
+        ``_, S, _ = torch.svd(A, some=some, compute_uv=False)`` should be replaced with
+    
+        .. code:: python
+    
+            S = torch.linalg.svdvals(A)
+    
+    .. note:: Differences with :func:`torch.linalg.svd`:
+    
+                 * :attr:`some` is the opposite of
+                   :func:`torch.linalg.svd`'s :attr:`full_matrices`. Note that
+                   default value for both is `True`, so the default behavior is
+                   effectively the opposite.
+                 * :func:`torch.svd` returns `V`, whereas :func:`torch.linalg.svd` returns
+                   `Vh`, that is, :math:`V^{\text{H}}`.
+                 * If :attr:`compute_uv` is `False`, :func:`torch.svd` returns zero-filled
+                   tensors for `U` and `Vh`, whereas :func:`torch.linalg.svd` returns
+                   empty tensors.
+    
+    .. note:: The singular values are returned in descending order. If :attr:`input` is a batch of matrices,
+              then the singular values of each matrix in the batch are returned in descending order.
+    
+    .. note:: The `S` tensor can only be used to compute gradients if :attr:`compute_uv` is `True`.
+    
+    .. note:: When :attr:`some` is `False`, the gradients on `U[..., :, min(m, n):]`
+              and `V[..., :, min(m, n):]` will be ignored in the backward pass, as those vectors
+              can be arbitrary bases of the corresponding subspaces.
+    
+    .. note:: The implementation of :func:`torch.linalg.svd` on CPU uses LAPACK's routine `?gesdd`
+              (a divide-and-conquer algorithm) instead of `?gesvd` for speed. Analogously,
+              on GPU, it uses cuSOLVER's routines `gesvdj` and `gesvdjBatched` on CUDA 10.1.243
+              and later, and MAGMA's routine `gesdd` on earlier versions of CUDA.
+    
+    .. note:: The returned `U` will not be contiguous. The matrix (or batch of matrices) will
+              be represented as a column-major matrix (i.e. Fortran-contiguous).
+    
+    .. warning:: The gradients with respect to `U` and `V` will only be finite when the input does not
+                 have zero nor repeated singular values.
+    
+    .. warning:: If the distance between any two singular values is close to zero, the gradients with respect to
+                 `U` and `V` will be numerically unstable, as they depends on
+                 :math:`\frac{1}{\min_{i \neq j} \sigma_i^2 - \sigma_j^2}`. The same happens when the matrix
+                 has small singular values, as these gradients also depend on `S^{-1}`.
+    
+    .. warning:: For complex-valued :attr:`input` the singular value decomposition is not unique,
+                 as `U` and `V` may be multiplied by an arbitrary phase factor :math:`e^{i \phi}` on every column.
+                 The same happens when :attr:`input` has repeated singular values, where one may multiply
+                 the columns of the spanning subspace in `U` and `V` by a rotation matrix
+                 and `the resulting vectors will span the same subspace`_.
+                 Different platforms, like NumPy, or inputs on different device types,
+                 may produce different `U` and `V` tensors.
+    
+    Args:
+        input (Tensor): the input tensor of size `(*, m, n)` where `*` is zero or more
+                        batch dimensions consisting of `(m, n)` matrices.
+        some (bool, optional): controls whether to compute the reduced or full decomposition, and
+                               consequently, the shape of returned `U` and `V`. Default: `True`.
+        compute_uv (bool, optional): controls whether to compute `U` and `V`. Default: `True`.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of tensors
+    
+    Example::
+    
+        >>> a = torch.randn(5, 3)
+        >>> a
+        tensor([[ 0.2364, -0.7752,  0.6372],
+                [ 1.7201,  0.7394, -0.0504],
+                [-0.3371, -1.0584,  0.5296],
+                [ 0.3550, -0.4022,  1.5569],
+                [ 0.2445, -0.0158,  1.1414]])
+        >>> u, s, v = torch.svd(a)
+        >>> u
+        tensor([[ 0.4027,  0.0287,  0.5434],
+                [-0.1946,  0.8833,  0.3679],
+                [ 0.4296, -0.2890,  0.5261],
+                [ 0.6604,  0.2717, -0.2618],
+                [ 0.4234,  0.2481, -0.4733]])
+        >>> s
+        tensor([2.3289, 2.0315, 0.7806])
+        >>> v
+        tensor([[-0.0199,  0.8766,  0.4809],
+                [-0.5080,  0.4054, -0.7600],
+                [ 0.8611,  0.2594, -0.4373]])
+        >>> torch.dist(a, torch.mm(torch.mm(u, torch.diag(s)), v.t()))
+        tensor(8.6531e-07)
+        >>> a_big = torch.randn(7, 5, 3)
+        >>> u, s, v = torch.svd(a_big)
+        >>> torch.dist(a_big, torch.matmul(torch.matmul(u, torch.diag_embed(s)), v.mT))
+        tensor(2.6503e-06)
+    
+    .. _the resulting vectors will span the same subspace:
+           (https://en.wikipedia.org/wiki/Singular_value_decomposition#Singular_values,_singular_vectors,_and_their_relation_to_the_SVD)
+    """
+    ...
+def swapaxes(input: Tensor, axis0: _int, axis1: _int) -> Tensor: 
+    r"""
+    swapaxes(input, axis0, axis1) -> Tensor
+    
+    Alias for :func:`torch.transpose`.
+    
+    This function is equivalent to NumPy's swapaxes function.
+    
+    Examples::
+    
+        >>> x = torch.tensor([[[0,1],[2,3]],[[4,5],[6,7]]])
+        >>> x
+        tensor([[[0, 1],
+                [2, 3]],
+    
+                [[4, 5],
+                [6, 7]]])
+        >>> torch.swapaxes(x, 0, 1)
+        tensor([[[0, 1],
+                [4, 5]],
+    
+                [[2, 3],
+                [6, 7]]])
+        >>> torch.swapaxes(x, 0, 2)
+        tensor([[[0, 4],
+                [2, 6]],
+    
+                [[1, 5],
+                [3, 7]]])
+    """
+    ...
+def swapdims(input: Tensor, dim0: _int, dim1: _int) -> Tensor: 
+    r"""
+    swapdims(input, dim0, dim1) -> Tensor
+    
+    Alias for :func:`torch.transpose`.
+    
+    This function is equivalent to NumPy's swapaxes function.
+    
+    Examples::
+    
+        >>> x = torch.tensor([[[0,1],[2,3]],[[4,5],[6,7]]])
+        >>> x
+        tensor([[[0, 1],
+                [2, 3]],
+    
+                [[4, 5],
+                [6, 7]]])
+        >>> torch.swapdims(x, 0, 1)
+        tensor([[[0, 1],
+                [4, 5]],
+    
+                [[2, 3],
+                [6, 7]]])
+        >>> torch.swapdims(x, 0, 2)
+        tensor([[[0, 4],
+                [2, 6]],
+    
+                [[1, 5],
+                [3, 7]]])
+    """
+    ...
+def sym_constrain_range(size: Union[Number, _complex], *, min: Optional[_int] = None, max: Optional[_int] = None) -> None: ...
+def sym_constrain_range_for_size(size: Union[Number, _complex], *, min: Optional[_int] = None, max: Optional[_int] = None) -> None: ...
+def t(input: Tensor) -> Tensor: 
+    r"""
+    t(input) -> Tensor
+    
+    Expects :attr:`input` to be <= 2-D tensor and transposes dimensions 0
+    and 1.
+    
+    0-D and 1-D tensors are returned as is. When input is a 2-D tensor this
+    is equivalent to ``transpose(input, 0, 1)``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(())
+        >>> x
+        tensor(0.1995)
+        >>> torch.t(x)
+        tensor(0.1995)
+        >>> x = torch.randn(3)
+        >>> x
+        tensor([ 2.4320, -0.4608,  0.7702])
+        >>> torch.t(x)
+        tensor([ 2.4320, -0.4608,  0.7702])
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 0.4875,  0.9158, -0.5872],
+                [ 0.3938, -0.6929,  0.6932]])
+        >>> torch.t(x)
+        tensor([[ 0.4875,  0.3938],
+                [ 0.9158, -0.6929],
+                [-0.5872,  0.6932]])
+    
+    See also :func:`torch.transpose`.
+    """
+    ...
+def t_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.t`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def take(input: Tensor, index: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    take(input, index) -> Tensor
+    
+    Returns a new tensor with the elements of :attr:`input` at the given indices.
+    The input tensor is treated as if it were viewed as a 1-D tensor. The result
+    takes the same shape as the indices.
+    
+    Args:
+        input (Tensor): the input tensor.
+        index (LongTensor): the indices into tensor
+    
+    Example::
+    
+        >>> src = torch.tensor([[4, 3, 5],
+        ...                     [6, 7, 8]])
+        >>> torch.take(src, torch.tensor([0, 2, 5]))
+        tensor([ 4,  5,  8])
+    """
+    ...
+def take_along_dim(input: Tensor, indices: Tensor, dim: Optional[_int] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    take_along_dim(input, indices, dim=None, *, out=None) -> Tensor
+    
+    Selects values from :attr:`input` at the 1-dimensional indices from :attr:`indices` along the given :attr:`dim`.
+    
+    If :attr:`dim` is None, the input array is treated as if it has been flattened to 1d.
+    
+    Functions that return indices along a dimension, like :func:`torch.argmax` and :func:`torch.argsort`,
+    are designed to work with this function. See the examples below.
+    
+    .. note::
+        This function is similar to NumPy's `take_along_axis`.
+        See also :func:`torch.gather`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        indices (tensor): the indices into :attr:`input`. Must have long dtype.
+        dim (int, optional): dimension to select along.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([[10, 30, 20], [60, 40, 50]])
+        >>> max_idx = torch.argmax(t)
+        >>> torch.take_along_dim(t, max_idx)
+        tensor([60])
+        >>> sorted_idx = torch.argsort(t, dim=1)
+        >>> torch.take_along_dim(t, sorted_idx, dim=1)
+        tensor([[10, 20, 30],
+                [40, 50, 60]])
+    """
+    ...
+def tan(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    tan(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the tangent of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \tan(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-1.2027, -1.7687,  0.4412, -1.3856])
+        >>> torch.tan(a)
+        tensor([-2.5930,  4.9859,  0.4722, -5.3366])
+    """
+    ...
+def tan_(input: Tensor) -> Tensor: ...
+def tanh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    tanh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the hyperbolic tangent of the elements
+    of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \tanh(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.8986, -0.7279,  1.1745,  0.2611])
+        >>> torch.tanh(a)
+        tensor([ 0.7156, -0.6218,  0.8257,  0.2553])
+    """
+    ...
+def tanh_(input: Tensor) -> Tensor: ...
+def tensor(data: Any, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    tensor(data, *, dtype=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Constructs a tensor with no autograd history (also known as a "leaf tensor", see :doc:`/notes/autograd`) by copying :attr:`data`.
+    
+    .. warning::
+    
+        When working with tensors prefer using :func:`torch.Tensor.clone`,
+        :func:`torch.Tensor.detach`, and :func:`torch.Tensor.requires_grad_` for
+        readability. Letting `t` be a tensor, ``torch.tensor(t)`` is equivalent to
+        ``t.clone().detach()``, and ``torch.tensor(t, requires_grad=True)``
+        is equivalent to ``t.clone().detach().requires_grad_(True)``.
+    
+    .. seealso::
+    
+        :func:`torch.as_tensor` preserves autograd history and avoids copies where possible.
+        :func:`torch.from_numpy` creates a tensor that shares storage with a NumPy array.
+    
+    Args:
+        data (array_like): Initial data for the tensor. Can be a list, tuple,
+            NumPy ``ndarray``, scalar, and other types.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, infers data type from :attr:`data`.
+        device (:class:`torch.device`, optional): the device of the constructed tensor. If None and data is a tensor
+            then the device of data is used. If None and data is not a tensor then
+            the result tensor is constructed on the current device.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]])
+        tensor([[ 0.1000,  1.2000],
+                [ 2.2000,  3.1000],
+                [ 4.9000,  5.2000]])
+    
+        >>> torch.tensor([0, 1])  # Type inference on data
+        tensor([ 0,  1])
+    
+        >>> torch.tensor([[0.11111, 0.222222, 0.3333333]],
+        ...              dtype=torch.float64,
+        ...              device=torch.device('cuda:0'))  # creates a double tensor on a CUDA device
+        tensor([[ 0.1111,  0.2222,  0.3333]], dtype=torch.float64, device='cuda:0')
+    
+        >>> torch.tensor(3.14159)  # Create a zero-dimensional (scalar) tensor
+        tensor(3.1416)
+    
+        >>> torch.tensor([])  # Create an empty tensor (of size (0,))
+        tensor([])
+    """
+    ...
+@overload
+def tensor_split(input: Tensor, tensor_indices_or_sections: Tensor, dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    tensor_split(input, indices_or_sections, dim=0) -> List of Tensors
+    
+    Splits a tensor into multiple sub-tensors, all of which are views of :attr:`input`,
+    along dimension :attr:`dim` according to the indices or number of sections specified
+    by :attr:`indices_or_sections`. This function is based on NumPy's
+    :func:`numpy.array_split`.
+    
+    Args:
+        input (Tensor): the tensor to split
+        indices_or_sections (Tensor, int or list or tuple of ints):
+            If :attr:`indices_or_sections` is an integer ``n`` or a zero dimensional long tensor
+            with value ``n``, :attr:`input` is split into ``n`` sections along dimension :attr:`dim`.
+            If :attr:`input` is divisible by ``n`` along dimension :attr:`dim`, each
+            section will be of equal size, :code:`input.size(dim) / n`. If :attr:`input`
+            is not divisible by ``n``, the sizes of the first :code:`int(input.size(dim) % n)`
+            sections will have size :code:`int(input.size(dim) / n) + 1`, and the rest will
+            have size :code:`int(input.size(dim) / n)`.
+    
+            If :attr:`indices_or_sections` is a list or tuple of ints, or a one-dimensional long
+            tensor, then :attr:`input` is split along dimension :attr:`dim` at each of the indices
+            in the list, tuple or tensor. For instance, :code:`indices_or_sections=[2, 3]` and :code:`dim=0`
+            would result in the tensors :code:`input[:2]`, :code:`input[2:3]`, and :code:`input[3:]`.
+    
+            If :attr:`indices_or_sections` is a tensor, it must be a zero-dimensional or one-dimensional
+            long tensor on the CPU.
+    
+        dim (int, optional): dimension along which to split the tensor. Default: ``0``
+    
+    Example::
+    
+        >>> x = torch.arange(8)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7]))
+    
+        >>> x = torch.arange(7)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4]), tensor([5, 6]))
+        >>> torch.tensor_split(x, (1, 6))
+        (tensor([0]), tensor([1, 2, 3, 4, 5]), tensor([6]))
+    
+        >>> x = torch.arange(14).reshape(2, 7)
+        >>> x
+        tensor([[ 0,  1,  2,  3,  4,  5,  6],
+                [ 7,  8,  9, 10, 11, 12, 13]])
+        >>> torch.tensor_split(x, 3, dim=1)
+        (tensor([[0, 1, 2],
+                [7, 8, 9]]),
+         tensor([[ 3,  4],
+                [10, 11]]),
+         tensor([[ 5,  6],
+                [12, 13]]))
+        >>> torch.tensor_split(x, (1, 6), dim=1)
+        (tensor([[0],
+                [7]]),
+         tensor([[ 1,  2,  3,  4,  5],
+                [ 8,  9, 10, 11, 12]]),
+         tensor([[ 6],
+                [13]]))
+    """
+    ...
+@overload
+def tensor_split(input: Tensor, sections: Union[_int, SymInt], dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    tensor_split(input, indices_or_sections, dim=0) -> List of Tensors
+    
+    Splits a tensor into multiple sub-tensors, all of which are views of :attr:`input`,
+    along dimension :attr:`dim` according to the indices or number of sections specified
+    by :attr:`indices_or_sections`. This function is based on NumPy's
+    :func:`numpy.array_split`.
+    
+    Args:
+        input (Tensor): the tensor to split
+        indices_or_sections (Tensor, int or list or tuple of ints):
+            If :attr:`indices_or_sections` is an integer ``n`` or a zero dimensional long tensor
+            with value ``n``, :attr:`input` is split into ``n`` sections along dimension :attr:`dim`.
+            If :attr:`input` is divisible by ``n`` along dimension :attr:`dim`, each
+            section will be of equal size, :code:`input.size(dim) / n`. If :attr:`input`
+            is not divisible by ``n``, the sizes of the first :code:`int(input.size(dim) % n)`
+            sections will have size :code:`int(input.size(dim) / n) + 1`, and the rest will
+            have size :code:`int(input.size(dim) / n)`.
+    
+            If :attr:`indices_or_sections` is a list or tuple of ints, or a one-dimensional long
+            tensor, then :attr:`input` is split along dimension :attr:`dim` at each of the indices
+            in the list, tuple or tensor. For instance, :code:`indices_or_sections=[2, 3]` and :code:`dim=0`
+            would result in the tensors :code:`input[:2]`, :code:`input[2:3]`, and :code:`input[3:]`.
+    
+            If :attr:`indices_or_sections` is a tensor, it must be a zero-dimensional or one-dimensional
+            long tensor on the CPU.
+    
+        dim (int, optional): dimension along which to split the tensor. Default: ``0``
+    
+    Example::
+    
+        >>> x = torch.arange(8)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7]))
+    
+        >>> x = torch.arange(7)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4]), tensor([5, 6]))
+        >>> torch.tensor_split(x, (1, 6))
+        (tensor([0]), tensor([1, 2, 3, 4, 5]), tensor([6]))
+    
+        >>> x = torch.arange(14).reshape(2, 7)
+        >>> x
+        tensor([[ 0,  1,  2,  3,  4,  5,  6],
+                [ 7,  8,  9, 10, 11, 12, 13]])
+        >>> torch.tensor_split(x, 3, dim=1)
+        (tensor([[0, 1, 2],
+                [7, 8, 9]]),
+         tensor([[ 3,  4],
+                [10, 11]]),
+         tensor([[ 5,  6],
+                [12, 13]]))
+        >>> torch.tensor_split(x, (1, 6), dim=1)
+        (tensor([[0],
+                [7]]),
+         tensor([[ 1,  2,  3,  4,  5],
+                [ 8,  9, 10, 11, 12]]),
+         tensor([[ 6],
+                [13]]))
+    """
+    ...
+@overload
+def tensor_split(input: Tensor, indices: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    tensor_split(input, indices_or_sections, dim=0) -> List of Tensors
+    
+    Splits a tensor into multiple sub-tensors, all of which are views of :attr:`input`,
+    along dimension :attr:`dim` according to the indices or number of sections specified
+    by :attr:`indices_or_sections`. This function is based on NumPy's
+    :func:`numpy.array_split`.
+    
+    Args:
+        input (Tensor): the tensor to split
+        indices_or_sections (Tensor, int or list or tuple of ints):
+            If :attr:`indices_or_sections` is an integer ``n`` or a zero dimensional long tensor
+            with value ``n``, :attr:`input` is split into ``n`` sections along dimension :attr:`dim`.
+            If :attr:`input` is divisible by ``n`` along dimension :attr:`dim`, each
+            section will be of equal size, :code:`input.size(dim) / n`. If :attr:`input`
+            is not divisible by ``n``, the sizes of the first :code:`int(input.size(dim) % n)`
+            sections will have size :code:`int(input.size(dim) / n) + 1`, and the rest will
+            have size :code:`int(input.size(dim) / n)`.
+    
+            If :attr:`indices_or_sections` is a list or tuple of ints, or a one-dimensional long
+            tensor, then :attr:`input` is split along dimension :attr:`dim` at each of the indices
+            in the list, tuple or tensor. For instance, :code:`indices_or_sections=[2, 3]` and :code:`dim=0`
+            would result in the tensors :code:`input[:2]`, :code:`input[2:3]`, and :code:`input[3:]`.
+    
+            If :attr:`indices_or_sections` is a tensor, it must be a zero-dimensional or one-dimensional
+            long tensor on the CPU.
+    
+        dim (int, optional): dimension along which to split the tensor. Default: ``0``
+    
+    Example::
+    
+        >>> x = torch.arange(8)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7]))
+    
+        >>> x = torch.arange(7)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4]), tensor([5, 6]))
+        >>> torch.tensor_split(x, (1, 6))
+        (tensor([0]), tensor([1, 2, 3, 4, 5]), tensor([6]))
+    
+        >>> x = torch.arange(14).reshape(2, 7)
+        >>> x
+        tensor([[ 0,  1,  2,  3,  4,  5,  6],
+                [ 7,  8,  9, 10, 11, 12, 13]])
+        >>> torch.tensor_split(x, 3, dim=1)
+        (tensor([[0, 1, 2],
+                [7, 8, 9]]),
+         tensor([[ 3,  4],
+                [10, 11]]),
+         tensor([[ 5,  6],
+                [12, 13]]))
+        >>> torch.tensor_split(x, (1, 6), dim=1)
+        (tensor([[0],
+                [7]]),
+         tensor([[ 1,  2,  3,  4,  5],
+                [ 8,  9, 10, 11, 12]]),
+         tensor([[ 6],
+                [13]]))
+    """
+    ...
+def threshold(input: Tensor, threshold: Union[Number, _complex], value: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: ...
+def threshold_(input: Tensor, threshold: Union[Number, _complex], value: Union[Number, _complex]) -> Tensor: ...
+def tile(input: Tensor, dims: Sequence[Union[_int, SymInt]]) -> Tensor: 
+    r"""
+    tile(input, dims) -> Tensor
+    
+    Constructs a tensor by repeating the elements of :attr:`input`.
+    The :attr:`dims` argument specifies the number of repetitions
+    in each dimension.
+    
+    If :attr:`dims` specifies fewer dimensions than :attr:`input` has, then
+    ones are prepended to :attr:`dims` until all dimensions are specified.
+    For example, if :attr:`input` has shape (8, 6, 4, 2) and :attr:`dims`
+    is (2, 2), then :attr:`dims` is treated as (1, 1, 2, 2).
+    
+    Analogously, if :attr:`input` has fewer dimensions than :attr:`dims`
+    specifies, then :attr:`input` is treated as if it were unsqueezed at
+    dimension zero until it has as many dimensions as :attr:`dims` specifies.
+    For example, if :attr:`input` has shape (4, 2) and :attr:`dims`
+    is (3, 3, 2, 2), then :attr:`input` is treated as if it had the
+    shape (1, 1, 4, 2).
+    
+    .. note::
+    
+        This function is similar to NumPy's tile function.
+    
+    Args:
+        input (Tensor): the tensor whose elements to repeat.
+        dims (tuple): the number of repetitions per dimension.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> x.tile((2,))
+        tensor([1, 2, 3, 1, 2, 3])
+        >>> y = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.tile(y, (2, 2))
+        tensor([[1, 2, 1, 2],
+                [3, 4, 3, 4],
+                [1, 2, 1, 2],
+                [3, 4, 3, 4]])
+    """
+    ...
+def topk(input: Tensor, k: Union[_int, SymInt], dim: _int = -1, largest: _bool = True, sorted: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.topk: 
+    r"""
+    topk(input, k, dim=None, largest=True, sorted=True, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns the :attr:`k` largest elements of the given :attr:`input` tensor along
+    a given dimension.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`largest` is ``False`` then the `k` smallest elements are returned.
+    
+    A namedtuple of `(values, indices)` is returned with the `values` and
+    `indices` of the largest `k` elements of each row of the `input` tensor in the
+    given dimension `dim`.
+    
+    The boolean option :attr:`sorted` if ``True``, will make sure that the returned
+    `k` elements are themselves sorted
+    
+    Args:
+        input (Tensor): the input tensor.
+        k (int): the k in "top-k"
+        dim (int, optional): the dimension to sort along
+        largest (bool, optional): controls whether to return largest or
+               smallest elements
+        sorted (bool, optional): controls whether to return the elements
+               in sorted order
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (Tensor, LongTensor) that can be
+            optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.arange(1., 6.)
+        >>> x
+        tensor([ 1.,  2.,  3.,  4.,  5.])
+        >>> torch.topk(x, 3)
+        torch.return_types.topk(values=tensor([5., 4., 3.]), indices=tensor([4, 3, 2]))
+    """
+    ...
+def trace(input: Tensor) -> Tensor: 
+    r"""
+    trace(input) -> Tensor
+    
+    Returns the sum of the elements of the diagonal of the input 2-D matrix.
+    
+    Example::
+    
+        >>> x = torch.arange(1., 10.).view(3, 3)
+        >>> x
+        tensor([[ 1.,  2.,  3.],
+                [ 4.,  5.,  6.],
+                [ 7.,  8.,  9.]])
+        >>> torch.trace(x)
+        tensor(15.)
+    """
+    ...
+@overload
+def transpose(input: Tensor, dim0: _int, dim1: _int) -> Tensor: 
+    r"""
+    transpose(input, dim0, dim1) -> Tensor
+    
+    Returns a tensor that is a transposed version of :attr:`input`.
+    The given dimensions :attr:`dim0` and :attr:`dim1` are swapped.
+    
+    If :attr:`input` is a strided tensor then the resulting :attr:`out`
+    tensor shares its underlying storage with the :attr:`input` tensor, so
+    changing the content of one would change the content of the other.
+    
+    If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` then the
+    resulting :attr:`out` tensor *does not* share the underlying storage
+    with the :attr:`input` tensor.
+    
+    If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` with compressed
+    layout (SparseCSR, SparseBSR, SparseCSC or SparseBSC) the arguments
+    :attr:`dim0` and :attr:`dim1` must be both batch dimensions, or must
+    both be sparse dimensions. The batch dimensions of a sparse tensor are the
+    dimensions preceding the sparse dimensions.
+    
+    .. note::
+        Transpositions which interchange the sparse dimensions of a `SparseCSR`
+        or `SparseCSC` layout tensor will result in the layout changing between
+        the two options. Transposition of the sparse dimensions of a ` SparseBSR`
+        or `SparseBSC` layout tensor will likewise generate a result with the
+        opposite layout.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim0 (int): the first dimension to be transposed
+        dim1 (int): the second dimension to be transposed
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 1.0028, -0.9893,  0.5809],
+                [-0.1669,  0.7299,  0.4942]])
+        >>> torch.transpose(x, 0, 1)
+        tensor([[ 1.0028, -0.1669],
+                [-0.9893,  0.7299],
+                [ 0.5809,  0.4942]])
+    
+    See also :func:`torch.t`.
+    """
+    ...
+@overload
+def transpose(input: Tensor, dim0: Union[str, ellipsis, None], dim1: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    transpose(input, dim0, dim1) -> Tensor
+    
+    Returns a tensor that is a transposed version of :attr:`input`.
+    The given dimensions :attr:`dim0` and :attr:`dim1` are swapped.
+    
+    If :attr:`input` is a strided tensor then the resulting :attr:`out`
+    tensor shares its underlying storage with the :attr:`input` tensor, so
+    changing the content of one would change the content of the other.
+    
+    If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` then the
+    resulting :attr:`out` tensor *does not* share the underlying storage
+    with the :attr:`input` tensor.
+    
+    If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` with compressed
+    layout (SparseCSR, SparseBSR, SparseCSC or SparseBSC) the arguments
+    :attr:`dim0` and :attr:`dim1` must be both batch dimensions, or must
+    both be sparse dimensions. The batch dimensions of a sparse tensor are the
+    dimensions preceding the sparse dimensions.
+    
+    .. note::
+        Transpositions which interchange the sparse dimensions of a `SparseCSR`
+        or `SparseCSC` layout tensor will result in the layout changing between
+        the two options. Transposition of the sparse dimensions of a ` SparseBSR`
+        or `SparseBSC` layout tensor will likewise generate a result with the
+        opposite layout.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim0 (int): the first dimension to be transposed
+        dim1 (int): the second dimension to be transposed
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 1.0028, -0.9893,  0.5809],
+                [-0.1669,  0.7299,  0.4942]])
+        >>> torch.transpose(x, 0, 1)
+        tensor([[ 1.0028, -0.1669],
+                [-0.9893,  0.7299],
+                [ 0.5809,  0.4942]])
+    
+    See also :func:`torch.t`.
+    """
+    ...
+def transpose_copy(input: Tensor, dim0: _int, dim1: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.transpose`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def trapezoid(y: Tensor, x: Tensor, *, dim: _int = -1) -> Tensor: 
+    r"""
+    trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor
+    
+    Computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_ along
+    :attr:`dim`. By default the spacing between elements is assumed to be 1, but
+    :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+    used to specify arbitrary spacing along :attr:`dim`.
+    
+    
+    Assuming :attr:`y` is a one-dimensional tensor with elements :math:`{y_0, y_1, ..., y_n}`,
+    the default computation is
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{1}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    When :attr:`dx` is specified the computation becomes
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{\Delta x}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    effectively multiplying the result by :attr:`dx`. When :attr:`x` is specified,
+    assuming :attr:`x` is also a one-dimensional tensor with
+    elements :math:`{x_0, x_1, ..., x_n}`, the computation becomes
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{(x_i - x_{i-1})}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    When :attr:`x` and :attr:`y` have the same size, the computation is as described above and no broadcasting is needed.
+    The broadcasting behavior of this function is as follows when their sizes are different. For both :attr:`x`
+    and :attr:`y`, the function computes the difference between consecutive elements along
+    dimension :attr:`dim`. This effectively creates two tensors, `x_diff` and `y_diff`, that have
+    the same shape as the original tensors except their lengths along the dimension :attr:`dim` is reduced by 1.
+    After that, those two tensors are broadcast together to compute final output as part of the trapezoidal rule.
+    See the examples below for details.
+    
+    .. note::
+        The trapezoidal rule is a technique for approximating the definite integral of a function
+        by averaging its left and right Riemann sums. The approximation becomes more accurate as
+        the resolution of the partition increases.
+    
+    Arguments:
+        y (Tensor): Values to use when computing the trapezoidal rule.
+        x (Tensor): If specified, defines spacing between values as specified above.
+    
+    Keyword arguments:
+        dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx`
+            are specified then this defaults to 1. Effectively multiplies the result by its value.
+        dim (int): The dimension along which to compute the trapezoidal rule.
+            The last (inner-most) dimension by default.
+    
+    Examples::
+    
+        >>> # Computes the trapezoidal rule in 1D, spacing is implicitly 1
+        >>> y = torch.tensor([1, 5, 10])
+        >>> torch.trapezoid(y)
+        tensor(10.5)
+    
+        >>> # Computes the same trapezoidal rule directly to verify
+        >>> (1 + 10 + 10) / 2
+        10.5
+    
+        >>> # Computes the trapezoidal rule in 1D with constant spacing of 2
+        >>> # NOTE: the result is the same as before, but multiplied by 2
+        >>> torch.trapezoid(y, dx=2)
+        21.0
+    
+        >>> # Computes the trapezoidal rule in 1D with arbitrary spacing
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.trapezoid(y, x)
+        28.5
+    
+        >>> # Computes the same trapezoidal rule directly to verify
+        >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2
+        28.5
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 matrix
+        >>> y = torch.arange(9).reshape(3, 3)
+        tensor([[0, 1, 2],
+                [3, 4, 5],
+                [6, 7, 8]])
+        >>> torch.trapezoid(y)
+        tensor([ 2., 8., 14.])
+    
+        >>> # Computes the trapezoidal rule for each column of the matrix
+        >>> torch.trapezoid(y, dim=0)
+        tensor([ 6., 8., 10.])
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with the same arbitrary spacing
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.trapezoid(y, x)
+        array([5., 5., 5.])
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with different arbitrary spacing per row
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]])
+        >>> torch.trapezoid(y, x)
+        array([2., 4., 6.])
+    """
+    ...
+@overload
+def trapezoid(y: Tensor, *, dx: Union[Number, _complex] = 1, dim: _int = -1) -> Tensor: 
+    r"""
+    trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor
+    
+    Computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_ along
+    :attr:`dim`. By default the spacing between elements is assumed to be 1, but
+    :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+    used to specify arbitrary spacing along :attr:`dim`.
+    
+    
+    Assuming :attr:`y` is a one-dimensional tensor with elements :math:`{y_0, y_1, ..., y_n}`,
+    the default computation is
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{1}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    When :attr:`dx` is specified the computation becomes
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{\Delta x}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    effectively multiplying the result by :attr:`dx`. When :attr:`x` is specified,
+    assuming :attr:`x` is also a one-dimensional tensor with
+    elements :math:`{x_0, x_1, ..., x_n}`, the computation becomes
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{(x_i - x_{i-1})}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    When :attr:`x` and :attr:`y` have the same size, the computation is as described above and no broadcasting is needed.
+    The broadcasting behavior of this function is as follows when their sizes are different. For both :attr:`x`
+    and :attr:`y`, the function computes the difference between consecutive elements along
+    dimension :attr:`dim`. This effectively creates two tensors, `x_diff` and `y_diff`, that have
+    the same shape as the original tensors except their lengths along the dimension :attr:`dim` is reduced by 1.
+    After that, those two tensors are broadcast together to compute final output as part of the trapezoidal rule.
+    See the examples below for details.
+    
+    .. note::
+        The trapezoidal rule is a technique for approximating the definite integral of a function
+        by averaging its left and right Riemann sums. The approximation becomes more accurate as
+        the resolution of the partition increases.
+    
+    Arguments:
+        y (Tensor): Values to use when computing the trapezoidal rule.
+        x (Tensor): If specified, defines spacing between values as specified above.
+    
+    Keyword arguments:
+        dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx`
+            are specified then this defaults to 1. Effectively multiplies the result by its value.
+        dim (int): The dimension along which to compute the trapezoidal rule.
+            The last (inner-most) dimension by default.
+    
+    Examples::
+    
+        >>> # Computes the trapezoidal rule in 1D, spacing is implicitly 1
+        >>> y = torch.tensor([1, 5, 10])
+        >>> torch.trapezoid(y)
+        tensor(10.5)
+    
+        >>> # Computes the same trapezoidal rule directly to verify
+        >>> (1 + 10 + 10) / 2
+        10.5
+    
+        >>> # Computes the trapezoidal rule in 1D with constant spacing of 2
+        >>> # NOTE: the result is the same as before, but multiplied by 2
+        >>> torch.trapezoid(y, dx=2)
+        21.0
+    
+        >>> # Computes the trapezoidal rule in 1D with arbitrary spacing
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.trapezoid(y, x)
+        28.5
+    
+        >>> # Computes the same trapezoidal rule directly to verify
+        >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2
+        28.5
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 matrix
+        >>> y = torch.arange(9).reshape(3, 3)
+        tensor([[0, 1, 2],
+                [3, 4, 5],
+                [6, 7, 8]])
+        >>> torch.trapezoid(y)
+        tensor([ 2., 8., 14.])
+    
+        >>> # Computes the trapezoidal rule for each column of the matrix
+        >>> torch.trapezoid(y, dim=0)
+        tensor([ 6., 8., 10.])
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with the same arbitrary spacing
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.trapezoid(y, x)
+        array([5., 5., 5.])
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with different arbitrary spacing per row
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]])
+        >>> torch.trapezoid(y, x)
+        array([2., 4., 6.])
+    """
+    ...
+@overload
+def trapz(y: Tensor, *, dx: _float = 1, dim: _int = -1) -> Tensor: 
+    r"""
+    trapz(y, x, *, dim=-1) -> Tensor
+    
+    Alias for :func:`torch.trapezoid`.
+    """
+    ...
+@overload
+def trapz(y: Tensor, x: Tensor, *, dim: _int = -1) -> Tensor: 
+    r"""
+    trapz(y, x, *, dim=-1) -> Tensor
+    
+    Alias for :func:`torch.trapezoid`.
+    """
+    ...
+def triangular_solve(input: Tensor, A: Tensor, upper: _bool = True, transpose: _bool = False, unitriangular: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.triangular_solve: 
+    r"""
+    triangular_solve(b, A, upper=True, transpose=False, unitriangular=False, *, out=None) -> (Tensor, Tensor)
+    
+    Solves a system of equations with a square upper or lower triangular invertible matrix :math:`A`
+    and multiple right-hand sides :math:`b`.
+    
+    In symbols, it solves :math:`AX = b` and assumes :math:`A` is square upper-triangular
+    (or lower-triangular if :attr:`upper`\ `= False`) and does not have zeros on the diagonal.
+    
+    `torch.triangular_solve(b, A)` can take in 2D inputs `b, A` or inputs that are
+    batches of 2D matrices. If the inputs are batches, then returns
+    batched outputs `X`
+    
+    If the diagonal of :attr:`A` contains zeros or elements that are very close to zero and
+    :attr:`unitriangular`\ `= False` (default) or if the input matrix is badly conditioned,
+    the result may contain `NaN` s.
+    
+    Supports input of float, double, cfloat and cdouble data types.
+    
+    .. warning::
+    
+        :func:`torch.triangular_solve` is deprecated in favor of :func:`torch.linalg.solve_triangular`
+        and will be removed in a future PyTorch release.
+        :func:`torch.linalg.solve_triangular` has its arguments reversed and does not return a
+        copy of one of the inputs.
+    
+        ``X = torch.triangular_solve(B, A).solution`` should be replaced with
+    
+        .. code:: python
+    
+            X = torch.linalg.solve_triangular(A, B)
+    
+    Args:
+        b (Tensor): multiple right-hand sides of size :math:`(*, m, k)` where
+                    :math:`*` is zero of more batch dimensions
+        A (Tensor): the input triangular coefficient matrix of size :math:`(*, m, m)`
+                    where :math:`*` is zero or more batch dimensions
+        upper (bool, optional): whether :math:`A` is upper or lower triangular. Default: ``True``.
+        transpose (bool, optional): solves `op(A)X = b` where `op(A) = A^T` if this flag is ``True``,
+                                    and `op(A) = A` if it is ``False``. Default: ``False``.
+        unitriangular (bool, optional): whether :math:`A` is unit triangular.
+            If True, the diagonal elements of :math:`A` are assumed to be
+            1 and not referenced from :math:`A`. Default: ``False``.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): tuple of two tensors to write
+            the output to. Ignored if `None`. Default: `None`.
+    
+    Returns:
+        A namedtuple `(solution, cloned_coefficient)` where `cloned_coefficient`
+        is a clone of :math:`A` and `solution` is the solution :math:`X` to :math:`AX = b`
+        (or whatever variant of the system of equations, depending on the keyword arguments.)
+    
+    Examples::
+    
+        >>> A = torch.randn(2, 2).triu()
+        >>> A
+        tensor([[ 1.1527, -1.0753],
+                [ 0.0000,  0.7986]])
+        >>> b = torch.randn(2, 3)
+        >>> b
+        tensor([[-0.0210,  2.3513, -1.5492],
+                [ 1.5429,  0.7403, -1.0243]])
+        >>> torch.triangular_solve(b, A)
+        torch.return_types.triangular_solve(
+        solution=tensor([[ 1.7841,  2.9046, -2.5405],
+                [ 1.9320,  0.9270, -1.2826]]),
+        cloned_coefficient=tensor([[ 1.1527, -1.0753],
+                [ 0.0000,  0.7986]]))
+    """
+    ...
+def tril(input: Tensor, diagonal: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    tril(input, diagonal=0, *, out=None) -> Tensor
+    
+    Returns the lower triangular part of the matrix (2-D tensor) or batch of matrices
+    :attr:`input`, the other elements of the result tensor :attr:`out` are set to 0.
+    
+    The lower triangular part of the matrix is defined as the elements on and
+    below the diagonal.
+    
+    The argument :attr:`diagonal` controls which diagonal to consider. If
+    :attr:`diagonal` = 0, all elements on and below the main diagonal are
+    retained. A positive value includes just as many diagonals above the main
+    diagonal, and similarly a negative value excludes just as many diagonals below
+    the main diagonal. The main diagonal are the set of indices
+    :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` where
+    :math:`d_{1}, d_{2}` are the dimensions of the matrix.
+    
+    Args:
+        input (Tensor): the input tensor.
+        diagonal (int, optional): the diagonal to consider
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[-1.0813, -0.8619,  0.7105],
+                [ 0.0935,  0.1380,  2.2112],
+                [-0.3409, -0.9828,  0.0289]])
+        >>> torch.tril(a)
+        tensor([[-1.0813,  0.0000,  0.0000],
+                [ 0.0935,  0.1380,  0.0000],
+                [-0.3409, -0.9828,  0.0289]])
+    
+        >>> b = torch.randn(4, 6)
+        >>> b
+        tensor([[ 1.2219,  0.5653, -0.2521, -0.2345,  1.2544,  0.3461],
+                [ 0.4785, -0.4477,  0.6049,  0.6368,  0.8775,  0.7145],
+                [ 1.1502,  3.2716, -1.1243, -0.5413,  0.3615,  0.6864],
+                [-0.0614, -0.7344, -1.3164, -0.7648, -1.4024,  0.0978]])
+        >>> torch.tril(b, diagonal=1)
+        tensor([[ 1.2219,  0.5653,  0.0000,  0.0000,  0.0000,  0.0000],
+                [ 0.4785, -0.4477,  0.6049,  0.0000,  0.0000,  0.0000],
+                [ 1.1502,  3.2716, -1.1243, -0.5413,  0.0000,  0.0000],
+                [-0.0614, -0.7344, -1.3164, -0.7648, -1.4024,  0.0000]])
+        >>> torch.tril(b, diagonal=-1)
+        tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                [ 0.4785,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                [ 1.1502,  3.2716,  0.0000,  0.0000,  0.0000,  0.0000],
+                [-0.0614, -0.7344, -1.3164,  0.0000,  0.0000,  0.0000]])
+    """
+    ...
+def tril_indices(row: _int, col: _int, offset: _int = 0, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    tril_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
+    
+    Returns the indices of the lower triangular part of a :attr:`row`-by-
+    :attr:`col` matrix in a 2-by-N Tensor, where the first row contains row
+    coordinates of all indices and the second row contains column coordinates.
+    Indices are ordered based on rows and then columns.
+    
+    The lower triangular part of the matrix is defined as the elements on and
+    below the diagonal.
+    
+    The argument :attr:`offset` controls which diagonal to consider. If
+    :attr:`offset` = 0, all elements on and below the main diagonal are
+    retained. A positive value includes just as many diagonals above the main
+    diagonal, and similarly a negative value excludes just as many diagonals below
+    the main diagonal. The main diagonal are the set of indices
+    :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]`
+    where :math:`d_{1}, d_{2}` are the dimensions of the matrix.
+    
+    .. note::
+        When running on CUDA, ``row * col`` must be less than :math:`2^{59}` to
+        prevent overflow during calculation.
+    
+    Args:
+        row (``int``): number of rows in the 2-D matrix.
+        col (``int``): number of columns in the 2-D matrix.
+        offset (``int``): diagonal offset from the main diagonal.
+            Default: if not provided, 0.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, ``torch.long``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        layout (:class:`torch.layout`, optional): currently only support ``torch.strided``.
+    
+    Example::
+    
+        >>> a = torch.tril_indices(3, 3)
+        >>> a
+        tensor([[0, 1, 1, 2, 2, 2],
+                [0, 0, 1, 0, 1, 2]])
+    
+        >>> a = torch.tril_indices(4, 3, -1)
+        >>> a
+        tensor([[1, 2, 2, 3, 3, 3],
+                [0, 0, 1, 0, 1, 2]])
+    
+        >>> a = torch.tril_indices(4, 3, 1)
+        >>> a
+        tensor([[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3],
+                [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2]])
+    """
+    ...
+def triplet_margin_loss(anchor: Tensor, positive: Tensor, negative: Tensor, margin: _float = 1.0, p: _float = 2, eps: _float = 1e-06, swap: _bool = False, reduction: _int = 1) -> Tensor: ...
+def triu(input: Tensor, diagonal: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    triu(input, diagonal=0, *, out=None) -> Tensor
+    
+    Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices
+    :attr:`input`, the other elements of the result tensor :attr:`out` are set to 0.
+    
+    The upper triangular part of the matrix is defined as the elements on and
+    above the diagonal.
+    
+    The argument :attr:`diagonal` controls which diagonal to consider. If
+    :attr:`diagonal` = 0, all elements on and above the main diagonal are
+    retained. A positive value excludes just as many diagonals above the main
+    diagonal, and similarly a negative value includes just as many diagonals below
+    the main diagonal. The main diagonal are the set of indices
+    :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` where
+    :math:`d_{1}, d_{2}` are the dimensions of the matrix.
+    
+    Args:
+        input (Tensor): the input tensor.
+        diagonal (int, optional): the diagonal to consider
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[ 0.2309,  0.5207,  2.0049],
+                [ 0.2072, -1.0680,  0.6602],
+                [ 0.3480, -0.5211, -0.4573]])
+        >>> torch.triu(a)
+        tensor([[ 0.2309,  0.5207,  2.0049],
+                [ 0.0000, -1.0680,  0.6602],
+                [ 0.0000,  0.0000, -0.4573]])
+        >>> torch.triu(a, diagonal=1)
+        tensor([[ 0.0000,  0.5207,  2.0049],
+                [ 0.0000,  0.0000,  0.6602],
+                [ 0.0000,  0.0000,  0.0000]])
+        >>> torch.triu(a, diagonal=-1)
+        tensor([[ 0.2309,  0.5207,  2.0049],
+                [ 0.2072, -1.0680,  0.6602],
+                [ 0.0000, -0.5211, -0.4573]])
+    
+        >>> b = torch.randn(4, 6)
+        >>> b
+        tensor([[ 0.5876, -0.0794, -1.8373,  0.6654,  0.2604,  1.5235],
+                [-0.2447,  0.9556, -1.2919,  1.3378, -0.1768, -1.0857],
+                [ 0.4333,  0.3146,  0.6576, -1.0432,  0.9348, -0.4410],
+                [-0.9888,  1.0679, -1.3337, -1.6556,  0.4798,  0.2830]])
+        >>> torch.triu(b, diagonal=1)
+        tensor([[ 0.0000, -0.0794, -1.8373,  0.6654,  0.2604,  1.5235],
+                [ 0.0000,  0.0000, -1.2919,  1.3378, -0.1768, -1.0857],
+                [ 0.0000,  0.0000,  0.0000, -1.0432,  0.9348, -0.4410],
+                [ 0.0000,  0.0000,  0.0000,  0.0000,  0.4798,  0.2830]])
+        >>> torch.triu(b, diagonal=-1)
+        tensor([[ 0.5876, -0.0794, -1.8373,  0.6654,  0.2604,  1.5235],
+                [-0.2447,  0.9556, -1.2919,  1.3378, -0.1768, -1.0857],
+                [ 0.0000,  0.3146,  0.6576, -1.0432,  0.9348, -0.4410],
+                [ 0.0000,  0.0000, -1.3337, -1.6556,  0.4798,  0.2830]])
+    """
+    ...
+def triu_indices(row: _int, col: _int, offset: _int = 0, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    triu_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
+    
+    Returns the indices of the upper triangular part of a :attr:`row` by
+    :attr:`col` matrix in a 2-by-N Tensor, where the first row contains row
+    coordinates of all indices and the second row contains column coordinates.
+    Indices are ordered based on rows and then columns.
+    
+    The upper triangular part of the matrix is defined as the elements on and
+    above the diagonal.
+    
+    The argument :attr:`offset` controls which diagonal to consider. If
+    :attr:`offset` = 0, all elements on and above the main diagonal are
+    retained. A positive value excludes just as many diagonals above the main
+    diagonal, and similarly a negative value includes just as many diagonals below
+    the main diagonal. The main diagonal are the set of indices
+    :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]`
+    where :math:`d_{1}, d_{2}` are the dimensions of the matrix.
+    
+    .. note::
+        When running on CUDA, ``row * col`` must be less than :math:`2^{59}` to
+        prevent overflow during calculation.
+    
+    Args:
+        row (``int``): number of rows in the 2-D matrix.
+        col (``int``): number of columns in the 2-D matrix.
+        offset (``int``): diagonal offset from the main diagonal.
+            Default: if not provided, 0.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, ``torch.long``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        layout (:class:`torch.layout`, optional): currently only support ``torch.strided``.
+    
+    Example::
+    
+        >>> a = torch.triu_indices(3, 3)
+        >>> a
+        tensor([[0, 0, 0, 1, 1, 2],
+                [0, 1, 2, 1, 2, 2]])
+    
+        >>> a = torch.triu_indices(4, 3, -1)
+        >>> a
+        tensor([[0, 0, 0, 1, 1, 1, 2, 2, 3],
+                [0, 1, 2, 0, 1, 2, 1, 2, 2]])
+    
+        >>> a = torch.triu_indices(4, 3, 1)
+        >>> a
+        tensor([[0, 0, 1],
+                [1, 2, 2]])
+    """
+    ...
+def true_divide(input: Union[Tensor, Number], other: Union[Tensor, Number], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    true_divide(dividend, divisor, *, out) -> Tensor
+    
+    Alias for :func:`torch.div` with ``rounding_mode=None``.
+    """
+    ...
+def trunc(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    trunc(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the truncated integer values of
+    the elements of :attr:`input`.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 3.4742,  0.5466, -0.8008, -0.9079])
+        >>> torch.trunc(a)
+        tensor([ 3.,  0., -0., -0.])
+    """
+    ...
+def trunc_(input: Tensor) -> Tensor: ...
+@overload
+def unbind(input: Tensor, dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    unbind(input, dim=0) -> seq
+    
+    Removes a tensor dimension.
+    
+    Returns a tuple of all slices along a given dimension, already without it.
+    
+    Arguments:
+        input (Tensor): the tensor to unbind
+        dim (int): dimension to remove
+    
+    Example::
+    
+        >>> torch.unbind(torch.tensor([[1, 2, 3],
+        >>>                            [4, 5, 6],
+        >>>                            [7, 8, 9]]))
+        (tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9]))
+    """
+    ...
+@overload
+def unbind(input: Tensor, dim: Union[str, ellipsis, None]) -> Tuple[Tensor, ...]: 
+    r"""
+    unbind(input, dim=0) -> seq
+    
+    Removes a tensor dimension.
+    
+    Returns a tuple of all slices along a given dimension, already without it.
+    
+    Arguments:
+        input (Tensor): the tensor to unbind
+        dim (int): dimension to remove
+    
+    Example::
+    
+        >>> torch.unbind(torch.tensor([[1, 2, 3],
+        >>>                            [4, 5, 6],
+        >>>                            [7, 8, 9]]))
+        (tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9]))
+    """
+    ...
+def unbind_copy(input: Tensor, dim: _int = 0, *, out: Union[Tuple[Tensor, ...], List[Tensor], None] = None) -> None: 
+    r"""
+    Performs the same operation as :func:`torch.unbind`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def unflatten(input: Tensor, dim: Union[str, ellipsis, None], sizes: Sequence[Union[_int, SymInt]], names: Sequence[Union[str, ellipsis, None]]) -> Tensor: 
+    r"""
+    unflatten(input, dim, sizes) -> Tensor
+    
+    Expands a dimension of the input tensor over multiple dimensions.
+    
+    .. seealso::
+    
+        :func:`torch.flatten` the inverse of this function. It coalesces several dimensions into one.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): Dimension to be unflattened, specified as an index into
+             ``input.shape``.
+        sizes (Tuple[int]): New shape of the unflattened dimension.
+             One of its elements can be `-1` in which case the corresponding output
+             dimension is inferred. Otherwise, the product of ``sizes`` *must*
+             equal ``input.shape[dim]``.
+    
+    Returns:
+        A View of input with the specified dimension unflattened.
+    
+    Examples::
+        >>> torch.unflatten(torch.randn(3, 4, 1), 1, (2, 2)).shape
+        torch.Size([3, 2, 2, 1])
+        >>> torch.unflatten(torch.randn(3, 4, 1), 1, (-1, 2)).shape
+        torch.Size([3, 2, 2, 1])
+        >>> torch.unflatten(torch.randn(5, 12, 3), -2, (2, 2, 3, 1, 1)).shape
+        torch.Size([5, 2, 2, 3, 1, 1, 3])
+    """
+    ...
+@overload
+def unflatten(input: Tensor, dim: _int, sizes: Sequence[Union[_int, SymInt]]) -> Tensor: 
+    r"""
+    unflatten(input, dim, sizes) -> Tensor
+    
+    Expands a dimension of the input tensor over multiple dimensions.
+    
+    .. seealso::
+    
+        :func:`torch.flatten` the inverse of this function. It coalesces several dimensions into one.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): Dimension to be unflattened, specified as an index into
+             ``input.shape``.
+        sizes (Tuple[int]): New shape of the unflattened dimension.
+             One of its elements can be `-1` in which case the corresponding output
+             dimension is inferred. Otherwise, the product of ``sizes`` *must*
+             equal ``input.shape[dim]``.
+    
+    Returns:
+        A View of input with the specified dimension unflattened.
+    
+    Examples::
+        >>> torch.unflatten(torch.randn(3, 4, 1), 1, (2, 2)).shape
+        torch.Size([3, 2, 2, 1])
+        >>> torch.unflatten(torch.randn(3, 4, 1), 1, (-1, 2)).shape
+        torch.Size([3, 2, 2, 1])
+        >>> torch.unflatten(torch.randn(5, 12, 3), -2, (2, 2, 3, 1, 1)).shape
+        torch.Size([5, 2, 2, 3, 1, 1, 3])
+    """
+    ...
+def unfold_copy(input: Tensor, dimension: _int, size: _int, step: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.unfold`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def unique_dim(input: Tensor, dim: _int, sorted: _bool = True, return_inverse: _bool = False, return_counts: _bool = False) -> Tuple[Tensor, Tensor, Tensor]: ...
+def unsafe_chunk(input: Tensor, chunks: _int, dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    unsafe_chunk(input, chunks, dim=0) -> List of Tensors
+    
+    Works like :func:`torch.chunk` but without enforcing the autograd restrictions
+    on inplace modification of the outputs.
+    
+    .. warning::
+        This function is safe to use as long as only the input, or only the outputs
+        are modified inplace after calling this function. It is user's
+        responsibility to ensure that is the case. If both the input and one or more
+        of the outputs are modified inplace, gradients computed by autograd will be
+        silently incorrect.
+    """
+    ...
+def unsafe_split(input: Tensor, split_size: Union[_int, SymInt], dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    unsafe_split(tensor, split_size_or_sections, dim=0) -> List of Tensors
+    
+    Works like :func:`torch.split` but without enforcing the autograd restrictions
+    on inplace modification of the outputs.
+    
+    .. warning::
+        This function is safe to use as long as only the input, or only the outputs
+        are modified inplace after calling this function. It is user's
+        responsibility to ensure that is the case. If both the input and one or more
+        of the outputs are modified inplace, gradients computed by autograd will be
+        silently incorrect.
+    """
+    ...
+def unsafe_split_with_sizes(input: Tensor, split_sizes: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: ...
+def unsqueeze(input: Tensor, dim: _int) -> Tensor: 
+    r"""
+    unsqueeze(input, dim) -> Tensor
+    
+    Returns a new tensor with a dimension of size one inserted at the
+    specified position.
+    
+    The returned tensor shares the same underlying data with this tensor.
+    
+    A :attr:`dim` value within the range ``[-input.dim() - 1, input.dim() + 1)``
+    can be used. Negative :attr:`dim` will correspond to :meth:`unsqueeze`
+    applied at :attr:`dim` = ``dim + input.dim() + 1``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the index at which to insert the singleton dimension
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3, 4])
+        >>> torch.unsqueeze(x, 0)
+        tensor([[ 1,  2,  3,  4]])
+        >>> torch.unsqueeze(x, 1)
+        tensor([[ 1],
+                [ 2],
+                [ 3],
+                [ 4]])
+    """
+    ...
+def unsqueeze_copy(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.unsqueeze`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def values_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.values`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def vander(x: Tensor, N: Optional[_int] = None, increasing: _bool = False) -> Tensor: 
+    r"""
+    vander(x, N=None, increasing=False) -> Tensor
+    
+    Generates a Vandermonde matrix.
+    
+    The columns of the output matrix are elementwise powers of the input vector :math:`x^{(N-1)}, x^{(N-2)}, ..., x^0`.
+    If increasing is True, the order of the columns is reversed :math:`x^0, x^1, ..., x^{(N-1)}`. Such a
+    matrix with a geometric progression in each row is named for Alexandre-Theophile Vandermonde.
+    
+    Arguments:
+        x (Tensor): 1-D input tensor.
+        N (int, optional): Number of columns in the output. If N is not specified,
+            a square array is returned :math:`(N = len(x))`.
+        increasing (bool, optional): Order of the powers of the columns. If True,
+            the powers increase from left to right, if False (the default) they are reversed.
+    
+    Returns:
+        Tensor: Vandermonde matrix. If increasing is False, the first column is :math:`x^{(N-1)}`,
+        the second :math:`x^{(N-2)}` and so forth. If increasing is True, the columns
+        are :math:`x^0, x^1, ..., x^{(N-1)}`.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3, 5])
+        >>> torch.vander(x)
+        tensor([[  1,   1,   1,   1],
+                [  8,   4,   2,   1],
+                [ 27,   9,   3,   1],
+                [125,  25,   5,   1]])
+        >>> torch.vander(x, N=3)
+        tensor([[ 1,  1,  1],
+                [ 4,  2,  1],
+                [ 9,  3,  1],
+                [25,  5,  1]])
+        >>> torch.vander(x, N=3, increasing=True)
+        tensor([[ 1,  1,  1],
+                [ 1,  2,  4],
+                [ 1,  3,  9],
+                [ 1,  5, 25]])
+    """
+    ...
+@overload
+def var(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var(input: Tensor, unbiased: _bool = True) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, unbiased: _bool = True) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+def vdot(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    vdot(input, other, *, out=None) -> Tensor
+    
+    Computes the dot product of two 1D vectors along a dimension.
+    
+    In symbols, this function computes
+    
+    .. math::
+    
+        \sum_{i=1}^n \overline{x_i}y_i.
+    
+    where :math:`\overline{x_i}` denotes the conjugate for complex
+    vectors, and it is the identity for real vectors.
+    
+    .. note::
+    
+        Unlike NumPy's vdot, torch.vdot intentionally only supports computing the dot product
+        of two 1D tensors with the same number of elements.
+    
+    .. seealso::
+    
+            :func:`torch.linalg.vecdot` computes the dot product of two batches of vectors along a dimension.
+    
+    Args:
+        input (Tensor): first tensor in the dot product, must be 1D. Its conjugate is used if it's complex.
+        other (Tensor): second tensor in the dot product, must be 1D.
+    
+    Keyword args:
+    
+    .. note:: out (Tensor, optional): the output tensor.
+    
+    
+    Example::
+    
+        >>> torch.vdot(torch.tensor([2, 3]), torch.tensor([2, 1]))
+        tensor(7)
+        >>> a = torch.tensor((1 +2j, 3 - 1j))
+        >>> b = torch.tensor((2 +1j, 4 - 0j))
+        >>> torch.vdot(a, b)
+        tensor([16.+1.j])
+        >>> torch.vdot(b, a)
+        tensor([16.-1.j])
+    """
+    ...
+def view_as_complex(input: Tensor) -> Tensor: 
+    r"""
+    view_as_complex(input) -> Tensor
+    
+    Returns a view of :attr:`input` as a complex tensor. For an input complex
+    tensor of :attr:`size` :math:`m1, m2, \dots, mi, 2`, this function returns a
+    new complex tensor of :attr:`size` :math:`m1, m2, \dots, mi` where the last
+    dimension of the input tensor is expected to represent the real and imaginary
+    components of complex numbers.
+    
+    .. warning::
+        :func:`view_as_complex` is only supported for tensors with
+        :class:`torch.dtype` ``torch.float64`` and ``torch.float32``.  The input is
+        expected to have the last dimension of :attr:`size` 2. In addition, the
+        tensor must have a `stride` of 1 for its last dimension. The strides of all
+        other dimensions must be even numbers.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x=torch.randn(4, 2)
+        >>> x
+        tensor([[ 1.6116, -0.5772],
+                [-1.4606, -0.9120],
+                [ 0.0786, -1.7497],
+                [-0.6561, -1.6623]])
+        >>> torch.view_as_complex(x)
+        tensor([(1.6116-0.5772j), (-1.4606-0.9120j), (0.0786-1.7497j), (-0.6561-1.6623j)])
+    """
+    ...
+def view_as_complex_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.view_as_complex`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def view_as_real(input: Tensor) -> Tensor: 
+    r"""
+    view_as_real(input) -> Tensor
+    
+    Returns a view of :attr:`input` as a real tensor. For an input complex tensor of
+    :attr:`size` :math:`m1, m2, \dots, mi`, this function returns a new
+    real tensor of size :math:`m1, m2, \dots, mi, 2`, where the last dimension of size 2
+    represents the real and imaginary components of complex numbers.
+    
+    .. warning::
+        :func:`view_as_real` is only supported for tensors with ``complex dtypes``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x=torch.randn(4, dtype=torch.cfloat)
+        >>> x
+        tensor([(0.4737-0.3839j), (-0.2098-0.6699j), (0.3470-0.9451j), (-0.5174-1.3136j)])
+        >>> torch.view_as_real(x)
+        tensor([[ 0.4737, -0.3839],
+                [-0.2098, -0.6699],
+                [ 0.3470, -0.9451],
+                [-0.5174, -1.3136]])
+    """
+    ...
+def view_as_real_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.view_as_real`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def view_copy(input: Tensor, dtype: _dtype, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.view`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def view_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.view`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def vsplit(input: Tensor, sections: _int) -> Tuple[Tensor, ...]: 
+    r"""
+    vsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with two or more dimensions, into multiple tensors
+    vertically according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=0)
+    (the split dimension is 0), except that if :attr:`indices_or_sections` is an integer
+    it must evenly divide the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.vsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(4,4)
+        >>> t
+        tensor([[ 0.,  1.,  2.,  3.],
+                [ 4.,  5.,  6.,  7.],
+                [ 8.,  9., 10., 11.],
+                [12., 13., 14., 15.]])
+        >>> torch.vsplit(t, 2)
+        (tensor([[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]),
+         tensor([[ 8.,  9., 10., 11.],
+                 [12., 13., 14., 15.]]))
+        >>> torch.vsplit(t, [3, 6])
+        (tensor([[ 0.,  1.,  2.,  3.],
+                 [ 4.,  5.,  6.,  7.],
+                 [ 8.,  9., 10., 11.]]),
+         tensor([[12., 13., 14., 15.]]),
+         tensor([], size=(0, 4)))
+    """
+    ...
+@overload
+def vsplit(input: Tensor, indices: _size) -> Tuple[Tensor, ...]: 
+    r"""
+    vsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with two or more dimensions, into multiple tensors
+    vertically according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=0)
+    (the split dimension is 0), except that if :attr:`indices_or_sections` is an integer
+    it must evenly divide the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.vsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(4,4)
+        >>> t
+        tensor([[ 0.,  1.,  2.,  3.],
+                [ 4.,  5.,  6.,  7.],
+                [ 8.,  9., 10., 11.],
+                [12., 13., 14., 15.]])
+        >>> torch.vsplit(t, 2)
+        (tensor([[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]),
+         tensor([[ 8.,  9., 10., 11.],
+                 [12., 13., 14., 15.]]))
+        >>> torch.vsplit(t, [3, 6])
+        (tensor([[ 0.,  1.,  2.,  3.],
+                 [ 4.,  5.,  6.,  7.],
+                 [ 8.,  9., 10., 11.]]),
+         tensor([[12., 13., 14., 15.]]),
+         tensor([], size=(0, 4)))
+    """
+    ...
+def vstack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    vstack(tensors, *, out=None) -> Tensor
+    
+    Stack tensors in sequence vertically (row wise).
+    
+    This is equivalent to concatenation along the first axis after all 1-D tensors have been reshaped by :func:`torch.atleast_2d`.
+    
+    Args:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5, 6])
+        >>> torch.vstack((a,b))
+        tensor([[1, 2, 3],
+                [4, 5, 6]])
+        >>> a = torch.tensor([[1],[2],[3]])
+        >>> b = torch.tensor([[4],[5],[6]])
+        >>> torch.vstack((a,b))
+        tensor([[1],
+                [2],
+                [3],
+                [4],
+                [5],
+                [6]])
+    """
+    ...
+@overload
+def where(condition: Tensor) -> Tuple[Tensor, ...]: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def where(condition: Tensor, input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def where(condition: Tensor, self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def where(condition: Tensor, input: Tensor, other: Union[Number, _complex]) -> Tensor: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def where(condition: Tensor, self: Union[Number, _complex], other: Union[Number, _complex]) -> Tensor: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def xlogy(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    xlogy(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.xlogy`.
+    """
+    ...
+@overload
+def xlogy(self: Union[Number, _complex], other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    xlogy(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.xlogy`.
+    """
+    ...
+@overload
+def xlogy(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    xlogy(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.xlogy`.
+    """
+    ...
+@overload
+def xlogy_(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def xlogy_(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+def zero_(input: Tensor) -> Tensor: ...
+@overload
+def zeros(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.zeros(2, 3)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    
+        >>> torch.zeros(5)
+        tensor([ 0.,  0.,  0.,  0.,  0.])
+    """
+    ...
+@overload
+def zeros(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.zeros(2, 3)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    
+        >>> torch.zeros(5)
+        tensor([ 0.,  0.,  0.,  0.,  0.])
+    """
+    ...
+@overload
+def zeros(size: _size, *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.zeros(2, 3)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    
+        >>> torch.zeros(5)
+        tensor([ 0.,  0.,  0.,  0.,  0.])
+    """
+    ...
+@overload
+def zeros(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.zeros(2, 3)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    
+        >>> torch.zeros(5)
+        tensor([ 0.,  0.,  0.,  0.,  0.])
+    """
+    ...
+def zeros_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the same size as
+    :attr:`input`. ``torch.zeros_like(input)`` is equivalent to
+    ``torch.zeros(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    .. warning::
+        As of 0.4, this function does not support an :attr:`out` keyword. As an alternative,
+        the old ``torch.zeros_like(input, out=output)`` is equivalent to
+        ``torch.zeros(input.size(), out=output)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    
+    Example::
+    
+        >>> input = torch.empty(2, 3)
+        >>> torch.zeros_like(input)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    """
+    ...
+
+__all__ = ['__and__', '__lshift__', '__or__', '__rshift__', '__xor__', '_adaptive_avg_pool2d',
+ '_adaptive_avg_pool3d', '_add_batch_dim', '_add_relu', '_add_relu_', '_addmm_activation',
+ '_aminmax', '_amp_foreach_non_finite_check_and_unscale_', '_amp_update_scale_', '_assert_async',
+ '_assert_scalar', '_assert_tensor_metadata', '_batch_norm_impl_index', '_cast_Byte', '_cast_Char',
+ '_cast_Double', '_cast_Float', '_cast_Half', '_cast_Int', '_cast_Long', '_cast_Short',
+ '_choose_qparams_per_tensor', '_chunk_cat', '_coalesce', '_compute_linear_combination', '_conj',
+ '_conj_copy', '_conj_physical', '_convert_indices_from_coo_to_csr',
+ '_convert_indices_from_csr_to_coo', '_convert_weight_to_int4pack', '_convolution',
+ '_convolution_mode', '_copy_from', '_copy_from_and_resize', '_cslt_compress', '_cslt_sparse_mm',
+ '_cslt_sparse_mm_search', '_ctc_loss', '_cudnn_ctc_loss', '_cudnn_init_dropout_state',
+ '_cudnn_rnn', '_cudnn_rnn_flatten_weight', '_cufft_clear_plan_cache',
+ '_cufft_get_plan_cache_max_size', '_cufft_get_plan_cache_size', '_cufft_set_plan_cache_max_size',
+ '_cummax_helper', '_cummin_helper', '_debug_has_internal_overlap', '_dim_arange',
+ '_dirichlet_grad', '_disable_functionalization', '_efficientzerotensor', '_embedding_bag',
+ '_embedding_bag_forward_only', '_empty_affine_quantized', '_empty_per_channel_affine_quantized',
+ '_enable_functionalization', '_euclidean_dist', '_fake_quantize_learnable_per_channel_affine',
+ '_fake_quantize_learnable_per_tensor_affine',
+ '_fake_quantize_per_tensor_affine_cachemask_tensor_qparams',
+ '_fake_quantize_per_tensor_affine_cachemask_tensor_qparams', '_fft_c2c', '_fft_c2r', '_fft_r2c',
+ '_fill_mem_eff_dropout_mask_', '_foobar', '_foreach_abs', '_foreach_abs_', '_foreach_acos',
+ '_foreach_acos_', '_foreach_add', '_foreach_add_', '_foreach_addcdiv', '_foreach_addcdiv_',
+ '_foreach_addcmul', '_foreach_addcmul_', '_foreach_asin', '_foreach_asin_', '_foreach_atan',
+ '_foreach_atan_', '_foreach_ceil', '_foreach_ceil_', '_foreach_clamp_max', '_foreach_clamp_max_',
+ '_foreach_clamp_min', '_foreach_clamp_min_', '_foreach_copy_', '_foreach_cos', '_foreach_cos_',
+ '_foreach_cosh', '_foreach_cosh_', '_foreach_div', '_foreach_div_', '_foreach_erf',
+ '_foreach_erf_', '_foreach_erfc', '_foreach_erfc_', '_foreach_exp', '_foreach_exp_',
+ '_foreach_expm1', '_foreach_expm1_', '_foreach_floor', '_foreach_floor_', '_foreach_frac',
+ '_foreach_frac_', '_foreach_lerp', '_foreach_lerp_', '_foreach_lgamma', '_foreach_lgamma_',
+ '_foreach_log', '_foreach_log10', '_foreach_log10_', '_foreach_log1p', '_foreach_log1p_',
+ '_foreach_log2', '_foreach_log2_', '_foreach_log_', '_foreach_maximum', '_foreach_maximum_',
+ '_foreach_minimum', '_foreach_minimum_', '_foreach_mul', '_foreach_mul_', '_foreach_neg',
+ '_foreach_neg_', '_foreach_norm', '_foreach_pow', '_foreach_pow_', '_foreach_reciprocal',
+ '_foreach_reciprocal_', '_foreach_round', '_foreach_round_', '_foreach_sigmoid',
+ '_foreach_sigmoid_', '_foreach_sign', '_foreach_sign_', '_foreach_sin', '_foreach_sin_',
+ '_foreach_sinh', '_foreach_sinh_', '_foreach_sqrt', '_foreach_sqrt_', '_foreach_sub',
+ '_foreach_sub_', '_foreach_tan', '_foreach_tan_', '_foreach_tanh', '_foreach_tanh_',
+ '_foreach_trunc', '_foreach_trunc_', '_foreach_zero_', '_from_functional_tensor',
+ '_functional_assert_async', '_functional_assert_scalar', '_functional_sym_constrain_range',
+ '_functional_sym_constrain_range_for_size',
+ '_functionalize_are_all_mutations_hidden_from_autograd',
+ '_functionalize_are_all_mutations_under_no_grad_or_inference_mode', '_functionalize_commit_update',
+ '_functionalize_mark_mutation_hidden_from_autograd', '_functionalize_replace',
+ '_functionalize_sync', '_fused_adam_', '_fused_adamw_', '_fused_dropout',
+ '_fused_moving_avg_obs_fq_helper', '_fused_moving_avg_obs_fq_helper', '_fused_sdp_choice',
+ '_fused_sgd_', '_fw_primal_copy', '_grid_sampler_2d_cpu_fallback',
+ '_has_compatible_shallow_copy_type', '_histogramdd_bin_edges', '_histogramdd_from_bin_cts',
+ '_histogramdd_from_bin_tensors', '_index_put_impl_', '_indices_copy', '_int_mm', '_is_all_true',
+ '_is_any_true', '_is_functional_tensor', '_is_zerotensor', '_lazy_clone', '_linalg_check_errors',
+ '_linalg_det', '_linalg_det', '_linalg_eigh', '_linalg_eigh', '_linalg_slogdet', '_linalg_slogdet',
+ '_linalg_solve_ex', '_linalg_solve_ex', '_linalg_svd', '_linalg_svd', '_log_softmax',
+ '_log_softmax_backward_data', '_logcumsumexp', '_lstm_mps', '_lu_with_info', '_lu_with_info',
+ '_make_dep_token', '_make_dual', '_make_dual_copy', '_make_per_channel_quantized_tensor',
+ '_make_per_tensor_quantized_tensor', '_masked_scale', '_masked_softmax', '_mixed_dtypes_linear',
+ '_mkldnn_reshape', '_mkldnn_transpose', '_mkldnn_transpose_', '_mps_convolution',
+ '_mps_convolution_transpose', '_native_batch_norm_legit', '_native_batch_norm_legit_no_training',
+ '_native_multi_head_attention', '_neg_view', '_neg_view_copy', '_nested_from_padded',
+ '_nested_from_padded_and_nested_example', '_nested_get_jagged_dummy', '_nested_get_lengths',
+ '_nested_get_offsets', '_nested_get_ragged_idx', '_nested_get_values', '_nested_get_values_copy',
+ '_nested_tensor_from_mask', '_nested_tensor_from_mask_left_aligned',
+ '_nested_tensor_from_tensor_list', '_nested_tensor_softmax_with_shape', '_nested_view_from_buffer',
+ '_nested_view_from_buffer_copy', '_nested_view_from_jagged', '_nested_view_from_jagged_copy',
+ '_nnpack_available', '_nnpack_spatial_convolution', '_pack_padded_sequence',
+ '_pad_packed_sequence', '_pin_memory', '_prelu_kernel', '_print', '_propagate_xla_data',
+ '_remove_batch_dim', '_reshape_alias_copy', '_reshape_from_tensor', '_resize_output_',
+ '_rowwise_prune', '_sample_dirichlet', '_saturate_weight_to_fp16',
+ '_scaled_dot_product_attention_math', '_scaled_dot_product_cudnn_attention',
+ '_scaled_dot_product_cudnn_attention', '_scaled_dot_product_efficient_attention',
+ '_scaled_dot_product_efficient_attention', '_scaled_dot_product_flash_attention',
+ '_scaled_dot_product_flash_attention', '_scaled_dot_product_flash_attention_for_cpu',
+ '_scaled_dot_product_flash_attention_for_cpu', '_scaled_mm', '_shape_as_tensor',
+ '_sobol_engine_draw', '_sobol_engine_ff_', '_sobol_engine_initialize_state_',
+ '_sobol_engine_scramble_', '_softmax', '_softmax_backward_data', '_sparse_broadcast_to',
+ '_sparse_broadcast_to_copy', '_sparse_csr_prod', '_sparse_csr_sum',
+ '_sparse_log_softmax_backward_data', '_sparse_semi_structured_linear',
+ '_sparse_softmax_backward_data', '_sparse_sparse_matmul', '_sparse_sum', '_stack',
+ '_standard_gamma', '_standard_gamma_grad', '_sync', '_test_autograd_multiple_dispatch',
+ '_test_autograd_multiple_dispatch_view', '_test_autograd_multiple_dispatch_view_copy',
+ '_test_check_tensor', '_test_functorch_fallback', '_test_parallel_materialize',
+ '_test_serialization_subcmul', '_to_cpu', '_to_functional_tensor', '_to_sparse_semi_structured',
+ '_transform_bias_rescale_qkv', '_transformer_encoder_layer_fwd', '_trilinear',
+ '_triton_multi_head_attention', '_triton_scaled_dot_attention', '_unique', '_unique2',
+ '_unpack_dual', '_unpack_dual', '_unsafe_index', '_unsafe_index_put', '_use_cudnn_ctc_loss',
+ '_use_cudnn_rnn_flatten_weight', '_validate_compressed_sparse_indices',
+ '_validate_sparse_bsc_tensor_args', '_validate_sparse_bsr_tensor_args',
+ '_validate_sparse_compressed_tensor_args', '_validate_sparse_coo_tensor_args',
+ '_validate_sparse_csc_tensor_args', '_validate_sparse_csr_tensor_args', '_values_copy',
+ '_weight_int4pack_mm', '_weight_int8pack_mm', '_weight_norm', '_weight_norm_interface', 'abs',
+ 'abs_', 'absolute', 'acos', 'acos_', 'acosh', 'acosh_', 'adaptive_avg_pool1d',
+ 'adaptive_max_pool1d', 'add', 'addbmm', 'addcdiv', 'addcmul', 'addmm', 'addmv', 'addmv_', 'addr',
+ 'adjoint', 'affine_grid_generator', 'alias_copy', 'all', 'allclose', 'alpha_dropout',
+ 'alpha_dropout_', 'amax', 'amin', 'aminmax', 'aminmax', 'angle', 'any', 'arange', 'arccos',
+ 'arccos_', 'arccosh', 'arccosh_', 'arcsin', 'arcsin_', 'arcsinh', 'arcsinh_', 'arctan', 'arctan2',
+ 'arctan_', 'arctanh', 'arctanh_', 'argmax', 'argmin', 'argsort', 'argwhere', 'as_strided',
+ 'as_strided_', 'as_strided_copy', 'as_strided_scatter', 'as_tensor', 'asarray', 'asin', 'asin_',
+ 'asinh', 'asinh_', 'atan', 'atan2', 'atan_', 'atanh', 'atanh_', 'avg_pool1d', 'baddbmm',
+ 'bartlett_window', 'batch_norm', 'batch_norm_backward_elemt', 'batch_norm_backward_reduce',
+ 'batch_norm_elemt', 'batch_norm_gather_stats', 'batch_norm_gather_stats_with_counts',
+ 'batch_norm_stats', 'batch_norm_update_stats', 'bernoulli', 'bilinear',
+ 'binary_cross_entropy_with_logits', 'bincount', 'binomial', 'bitwise_and', 'bitwise_left_shift',
+ 'bitwise_not', 'bitwise_or', 'bitwise_right_shift', 'bitwise_xor', 'blackman_window', 'bmm',
+ 'broadcast_to', 'bucketize', 'can_cast', 'cat', 'ccol_indices_copy', 'ceil', 'ceil_', 'celu',
+ 'celu_', 'channel_shuffle', 'cholesky', 'cholesky_inverse', 'cholesky_solve',
+ 'choose_qparams_optimized', 'chunk', 'clamp', 'clamp_', 'clamp_max', 'clamp_max_', 'clamp_min',
+ 'clamp_min_', 'clip', 'clip_', 'clone', 'col_indices_copy', 'column_stack', 'combinations',
+ 'complex', 'concat', 'concatenate', 'conj', 'conj_physical', 'conj_physical_', 'constant_pad_nd',
+ 'conv1d', 'conv2d', 'conv3d', 'conv_tbc', 'conv_transpose1d', 'conv_transpose2d',
+ 'conv_transpose3d', 'convolution', 'copysign', 'corrcoef', 'cos', 'cos_', 'cosh', 'cosh_',
+ 'cosine_embedding_loss', 'cosine_similarity', 'count_nonzero', 'cov', 'cross', 'crow_indices_copy',
+ 'ctc_loss', 'cudnn_affine_grid_generator', 'cudnn_batch_norm', 'cudnn_convolution',
+ 'cudnn_convolution_add_relu', 'cudnn_convolution_relu', 'cudnn_convolution_transpose',
+ 'cudnn_grid_sampler', 'cudnn_is_acceptable', 'cummax', 'cummax', 'cummin', 'cummin', 'cumprod',
+ 'cumsum', 'cumulative_trapezoid', 'deg2rad', 'deg2rad_', 'dequantize', 'det', 'detach', 'detach_',
+ 'detach_copy', 'diag', 'diag_embed', 'diagflat', 'diagonal', 'diagonal_copy', 'diagonal_scatter',
+ 'diff', 'digamma', 'dist', 'div', 'divide', 'dot', 'dropout', 'dropout_', 'dsmm', 'dsplit',
+ 'dstack', 'embedding', 'embedding_bag', 'embedding_renorm_', 'empty', 'empty_like',
+ 'empty_permuted', 'empty_quantized', 'empty_strided', 'eq', 'equal', 'erf', 'erf_', 'erfc',
+ 'erfc_', 'erfinv', 'exp', 'exp2', 'exp2_', 'exp_', 'expand_copy', 'expm1', 'expm1_', 'eye',
+ 'fake_quantize_per_channel_affine', 'fake_quantize_per_tensor_affine', 'fbgemm_linear_fp16_weight',
+ 'fbgemm_linear_fp16_weight_fp32_activation', 'fbgemm_linear_int8_weight',
+ 'fbgemm_linear_int8_weight_fp32_activation', 'fbgemm_linear_quantize_weight',
+ 'fbgemm_pack_gemm_matrix_fp16', 'fbgemm_pack_quantized_matrix', 'feature_alpha_dropout',
+ 'feature_alpha_dropout_', 'feature_dropout', 'feature_dropout_', 'fill', 'fill_', 'fix', 'fix_',
+ 'flatten', 'flip', 'fliplr', 'flipud', 'float_power', 'floor', 'floor_', 'floor_divide', 'fmax',
+ 'fmin', 'fmod', 'frac', 'frac_', 'frexp', 'frexp', 'frobenius_norm', 'from_file', 'from_numpy',
+ 'frombuffer', 'full', 'full_like', 'fused_moving_avg_obs_fake_quant', 'gather', 'gcd', 'gcd_',
+ 'ge', 'geqrf', 'geqrf', 'ger', 'get_default_dtype', 'get_num_interop_threads', 'get_num_threads',
+ 'gradient', 'greater', 'greater_equal', 'grid_sampler', 'grid_sampler_2d', 'grid_sampler_3d',
+ 'group_norm', 'gru', 'gru_cell', 'gt', 'hamming_window', 'hann_window', 'hardshrink', 'heaviside',
+ 'hinge_embedding_loss', 'histc', 'histogram', 'histogram', 'histogramdd', 'histogramdd', 'hsmm',
+ 'hsplit', 'hspmm', 'hstack', 'hypot', 'i0', 'i0_', 'igamma', 'igammac', 'imag', 'index_add',
+ 'index_copy', 'index_fill', 'index_put', 'index_put_', 'index_reduce', 'index_select',
+ 'indices_copy', 'init_num_threads', 'inner', 'instance_norm', 'int_repr', 'inverse', 'is_complex',
+ 'is_conj', 'is_distributed', 'is_floating_point', 'is_grad_enabled', 'is_inference',
+ 'is_inference_mode_enabled', 'is_neg', 'is_nonzero', 'is_same_size', 'is_signed',
+ 'is_vulkan_available', 'isclose', 'isfinite', 'isin', 'isinf', 'isnan', 'isneginf', 'isposinf',
+ 'isreal', 'istft', 'kaiser_window', 'kl_div', 'kron', 'kthvalue', 'kthvalue', 'layer_norm', 'lcm',
+ 'lcm_', 'ldexp', 'ldexp_', 'le', 'lerp', 'less', 'less_equal', 'lgamma', 'linspace', 'log',
+ 'log10', 'log10_', 'log1p', 'log1p_', 'log2', 'log2_', 'log_', 'log_softmax', 'logaddexp',
+ 'logaddexp2', 'logcumsumexp', 'logdet', 'logical_and', 'logical_not', 'logical_or', 'logical_xor',
+ 'logit', 'logit_', 'logspace', 'logsumexp', 'lstm', 'lstm_cell', 'lt', 'lu_solve', 'lu_unpack',
+ 'lu_unpack', 'margin_ranking_loss', 'masked_fill', 'masked_scatter', 'masked_select', 'matmul',
+ 'matrix_exp', 'matrix_power', 'max', 'max', 'max_pool1d', 'max_pool1d_with_indices', 'max_pool2d',
+ 'max_pool3d', 'maximum', 'mean', 'median', 'median', 'min', 'min', 'minimum', 'miopen_batch_norm',
+ 'miopen_convolution', 'miopen_convolution_add_relu', 'miopen_convolution_relu',
+ 'miopen_convolution_transpose', 'miopen_depthwise_convolution', 'miopen_rnn',
+ 'mkldnn_adaptive_avg_pool2d', 'mkldnn_convolution', 'mkldnn_linear_backward_weights',
+ 'mkldnn_max_pool2d', 'mkldnn_max_pool3d', 'mkldnn_rnn_layer', 'mm', 'mode', 'mode', 'moveaxis',
+ 'movedim', 'msort', 'mul', 'multinomial', 'multiply', 'mv', 'mvlgamma', 'nan_to_num',
+ 'nan_to_num_', 'nanmean', 'nanmedian', 'nanmedian', 'nanquantile', 'nansum', 'narrow',
+ 'narrow_copy', 'native_batch_norm', 'native_channel_shuffle', 'native_dropout',
+ 'native_group_norm', 'native_layer_norm', 'native_norm', 'ne', 'neg', 'neg_', 'negative',
+ 'negative_', 'nextafter', 'nonzero', 'nonzero_static', 'norm_except_dim', 'normal', 'not_equal',
+ 'nuclear_norm', 'numel', 'ones', 'ones_like', 'orgqr', 'ormqr', 'outer', 'pairwise_distance',
+ 'pdist', 'permute', 'permute_copy', 'pinverse', 'pixel_shuffle', 'pixel_unshuffle', 'poisson',
+ 'poisson_nll_loss', 'polar', 'polygamma', 'positive', 'pow', 'prelu', 'prod', 'promote_types',
+ 'put', 'q_per_channel_axis', 'q_per_channel_scales', 'q_per_channel_zero_points', 'q_scale',
+ 'q_zero_point', 'qr', 'qr', 'quantile', 'quantize_per_channel', 'quantize_per_tensor',
+ 'quantize_per_tensor_dynamic', 'quantized_batch_norm', 'quantized_gru_cell', 'quantized_lstm_cell',
+ 'quantized_max_pool1d', 'quantized_max_pool2d', 'quantized_max_pool3d', 'quantized_rnn_relu_cell',
+ 'quantized_rnn_tanh_cell', 'rad2deg', 'rad2deg_', 'rand', 'rand_like', 'randint', 'randint_like',
+ 'randn', 'randn_like', 'randperm', 'range', 'ravel', 'real', 'reciprocal', 'reciprocal_', 'relu',
+ 'relu_', 'remainder', 'renorm', 'repeat_interleave', 'reshape', 'resize_as_', 'resize_as_sparse_',
+ 'resolve_conj', 'resolve_neg', 'result_type', 'rnn_relu', 'rnn_relu_cell', 'rnn_tanh',
+ 'rnn_tanh_cell', 'roll', 'rot90', 'round', 'round_', 'row_indices_copy', 'row_stack', 'rrelu',
+ 'rrelu_', 'rsqrt', 'rsqrt_', 'rsub', 'saddmm', 'scalar_tensor', 'scatter', 'scatter_add',
+ 'scatter_reduce', 'searchsorted', 'segment_reduce', 'select', 'select_copy', 'select_scatter',
+ 'selu', 'selu_', 'set_flush_denormal', 'set_num_interop_threads', 'set_num_threads', 'sgn',
+ 'sigmoid', 'sigmoid_', 'sign', 'signbit', 'sin', 'sin_', 'sinc', 'sinc_', 'sinh', 'sinh_',
+ 'slice_copy', 'slice_inverse', 'slice_scatter', 'slogdet', 'slogdet', 'smm', 'softmax', 'sort',
+ 'sort', 'sparse_bsc_tensor', 'sparse_bsr_tensor', 'sparse_compressed_tensor', 'sparse_coo_tensor',
+ 'sparse_csc_tensor', 'sparse_csr_tensor', 'split_copy', 'split_with_sizes',
+ 'split_with_sizes_copy', 'spmm', 'sqrt', 'sqrt_', 'square', 'square_', 'squeeze', 'squeeze_copy',
+ 'sspaddmm', 'stack', 'std', 'std_mean', 'sub', 'subtract', 'sum', 'svd', 'svd', 'swapaxes',
+ 'swapdims', 'sym_constrain_range', 'sym_constrain_range_for_size', 't', 't_copy', 'take',
+ 'take_along_dim', 'tan', 'tan_', 'tanh', 'tanh_', 'tensor', 'tensor_split', 'threshold',
+ 'threshold_', 'tile', 'topk', 'topk', 'trace', 'transpose', 'transpose_copy', 'trapezoid', 'trapz',
+ 'triangular_solve', 'triangular_solve', 'tril', 'tril_indices', 'triplet_margin_loss', 'triu',
+ 'triu_indices', 'true_divide', 'trunc', 'trunc_', 'unbind', 'unbind_copy', 'unflatten',
+ 'unfold_copy', 'unique_dim', 'unsafe_chunk', 'unsafe_split', 'unsafe_split_with_sizes',
+ 'unsqueeze', 'unsqueeze_copy', 'values_copy', 'vander', 'var', 'var_mean', 'vdot',
+ 'view_as_complex', 'view_as_complex_copy', 'view_as_real', 'view_as_real_copy', 'view_copy',
+ 'vsplit', 'vstack', 'where', 'xlogy', 'xlogy_', 'zero_', 'zeros', 'zeros_like']
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/__config__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/__config__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7e3e209654a8846ddc42d31220101340043c276
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/__config__.py
@@ -0,0 +1,22 @@
+import torch
+
+
+def show():
+    """
+    Return a human-readable string with descriptions of the
+    configuration of PyTorch.
+    """
+    return torch._C._show_config()
+
+
+# TODO: In principle, we could provide more structured version/config
+# information here. For now only CXX_FLAGS is exposed, as Timer
+# uses them.
+def _cxx_flags():
+    """Returns the CXX_FLAGS used when building PyTorch."""
+    return torch._C._cxx_flags()
+
+
+def parallel_info():
+    r"""Returns detailed string with parallelization settings"""
+    return torch._C._parallel_info()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_compile.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_compile.py
new file mode 100644
index 0000000000000000000000000000000000000000..354d64e9ff9fddc9a1dc321241ce8bea7955b58a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_compile.py
@@ -0,0 +1,30 @@
+"""
+APIs related to torch.compile which lazily import torch._dynamo to avoid
+circular dependencies.
+"""
+import functools
+
+
+def _disable_dynamo(fn=None, recursive=True):
+    """
+    This API should be only used inside torch, external users should still use
+    torch._dynamo.disable. The main goal of this API is to avoid circular
+    imports issues that is common while using _dynamo.disable inside torch
+    itself.
+
+    This API avoids it by lazily importing torch._dynamo from the import time to
+    the invocation of the decorated function.
+    """
+    if fn is not None:
+
+        @functools.wraps(fn)
+        def inner(*args, **kwargs):
+            import torch._dynamo
+
+            return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
+
+        return inner
+    else:
+        # decorator usage like @_disable_dynamo(recursive=False). The resulting
+        # object expects the original decorated function as the arg.
+        return functools.partial(_disable_dynamo, recursive=recursive)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_jit_internal.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_jit_internal.py
new file mode 100644
index 0000000000000000000000000000000000000000..64509816e09cf029bce0663287d94661c6c4585c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_jit_internal.py
@@ -0,0 +1,1510 @@
+"""
+The weak_script annotation needs to be here instead of inside torch/jit/ so it
+can be used in other places in torch/ (namely torch.nn) without running into
+circular dependency problems
+"""
+
+import ast
+import builtins
+import collections
+import contextlib
+import enum
+import inspect
+import io
+import pickle
+import sys
+import threading
+import types
+import typing
+import warnings
+import weakref
+from textwrap import dedent
+from typing import (  # noqa: F401
+    Any,
+    Callable,
+    Dict,
+    Final,
+    ForwardRef,
+    Generic,
+    get_args,  # new in 3.8
+    get_origin,  # new in 3.8
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+
+import torch
+
+# This is needed. `torch._jit_internal` is imported before `torch.distributed.__init__`.
+# Explicitly ask to import `torch.distributed.__init__` first.
+# Otherwise, "AttributeError: module 'torch' has no attribute 'distributed'" is raised.
+import torch.distributed.rpc
+import torch.package._mangling as package_mangling
+from torch._awaits import _Await
+from torch._C import _Await as CAwait, Future as CFuture
+from torch._sources import fake_range, get_source_lines_and_file, parse_def
+from torch.futures import Future
+
+IS_PY39_PLUS: Final[bool] = sys.version_info >= (3, 9)
+IS_PY310_PLUS: Final[bool] = sys.version_info >= (3, 10)
+
+BuiltinUnionType: Union[Type, Tuple[Type, ...]]
+if sys.version_info >= (3, 10):
+    # NOTE: IS_PY310_PLUS doesn't work with mypy.
+    # cf. https://mypy.readthedocs.io/en/stable/common_issues.html#python-version-and-system-platform-checks
+    BuiltinUnionType = types.UnionType
+else:
+    BuiltinUnionType = ()  # trick: this makes isinstance short circuit.
+
+LockType: Type
+try:
+    import _thread
+
+    LockType = _thread.LockType
+except ImportError:
+    import _dummy_thread  # type: ignore[import-not-found]
+
+    LockType = _dummy_thread.LockType
+
+# Wrapper functions that can call either of 2 functions depending on a boolean
+# argument
+boolean_dispatched: "weakref.WeakKeyDictionary[Callable, Dict[str, Callable]]" = (
+    weakref.WeakKeyDictionary()
+)  # noqa: T484
+
+
+FAKE_FILENAME_PREFIX = "__torch_jit_dataclass"
+
+
+class SourceLoader:
+    def __init__(self):
+        self.content = {}
+
+    def cache(self, fn, source):
+        self.content[fn] = source
+
+    def get_source(self, fn):
+        return self.content.get(fn)
+
+
+loader = SourceLoader()
+
+
+def createResolutionCallbackFromEnv(lookup_base):
+    """
+    Creates a resolution callback that will look up qualified names in an
+    environment, starting with `lookup_base` for the base of any qualified
+    names, then proceeding down the lookup chain with the resolved object.
+
+    You should not use this directly, it should only be used from the other
+    createResolutionCallbackFrom* functions.
+    """
+
+    def lookupInModule(qualified_name, module):
+        if "." in qualified_name:
+            base, remaining_pieces = qualified_name.split(".", maxsplit=1)
+            module_value = getattr(module, base)
+            return lookupInModule(remaining_pieces, module_value)
+        else:
+            return getattr(module, qualified_name)
+
+    def parseNestedExpr(expr, module) -> Tuple[Any, int]:
+        i = 0
+        while i < len(expr) and expr[i] not in (",", "[", "]"):
+            i += 1
+
+        # Special case logic for the empty Tuple as a subscript (used
+        # in the type annotation `Tuple[()]`)
+        if expr[:i] == "()":
+            return (), i
+
+        base = lookupInModule(expr[:i].strip(), module)
+        assert base is not None, f"Unresolvable type {expr[:i]}"
+        if i == len(expr) or expr[i] != "[":
+            return base, i
+
+        assert expr[i] == "["
+        parts = []
+        while expr[i] != "]":
+            part_len = 0
+            i += 1
+            part, part_len = parseNestedExpr(expr[i:], module)
+            parts.append(part)
+            i += part_len
+        if len(parts) > 1:
+            return base[tuple(parts)], i + 1
+        else:
+            return base[parts[0]], i + 1
+
+    def parseExpr(expr, module):
+        try:
+            value, len_parsed = parseNestedExpr(expr, module)
+            assert len_parsed == len(
+                expr
+            ), "whole expression was not parsed, falling back to c++ parser"
+            return value
+        except Exception:
+            """
+            The python resolver fails in several cases in known unit tests, and is intended
+            to fall back gracefully to the c++ resolver in general.  For example, python 2 style
+            annotations which are frequent in our unit tests often fail with types e.g. int not
+            resolvable from the calling frame.
+            """
+            return None
+
+    return lambda expr: parseExpr(expr, lookup_base)
+
+
+def createResolutionCallbackFromFrame(frames_up: int = 0):
+    """
+    Creates a function which, given a string variable name,
+    returns the value of the variable in the scope of the caller of
+    the function which called createResolutionCallbackFromFrame (by default).
+
+    This is used to enable access in-scope Python variables inside
+    TorchScript fragments.
+
+    frames_up is number of additional frames to go up on the stack.
+    The default value is 0, which correspond to the frame of the caller
+    of createResolutionCallbackFromFrame. Also for example, if frames_up is set
+    to 1, then the frame of the caller's caller of createResolutionCallbackFromFrame
+    will be taken.
+
+    For example, the following program prints 2::
+
+        def bar():
+            cb = createResolutionCallbackFromFrame(1)
+            print(cb("foo"))
+
+        def baz():
+            foo = 2
+            bar()
+
+        baz()
+    """
+    frame = inspect.currentframe()
+    i = 0
+    while i < frames_up + 1:
+        assert frame is not None
+        frame = frame.f_back
+        i += 1
+
+    assert frame is not None
+    f_locals = frame.f_locals
+    f_globals = frame.f_globals
+
+    class env:
+        def __getattr__(self, key):
+            if key in f_locals:
+                return f_locals[key]
+            elif key in f_globals:
+                return f_globals[key]
+            elif key in dir(builtins):
+                return getattr(builtins, key)
+
+    return createResolutionCallbackFromEnv(env())
+
+
+def get_closure(fn):
+    """
+    Get a dictionary of closed over variables from a function
+    """
+    captures = {}
+    captures.update(fn.__globals__)
+
+    for index, captured_name in enumerate(fn.__code__.co_freevars):
+        captures[captured_name] = fn.__closure__[index].cell_contents
+
+    return captures
+
+
+# [local resolution in python]
+# Depending on where a variable is defined, and where it is used, we may
+# or may not be able to recover its value when recursively compiling a
+# script function. Remember in the general case, a module or function is
+# first defined and then later scripted. This means we do not have a
+# chance to capture the active frames when the function is defined. Hence any
+# name resolution has to happen later on the created closure. The way
+# python captures type annotations restricts what we can recover. The
+# follow example illustrates the different cases:
+#
+#         class MyGlobalClass:
+#         ...
+#         def my_local_scope():
+#             @torch.jit.script
+#             class MyClass:
+#                 ...
+#             @torch.jit.script
+#             class MyClassUsedAsVar:
+#                 ...
+#             def eg(x: MyClass, y: MyGlobalClass):
+#                 a_local_capture : Foo
+#                 return MyClassUsedAsVar(x)
+#
+# MyGlobalClass is defined in the __globals__ dictionary of function
+# 'eg', so it is always recoverable. my_local_scope introduces a new local
+# variable scope in the function. Classes defined here are only visible as
+# local variables. For the case of MyClassUsedAsVar, it is captured
+# because it is used as a variable inside the body of the function, and we
+# can resolve it using the captures returned from `get_closure`. However,
+# the type annotations are not captured by the closure. In Python
+# 3.0--3.9, the _value_ of MyClass and MyGlobalClass will be available as
+# annotations on `eg``, but starting in Python 4.0, they will represented as
+# strings and no longer present. Furthermore, since the body of `eg` does
+# not reference those names, they do not appear in the list of closed over
+# variables. In Python 2.x, type annotations are in comments, leading to a
+# similar situation where their definitions are not available. We anticipate
+# that most users will not run into this issue because their modules and
+# functions will be defined at a global scope like MyGlobalClass. In cases
+# where they are not, it is possible to work around issues by declaring the
+# values global in the function.
+# In Python 3.9 declaring class as global will make it invisible to
+# `inspect.getsource`, see https://bugs.python.org/issue42666 .
+# This could be worked around by manualy adding it to `global()` dictionary.
+
+
+def createResolutionCallbackFromClosure(fn):
+    """
+    Create a resolutionCallback by introspecting the function instead of
+    looking up the stack for the enclosing scope
+    """
+    closure = get_closure(fn)
+
+    class closure_lookup:
+        # This is a class since `closure` is a dict and it's easier in
+        # `env_helper` if everything just works with `getattr` calls
+        def __getattr__(self, key):
+            if key in closure:
+                return closure[key]
+            elif hasattr(typing, key):
+                return getattr(typing, key)
+            elif hasattr(builtins, key):
+                return getattr(builtins, key)
+            return None
+
+    return createResolutionCallbackFromEnv(closure_lookup())
+
+
+def can_compile_class(cls) -> bool:
+    # If any of the functions on a type don't have a code object, this type can't
+    # be compiled and is probably a builtin / bound from C
+    if is_ignored_fn(cls):
+        return False
+
+    # Ignore the following list of built-in classes.
+    ignored_builtin_classes = (torch.nn.Module, tuple, list, Exception)
+    if issubclass(cls, ignored_builtin_classes):
+        return False
+
+    names = cls.__dict__
+    fns = [
+        getattr(cls, name)
+        for name in names
+        if inspect.isroutine(getattr(cls, name, None))
+    ]
+    has_code = [hasattr(fn, "__code__") for fn in fns]
+    return all(has_code)
+
+
+def get_callable_argument_names(fn) -> List[str]:
+    """
+    Gets names of all POSITIONAL_OR_KEYWORD arguments for callable `fn`.
+    Returns an empty list when other types of arguments are present.
+
+    This is used by `torch.jit.trace` to assign meaningful argument names to
+    traced functions and modules.
+
+    Args:
+        fn: A callable.
+    Returns:
+        Argument names: List[str]
+    """
+    # inspect.signature may fail, give up in that case.
+    try:
+        callable_signature = inspect.signature(fn)
+    except Exception:
+        return []
+
+    argument_names = []
+    for name, param in callable_signature.parameters.items():
+        # All four other types of arguments do not map to individual values
+        # with a keyword as name.
+        if not param.kind == param.POSITIONAL_OR_KEYWORD:
+            continue
+
+        argument_names.append(name)
+
+    return argument_names
+
+
+def get_annotation_str(annotation):
+    """
+    Convert an AST node containing a type annotation to the string present in the source
+    that represents the same annotation.
+    """
+    if isinstance(annotation, ast.Name):
+        return annotation.id
+    elif isinstance(annotation, ast.Attribute):
+        return ".".join([get_annotation_str(annotation.value), annotation.attr])
+    elif isinstance(annotation, ast.Subscript):
+        # In Python3.9+ subscript indicies are not wrapped in ast.Index
+        subscript_slice = annotation.slice if IS_PY39_PLUS else annotation.slice.value  # type: ignore[attr-defined]
+        return f"{get_annotation_str(annotation.value)}[{get_annotation_str(subscript_slice)}]"
+    elif isinstance(annotation, ast.Tuple):
+        return ",".join([get_annotation_str(elt) for elt in annotation.elts])
+    elif isinstance(annotation, (ast.Constant, ast.NameConstant)):
+        return f"{annotation.value}"
+
+    # If an AST node is not handled here, it's probably handled in ScriptTypeParser.
+    return None
+
+
+def get_type_hint_captures(fn):
+    """
+    Get a dictionary containing type resolution mappings necessary to resolve types
+    for the literal annotations on 'fn'. These are not considered to be closed-over by fn
+    and must be obtained separately (e.g. using this function).
+
+    Args:
+        fn: A callable.
+    Returns:
+        A Dict[str, Any] containing a mapping from the literal annotations used on
+        fn to the Python objects they refer to.
+    """
+    # First, try to get the source of the function. We'll need to parse it to find the actual string names
+    # that were used to annotate the types, since inspect.signature() will only return the class object that
+    # the annotation refers to, not the string name. If we can't get the source, simply return an empty dict.
+    # This may happen in cases where the function is synthesized dynamically at runtime.
+    src = loader.get_source(fn)
+    if src is None:
+        src = inspect.getsource(fn)
+
+    # Gather a dictionary of parameter name -> type, skipping any parameters whose annotated
+    # types are strings. These are only understood by TorchScript in the context of a type annotation
+    # that refers to a class in its own definition, but trying to include a mapping for this in the result
+    # function would cause infinite recursion because the class is currently being compiled.
+    # In addition, there is logic in ScriptTypeParser to handle this.
+    signature = inspect.signature(fn)
+    name_to_type = {
+        name: parameter.annotation
+        for name, parameter in signature.parameters.items()
+        if parameter.annotation is not inspect.Parameter.empty
+        and not isinstance(parameter.annotation, str)
+    }
+
+    # Then, get the literal type annotations from the function declaration
+    # by source inspection. This accounts for the case in which aliases are used
+    # to annotate the arguments (e.g device_t = torch.device, and then d: device_t).
+    # frontend.py cannot be used here because it includes _jit_internal, so use ast instead.
+    a = ast.parse(dedent(src))
+    if len(a.body) != 1 or not isinstance(a.body[0], ast.FunctionDef):
+        raise RuntimeError(f"Expected {fn} to be a function")
+    f = a.body[0]
+
+    # Prepare a dictionary of source annotation -> type, which will be the final result of this function,
+    # by using the parsed AST (f) to reconstruct source annotations as strings for each parameter and mapping
+    # them to the type object corresponding to the annotation via name_to_type using the parameter name.
+    annotation_to_type = {}
+
+    for arg in f.args.args:
+        # Get the source type annotation string for this argument if possible.
+        arg_annotation_str = (
+            get_annotation_str(arg.annotation) if arg.annotation else None
+        )
+
+        # If the argument has no annotation or get_annotation_str cannot convert it to a string,
+        # arg_annotation_str will be None. Skip this arg; ScriptTypeParser will probably handle
+        # this in the latter case.
+        if arg_annotation_str is None:
+            continue
+
+        # Insert {arg_annotation_str: type} into annotation_to_type if possible. One reason arg_name may not
+        # be present in name_to_type is that the annotation itself is a string and not a type object
+        # (common for self-refential annotations in classes). Once again, let ScriptTypeParser handle this.
+        arg_name = arg.arg
+        if arg_name in name_to_type:
+            annotation_to_type[arg_annotation_str] = name_to_type[arg_name]
+
+    # If there is a valid return annotation, include it in annotation_to_type. As with argument annotations,
+    # the literal annotation has to be convertible to a string by get_annotation_str, and the actual type
+    # of the annotation cannot be a string.
+    literal_return_annotation = get_annotation_str(f.returns)
+    valid_literal_annotation = literal_return_annotation is not None
+    return_annotation = signature.return_annotation
+    valid_return_annotation_type = (
+        return_annotation is not inspect.Parameter.empty
+        and not isinstance(return_annotation, str)
+    )
+    if valid_literal_annotation and valid_return_annotation_type:
+        annotation_to_type[literal_return_annotation] = return_annotation
+
+    return annotation_to_type
+
+
+def createResolutionCallbackForClassMethods(cls):
+    """
+    This looks at all the methods defined in a class and pulls their closed-over
+    variables into a dictionary and uses that to resolve variables.
+    """
+    # cls is a type here, so `ismethod` is false since the methods on the type
+    # aren't bound to anything, so Python treats them as regular functions
+    fns = [
+        getattr(cls, name)
+        for name in cls.__dict__
+        if inspect.isroutine(getattr(cls, name))
+    ]
+    # Skip built-ins, as they do not have global scope nor type hints
+    # Needed to support `enum.Enum` derived classes in Python-3.11
+    # That adds `_new_member_` property which is an alias to `__new__`
+    fns = [fn for fn in fns if not inspect.isbuiltin(fn) and hasattr(fn, "__globals__")]
+    captures = {}
+
+    for fn in fns:
+        captures.update(get_closure(fn))
+        captures.update(get_type_hint_captures(fn))
+
+    def lookup_in_class(key):
+        if key in captures:
+            return captures[key]
+        else:
+            return getattr(builtins, key, None)
+
+    return lookup_in_class
+
+
+def boolean_dispatch(
+    arg_name, arg_index, default, if_true, if_false, module_name, func_name
+):
+    """
+    Dispatches to either of 2 script functions based on a boolean argument.
+    In TorchScript, the boolean argument must be constant so that the correct
+    function to use can be determined at compile time.
+    """
+
+    def fn(*args, **kwargs):
+        dispatch_flag = default
+        if arg_name in kwargs:
+            dispatch_flag = kwargs[arg_name]
+        elif arg_index < len(args):
+            dispatch_flag = args[arg_index]
+
+        if dispatch_flag:
+            return if_true(*args, **kwargs)
+        else:
+            return if_false(*args, **kwargs)
+
+    if if_true.__doc__ is None and if_false.__doc__ is not None:
+        doc = if_false.__doc__
+        if_true.__doc__ = doc
+    elif if_false.__doc__ is None and if_true.__doc__ is not None:
+        doc = if_true.__doc__
+        if_false.__doc__ = doc
+    elif if_false.__doc__ is None and if_true.__doc__ is None:
+        # neither function has a docstring
+        doc = None
+    else:
+        raise RuntimeError("only one function can have a docstring")
+    fn.__doc__ = doc
+
+    if module_name is not None:
+        fn.__module__ = module_name
+    if func_name is not None:
+        fn.__name__ = func_name
+
+    boolean_dispatched[fn] = {
+        "if_true": if_true,
+        "if_false": if_false,
+        "index": arg_index,
+        "default": default,
+        "arg_name": arg_name,
+    }
+    return fn
+
+
+class FunctionModifiers:
+    """
+    Used to denote the behavior of a function in TorchScript. See export() and
+    ignore() for details.
+    """
+
+    UNUSED = "unused (ignored and replaced with raising of an exception)"
+    IGNORE = "ignore (leave as a call to Python, cannot be torch.jit.save'd)"
+    EXPORT = "export (compile this function even if nothing calls it)"
+    DEFAULT = "default (compile if called from a exported function / forward)"
+    COPY_TO_SCRIPT_WRAPPER = (
+        "if this method is not scripted, copy the python method onto the scripted model"
+    )
+    _DROP = "_drop (function is fully ignored, declaration can be unscriptable)"
+
+
+def export(fn):
+    """
+    This decorator indicates that a method on an ``nn.Module`` is used as an entry point into a
+    :class:`ScriptModule` and should be compiled.
+
+    ``forward`` implicitly is assumed to be an entry point, so it does not need this decorator.
+    Functions and methods called from ``forward`` are compiled as they are seen
+    by the compiler, so they do not need this decorator either.
+
+    Example (using ``@torch.jit.export`` on a method):
+
+    .. testcode::
+
+        import torch
+        import torch.nn as nn
+
+        class MyModule(nn.Module):
+            def implicitly_compiled_method(self, x):
+                return x + 99
+
+            # `forward` is implicitly decorated with `@torch.jit.export`,
+            # so adding it here would have no effect
+            def forward(self, x):
+                return x + 10
+
+            @torch.jit.export
+            def another_forward(self, x):
+                # When the compiler sees this call, it will compile
+                # `implicitly_compiled_method`
+                return self.implicitly_compiled_method(x)
+
+            def unused_method(self, x):
+                return x - 20
+
+        # `m` will contain compiled methods:
+        #     `forward`
+        #     `another_forward`
+        #     `implicitly_compiled_method`
+        # `unused_method` will not be compiled since it was not called from
+        # any compiled methods and wasn't decorated with `@torch.jit.export`
+        m = torch.jit.script(MyModule())
+    """
+    fn._torchscript_modifier = FunctionModifiers.EXPORT
+    return fn
+
+
+def unused(fn):
+    """
+    This decorator indicates to the compiler that a function or method should
+    be ignored and replaced with the raising of an exception. This allows you
+    to leave code in your model that is not yet TorchScript compatible and still
+    export your model.
+
+        Example (using ``@torch.jit.unused`` on a method)::
+
+            import torch
+            import torch.nn as nn
+
+            class MyModule(nn.Module):
+                def __init__(self, use_memory_efficient):
+                    super().__init__()
+                    self.use_memory_efficient = use_memory_efficient
+
+                @torch.jit.unused
+                def memory_efficient(self, x):
+                    import pdb
+                    pdb.set_trace()
+                    return x + 10
+
+                def forward(self, x):
+                    # Use not-yet-scriptable memory efficient mode
+                    if self.use_memory_efficient:
+                        return self.memory_efficient(x)
+                    else:
+                        return x + 10
+
+            m = torch.jit.script(MyModule(use_memory_efficient=False))
+            m.save("m.pt")
+
+            m = torch.jit.script(MyModule(use_memory_efficient=True))
+            # exception raised
+            m(torch.rand(100))
+    """
+    if isinstance(fn, property):
+        prop = fn
+        setattr(  # noqa: B010
+            prop.fget, "_torchscript_modifier", FunctionModifiers.UNUSED
+        )
+
+        if prop.fset:
+            setattr(  # noqa: B010
+                prop.fset, "_torchscript_modifier", FunctionModifiers.UNUSED
+            )
+
+        return prop
+
+    fn._torchscript_modifier = FunctionModifiers.UNUSED
+    return fn
+
+
+# No op context manager from python side
+class _IgnoreContextManager(contextlib.AbstractContextManager):
+    def __init__(self, **kwargs):
+        pass
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        pass
+
+
+def ignore(drop=False, **kwargs):
+    """
+    This decorator indicates to the compiler that a function or method should
+    be ignored and left as a Python function. This allows you to leave code in
+    your model that is not yet TorchScript compatible. If called from TorchScript,
+    ignored functions will dispatch the call to the Python interpreter. Models with ignored
+    functions cannot be exported; use :func:`@torch.jit.unused <torch.jit.unused>` instead.
+
+    Example (using ``@torch.jit.ignore`` on a method)::
+
+        import torch
+        import torch.nn as nn
+
+        class MyModule(nn.Module):
+            @torch.jit.ignore
+            def debugger(self, x):
+                import pdb
+                pdb.set_trace()
+
+            def forward(self, x):
+                x += 10
+                # The compiler would normally try to compile `debugger`,
+                # but since it is `@ignore`d, it will be left as a call
+                # to Python
+                self.debugger(x)
+                return x
+
+        m = torch.jit.script(MyModule())
+
+        # Error! The call `debugger` cannot be saved since it calls into Python
+        m.save("m.pt")
+
+    Example (using ``@torch.jit.ignore(drop=True)`` on a method):
+
+    .. testcode::
+
+        import torch
+        import torch.nn as nn
+
+        class MyModule(nn.Module):
+            @torch.jit.ignore(drop=True)
+            def training_method(self, x):
+                import pdb
+                pdb.set_trace()
+
+            def forward(self, x):
+                if self.training:
+                    self.training_method(x)
+                return x
+
+        m = torch.jit.script(MyModule())
+
+        # This is OK since `training_method` is not saved, the call is replaced
+        # with a `raise`.
+        m.save("m.pt")
+
+    .. testcleanup::
+
+        import os
+        os.remove('m.pt')
+    """
+
+    if callable(drop):
+        # used without any args, so drop is actually a function
+        #   @torch.jit.ignore
+        #   def fn(...):
+        fn = drop
+        fn._torchscript_modifier = FunctionModifiers.IGNORE
+        return fn
+
+    if not isinstance(drop, bool):
+        raise RuntimeError(
+            "Argument to @torch.jit.ignore must be a bool or "
+            f"a function but got {drop}"
+        )
+
+    # for backwards compat
+    drop_on_export = kwargs.pop("drop_on_export", None)
+    if drop_on_export:
+        warnings.warn(
+            "ignore(drop_on_export=True) has been deprecated. TorchScript will now drop the function "
+            "call on compilation. Use torch.jit.unused now. {}",
+            category=FutureWarning,
+        )
+
+        drop = drop_on_export
+    elif drop:
+        warnings.warn(
+            "ignore(True) has been deprecated. TorchScript will now drop the function "
+            "call on compilation. Use torch.jit.unused now. {}",
+            category=FutureWarning,
+        )
+
+    def decorator(fn):
+        if drop:
+            fn._torchscript_modifier = FunctionModifiers.UNUSED
+        else:
+            fn._torchscript_modifier = FunctionModifiers.IGNORE
+        return fn
+
+    return decorator
+
+
+def _drop(fn):
+    fn._torchscript_modifier = FunctionModifiers._DROP
+    return fn
+
+
+def _copy_to_script_wrapper(fn):
+    fn._torchscript_modifier = FunctionModifiers.COPY_TO_SCRIPT_WRAPPER
+    return fn
+
+
+def module_has_exports(mod):
+    for name in dir(mod):
+        if hasattr(mod, name):
+            item = getattr(mod, name)
+            if callable(item):
+                if get_torchscript_modifier(item) is FunctionModifiers.EXPORT:
+                    return True
+    return False
+
+
+# WARNING: should_drop is currently being used by our JIT code coverage plug-in to mark JIT'd code as covered. If you
+# rename this function, please update references in tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py to
+# allow JIT'd code to still be covered.
+def should_drop(fn) -> bool:
+    attr = get_torchscript_modifier(fn)
+    if attr is None:
+        return False
+    return attr is FunctionModifiers.UNUSED or attr is FunctionModifiers._DROP
+
+
+def is_ignored_fn(fn) -> bool:
+    mod = get_torchscript_modifier(fn)
+    return (
+        mod is FunctionModifiers.UNUSED
+        or mod is FunctionModifiers.IGNORE
+        or mod is FunctionModifiers._DROP
+    )
+
+
+def _is_drop_fn(fn) -> bool:
+    mod = get_torchscript_modifier(fn)
+    return mod is FunctionModifiers._DROP
+
+
+def is_static_fn(cls, fn) -> bool:
+    return isinstance(inspect.getattr_static(cls, fn, default=None), staticmethod)
+
+
+def get_static_fn(cls, fn):
+    return inspect.getattr_static(cls, fn).__func__
+
+
+def get_torchscript_modifier(fn):
+    if not callable(fn):
+        return None
+    if hasattr(fn, "__func__"):
+        fn = fn.__func__
+    return getattr(fn, "_torchscript_modifier", FunctionModifiers.DEFAULT)
+
+
+def copy_torchscript_modifier(orig, new) -> None:
+    attr = get_torchscript_modifier(orig)
+    if attr is None:
+        return
+    new._torchscript_modifier = attr
+
+
+# overloading registration
+# overloads get registered in this file, and compiled in torch/jit/__init__.py
+# so that they can be imported in nn/functional.py without an import cycle
+
+# qualified_name => list[overload_functions]
+_overloaded_fns: Dict[str, List[Callable]] = {}  # noqa: T484
+
+
+_OVERLOAD_EXAMPLE = """
+Example usage of overload function:
+@torch.jit._overload
+def my_function(x: type0) -> type0: # decl 1
+    pass
+
+@torch.jit._overload
+def my_function(x: type1) -> type1: # decl 2
+    pass
+
+def my_function(x):                 # implementation
+    if isinstance(x, type0):
+        return x
+    elif isinstance(x, type1):
+        return x
+"""
+
+
+def get_overload_no_implementation_error_message(kind, obj):
+    sourcelines, file_lineno, filename = get_source_lines_and_file(obj)
+    return (
+        f'Implementation for the {kind} "{_qualified_name(obj)}" is missing. Please make '
+        f"sure a definition is provided and defined after all overload declarations.\n"
+        f'File "{filename}", line {file_lineno}:\n'
+        + "".join(sourcelines)
+        + "\n"
+        + _OVERLOAD_EXAMPLE
+    )
+
+
+def _check_overload_body(func):
+    try:
+        parsed_def = parse_def(func)
+    except OSError as e:
+        # Parsing the function definition can raise an OSError if source is unavailable.
+        # Since this is just an initial check, just raise a warning if this is the case.
+        warnings.warn(
+            f"Unable to retrieve source for @torch.jit._overload function: {func}."
+        )
+        return
+
+    body = parsed_def.ast.body[0].body
+
+    def is_pass(x):
+        return isinstance(x, ast.Pass)
+
+    def is_ellipsis(x):
+        return isinstance(x, ast.Expr) and isinstance(x.value, ast.Ellipsis)
+
+    if len(body) != 1 or not (is_pass(body[0]) or is_ellipsis(body[0])):
+        msg = (
+            "Only `pass` statement or `...` can be the body of overload declaration:\n"
+        )
+        msg += "\n".join(parsed_def.source.split("\n")[:3])
+        msg += " <- Expecting `pass` or `...` here!\n" + _OVERLOAD_EXAMPLE
+        raise RuntimeError(msg)
+
+
+def _overload(func):
+    _check_overload_body(func)
+    qual_name = _qualified_name(func)
+    global _overloaded_fns
+    fn_overload_list = _overloaded_fns.get(qual_name)
+    if fn_overload_list is None:
+        fn_overload_list = []
+        _overloaded_fns[qual_name] = fn_overload_list
+    fn_overload_list.append(func)
+    return func
+
+
+def _get_fn_overloads(qual_name):
+    return _overloaded_fns.get(qual_name)
+
+
+def _clear_fn_overloads(qual_name) -> None:
+    del _overloaded_fns[qual_name]
+
+
+def get_class_name_lineno(method) -> Tuple[str, int]:
+    current_frame = inspect.currentframe()
+
+    # one for the get_class_name call, one for _overload_method call
+    for i in range(2):
+        assert (
+            current_frame is not None
+        )  # assert current frame is not an Optional[FrameType]
+        current_frame = current_frame.f_back
+
+    assert current_frame is not None  # same here
+    class_name = current_frame.f_code.co_name
+    line_no = current_frame.f_code.co_firstlineno
+    return class_name, line_no
+
+
+# At the point the decorator is applied to class methods the method
+# has no reference to its owning class. _qualified_name would not include
+# the class it is defined in, so any methods with the same name in the same file
+# would have the same _qualified_name, even if they were defined in different
+# classes. This problem only exists in python 2.
+# We get around this problem by looking at the stack frame and identifying
+# the class name, and throwing an error whenever overloads are used
+# when modules of the same name are in the same file
+
+# qualified_name => class name => list[overload_functions]
+_overloaded_methods: Dict[str, Dict[str, List[Callable]]] = {}  # noqa: T484
+
+
+# (qualified_name, class name) => class_fileno
+_overloaded_method_class_fileno: Dict[Tuple[str, str], int] = {}
+
+
+def _overload_method(func):
+    _check_overload_body(func)
+    qual_name = _qualified_name(func)
+    global _overloaded_methods
+    class_name_map = _overloaded_methods.get(qual_name, None)
+    if class_name_map is None:
+        class_name_map = {}
+        _overloaded_methods[qual_name] = class_name_map
+
+    class_name, line_no = get_class_name_lineno(func)
+    method_overloads = class_name_map.get(class_name, None)
+    if method_overloads is None:
+        method_overloads = []
+        class_name_map[class_name] = method_overloads
+        _overloaded_method_class_fileno[(qual_name, class_name)] = line_no
+    else:
+        existing_lineno = _overloaded_method_class_fileno[(qual_name, class_name)]
+        if existing_lineno != line_no:
+            raise RuntimeError(
+                "Cannot currently overload the same method name in two different"
+                " classes with the same name in the same module"
+            )
+
+    method_overloads.append(func)
+    return func
+
+
+def _get_overloaded_methods(method, mod_class):
+    # TODO: __name__ not set for submodules in recursive script
+    if not hasattr(method, "__name__"):
+        return None
+    qual_name = _qualified_name(method)
+    class_name_map = _overloaded_methods.get(qual_name, None)
+    if class_name_map is None:
+        return None
+    overloads = class_name_map.get(mod_class.__name__, None)
+    if overloads is None:
+        return None
+
+    method_line_no = get_source_lines_and_file(method)[1]
+    mod_class_fileno = get_source_lines_and_file(mod_class)[1]
+    mod_end_fileno = mod_class_fileno + len(get_source_lines_and_file(mod_class)[0])
+    if not (method_line_no >= mod_class_fileno and method_line_no <= mod_end_fileno):
+        raise Exception(
+            "Overloads are not useable when a module is redeclared within the same file: "
+            + str(method)
+        )
+    return overloads
+
+
+def is_tuple(ann) -> bool:
+    if ann is Tuple:
+        raise_error_container_parameter_missing("Tuple")
+
+    # For some reason Python 3.7 violates the Type[A, B].__origin__ == Type rule
+    if not hasattr(ann, "__module__"):
+        return False
+
+    ann_origin = get_origin(ann)
+    if IS_PY39_PLUS and ann.__module__ == "builtins" and ann_origin is tuple:
+        return True
+    return ann.__module__ == "typing" and (ann_origin is Tuple or ann_origin is tuple)
+
+
+def is_list(ann) -> bool:
+    if ann is List:
+        raise_error_container_parameter_missing("List")
+
+    if not hasattr(ann, "__module__"):
+        return False
+
+    ann_origin = get_origin(ann)
+    if IS_PY39_PLUS and ann.__module__ == "builtins" and ann_origin is list:
+        return True
+    return ann.__module__ == "typing" and (ann_origin is List or ann_origin is list)
+
+
+def is_dict(ann) -> bool:
+    if ann is Dict:
+        raise_error_container_parameter_missing("Dict")
+
+    if not hasattr(ann, "__module__"):
+        return False
+
+    ann_origin = get_origin(ann)
+    if IS_PY39_PLUS and ann.__module__ == "builtins" and ann_origin is dict:
+        return True
+    return ann.__module__ == "typing" and (ann_origin is Dict or ann_origin is dict)
+
+
+def is_union(ann):
+    if ann is Union:
+        raise_error_container_parameter_missing("Union")
+
+    return isinstance(ann, BuiltinUnionType) or (
+        hasattr(ann, "__module__")
+        and ann.__module__ == "typing"
+        and (get_origin(ann) is Union)
+    )
+
+
+def is_optional(ann):
+    if ann is Optional:
+        raise_error_container_parameter_missing("Optional")
+
+    def is_optional_as_optional(ann):
+        return (
+            hasattr(ann, "__module__")
+            and ann.__module__ == "typing"
+            and (get_origin(ann) is Optional)
+        )
+
+    def is_union_as_optional(ann):
+        ann_args = get_args(ann)
+        return len(ann_args) == 2 and (None in ann_args or type(None) in ann_args)
+
+    return is_optional_as_optional(ann) or (is_union(ann) and is_union_as_optional(ann))
+
+
+def is_future(ann) -> bool:
+    if ann is Future:
+        raise RuntimeError(
+            "Attempted to use Future without a "
+            "contained type. Please add a contained type, e.g. "
+            "Future[int]"
+        )
+    return get_origin(ann) is Future
+
+
+def is_await(ann) -> bool:
+    if ann is _Await:
+        return True
+    return get_origin(ann) is _Await
+
+
+if torch.distributed.rpc.is_available():
+    from torch._C._distributed_rpc import PyRRef
+    from torch.distributed.rpc import RRef
+
+    def is_rref(ann) -> bool:
+        if ann is RRef:
+            raise RuntimeError(
+                "Attempted to use RRef without a "
+                "contained type. Please add a contained type, e.g. "
+                "RRef[int]"
+            )
+        return get_origin(ann) is RRef
+
+    def is_rref_instance(obj) -> bool:
+        return isinstance(obj, PyRRef)
+
+else:
+
+    def is_rref_instance(obj) -> bool:
+        # If the RPC module doesn't exist then RRefs don't exist either.
+        return False
+
+
+def is_final(ann) -> bool:
+    return (
+        hasattr(ann, "__module__")
+        and ann.__module__ in {"typing", "typing_extensions"}
+        and (get_origin(ann) is Final or isinstance(ann, type(Final)))
+    )
+
+
+# allows BroadcastingList instance to be subscriptable
+class BroadcastingListCls:
+    def __getitem__(self, types):
+        return
+
+
+# mypy doesn't support parameters on types, so we have to explicitly type each
+# list size
+BroadcastingList1 = BroadcastingListCls()
+for i in range(2, 7):
+    globals()[f"BroadcastingList{i}"] = BroadcastingList1
+
+
+def is_scripting() -> bool:
+    r"""
+    Function that returns True when in compilation and False otherwise. This
+    is useful especially with the @unused decorator to leave code in your
+    model that is not yet TorchScript compatible.
+    .. testcode::
+
+        import torch
+
+        @torch.jit.unused
+        def unsupported_linear_op(x):
+            return x
+
+        def linear(x):
+           if torch.jit.is_scripting():
+              return torch.linear(x)
+           else:
+              return unsupported_linear_op(x)
+    """
+    return False
+
+
+# Retrieves a fully-qualified name (module hierarchy + classname) for a given obj.
+def _qualified_name(obj, mangle_name=True) -> str:
+    # This special case allows us to override the qualified name on a type.
+    # It's currently used in conjunction with tracing, where we create a
+    # fake module to filter only supported attributes. However, since this
+    # new type is defined as a local class, we need a mechanism to override
+    # its qualname so it appears correctly in the TorchScript system. This,
+    # we set '_jit_override_qualname' with the original traced module's
+    # qualified name, which is picked up here
+    if hasattr(obj, "_jit_override_qualname"):
+        return obj._jit_override_qualname
+    # short-circuit in cases where the object already has a known qualified name
+    if isinstance(obj, torch._C.ScriptFunction):
+        return obj.qualified_name
+
+    if getattr(obj, "__name__", None):
+        name = obj.__name__
+    # Enum classes do not have `__name__` attr, instead they have `name`.
+    elif isinstance(obj, enum.Enum):
+        name = obj.name
+    else:
+        raise RuntimeError("Could not get name of python class object")
+
+    if name == "<lambda>":
+        name = "_lambda"  # make name a valid identifier
+
+    module_name = obj.__module__
+
+    # If the module is actually a torchbind module, then we should short circuit
+    if module_name == "torch._classes":
+        return obj.qualified_name
+
+    # The Python docs are very clear that `__module__` can be None, but I can't
+    # figure out when it actually would be.
+    if module_name is None:
+        raise RuntimeError(
+            f"Could not get qualified name for class '{name}': "
+            "__module__ can't be None."
+        )
+
+    # if getattr(sys.modules[module_name], name) is not obj:
+    #     raise RuntimeError(f"Could not get qualified name for class '{name}': "
+    #                        f"the attr {name} on module {module_name} is not the class")
+
+    # torch.package and TorchScript have separate mangling schemes to avoid
+    # name collisions from multiple packages. To avoid them interfering with
+    # each other, normalize the package manging here.
+    if package_mangling.is_mangled(module_name):
+        module_name = module_name.replace("<", "_")
+        module_name = module_name.replace(">", "_")
+
+    # The PythonExceptionValue C++ class in torch/csrc/jit/python/python_sugared_value.h
+    # does not need mangle the python class name.
+    if mangle_name:
+        # __main__ is a builtin module, so rewrite it to "__torch__".
+        if module_name == "__main__":
+            module_name = "__torch__"
+        else:
+            # Everything else gets a "__torch__" prefix to avoid name collisions
+            # with the names of user values.
+            module_name = "__torch__." + module_name
+
+    if "." in name:
+        raise RuntimeError(
+            f"Could not get qualified name for class '{name}': "
+            f"'{name}' is not a valid identifier"
+        )
+
+    return module_name + "." + name
+
+
+def _try_get_dispatched_fn(fn):
+    if not callable(fn):
+        return None
+    return boolean_dispatched.get(fn)
+
+
+def _get_named_tuple_properties(
+    obj, loc: Optional[torch._C._jit_tree_views.SourceRange] = None, rcb=None
+):
+    if loc is None:
+        loc = fake_range()
+
+    assert issubclass(obj, tuple) and hasattr(obj, "_fields")
+    if hasattr(obj, "_field_defaults"):
+        defaults = [
+            obj._field_defaults[field]
+            for field in obj._fields
+            if field in obj._field_defaults
+        ]
+    else:
+        defaults = []
+    # In 3.10 recommended way to get annotations is to call `inspect.get_annotations` function
+    # Also, annotations from base class are not inherited so they need to be queried explicitly
+    if sys.version_info[:2] < (3, 10):
+        obj_annotations = getattr(obj, "__annotations__", {})
+    else:
+        obj_annotations = inspect.get_annotations(obj)
+        if len(obj_annotations) == 0 and hasattr(obj, "__base__"):
+            obj_annotations = inspect.get_annotations(obj.__base__)
+
+    annotations = []
+    for field in obj._fields:
+        if field in obj_annotations:
+            field_type = obj_annotations[field]
+            # [Note: ForwardRef annotations in NamedTuple attributes]
+            # NamedTuple types are slightly different from normal types.
+            #
+            # Normally, annotations are evaluted like this (during jit.script):
+            # 1. Load strings of python code into c++ and parse.
+            # 2. Get annotations as strings
+            # 3. Use the PythonResolver's resolution callback (rcb) to convert
+            #    the string into a python object
+            # 4. We call into annotations.py:ann_to_type to convert python obj
+            #    from step 3 into a type that torchscript understands.
+            #
+            # NamedTuples are more complicated, because it has sub-types.
+            # Normally, once we have the NamedTuple type object from #3,
+            # we can just look at the annotation literal values and use
+            # ann_to_type directly on them.
+            #
+            # But sometimes, users will annotate with string literals, e.g.
+            #    x: 'int'
+            # This also happens with PEP563 (from __forward__ import annotations)
+            #
+            # These annotations appear in the annotation dict as ForwardRef('int').
+            #
+            # Then, we need to convert the string into a python object. This
+            # requires having local context for custom objects or imported types.
+            # rcb() is what gives us this. So, we plumb rcb through the stack so
+            # it can be used in this context for the if block below.
+            #
+            # FAQ:
+            # - Why do we need this special handling for NamedTuple but string
+            #   annotations work fine for normal types? Normally, we parse the
+            #   string directly and then call rcb() directly from C++.
+            # - Why not use ForwardRef._evaluate? For that, we need globals()
+            #   and locals() for the local context where the NamedTuple was defined.
+            #   rcb is what lets us look up into these. So, basically rcb does the
+            #   hard work for us.
+            if isinstance(field_type, ForwardRef) and rcb is not None:
+                rcb_type = rcb(field_type.__forward_arg__)
+                # rcb returns None if it can't find anything.
+                if rcb_type is None:
+                    raise ValueError(
+                        f"Unknown type annotation: '{field_type}' in NamedTuple {obj.__name__}."
+                        f" Likely due to partial support for ForwardRef parameters in NamedTuples, see #95858."
+                        f" Issue occurred at {loc.highlight()}"
+                    )
+                field_type = rcb_type
+            the_type = torch.jit.annotations.ann_to_type(field_type, loc, rcb)
+            annotations.append(the_type)
+        else:
+            annotations.append(torch._C.TensorType.getInferred())
+    return type(obj).__name__, obj._fields, annotations, defaults
+
+
+def _create_named_tuple(
+    t, unqual_name: str, field_names: List[str], defaults: Tuple[Any, ...]
+):
+    TupleType = collections.namedtuple(unqual_name, field_names, defaults=defaults)  # type: ignore[call-arg, no-redef, misc]
+    return TupleType(*t)
+
+
+@contextlib.contextmanager
+def _disable_emit_hooks():
+    hooks = torch._C._jit_get_emit_hooks()
+    torch._C._jit_set_emit_hooks(None, None)
+    try:
+        yield
+    finally:
+        torch._C._jit_set_emit_hooks(hooks[0], hooks[1])
+
+
+def _disable_emit_hooks_decorator(_DecoratorContextManager) -> None:  # noqa: F811
+    def __enter__(self) -> None:
+        self.hooks = torch._C._jit_get_emit_hooks()
+        torch._C._jit_set_emit_hooks(None, None)
+
+    def __exit__(self, *args) -> None:
+        torch._C._jit_set_emit_hooks(self.hooks[0], self.hooks[1])
+
+
+def _is_exception(obj) -> bool:
+    if not inspect.isclass(obj):
+        return False
+    return issubclass(obj, Exception)
+
+
+def raise_error_container_parameter_missing(target_type) -> None:
+    if target_type == "Dict":
+        raise RuntimeError(
+            "Attempted to use Dict without "
+            "contained types. Please add contained type, e.g. "
+            "Dict[int, int]"
+        )
+    raise RuntimeError(
+        f"Attempted to use {target_type} without a "
+        "contained type. Please add a contained type, e.g. "
+        f"{target_type}[int]"
+    )
+
+
+def check_args_exist(target_type) -> None:
+    if target_type is List or target_type is list:
+        raise_error_container_parameter_missing("List")
+    elif target_type is Tuple or target_type is tuple:
+        raise_error_container_parameter_missing("Tuple")
+    elif target_type is Dict or target_type is dict:
+        raise_error_container_parameter_missing("Dict")
+    elif target_type is None or target_type is Optional:
+        raise_error_container_parameter_missing("Optional")
+
+
+def check_empty_containers(obj) -> None:
+    if obj == [] or obj == {} or obj == ():
+        warnings.warn(
+            "The inner type of a container is lost when "
+            "calling torch.jit.isinstance in eager mode. For "
+            "example, List[int] would become list and "
+            "therefore falsely return True for List[float] or"
+            " List[str]."
+        )
+
+
+# supports List/Dict/Tuple and Optional types
+# TODO support future
+def container_checker(obj, target_type) -> bool:
+    origin_type = get_origin(target_type)
+    check_args_exist(target_type)
+    if origin_type is None:
+        return False
+    elif origin_type is list or origin_type is List:
+        check_empty_containers(obj)
+        if not isinstance(obj, list):
+            return False
+        arg_type = get_args(target_type)[0]
+        arg_origin = get_origin(arg_type)
+        for el in obj:
+            # check if nested container, ex: List[List[str]]
+            if arg_origin:  # processes nested container, ex: List[List[str]]
+                if not container_checker(el, arg_type):
+                    return False
+            elif not isinstance(el, arg_type):
+                return False
+        return True
+    elif origin_type is Dict or origin_type is dict:
+        check_empty_containers(obj)
+        if not isinstance(obj, dict):
+            return False
+        key_type = get_args(target_type)[0]
+        val_type = get_args(target_type)[1]
+        for key, val in obj.items():
+            # check if keys are of right type
+            if not isinstance(key, key_type):
+                return False
+            val_origin = get_origin(val_type)
+            if val_origin:
+                if not container_checker(val, val_type):
+                    return False
+            elif not isinstance(val, val_type):
+                return False
+        return True
+    elif origin_type is Tuple or origin_type is tuple:
+        check_empty_containers(obj)
+        if not isinstance(obj, tuple):
+            return False
+        arg_types = get_args(target_type)
+        if len(obj) != len(arg_types):
+            return False
+        for el, el_type in zip(obj, arg_types):
+            el_origin = get_origin(el_type)
+            if el_origin:
+                if not container_checker(el, el_type):
+                    return False
+            elif not isinstance(el, el_type):
+                return False
+        return True
+    elif origin_type is Union or issubclass(
+        origin_type, BuiltinUnionType
+    ):  # also handles Optional
+        if obj is None:  # check before recursion because None is always fine
+            return True
+        inner_types = get_args(target_type)
+        for t in inner_types:
+            t_origin = get_origin(t)
+            if t_origin:
+                return container_checker(obj, t)
+            elif isinstance(obj, t):
+                return True
+    return False
+
+
+def _isinstance(obj, target_type) -> bool:
+    if isinstance(target_type, collections.abc.Container):
+        if not isinstance(target_type, tuple):
+            raise RuntimeError(
+                "The second argument to "
+                "`torch.jit.isinstance` must be a type "
+                "or a tuple of types"
+            )
+        for t_type in target_type:
+            if _isinstance(obj, t_type):
+                return True
+        return False
+
+    origin_type = get_origin(target_type)
+    if origin_type:
+        return container_checker(obj, target_type)
+
+    # Check to handle non-typed optional origin returns as none instead
+    #    of as optional in 3.7-3.8
+    check_args_exist(target_type)
+
+    # handle non-containers
+    return isinstance(obj, target_type)
+
+
+class _TensorExtractor(pickle.Pickler):
+    def __init__(self, *args, tensors: List[torch.Tensor], **kwargs):
+        super().__init__(*args, **kwargs)
+        self.tensors = tensors
+
+    def persistent_id(self, obj):
+        if isinstance(obj, torch.Tensor):
+            self.tensors.append(obj)
+            return ""
+        # Since we just want to extract tensors, we don't mind if an object is
+        # unpicklable if it doesn't contain tensors, as we can just ignore/skip
+        # it. To play it safe, we only do so for common objects that we're sure
+        # don't contain tensors. Feel free to add new types here. Note also that
+        # even if a type isn't listed here this won't block users, since thet
+        # can just add a __getstate__ or __reduce__ method to their class.
+        if isinstance(obj, LockType):
+            return ""
+        # Futures and RRefs don't technically contain a value, they just offer
+        # the means to access a value.
+        if isinstance(obj, CFuture) or is_rref_instance(obj):
+            return ""
+        if isinstance(obj, CAwait):
+            return ""
+        if isinstance(obj, torch.cuda.Event):
+            return ""
+        if isinstance(obj, threading.Thread):
+            return ""
+        return None
+
+
+def _extract_tensors(obj):
+    r"""
+    This function is exclusively called from C++.
+    See ``torch/csrc/jit/python/python_ivalue.h``.
+
+    It extracts the tensors contained in the given object, through pickling.
+    """
+    tensors: List[torch.Tensor] = []
+    extractor = _TensorExtractor(io.BytesIO(), protocol=-1, tensors=tensors)
+    extractor.dump(obj)
+    return tensors
+
+
+# In Python-3.11+ typed enums (i.e. IntEnum for example) retain number of base class methods in subclass
+# that were previously dropped. To preserve the behavior, explicitly drop them there
+
+if sys.version_info > (3, 10):
+    _drop(enum.Enum.__new__)
+    _drop(enum.Enum.__format__)
+    _drop(enum.Enum.__repr__)
+    _drop(enum.Enum.__str__)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_meta_registrations.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_meta_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..e63e990d95141a64c7e8bd753b59a2d3219d9c2b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_meta_registrations.py
@@ -0,0 +1,6253 @@
+import math
+from enum import Enum
+from functools import partial
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch._prims_common as utils
+from torch import SymBool, SymFloat, Tensor
+from torch._decomp import (
+    _add_op_to_registry,
+    _convert_out_params,
+    global_decomposition_table,
+    meta_table,
+)
+from torch._ops import OpOverload
+from torch._prims import _prim_elementwise_meta, ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND
+from torch._prims_common import (
+    corresponding_complex_dtype,
+    corresponding_real_dtype,
+    elementwise_dtypes,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    IntLike,
+    make_contiguous_strides_for,
+    TensorLike,
+)
+
+from torch._prims_common.wrappers import (
+    _maybe_convert_to_dtype,
+    _maybe_resize_out,
+    _resize_output_check,
+    _safe_copy_out,
+    out_wrapper,
+)
+from torch._refs import _broadcast_shapes, _maybe_broadcast
+from torch.utils import _pytree as pytree
+
+
+aten = torch.ops.aten
+
+_meta_lib_dont_use_me_use_register_meta = torch.library.Library("aten", "IMPL", "Meta")
+
+
+def register_meta(op):
+    def wrapper(fn):
+        fn = _convert_out_params(fn)
+
+        def register(op):
+            _add_op_to_registry(meta_table, op, fn)
+
+        pytree.tree_map_(register, op)
+        return fn
+
+    return wrapper
+
+
+def elementwise_meta(
+    *args,
+    type_promotion: ELEMENTWISE_TYPE_PROMOTION_KIND,
+):
+    # Perform type promotion, as this is expected from prim_metafunction
+    _, result_dtype = utils.elementwise_dtypes(
+        *args,
+        type_promotion_kind=type_promotion,
+    )
+    args = [_maybe_convert_to_dtype(x, result_dtype) for x in args]
+
+    # Broadcast
+    args = _maybe_broadcast(*args)
+
+    # Perform prim checks
+    return _prim_elementwise_meta(
+        *args, type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
+def toRealValueType(dtype):
+    from_complex = {
+        torch.complex32: torch.half,
+        torch.cfloat: torch.float,
+        torch.cdouble: torch.double,
+    }
+    return from_complex.get(dtype, dtype)
+
+
+def check_inplace_broadcast(self_shape, *args_shape):
+    broadcasted_shape = tuple(_broadcast_shapes(self_shape, *args_shape))
+    torch._check(
+        broadcasted_shape == self_shape,
+        lambda: f"output with shape {self_shape} doesn't match the broadcast shape {broadcasted_shape}",
+    )
+
+
+@register_meta([aten.linspace, aten.logspace])
+@out_wrapper()
+def meta_linspace_logspace(
+    start,
+    end,
+    steps,
+    base=None,
+    dtype=None,
+    device=None,
+    layout=torch.strided,
+    pin_memory=False,
+    requires_grad=False,
+):
+    if isinstance(start, torch.Tensor):
+        torch._check(
+            start.dim() == 0,
+            lambda: "linspace only supports 0-dimensional start and end tensors",
+        )
+    if isinstance(end, torch.Tensor):
+        torch._check(
+            end.dim() == 0,
+            lambda: "linspace only supports 0-dimensional start and end tensors",
+        )
+
+    if any(isinstance(arg, complex) for arg in (start, end, steps)):
+        default_complex_dtype = utils.corresponding_complex_dtype(
+            torch.get_default_dtype()
+        )
+        if dtype is None:
+            dtype = default_complex_dtype
+        else:
+            torch._check(
+                utils.is_complex_dtype(dtype),
+                lambda: f"linspace(): inferred dtype {default_complex_dtype} can't be safely cast to passed dtype {dtype}",
+            )
+    else:
+        dtype = dtype or torch.get_default_dtype()
+    assert isinstance(dtype, torch.dtype)
+
+    # steps does not participate in the computation of the dtype
+    torch._check_type(
+        isinstance(steps, IntLike),
+        lambda: f"received an invalid combination of arguments - got \
+({type(start).__name__}, {type(end).__name__}, {type(steps).__name__})",
+    )
+    assert isinstance(steps, IntLike)  # for mypy
+    torch._check(steps >= 0, lambda: "number of steps must be non-negative")
+
+    return torch.empty(
+        (steps,),  # type: ignore[arg-type]
+        dtype=dtype,
+        layout=layout,
+        device="meta",
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+    )
+
+
+@register_meta([aten.take.default, aten.take.out])
+@out_wrapper()
+def meta_take(self, index):
+    # Type and device checks
+    torch._check(
+        index.dtype == torch.long,
+        lambda: f"take(): Expected a long tensor for index, but got {index.dtype}",
+    )
+    # Index checks
+    torch._check_index(
+        not (self.numel() == 0 and index.numel() != 0),
+        lambda: "take(): tried to take from an empty tensor",
+    )
+    return self.new_empty(index.shape)
+
+
+@register_meta([aten.linalg_cross.default, aten.linalg_cross.out])
+@out_wrapper()
+def linalg_cross(self, other, *, dim=-1):
+    x_d = self.ndim
+    y_d = other.ndim
+    torch._check(
+        x_d == y_d,
+        lambda: "linalg.cross: inputs must have the same number of dimensions.",
+    )
+    torch._check(
+        self.size(dim) == 3 and other.size(dim) == 3,
+        lambda: (
+            f"linalg.cross: inputs dimension {dim} must have length 3. "
+            f"Got {self.size(dim)} and {other.size(dim)}"
+        ),
+    )
+    out_shape = _broadcast_shapes(self.shape, other.shape)
+    return self.new_empty(out_shape)
+
+
+@register_meta(aten.linalg_matrix_exp)
+@out_wrapper()
+def linalg_matrix_exp(self):
+    squareCheckInputs(self, "linalg.matrix_exp")
+    checkFloatingOrComplex(self, "linalg.matrix_exp")
+    return torch.empty_like(self, memory_format=torch.contiguous_format)
+
+
+@register_meta(
+    [aten.cummax.default, aten.cummax.out, aten.cummin.default, aten.cummin.out]
+)
+@out_wrapper("values", "indices")
+def cummaxmin(self, dim):
+    values = torch.empty(self.shape, device=self.device, dtype=self.dtype)
+    indices = torch.empty(self.shape, device=self.device, dtype=torch.int64)
+    if self.numel() != 0 and self.ndim != 0:
+        # Checks that dim is within bounds
+        maybe_wrap_dim(dim, self.ndim)
+    return values, indices
+
+
+@register_meta([aten.logcumsumexp.default, aten.logcumsumexp.out])
+@out_wrapper()
+def logcumsumexp(self, dim):
+    # Checks that dim is within bounds
+    maybe_wrap_dim(dim, self.ndim)
+    return torch.empty_like(self).contiguous()
+
+
+# Stride-related code from _exec_fft in aten/src/ATen/native/cuda/SpectralOps.cpp
+def _exec_fft(out, self, out_sizes, dim, forward):
+    ndim = self.ndim
+    signal_ndim = len(dim)
+    batch_dims = ndim - signal_ndim
+
+    # Permute dimensions so batch dimensions come first, and in stride order
+    dim_permute = list(range(ndim))
+
+    is_transformed_dim = [False for _ in range(ndim)]
+    for d in dim:
+        is_transformed_dim[d] = True
+
+    # std::partition
+    left, right = [], []
+    for d in dim_permute:
+        if not is_transformed_dim[d]:
+            left.append(d)
+        else:
+            right.append(d)
+    dim_permute = left + right
+    batch_end = len(left)
+
+    self_strides = self.stride()
+    tmp = dim_permute[:batch_end]
+    tmp.sort(key=lambda x: self_strides[x], reverse=True)
+    dim_permute = tmp + dim_permute[batch_end:]
+    input = self.permute(dim_permute)
+
+    # Collapse batch dimensions into a single dimension
+    batched_sizes = [-1] + list(input.shape[batch_dims:])
+    input = input.reshape(batched_sizes)
+
+    batch_size = input.size(0)
+    batched_sizes[0] = batch_size
+    batched_out_sizes = batched_sizes
+    for i in range(len(dim)):
+        batched_out_sizes[i + 1] = out_sizes[dim[i]]
+    out = out.reshape(batched_out_sizes)
+
+    # Reshaping to original batch shape and inverting the dimension permutation
+    out_strides = [0 for _ in range(ndim)]
+    batch_numel = 1
+    i = batch_dims - 1
+    while i >= 0:
+        out_strides[dim_permute[i]] = batch_numel * out.stride(0)
+        batch_numel *= out_sizes[dim_permute[i]]
+        i -= 1
+    for i in range(batch_dims, ndim):
+        out_strides[dim_permute[i]] = out.stride(1 + (i - batch_dims))
+    return out.as_strided(out_sizes, out_strides, out.storage_offset())
+
+
+# See _fft_c2c_cufft in aten/src/ATen/native/cuda/SpectralOps.cpp
+# and _fft_c2c_mkl in aten/src/ATen/native/mkl/SpectralOps.cpp
+@register_meta([aten._fft_c2c.default, aten._fft_c2c.out])
+@out_wrapper()
+def meta_fft_c2c(self, dim, normalization, forward):
+    assert self.dtype.is_complex
+
+    out_sizes = self.shape
+    output = self.new_empty(out_sizes)
+
+    if not dim:
+        return output
+
+    sorted_dims = dim[:]
+    self_strides = self.stride()
+    sorted_dims.sort(key=lambda x: self_strides[x], reverse=True)
+    output = _exec_fft(output, self, out_sizes, sorted_dims, forward)
+
+    return output
+
+
+@register_meta([aten._fft_r2c.default, aten._fft_r2c.out])
+@out_wrapper()
+def meta_fft_r2c(self, dim, normalization, onesided):
+    assert self.dtype.is_floating_point
+    output_sizes = list(self.size())
+
+    if onesided:
+        last_dim = dim[-1]
+        last_dim_halfsize = (output_sizes[last_dim] // 2) + 1
+        output_sizes[last_dim] = last_dim_halfsize
+
+    return self.new_empty(
+        output_sizes, dtype=utils.corresponding_complex_dtype(self.dtype)
+    )
+
+
+@register_meta(aten.randperm.generator_out)
+def meta_randperm(n, *, generator=None, out):
+    return _maybe_resize_out(out, torch.Size([n]))
+
+
+@register_meta(aten.randperm.default)
+def meta_randperm_default(
+    n, *, dtype=torch.long, layout=None, device=None, pin_memory=None
+):
+    return torch.empty(
+        n, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_meta(aten.randint.default)
+def meta_randint(
+    high, size, *, dtype=torch.long, layout=None, device=None, pin_memory=None
+):
+    return torch.empty(
+        size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_meta(aten.randint.low)
+def meta_randint_low(
+    low,
+    high,
+    size,
+    *,
+    dtype=torch.long,
+    layout=None,
+    device=None,
+    pin_memory=None,
+):
+    return torch.empty(
+        size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_meta(aten.rand.default)
+def meta_rand_default(size, *, dtype=None, layout=None, device=None, pin_memory=None):
+    return torch.empty(
+        size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_meta([aten._fft_c2r.default, aten._fft_c2r.out])
+@out_wrapper()
+def meta_fft_c2r(self, dim, normalization, lastdim):
+    assert self.dtype.is_complex
+    output_sizes = list(self.size())
+    output_sizes[dim[-1]] = lastdim
+    return self.new_empty(output_sizes, dtype=toRealValueType(self.dtype))
+
+
+@register_meta(aten.copy_.default)
+def meta_copy_(self, src, non_blocking=False):
+    # This code simulates the original decomp from inductor,
+    # which runs most of the meta checks that we care about.
+    # In theory, we should make this more robust by carefully
+    # auditing our C++ copy_() kernel and copying the checks here.
+
+    if torch._debug_has_internal_overlap(self) == 1:  # 1 == MemOverlap::Yes
+        raise RuntimeError(
+            "more than one element of the written-to tensor refers to a single memory location"
+        )
+
+    if isinstance(src, Tensor):
+        intermediate = src.to(self, non_blocking)
+        if self.size() != intermediate.size():
+            aten.expand_copy.default(intermediate, self.size())
+    return self
+
+
+def inferUnsqueezeGeometry(tensor, dim):
+    result_sizes = list(tensor.size())
+    result_strides = list(tensor.stride())
+    new_stride = 1 if dim >= tensor.dim() else result_sizes[dim] * result_strides[dim]
+    result_sizes.insert(dim, 1)
+    result_strides.insert(dim, new_stride)
+    return result_sizes, result_strides
+
+
+@register_meta(aten.unsqueeze_.default)
+def meta_unsqueeze_(self, dim):
+    dim = maybe_wrap_dim(dim, self.dim() + 1)
+    g_sizes, g_strides = inferUnsqueezeGeometry(self, dim)
+    self.as_strided_(g_sizes, g_strides)
+    return self
+
+
+@register_meta(aten._sparse_semi_structured_linear)
+def meta_sparse_structured_linear(
+    input: Tensor,
+    weight: Tensor,
+    _meta: Tensor,
+    bias: Optional[Tensor] = None,
+    _activation_opt: Optional[str] = None,
+    out_dtype: Optional[torch.dtype] = None,
+):
+    output_sizes = list(input.shape)
+    if bias is not None:
+        assert weight.size(0) == bias.size(0), "output size mismatch"
+    assert weight.size(1) == input.size(-1) / 2
+    output_sizes[-1] = weight.size(0)
+
+    # see: https://github.com/pytorch/pytorch/pull/114477#issuecomment-1830121375
+    # We assume that we have already squashed the inputs into a 2-D tensor
+    # Then, as the output is transposed, we need to propagate the transposed
+    # stride information to the output tensor
+    assert len(input.shape) == 2, "we can only handle the squashed input case"
+    transposed_strides = (1, input.size(0))
+
+    if out_dtype is not None:
+        assert (
+            input.dtype == torch.int8 and out_dtype == torch.int32
+        ), "out_dtype is only supported for i8i8->i32 linear operator"
+    output = input.new_empty(
+        output_sizes,
+        dtype=input.dtype if out_dtype is None else out_dtype,
+    ).as_strided(output_sizes, transposed_strides)
+
+    return output
+
+
+@register_meta(aten._cslt_sparse_mm)
+def meta__cslt_sparse_mm(
+    compressed_A: torch.Tensor,
+    dense_B: torch.Tensor,
+    bias: Optional[Tensor] = None,
+    alpha: Optional[Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    transpose_result: bool = False,
+):
+    assert dense_B.dtype in {
+        torch.float32,
+        torch.float16,
+        torch.bfloat16,
+        torch.int8,
+    }, "_cslt_sparse_mm only supports fp16, bf16, and int8"
+    assert compressed_A.dtype == dense_B.dtype, "inputs must have the same dtype"
+    assert len(dense_B.shape) == 2, "_cslt_sparse_mm only supports 2d inputs"
+
+    is_int8_input_type = compressed_A.dtype == torch.int8
+    compression_factor = 10 if is_int8_input_type else 9
+    k = dense_B.size(0)
+    n = dense_B.size(1)
+    m = (compressed_A.numel() * 16) // (compression_factor * k)
+    if bias is not None:
+        assert m == bias.size(0)
+
+    if out_dtype is not None:
+        assert is_int8_input_type and out_dtype in {
+            torch.float16,
+            torch.bfloat16,
+            torch.int32,
+        }, "out_dtype is only supported for i8i8->fp16, bf16, or i32 matmul"
+    output_shape = (n, m) if transpose_result else (m, n)
+    result = dense_B.new_empty(output_shape, dtype=out_dtype)
+    return result
+
+
+@register_meta(aten.index_reduce.default)
+def meta_index_reduce(
+    self: Tensor,
+    dim: int,
+    index: Tensor,
+    source: torch.Tensor,
+    reduce: str,
+    *,
+    include_self: bool = True,
+) -> Tensor:
+    return torch.empty_like(self, memory_format=torch.contiguous_format)
+
+
+@register_meta(aten.index_reduce_.default)
+def meta_index_reduce_(
+    self: Tensor,
+    dim: int,
+    index: Tensor,
+    source: torch.Tensor,
+    reduce: str,
+    *,
+    include_self: bool = True,
+) -> Tensor:
+    return self
+
+
+# Implementations below are taken from https://github.com/albanD/subclass_zoo/blob/main/python_meta_tensor.py
+@out_wrapper()
+@register_meta(aten.index_select.default)
+def meta_index_select(self, dim, index):
+    result_size = list(self.size())
+    if self.dim() > 0:
+        result_size[dim] = index.numel()
+    return self.new_empty(result_size)
+
+
+@register_meta(aten.segment_reduce.default)
+def meta_segment_reduce(
+    data: Tensor,
+    reduce: str,
+    *,
+    lengths: Optional[Tensor] = None,
+    indices: Optional[Tensor] = None,
+    offsets: Optional[Tensor] = None,
+    axis: int = 0,
+    unsafe: bool = False,
+    initial=None,
+) -> Tensor:
+    if indices is not None:
+        raise NotImplementedError(
+            "segment_reduce(): indices based reduction is not supported yet."
+        )
+
+    def segment_reduce_lengths_tensor(lengths_shape):
+        return torch.empty(
+            lengths_shape + data.shape[axis + 1 :],
+            dtype=data.dtype,
+            device="meta",
+            memory_format=torch.contiguous_format,
+        )
+
+    if lengths is not None:
+        return segment_reduce_lengths_tensor(lengths.shape)
+    # FIXME should probably check that lengths and offset aren't both set, but
+    # the ATen implementation neglects this too
+    if offsets is not None:
+        # lengths == torch.diff(offsets)
+        lengths_shape = offsets.shape[:-1] + (offsets.shape[-1] - 1,)
+        return segment_reduce_lengths_tensor(lengths_shape)
+    raise RuntimeError("segment_reduce(): Either lengths or offsets must be defined.")
+
+
+@register_meta([aten.max.default, aten.max.unary_out])
+@out_wrapper()
+def meta_max(self):
+    return self.new_empty(())
+
+
+@register_meta(aten.max.dim)
+def meta_max_dim(self, dim, keepdim=False):
+    dim = utils.reduction_dims(self.shape, (dim,))
+    output_shape = _compute_reduction_shape(self, dim, keepdim)
+    return (
+        self.new_empty(output_shape),
+        self.new_empty(output_shape, dtype=torch.long),
+    )
+
+
+@register_meta([aten.min.default, aten.min.unary_out])
+@out_wrapper()
+def meta_min(self):
+    return self.new_empty(())
+
+
+@register_meta(aten.min.dim)
+def meta_min_dim(self, dim, keepdim=False):
+    dim = utils.reduction_dims(self.shape, (dim,))
+    output_shape = _compute_reduction_shape(self, dim, keepdim)
+    return (
+        self.new_empty(output_shape),
+        self.new_empty(output_shape, dtype=torch.long),
+    )
+
+
+@register_meta(aten.angle.default)
+def meta_angle(self):
+    if self.is_complex():
+        result_dtype = corresponding_real_dtype(self.dtype)
+    else:
+        _, result_dtype = elementwise_dtypes(
+            self,
+            type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        )
+    return torch.empty_like(self, dtype=result_dtype)
+
+
+@register_meta(aten.angle.out)
+def meta_angle_out(self, out):
+    torch._resize_output_(out, self.size(), self.device)
+    return out.copy_(torch.angle(self))
+
+
+@register_meta(aten._assert_async.default)
+def assert_async(val):
+    return
+
+
+@register_meta(aten._assert_async.msg)
+def assert_async_meta(val, assert_msg):
+    return
+
+
+@register_meta(aten._print.default)
+def print_meta(s):
+    return
+
+
+@register_meta(aten._make_dep_token.default)
+def make_dep_token(
+    *,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+    memory_format=None,
+):
+    return torch.empty([], device="meta")
+
+
+@register_meta(aten.sym_constrain_range.default)
+def sym_constrain_range(size, min=None, max=None):
+    # Avoid importing sympy at a module level
+    from torch.fx.experimental.symbolic_shapes import constrain_range
+
+    if isinstance(size, (SymFloat, SymBool)):
+        raise ValueError("Constraining SymFloat or Symbool is nyi")
+    constrain_range(size, min=min, max=max)
+
+
+@register_meta(aten._functional_sym_constrain_range.default)
+def functional_sym_constrain_range(size, min=None, max=None, dep_token=None):
+    aten.sym_constrain_range(size, min=min, max=max)
+    return dep_token
+
+
+@register_meta(aten.sym_constrain_range_for_size.default)
+def sym_constrain_range_for_size(size, min=None, max=None):
+    # Avoid importing sympy at a module level
+    from torch.fx.experimental.symbolic_shapes import _constrain_range_for_size
+
+    if isinstance(size, (SymFloat, SymBool)):
+        raise ValueError("Constraining SymFloat or Symbool is nyi")
+    _constrain_range_for_size(size, min=min, max=max)
+
+
+@register_meta(aten._functional_sym_constrain_range_for_size.default)
+def functional_sym_constrain_range_for_size(size, min, max, dep_token):
+    aten.sym_constrain_range_for_size(size, min=min, max=max)
+    return dep_token
+
+
+@register_meta(aten._functional_assert_async.msg)
+def functional_assert_async_meta(val, assert_msg, dep_token):
+    return dep_token
+
+
+# From aten/src/ATen/native/LinearAlgebraUtils.h
+def squareCheckInputs(self: Tensor, f_name: str):
+    assert (
+        self.dim() >= 2
+    ), f"{f_name}: The input tensor must have at least 2 dimensions."
+    assert self.size(-1) == self.size(
+        -2
+    ), f"{f_name}: A must be batches of square matrices, but they are {self.size(-2)} by {self.size(-1)} matrices"
+
+
+# Validates input shapes and devices
+# for linear solve methods (solve, cholesky_solve, lu_solve, triangular_solve)
+# From aten/src/ATen/native/LinearAlgebraUtils.h
+def linearSolveCheckInputs(
+    self: Tensor,
+    A: Tensor,
+    name: str,
+):
+    torch._check(
+        self.device == A.device,
+        lambda: (
+            f"Expected b and A to be on the same device, but found b on "
+            f"{self.device} and A on {A.device} instead."
+        ),
+    )
+
+    torch._check(
+        self.dtype == A.dtype,
+        lambda: (
+            f"Expected b and A to have the same dtype, but found b of type "
+            f"{self.dtype} and A of type {A.dtype} instead."
+        ),
+    )
+
+    torch._check(
+        A.size(-1) == A.size(-2),
+        lambda: (
+            f"A must be batches of square matrices, "
+            f"but they are {A.size(-2)} by {A.size(-1)} matrices"
+        ),
+    )
+
+    torch._check(
+        A.size(-1) == self.size(-2),
+        lambda: (
+            f"Incompatible matrix sizes for {name}: each A "
+            f"matrix is {A.size(-1)} by {A.size(-1)}"
+            f" but each b matrix is {self.size(-2)} by {self.size(-1)}"
+        ),
+    )
+
+
+# From aten/src/ATen/native/LinearAlgebraUtils.h
+def checkFloatingOrComplex(
+    t: Tensor, f_name: str, allow_low_precision_dtypes: bool = True
+):
+    dtype = t.dtype
+    torch._check(
+        t.is_floating_point() or t.is_complex(),
+        lambda: f"{f_name}: Expected a floating point or complex tensor as input. Got {dtype}",
+    )
+    if not allow_low_precision_dtypes:
+        torch._check(
+            dtype in (torch.float, torch.double, torch.cfloat, torch.cdouble),
+            lambda: f"{f_name}: Low precision dtypes not supported. Got {dtype}",
+        )
+
+
+# From aten/src/ATen/native/LinearAlgebraUtils.h
+def checkIsMatrix(A: Tensor, f_name: str, arg_name: str = "A"):
+    torch._check(
+        A.dim() >= 2,
+        lambda: f"{f_name}: The input tensor {arg_name} must have at least 2 dimensions.",
+    )
+
+
+def checkInputsSolver(
+    A: Tensor,
+    B: Tensor,
+    left: bool,
+    f_name: str,
+):
+    squareCheckInputs(A, f_name)
+    checkIsMatrix(B, f_name)
+    torch._check(
+        A.size(-2) == B.size(-2) if left else A.size(-1) == B.size(-1),
+        lambda: (
+            f"{f_name}: Incompatible shapes of A and B for the equation "
+            f"{'AX = B' if left else 'XA = B'}"
+            f" ({A.size(-2)}x{A.size(-1)} and {B.size(-2)}x{B.size(-1)})"
+        ),
+    )
+
+
+def checkSameDevice(
+    fn_name: str, result: Tensor, input: Tensor, result_name: str = "result"
+):
+    torch._check(
+        result.device == input.device,
+        lambda: (
+            f"{fn_name}: Expected {result_name} and input tensors to be on the same device, but got "
+            f"{result_name} on {result.device} and input on {input.device}"
+        ),
+    )
+
+
+def checkUplo(UPLO: str):
+    UPLO_uppercase = UPLO.upper()
+    torch._check(
+        len(UPLO) == 1 and (UPLO_uppercase == "U" or UPLO_uppercase == "L"),
+        lambda: f"Expected UPLO argument to be 'L' or 'U', but got {UPLO}",
+    )
+
+
+@register_meta([aten._linalg_eigh.default, aten._linalg_eigh.eigenvalues])
+@out_wrapper("eigenvalues", "eigenvectors")
+def meta__linalg_eigh(
+    A: Tensor,
+    UPLO: str = "L",
+    compute_v: bool = True,
+):
+    squareCheckInputs(A, "linalg.eigh")
+    checkUplo(UPLO)
+
+    shape = list(A.shape)
+    if compute_v:
+        vecs = A.new_empty(shape)
+        vecs.as_strided_(shape, make_contiguous_strides_for(shape, row_major=False))
+    else:
+        vecs = A.new_empty([0])
+
+    shape.pop()
+    vals = A.new_empty(shape, dtype=toRealValueType(A.dtype))
+
+    return vals, vecs
+
+
+@register_meta([aten._linalg_eigvals.default, aten.linalg_eigvals.out])
+@out_wrapper()
+def meta__linalg_eigvals(input: Tensor) -> Tensor:
+    squareCheckInputs(input, "linalg.eigvals")
+    complex_dtype = (
+        input.dtype
+        if utils.is_complex_dtype(input.dtype)
+        else utils.corresponding_complex_dtype(input.dtype)
+    )
+    return input.new_empty(input.shape[:-1], dtype=complex_dtype)
+
+
+@register_meta([aten.linalg_eig])
+@out_wrapper("eigenvalues", "eigenvectors")
+def meta_linalg_eig(input: Tensor):
+    squareCheckInputs(input, "linalg.eig")
+    complex_dtype = (
+        input.dtype
+        if utils.is_complex_dtype(input.dtype)
+        else utils.corresponding_complex_dtype(input.dtype)
+    )
+    values = input.new_empty(input.shape[:-1], dtype=complex_dtype)
+    vectors = input.new_empty(input.shape, dtype=complex_dtype)
+    return values, vectors
+
+
+def cloneBatchedColumnMajor(src: Tensor) -> Tensor:
+    return src.mT.clone(memory_format=torch.contiguous_format).transpose(-2, -1)
+
+
+@register_meta(aten._cholesky_solve_helper)
+@out_wrapper()
+def _cholesky_solve_helper(self: Tensor, A: Tensor, upper: bool) -> Tensor:
+    return cloneBatchedColumnMajor(self)
+
+
+@register_meta(aten.cholesky_solve)
+@out_wrapper()
+def cholesky_solve(self: Tensor, A: Tensor, upper: bool = False) -> Tensor:
+    torch._check(
+        self.ndim >= 2,
+        lambda: f"b should have at least 2 dimensions, but has {self.ndim} dimensions instead",
+    )
+    torch._check(
+        A.ndim >= 2,
+        lambda: f"u should have at least 2 dimensions, but has {A.ndim} dimensions instead",
+    )
+    self_broadcasted, A_broadcasted = _linalg_broadcast_batch_dims_name(
+        self, A, "cholesky_solve"
+    )
+    return _cholesky_solve_helper(self_broadcasted, A_broadcasted, upper)
+
+
+@register_meta(aten.cholesky)
+@out_wrapper()
+def cholesky(self: Tensor, upper: bool = False) -> Tensor:
+    if self.numel() == 0:
+        return torch.empty_like(self, memory_format=torch.legacy_contiguous_format)
+    squareCheckInputs(self, "cholesky")
+    return cloneBatchedColumnMajor(self)
+
+
+@register_meta(aten.cholesky_inverse)
+@out_wrapper()
+def cholesky_inverse(self: Tensor, upper: bool = False) -> Tensor:
+    squareCheckInputs(self, "cholesky_inverse")
+    return cloneBatchedColumnMajor(self)
+
+
+# From aten/src/ATen/native/BatchLinearAlgebra.cpp
+@register_meta(aten.linalg_cholesky_ex.default)
+def linalg_cholesky_ex(A: Tensor, upper: bool = False, check_errors: bool = False):
+    squareCheckInputs(A, "linalg.cholesky")
+    checkFloatingOrComplex(A, "linalg.cholesky")
+
+    A_shape = A.shape
+    ndim = len(A_shape)
+
+    # L
+    L_strides = make_contiguous_strides_for(A_shape, False)
+    L = A.new_empty(A_shape)
+    L.as_strided_(A_shape, L_strides)
+
+    # infos
+    infos = A.new_empty(A_shape[0 : ndim - 2], dtype=torch.int32)
+    return L, infos
+
+
+@register_meta(
+    [aten.linalg_householder_product.default, aten.linalg_householder_product.out]
+)
+@out_wrapper()
+def linalg_householder_product(input: Tensor, tau: Tensor) -> Tensor:
+    torch._check(
+        input.ndim >= 2,
+        lambda: "torch.linalg.householder_product: input must have at least 2 dimensions.",
+    )
+    torch._check(
+        input.size(-2) >= input.size(-1),
+        lambda: "torch.linalg.householder_product: input.shape[-2] must be greater than or equal to input.shape[-1]",
+    )
+    torch._check(
+        input.size(-1) >= tau.size(-1),
+        lambda: "torch.linalg.householder_product: input.shape[-1] must be greater than or equal to tau.shape[-1]",
+    )
+
+    torch._check(
+        input.ndim - tau.ndim == 1,
+        lambda: (
+            f"torch.linalg.householder_product: Expected tau to have one dimension less than input, "
+            f"but got tau.ndim equal to {tau.ndim} and input.ndim is equal to {input.ndim}"
+        ),
+    )
+    if input.ndim > 2:
+        expected_batch_tau_shape = input.shape[:-2]
+        actual_batch_tau_shape = tau.shape[:-1]
+        torch._check(
+            actual_batch_tau_shape == expected_batch_tau_shape,
+            lambda: (
+                f"torch.linalg.householder_product: Expected batch dimensions of tau to be "
+                f"equal to input.shape[:-2], but got {actual_batch_tau_shape}"
+            ),
+        )
+
+    torch._check(
+        tau.dtype == input.dtype,
+        lambda: (
+            f"torch.linalg.householder_product: tau dtype {tau.dtype}"
+            f" does not match input dtype {input.dtype}"
+        ),
+    )
+    checkSameDevice("torch.linalg.householder_product", tau, input, "tau")
+
+    return torch.empty_strided(
+        size=input.shape,
+        stride=make_contiguous_strides_for(input.shape, row_major=False),
+        dtype=input.dtype,
+        device=input.device,
+    )
+
+
+# From aten/src/ATen/native/BatchLinearAlgebra.cpp
+@register_meta(aten.linalg_inv_ex.default)
+def linalg_inv_ex_meta(A: Tensor, check_errors: bool = False):
+    squareCheckInputs(A, "linalg.inv_ex")
+    checkFloatingOrComplex(A, "linalg.inv_ex", allow_low_precision_dtypes=False)
+
+    L = A.new_empty(A.shape)
+    L.as_strided_(A.shape, make_contiguous_strides_for(A.shape, row_major=False))
+
+    infos = A.new_empty(A.shape[:-2], dtype=torch.int32)
+    return L, infos
+
+
+@register_meta([aten.linalg_ldl_factor_ex.default, aten.linalg_ldl_factor_ex.out])
+@out_wrapper("LD", "pivots", "info")
+def linalg_ldl_factor_ex_meta(
+    self: Tensor,
+    *,
+    hermitian: bool = False,
+    check_errors: bool = False,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    squareCheckInputs(self, "torch.linalg.ldl_factor_ex")
+    checkFloatingOrComplex(self, "torch.linalg.ldl_factor_ex")
+    LD = torch.empty_strided(
+        size=self.shape,
+        stride=make_contiguous_strides_for(self.shape, row_major=False),
+        dtype=self.dtype,
+        device=self.device,
+    )
+    pivots = self.new_empty(self.shape[:-1], dtype=torch.int)
+    info = self.new_empty(self.shape[:-2], dtype=torch.int)
+    return LD, pivots, info
+
+
+@register_meta([aten.linalg_ldl_solve.default, aten.linalg_ldl_solve.out])
+@out_wrapper()
+def linalg_ldl_solve_meta(
+    LD: Tensor, pivots: Tensor, B: Tensor, *, hermitian: bool = False
+) -> Tensor:
+    squareCheckInputs(LD, "torch.linalg.ldl_solve")
+    checkFloatingOrComplex(LD, "torch.linalg.ldl_solve")
+    linearSolveCheckInputs(B, LD, "torch.linalg.ldl_solve")
+    torch._check(
+        B.ndim >= 2,
+        lambda: (
+            f"torch.linalg.ldl_solve: Expected B to have at least 2 dimensions, "
+            f"but it has {B.ndim} dimensions instead"
+        ),
+    )
+    expected_pivots_shape = LD.shape[:-1]
+    torch._check(
+        expected_pivots_shape == pivots.shape,
+        lambda: (
+            f"torch.linalg.ldl_solve: Expected LD.shape[:-1] and pivots.shape to be the same, "
+            f"but got pivots with shape {pivots.shape} instead"
+        ),
+    )
+    torch._check(
+        utils.is_integer_dtype(pivots.dtype),
+        lambda: f"torch.linalg.ldl_solve: Expected pivots to be integers. Got {pivots.dtype}",
+    )
+    torch._check(
+        LD.dtype == B.dtype,
+        lambda: f"torch.linalg.ldl_solve: LD dtype {LD.dtype} does not match b dtype {B.dtype}",
+    )
+    B_broadcast_size, _ = _linalg_broadcast_batch_dims(B, LD)
+    return torch.empty_strided(
+        size=B_broadcast_size,
+        stride=make_contiguous_strides_for(B_broadcast_size, row_major=False),
+        dtype=B.dtype,
+        device=B.device,
+    )
+
+
+@register_meta([aten.linalg_lu.default, aten.linalg_lu.out])
+@out_wrapper("P", "L", "U")
+def linalg_lu_meta(A: Tensor, *, pivot: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
+    torch._check(
+        A.ndim >= 2,
+        lambda: f"linalg.lu: Expected tensor with 2 or more dimensions. Got size: {A.shape} instead",
+    )
+
+    sizes = list(A.shape)
+    m = sizes[-2]
+    n = sizes[-1]
+    k = min(m, n)
+
+    sizes[-1] = m
+    if pivot:
+        P = A.new_empty(sizes)
+    else:
+        P = A.new_empty([0])
+
+    sizes[-1] = k
+    L = A.new_empty(sizes)
+
+    sizes[-2] = k
+    sizes[-1] = n
+    U = A.new_empty(sizes)
+    return P, L, U
+
+
+@register_meta([aten.linalg_lu_factor_ex.default, aten.linalg_lu_factor_ex.out])
+@out_wrapper("LU", "pivots", "info")
+def linalg_lu_factor_ex_meta(
+    A: Tensor, *, pivot: bool = True, check_errors: bool = False
+) -> Tuple[Tensor, Tensor, Tensor]:
+    torch._check(
+        A.ndim >= 2,
+        lambda: f"torch.lu_factor: Expected tensor with 2 or more dimensions. Got size: {A.shape} instead",
+    )
+
+    sizes = list(A.shape)
+    m = sizes[-2]
+    n = sizes[-1]
+
+    LU = torch.empty_strided(
+        size=sizes,
+        stride=make_contiguous_strides_for(sizes, row_major=False),
+        dtype=A.dtype,
+        device=A.device,
+    )
+
+    # Sets sizes to the size of pivots
+    sizes.pop()
+    sizes[-1] = min(m, n)
+    pivots = A.new_empty(sizes, dtype=torch.int)
+
+    # Sets sizes to the size of info
+    sizes.pop()
+    info = A.new_empty(sizes, dtype=torch.int)
+
+    return LU, pivots, info
+
+
+@register_meta([aten.linalg_lu_solve.default, aten.linalg_lu_solve.out])
+@out_wrapper()
+def linalg_lu_solve_meta(
+    LU: Tensor,
+    pivots: Tensor,
+    B: Tensor,
+    *,
+    left: bool = True,
+    adjoint: bool = False,
+) -> Tensor:
+    # dtype
+    checkFloatingOrComplex(LU, "torch.linalg.lu_solve")
+    torch._check(
+        LU.dtype == B.dtype,
+        lambda: (
+            f"linalg.lu_solve: Expected LU and B to have the same dtype, "
+            f"but found LU of type {LU.dtype} and B of type {B.dtype} instead"
+        ),
+    )
+    torch._check(
+        pivots.dtype == torch.int,
+        lambda: "linalg.lu_solve: pivots should be a Tensor of scalar type torch.int32",
+    )
+
+    # matrix shapes
+    squareCheckInputs(LU, "torch.linalg.lu_solve")
+    checkInputsSolver(LU, B, left, "linalg.lu_solve")
+    torch._check(
+        LU.size(-1) == pivots.size(-1),
+        lambda: "linalg.lu_solve: Number of pivots per batch should be same as the dimension of the matrix",
+    )
+
+    # batches
+    torch._check(
+        LU.shape[:-1] == pivots.shape,
+        lambda: (
+            f"linalg.lu_solve: Expected LU.shape[:-1] and pivots.shape to be the same, "
+            f"but got pivots with shape {pivots.shape} instead"
+        ),
+    )
+
+    B_broadcast_size, _ = _linalg_broadcast_batch_dims(B, LU)
+
+    result = torch.empty_strided(
+        size=B_broadcast_size,
+        stride=make_contiguous_strides_for(B_broadcast_size, row_major=not left),
+        dtype=B.dtype,
+        device=B.device,
+    )
+
+    if result.numel() != 0 and not left:
+        if result.is_complex():
+            result = result.conj()
+
+    return result
+
+
+@register_meta(aten.lu_unpack)
+@out_wrapper("P", "L", "U")
+def lu_unpack_meta(
+    LU: Tensor,
+    pivots: Tensor,
+    unpack_data: bool = True,
+    unpack_pivots: bool = True,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    torch._check(
+        LU.ndim >= 2,
+        lambda: f"torch.lu_unpack: Expected tensor with 2 or more dimensions. Got size: {LU.shape} instead",
+    )
+    if unpack_pivots:
+        torch._check(
+            pivots.dtype == torch.int32,
+            lambda: (
+                "torch.lu_unpack: LU_pivots is expected to be a contiguous tensor of torch.int32 dtype.\n"
+                "Note: this function is intended to be used with the output produced by torch.linalg.lu_factor"
+            ),
+        )
+    sizes = list(LU.shape)
+    m = sizes[-2]
+    n = sizes[-1]
+    k = min(m, n)
+    sizes[-1] = m
+    if unpack_pivots:
+        P = LU.new_empty(sizes)
+    else:
+        P = LU.new_empty([0])
+    if unpack_data:
+        sizes[-1] = k
+        L = LU.new_empty(sizes)
+        sizes[-2] = k
+        sizes[-1] = n
+        U = LU.new_empty(sizes)
+    else:
+        L = LU.new_empty([0])
+        U = LU.new_empty([0])
+    return P, L, U
+
+
+# parse the "mode" param in linalg_qr: return a tuple of bools (compute_q, reduced)
+def _parse_qr_mode(mode: str) -> Tuple[bool, bool]:
+    if mode == "reduced":
+        compute_q = True
+        reduced = True
+    elif mode == "complete":
+        compute_q = True
+        reduced = False
+    elif mode == "r":
+        compute_q = False
+        reduced = True  # this is actually irrelevant in this mode
+    else:
+        torch._check(
+            False,
+            lambda: (
+                f"qr received unrecognized mode '{mode}' "
+                f"but expected one of 'reduced' (default), 'r', or 'complete'"
+            ),
+        )
+    return compute_q, reduced  # type: ignore[possibly-undefined]
+
+
+@register_meta([aten.linalg_qr.default, aten.linalg_qr.out])
+@out_wrapper("Q", "R")
+def linalg_qr_meta(
+    A: Tensor,
+    mode: str = "reduced",
+) -> Tuple[Tensor, Tensor]:
+    checkIsMatrix(A, "linalg.qr")
+    checkFloatingOrComplex(A, "linalg.qr")
+
+    compute_q, reduced_mode = _parse_qr_mode(mode)
+
+    m = A.shape[-2]
+    n = A.shape[-1]
+    k = min(m, n)
+
+    if compute_q:
+        Q_shape = list(A.shape)
+        Q_shape[-1] = k if reduced_mode else m
+        Q = A.new_empty(Q_shape)
+        Q.as_strided_(Q_shape, make_contiguous_strides_for(Q_shape, row_major=False))
+    else:
+        Q = A.new_empty([0])
+
+    # For readability
+    R_shape = list(A.shape)
+    R_shape[-2] = k if reduced_mode or not compute_q else m
+    R = A.new_empty(R_shape)
+    R.as_strided_(R_shape, make_contiguous_strides_for(R_shape, row_major=False))
+    return Q, R
+
+
+@register_meta([aten._linalg_slogdet.default, aten._linalg_slogdet.sign])
+@out_wrapper("sign", "logabsdet", "LU", "pivots")
+def _linalg_slogdet(A: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    squareCheckInputs(A, "linalg.slogdet")
+    checkFloatingOrComplex(A, "linalg.slogdet", False)
+    shape = A.shape
+    sign = A.new_empty(shape[:-2])
+    logabsdet = A.new_empty(shape[:-2], dtype=toRealValueType(A.dtype))
+    LU = torch.empty_strided(
+        size=shape,
+        stride=make_contiguous_strides_for(shape, False),
+        dtype=A.dtype,
+        device=A.device,
+    )
+    pivots = A.new_empty(shape[:-1], dtype=torch.int32)
+    return sign, logabsdet, LU, pivots
+
+
+# From aten/src/ATen/native/BatchLinearAlgebra.cpp
+# NOTE: matching defaults in aten/src/ATen/native/native_functions.yaml
+@register_meta(aten._linalg_svd.default)
+def _linalg_svd_meta(
+    A: Tensor,
+    full_matrices: bool = False,
+    compute_uv: bool = True,
+    driver: Optional[str] = None,
+):
+    checkIsMatrix(A, "linalg.svd")
+    checkFloatingOrComplex(A, "linalg.svd")
+
+    batch_dims = list(A.shape[:-2])
+    m = A.shape[-2]
+    n = A.shape[-1]
+    k = min(m, n)
+
+    if compute_uv:
+        U_shape = batch_dims + [m, m if full_matrices else k]
+        U = A.new_empty(U_shape)
+        U.as_strided_(U_shape, make_contiguous_strides_for(U_shape, row_major=False))
+
+        V_shape = batch_dims + [n if full_matrices else k, n]
+        V = A.new_empty(V_shape)
+        # NB: This checks for CUDA since there is no way to check for cuSolver.
+        # Also, this might not work correctly on CPU when fake_device is not
+        # available as device_hint just defaults to CUDA in that case. See
+        # _linalg_svd meta in core.
+        is_cuda = device_hint(A) == "cuda"
+        V.as_strided_(V_shape, make_contiguous_strides_for(V_shape, row_major=is_cuda))
+    else:
+        # doesn't matter
+        U = A.new_empty([0])
+        V = A.new_empty([0])
+
+    # S is always real, even when A is complex.
+    S = A.new_empty(batch_dims + [k], dtype=toRealValueType(A.dtype))
+    return U, S, V
+
+
+def _linalg_broadcast_batch_dims(
+    arg1: Tensor, arg2: Tensor
+) -> Tuple[List[int], List[int]]:
+    # broadcast the batch dimensions of arg1 and arg2.
+    arg1_batch_sizes = arg1.shape[:-2]
+    arg2_batch_sizes = arg2.shape[:-2]
+    expand_batch_portion = _broadcast_shapes(arg1_batch_sizes, arg2_batch_sizes)
+
+    arg1_expand_size = list(expand_batch_portion)
+    arg1_expand_size += [arg1.size(-2), arg1.size(-1)]
+
+    arg2_expand_size = list(expand_batch_portion)
+    arg2_expand_size += [arg2.size(-2), arg2.size(-1)]
+    return arg1_expand_size, arg2_expand_size
+
+
+def _linalg_broadcast_batch_dims_name(
+    arg1: Tensor, arg2: Tensor, name: Optional[str]
+) -> Tuple[Tensor, Tensor]:
+    # If there's no name we assume we don't want to check the errors
+    if name:
+        linearSolveCheckInputs(arg1, arg2, name)
+
+    arg1_expand_size, arg2_expand_size = _linalg_broadcast_batch_dims(arg1, arg2)
+
+    arg1_broadcasted = (
+        arg1 if arg1_expand_size == arg1.shape else arg1.expand(arg1_expand_size)
+    )
+    arg2_broadcasted = (
+        arg2 if arg2_expand_size == arg2.shape else arg2.expand(arg2_expand_size)
+    )
+    return arg1_broadcasted, arg2_broadcasted
+
+
+def linalg_solve_is_vector_rhs(input: Tensor, other: Tensor) -> bool:
+    expected_batched_rhs_shape = input.shape[:-1]
+    vector_case = other.ndim == 1 or (
+        input.ndim - 1 == other.ndim and other.shape == expected_batched_rhs_shape
+    )
+    return vector_case
+
+
+@register_meta(aten._linalg_solve_ex)
+def _linalg_solve_ex(
+    A: Tensor,
+    B: Tensor,
+    *,
+    left: bool = True,
+    check_errors: bool = False,
+    result: Optional[Tensor] = None,
+    LU: Optional[Tensor] = None,
+    pivots: Optional[Tensor] = None,
+    info: Optional[Tensor] = None,
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    checkFloatingOrComplex(A, "linalg.solve")
+    torch._check(
+        A.dtype == B.dtype,
+        lambda: (
+            f"linalg.solve: Expected A and B to have the same dtype, but found A of type "
+            f"{A.dtype} and B of type {B.dtype} instead"
+        ),
+    )
+    vector_case = linalg_solve_is_vector_rhs(A, B)
+    B_ = B.unsqueeze(-1) if vector_case else B
+    checkInputsSolver(A, B_, left, "linalg.solve")
+    B_broad_shape, _ = _linalg_broadcast_batch_dims(B_, A)
+    torch._check(
+        left or not vector_case,
+        lambda: (
+            "linalg.solve: Vector broadcasting of the left hand side is not supported for left=False. "
+            "In this case linalg.solve is equivalent to B / A.squeeze(-1)"
+        ),
+    )
+    result_shape = B_broad_shape[:-1] if vector_case else B_broad_shape
+    result_ = torch.empty_strided(
+        size=result_shape,
+        stride=make_contiguous_strides_for(result_shape, not left),
+        dtype=B.dtype,
+        device=B.device,
+    )
+    shape = A.shape
+    ndim = A.ndim
+    LU_ = torch.empty_strided(
+        size=shape,
+        stride=make_contiguous_strides_for(shape, False),
+        dtype=A.dtype,
+        device=A.device,
+    )
+    pivots_ = A.new_empty(shape[:-1], dtype=torch.int32)
+    info_ = A.new_empty(shape[:-2], dtype=torch.int32)
+    out = (result, LU, pivots, info)
+    res = (result_, LU_, pivots_, info_)
+    if all(x is not None for x in out):
+        for r, o in zip(res, out):
+            # resize and copy operations are done in-place
+            _maybe_resize_out(o, r.shape)  # type: ignore[arg-type]
+            # strides are not copied in out_wrapper
+            o.as_strided_(r.shape, r.stride())  # type: ignore[union-attr]
+            _safe_copy_out(copy_from=r, copy_to=o, exact_dtype=False)  # type: ignore[arg-type]
+    return res
+
+
+@register_meta([aten.linalg_solve_triangular.default, aten.linalg_solve_triangular.out])
+def linalg_solve_triangular_meta(
+    A: Tensor,
+    B: Tensor,
+    *,
+    upper: bool,
+    left: bool = True,
+    unitriangular: bool = False,
+    out: Optional[Tensor] = None,
+) -> Tensor:
+    if out is None:
+        out = A.new_empty([0])
+    assert isinstance(out, TensorLike)
+    checkInputsSolver(A, B, left, "linalg.solve_triangular")
+    B_, A_ = _linalg_broadcast_batch_dims_name(B, A, None)
+    avoid_copy_A = A_.transpose(-2, -1).is_contiguous() and A_.is_conj()
+    if avoid_copy_A:
+        out = _maybe_resize_out(out, B_.shape)
+    else:
+        # reimplementation of resize_output with result F-contig
+        if _resize_output_check(out, B_.shape):
+            out.resize_(B_.transpose(-2, -1).shape)
+            out.transpose_(-2, -1)
+    return out  # type: ignore[return-value]
+
+
+@register_meta(aten.triangular_solve)
+@out_wrapper("solution", "cloned_coefficient")
+def triangular_solve_meta(
+    self: Tensor,
+    A: Tensor,
+    upper: bool = True,
+    transpose: bool = False,
+    unitriangular: bool = False,
+) -> Tuple[Tensor, Tensor]:
+    torch._check(
+        self.ndim >= 2,
+        lambda: (
+            f"torch.triangular_solve: Expected b to have at least 2 dimensions, "
+            f"but it has {self.ndim} dimensions instead"
+        ),
+    )
+    torch._check(
+        A.ndim >= 2,
+        lambda: (
+            f"torch.triangular_solve: Expected A to have at least 2 dimensions, "
+            f"but it has {A.ndim} dimensions instead"
+        ),
+    )
+
+    linearSolveCheckInputs(self, A, "triangular_solve")
+
+    if A.layout == torch.strided:
+        self_broadcast_size, A_broadcast_size = _linalg_broadcast_batch_dims(self, A)
+        solution = torch.empty_strided(
+            size=self_broadcast_size,
+            stride=make_contiguous_strides_for(self_broadcast_size, row_major=False),
+            dtype=self.dtype,
+            device=self.device,
+        )
+        cloned_coefficient = torch.empty_strided(
+            size=A_broadcast_size,
+            stride=make_contiguous_strides_for(A_broadcast_size, row_major=False),
+            dtype=A.dtype,
+            device=A.device,
+        )
+    elif A.layout == torch.sparse_csr or A.layout == torch.sparse_bsr:
+        solution = torch.empty_like(self)
+        cloned_coefficient = self.new_empty([0])
+    else:
+        torch._check(False, lambda: "triangular_solve: Got an unexpected layout.")
+    return solution, cloned_coefficient  # type: ignore[possibly-undefined]
+
+
+# From aten/src/ATen/native/LinearAlgebra.cpp
+@register_meta(aten._linalg_det.default)
+def _linalg_det_meta(A):
+    squareCheckInputs(A, "linalg.det")
+    checkFloatingOrComplex(A, "linalg.det")
+
+    det = A.new_empty(A.shape[:-2])
+
+    LU = A.new_empty(A.shape)
+    LU.as_strided_(A.shape, make_contiguous_strides_for(A.shape, row_major=False))
+
+    pivots = A.new_empty(A.shape[:-1], dtype=torch.int32)
+    return det, LU, pivots
+
+
+@register_meta(aten.ormqr)
+@out_wrapper()
+def ormqr(
+    input: Tensor,
+    tau: Tensor,
+    other: Tensor,
+    left: bool = True,
+    transpose: bool = False,
+) -> Tensor:
+    torch._check(
+        input.ndim >= 2, lambda: "torch.ormqr: input must have at least 2 dimensions."
+    )
+    torch._check(
+        other.ndim >= 2, lambda: "torch.ormqr: other must have at least 2 dimensions."
+    )
+
+    left_size_condition = -2 if left else -1
+    torch._check(
+        other.shape[left_size_condition] >= tau.shape[-1],
+        lambda: f"torch.ormqr: other.shape[{left_size_condition}] must be greater than or equal to tau.shape[-1]",
+    )
+    torch._check(
+        other.shape[left_size_condition] == input.shape[-2],
+        lambda: f"torch.ormqr: other.shape[{left_size_condition}] must be equal to input.shape[-2]",
+    )
+
+    torch._check(
+        tau.shape[-1] <= input.shape[-1],
+        lambda: "torch.ormqr: tau.shape[-1] must be less than or equal to input.shape[-1]",
+    )
+
+    torch._check(
+        input.ndim - tau.ndim == 1,
+        lambda: (
+            f"torch.ormqr: Expected tau to have one dimension less than input, "
+            f"but got tau.ndim equal to {tau.ndim} and input.ndim is equal to {input.ndim}"
+        ),
+    )
+    torch._check(
+        input.ndim == other.ndim,
+        lambda: (
+            f"torch.ormqr: Expected other to have the same number of dimensions as input, "
+            f"but got other.ndim equal to {other.ndim} and input.ndim is equal to {input.ndim}"
+        ),
+    )
+
+    if input.ndim > 2:
+        expected_batch_shape = input.shape[:-2]
+        actual_batch_tau_shape = tau.shape[:-1]
+        torch._check(
+            actual_batch_tau_shape == expected_batch_shape,
+            lambda: (
+                f"torch.ormqr: Expected batch dimensions of tau to be "
+                f"equal to input.shape[:-2], but got {actual_batch_tau_shape}"
+            ),
+        )
+
+        actual_batch_other_shape = other.shape[:-2]
+        torch._check(
+            actual_batch_other_shape == expected_batch_shape,
+            lambda: (
+                f"torch.ormqr: Expected batch dimensions of other to be "
+                f"equal to input.shape[:-2], but got {actual_batch_other_shape}"
+            ),
+        )
+
+    torch._check(
+        tau.dtype == input.dtype,
+        lambda: (
+            f"torch.ormqr: Expected input and tau to have the same dtype, "
+            f"but input has dtype {input.dtype} and tau has dtype {tau.dtype}"
+        ),
+    )
+    torch._check(
+        other.dtype == input.dtype,
+        lambda: (
+            f"torch.ormqr: Expected input and other to have the same dtype, "
+            f"but input has dtype {input.dtype} and other has dtype {other.dtype}"
+        ),
+    )
+
+    checkSameDevice("torch.ormqr", tau, input, "tau")
+    checkSameDevice("torch.ormqr", other, input, "other")
+
+    return torch.empty_strided(
+        size=other.shape,
+        stride=make_contiguous_strides_for(other.shape, row_major=False),
+        dtype=other.dtype,
+        device=other.device,
+    )
+
+
+def _padding_check_valid_input(input, padding, *, dim):
+    torch._check(
+        len(padding) == 2 * dim,
+        lambda: f"padding size is expected to be {2 * dim}, but got: {len(padding)}",
+    )
+
+    input_dim = input.ndim
+
+    is_batch_mode = input_dim == (dim + 2)
+
+    valid_batch_mode = is_batch_mode
+    valid_non_batch_mode = not is_batch_mode
+
+    if is_batch_mode:
+        # allow batch size of 0-dim.
+        for d in range(1, input_dim):
+            valid_batch_mode = valid_batch_mode and input.size(d) != 0
+    else:
+        for d in range(0, input_dim):
+            valid_non_batch_mode = valid_non_batch_mode and input.size(d) != 0
+
+    # allow empty batch size but not other dimensions.
+    torch._check(
+        valid_batch_mode or valid_non_batch_mode,
+        lambda: (
+            f"Expected {dim + 1}D or {dim + 2}D (batch mode) tensor with possibly 0 batch size "
+            f"and other non-zero dimensions for input, but got: {input.shape}"
+        ),
+    )
+
+
+def _pad1d_common(input, padding, *, is_reflection):
+    dim_plane = 0
+    dim_w = 1
+    nbatch = 1
+
+    if input.ndim == 3:
+        nbatch = input.size(0)
+        dim_w += 1
+        dim_plane += 1
+
+    _padding_check_valid_input(input, padding, dim=1)
+
+    pad_l, pad_r = padding
+
+    nplane = input.size(dim_plane)
+    input_w = input.size(dim_w)
+    output_w = input_w + pad_l + pad_r
+
+    if is_reflection:
+        torch._check(
+            pad_l < input_w and pad_r < input_w,
+            lambda: (
+                f"Argument #4: Padding size should be less than the corresponding input dimension, "
+                f"but got: padding ({pad_l}, {pad_r}) at dimension {dim_w} of input {input.shape}"
+            ),
+        )
+
+    torch._check(
+        output_w >= 1,
+        lambda: f"input (W: {input_w}) is too small. Calculated output W: {output_w}",
+    )
+
+    if input.ndim == 2:
+        return input.new_empty((nplane, output_w))
+    else:
+        return input.new_empty((nbatch, nplane, output_w))
+
+
+@register_meta(aten.reflection_pad1d)
+@out_wrapper()
+def meta_reflection_pad1d(input, padding):
+    return _pad1d_common(input, padding, is_reflection=True)
+
+
+@register_meta(aten.replication_pad1d)
+@out_wrapper()
+def meta_replication_pad1d(input, padding):
+    return _pad1d_common(input, padding, is_reflection=False)
+
+
+def _pad1d_backward_common(grad_output, input, padding, *, is_reflection):
+    dim_w = 1
+    if not is_reflection:
+        torch._check(len(padding) == 2, lambda: "padding size is expected to be 2")
+
+    if input.ndim == 3:
+        dim_w += 1
+
+    pad_l, pad_r = padding
+
+    input_w = input.size(dim_w)
+    output_w = input_w + pad_l + pad_r
+
+    if is_reflection:
+        torch._check(
+            pad_l < input_w and pad_r < input_w,
+            lambda: (
+                f"Argument #4: Padding size should be less than the corresponding input dimension, "
+                f"but got: padding ({pad_l}, {pad_r}) at dimension {dim_w} of input {input.shape}"
+            ),
+        )
+
+    torch._check(
+        output_w == grad_output.size(dim_w),
+        lambda: f"grad_output width unexpected. Expected: {output_w}, Got: {grad_output.size(dim_w)}",
+    )
+
+    return input.new_empty(input.shape)
+
+
+@register_meta(aten.reflection_pad1d_backward)
+@out_wrapper("grad_input")
+def meta_reflection_pad1d_backward(grad_output, input, padding):
+    return _pad1d_backward_common(grad_output, input, padding, is_reflection=True)
+
+
+@register_meta(aten.replication_pad1d_backward)
+@out_wrapper("grad_input")
+def meta_replication_pad1d_backward(grad_output, input, padding):
+    return _pad1d_backward_common(grad_output, input, padding, is_reflection=False)
+
+
+def _pad2d_common(input, padding, *, is_reflection):
+    dim_w = 2
+    dim_h = 1
+    dim_slices = 0
+    nbatch = 1
+
+    _padding_check_valid_input(input, padding, dim=2)
+
+    ndim = input.ndim
+    if ndim == 4:
+        nbatch = input.size(0)
+        dim_w += 1
+        dim_h += 1
+        dim_slices += 1
+
+    pad_l, pad_r, pad_t, pad_b = padding
+
+    nplane = input.size(dim_slices)
+    input_h = input.size(dim_h)
+    input_w = input.size(dim_w)
+    output_h = input_h + pad_t + pad_b
+    output_w = input_w + pad_l + pad_r
+
+    if is_reflection:
+        torch._check(
+            pad_l < input_w and pad_r < input_w,
+            lambda: (
+                f"Argument #4: Padding size should be less than the corresponding input dimension, "
+                f"but got: padding ({pad_l}, {pad_r}) at dimension {dim_w} of input {input.shape}"
+            ),
+        )
+        torch._check(
+            pad_t < input_h and pad_b < input_h,
+            lambda: (
+                f"Argument #6: Padding size should be less than the corresponding input dimension, "
+                f"but got: padding ({pad_t}, {pad_b}) at dimension {dim_h} of input {input.shape}"
+            ),
+        )
+
+    torch._check(
+        output_w >= 1 or output_h >= 1,
+        lambda: (
+            f"input (H: {input_h} W: {input_w}) is too small. "
+            f"Calculated output H: {output_h} W: {output_w}"
+        ),
+    )
+
+    if input.ndim == 3:
+        return input.new_empty((nplane, output_h, output_w))
+    else:
+        return input.new_empty((nbatch, nplane, output_h, output_w))
+
+
+@register_meta(aten.reflection_pad2d)
+@out_wrapper()
+def meta_reflection_pad2d(input, padding):
+    return _pad2d_common(input, padding, is_reflection=True)
+
+
+@register_meta(aten.replication_pad2d)
+@out_wrapper()
+def meta_replication_pad2d(input, padding):
+    return _pad2d_common(input, padding, is_reflection=False)
+
+
+@register_meta(
+    [
+        aten.reflection_pad2d_backward.default,
+        aten.reflection_pad2d_backward.grad_input,
+        aten.replication_pad2d_backward.default,
+        aten.replication_pad2d_backward.grad_input,
+    ]
+)
+@out_wrapper("grad_input")
+def meta_pad2d_backward(grad_output, self, padding):
+    dim_w = 2
+    dim_h = 1
+    dim_plane = 0
+    nbatch = 1
+
+    self_shape = self.shape
+    if self.dim() == 4:
+        nbatch = self_shape[0]
+        dim_w += 1
+        dim_h += 1
+        dim_plane += 1
+
+    pad_l, pad_r, pad_t, pad_b = padding
+
+    nplane = self_shape[dim_plane]
+    input_h = self_shape[dim_h]
+    input_w = self_shape[dim_w]
+    output_h = input_h + pad_t + pad_b
+    output_w = input_w + pad_l + pad_r
+
+    torch._check(
+        output_w == grad_output.size(dim_w),
+        lambda: f"grad_output width unexpected. Expected: {output_w}, Got: {grad_output.size(dim_w)}",
+    )
+    torch._check(
+        output_h == grad_output.size(dim_h),
+        lambda: f"grad_output height unexpected. Expected: {output_h}, Got: {grad_output.size(dim_h)}",
+    )
+    return self.new_empty(self.shape)
+
+
+def _pad3d_common(input, padding, *, is_reflection):
+    dim_w = 3
+    dim_h = 2
+    dim_d = 1
+    dim_plane = 0
+
+    _padding_check_valid_input(input, padding, dim=3)
+
+    batch_mode = input.ndim == 5
+    if batch_mode:
+        nbatch = input.size(0)
+        dim_w += 1
+        dim_h += 1
+        dim_d += 1
+        dim_plane += 1
+
+    pad_l, pad_r, pad_t, pad_b, pad_f, pad_bk = padding
+
+    nplane = input.size(dim_plane)
+    input_d = input.size(dim_d)
+    input_h = input.size(dim_h)
+    input_w = input.size(dim_w)
+    output_d = input_d + pad_f + pad_bk
+    output_h = input_h + pad_t + pad_b
+    output_w = input_w + pad_l + pad_r
+
+    if is_reflection:
+        torch._check(
+            pad_l < input_w and pad_r < input_w,
+            lambda: (
+                f"Argument #4: Padding size should be less than the corresponding input dimension, "
+                f"but got: padding ({pad_l}, {pad_r}) at dimension {dim_w} of input {input.shape}"
+            ),
+        )
+        torch._check(
+            pad_t < input_h and pad_b < input_h,
+            lambda: (
+                f"Argument #6: Padding size should be less than the corresponding input dimension, "
+                f"but got: padding ({pad_t}, {pad_b}) at dimension {dim_h} of input {input.shape}"
+            ),
+        )
+        torch._check(
+            pad_f < input_d and pad_bk < input_d,
+            lambda: (
+                f"Argument #8: Padding size should be less than the corresponding input dimension, "
+                f"but got: padding ({pad_f}, {pad_bk}) at dimension {dim_d} of input {input.shape}"
+            ),
+        )
+
+    torch._check(
+        output_w >= 1 or output_h >= 1 or output_d >= 1,
+        lambda: (
+            f"input (D: {input_d} H: {input_h} W: {input_w}) is too small. "
+            f"Calculated output D: {output_d} H: {output_h} W: {output_w}"
+        ),
+    )
+
+    if batch_mode:
+        return input.new_empty((nbatch, nplane, output_d, output_h, output_w))  # type: ignore[possibly-undefined]
+    else:
+        return input.new_empty((nplane, output_d, output_h, output_w))
+
+
+@register_meta(aten.reflection_pad3d)
+@out_wrapper()
+def meta_reflection_pad3d(input, padding):
+    return _pad3d_common(input, padding, is_reflection=True)
+
+
+@register_meta(aten.replication_pad3d)
+@out_wrapper()
+def meta_replication_pad3d(input, padding):
+    return _pad3d_common(input, padding, is_reflection=False)
+
+
+@register_meta(
+    [
+        aten.reflection_pad3d_backward.default,
+        aten.reflection_pad3d_backward.grad_input,
+        aten.replication_pad3d_backward.default,
+        aten.replication_pad3d_backward.grad_input,
+    ]
+)
+@out_wrapper("grad_input")
+def meta_pad3d_backward(grad_output, input, padding):
+    torch._check(len(padding) == 6, lambda: "padding size is expected to be 6")
+    assert input.ndim > 3
+    assert grad_output.ndim == input.ndim
+
+    dim_w = 3
+    dim_h = 2
+    dim_d = 1
+
+    if input.ndim == 5:
+        dim_w += 1
+        dim_h += 1
+        dim_d += 1
+
+    pad_l, pad_r, pad_t, pad_b, pad_f, pad_bk = padding
+
+    input_d = input.size(dim_d)
+    input_h = input.size(dim_h)
+    input_w = input.size(dim_w)
+    output_d = input_d + pad_f + pad_bk
+    output_h = input_h + pad_t + pad_b
+    output_w = input_w + pad_l + pad_r
+
+    torch._check(
+        output_w == grad_output.size(dim_w),
+        lambda: f"grad_output width unexpected. Expected: {output_w}, Got: {grad_output.size(dim_w)}",
+    )
+    torch._check(
+        output_h == grad_output.size(dim_h),
+        lambda: f"grad_output height unexpected. Expected: {output_h}, Got: {grad_output.size(dim_h)}",
+    )
+    torch._check(
+        output_d == grad_output.size(dim_d),
+        lambda: f"grad_output depth unexpected. Expected: {output_d}, Got: {grad_output.size(dim_d)}",
+    )
+
+    return input.new_empty(input.shape)
+
+
+@register_meta(aten._pdist_forward)
+@out_wrapper()
+def meta__pdist_forward(self: Tensor, p: float = 2) -> Tensor:
+    torch._check(
+        self.is_contiguous(), lambda: "_pdist_forward requires contiguous input"
+    )
+    n = self.size(0)
+    if n <= 1:
+        return self.new_empty([0]).to(memory_format=torch.legacy_contiguous_format)  # type: ignore[call-overload]
+    else:
+        return self.new_empty((n * (n - 1) // 2,)).to(
+            memory_format=torch.legacy_contiguous_format
+        )  # type: ignore[call-overload]
+
+
+@register_meta(aten._pdist_backward)
+@out_wrapper()
+def meta__pdist_backward(grad: Tensor, self: Tensor, p: float, pdist: Tensor) -> Tensor:
+    torch._check(
+        self.is_contiguous(), lambda: "_pdist_backward requires self to be contiguous"
+    )
+    torch._check(
+        pdist.is_contiguous(), lambda: "_pdist_backward requires pdist to be contiguous"
+    )
+    return torch.empty_like(self, memory_format=torch.legacy_contiguous_format)
+
+
+@register_meta([aten.baddbmm.default, aten.baddbmm.out])
+@out_wrapper()
+def meta_baddbmm(self, batch1, batch2, *, beta=1, alpha=1):
+    dim1 = batch1.size(0)
+    dim2 = batch1.size(1)
+    dim3 = batch2.size(2)
+    self = self.expand((dim1, dim2, dim3))
+    torch._check(batch1.dim() == 3, lambda: "batch1 must be a 3D tensor")
+    torch._check(batch2.dim() == 3, lambda: "batch2 must be a 3D tensor")
+    torch._check(
+        self.dtype == batch1.dtype == batch2.dtype,
+        lambda: f"Input dtypes must be the same, got: input: {self.dtype}, batch1: {batch1.dtype}, batch2: {batch2.dtype}",
+    )
+    batch1_sizes = batch1.shape
+    batch2_sizes = batch2.shape
+    bs = batch1_sizes[0]
+    contraction_size = batch1_sizes[2]
+    torch._check(
+        batch2_sizes[0] == bs and batch2_sizes[1] == contraction_size,
+        lambda: (
+            f"Expected size for first two dimensions of batch2 tensor to be: "
+            f"[{bs}, {contraction_size}] but got: [{batch2_sizes[0]}, {batch2_sizes[1]}]."
+        ),
+    )
+    return self.new_empty(self.size())
+
+
+@register_meta([aten.bernoulli.default, aten.bernoulli.out])
+@out_wrapper()
+def meta_bernoulli(self, *, generator=None):
+    # https://github.com/pytorch/pytorch/issues/88612
+    return torch.empty_like(self).contiguous()
+
+
+@register_meta(aten.bernoulli_.float)
+def meta_bernoulli_(self, p=0.5, generator=None):
+    return self
+
+
+@register_meta(aten.bernoulli.p)
+def meta_bernoulli_p(self, p=0.5, generator=None):
+    # https://github.com/pytorch/pytorch/issues/88612
+    return torch.empty_like(self).contiguous()
+
+
+@register_meta(aten._fused_moving_avg_obs_fq_helper.default)
+def meta__fused_moving_avg_obs_fq_helper(
+    self,
+    observer_on,
+    fake_quant_on,
+    running_min,
+    running_max,
+    scale,
+    zero_point,
+    averaging_const,
+    quant_min,
+    quant_max,
+    ch_axis,
+    per_row_fake_quant=False,
+    symmetric_quant=False,
+):
+    torch._check(
+        ch_axis < self.dim(),
+        lambda: "Error in fused_moving_avg_obs_fake_quant_cpu: ch_axis must be < self.dim()",
+    )
+    mask = torch.empty_like(self, dtype=torch.bool)
+    return (torch.empty_like(self), mask)
+
+
+@register_meta(aten.mm)
+@out_wrapper()
+def meta_mm(a, b):
+    torch._check(a.dim() == 2, lambda: "a must be 2D")
+    torch._check(b.dim() == 2, lambda: "b must be 2D")
+    N, M1 = a.shape
+    M2, P = b.shape
+    torch._check(
+        M1 == M2,
+        lambda: f"a and b must have same reduction dim, but got [{N}, {M1}] X [{M2}, {P}].",
+    )
+    return a.new_empty(N, P)
+
+
+def _compute_reduction_shape(self, dims, keepdim):
+    if keepdim:
+        return tuple(self.shape[i] if i not in dims else 1 for i in range(self.ndim))
+
+    return utils.compute_reduction_output_shape(self.shape, dims)
+
+
+# FakeTensors (meta tensors with a device) will report device as meta
+# when running meta kernels. Here, access the "fake device" of FakeTensor if it
+# exists so meta kernels which have diverge per device will be more
+# accurate when run with FakeTensors
+def device_hint(tensor) -> "str":
+    if isinstance(tensor, torch._subclasses.FakeTensor):
+        return tensor.fake_device.type
+    else:
+        return "cuda"  # default to cuda
+
+
+def calc_conv_nd_return_shape(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    stride: Union[List[int], int],
+    padding: Union[List[int], int],
+    dilation: Union[List[int], int],
+    is_transposed: bool,
+    groups: int,
+    output_padding: Optional[Union[List[int], int]] = None,
+):
+    def _formula(ln: int, p: int, d: int, k: int, s: int) -> int:
+        """
+        Formula to apply to calculate the length of some dimension of the output
+
+        See: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+
+        Args:
+            ln: length of the dimension
+            p: padding in that dim
+            d: dilation in that dim
+            k: kernel size in that dim
+            s: stride in that dim
+        Returns:
+            The output length
+        """
+        return (ln + 2 * p - d * (k - 1) - 1) // s + 1
+
+    def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int:
+        """
+        Formula to apply to calculate the length of some dimension of the output
+        if transposed convolution is used.
+        See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html
+
+        Args:
+            ln: length of the dimension
+            p: padding in that dim
+            d: dilation in that dim
+            k: kernel size in that dim
+            s: stride in that dim
+            op: output padding in that dim
+
+        Returns:
+            The output length
+        """
+        return (ln - 1) * s - 2 * p + d * (k - 1) + op + 1
+
+    kernel_size = weight.shape[2:]
+    dims = input_tensor.shape[2:]
+    if is_transposed:
+        out_channels = groups * weight.shape[1]
+    else:
+        out_channels = weight.shape[0]
+        if weight.shape[1] * groups != input_tensor.shape[1]:
+            raise RuntimeError("Invalid channel dimensions")
+
+    ret_shape = [input_tensor.shape[0], out_channels]
+    if isinstance(stride, IntLike):
+        stride = [stride] * len(dims)
+    elif len(stride) == 1:
+        stride = [stride[0]] * len(dims)
+
+    if isinstance(padding, IntLike):
+        padding = [padding] * len(dims)
+    elif len(padding) == 1:
+        padding = [padding[0]] * len(dims)
+
+    if isinstance(dilation, IntLike):
+        dilation = [dilation] * len(dims)
+    elif len(dilation) == 1:
+        dilation = [dilation[0]] * len(dims)
+
+    output_padding_list: Optional[List[int]] = None
+    if output_padding:
+        if isinstance(output_padding, IntLike):
+            output_padding_list = [output_padding] * len(dims)
+        elif len(output_padding) == 1:
+            output_padding_list = [output_padding[0]] * len(dims)
+        else:
+            output_padding_list = output_padding
+
+    for i in range(len(dims)):
+        # If output_padding is present, we are dealing with a transposed convolution
+        if output_padding_list:
+            ret_shape.append(
+                _formula_transposed(
+                    dims[i],
+                    padding[i],
+                    dilation[i],
+                    kernel_size[i],
+                    stride[i],
+                    output_padding_list[i],
+                )
+            )
+        else:
+            ret_shape.append(
+                _formula(dims[i], padding[i], dilation[i], kernel_size[i], stride[i])
+            )
+
+    return ret_shape
+
+
+def is_channels_last(ten):
+    return torch._prims_common.suggest_memory_format(ten) == torch.channels_last
+
+
+@register_meta(aten.convolution.default)
+def meta_conv(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    is_transposed: bool,
+    output_padding: List[int],
+    groups: int,
+):
+    def pick_memory_format():
+        if device_hint(input_tensor) == "cuda":
+            if is_channels_last(input_tensor) or is_channels_last(weight):
+                return torch.channels_last
+        else:
+            if is_channels_last(input_tensor):
+                return torch.channels_last
+        if input_tensor.is_contiguous(memory_format=torch.contiguous_format):
+            return torch.contiguous_format
+        elif input_tensor.is_contiguous(memory_format=torch.preserve_format):
+            return torch.preserve_format
+
+    shape_out = calc_conv_nd_return_shape(
+        input_tensor,
+        weight,
+        stride,
+        padding,
+        dilation,
+        is_transposed,
+        groups,
+        output_padding if is_transposed else None,
+    )
+
+    input_channels_dim = 1
+    output_channels_dim = 1
+    if input_tensor.size(input_channels_dim) == 0:
+        shape_out[output_channels_dim] = 0
+
+    out = input_tensor.new_empty(shape_out)
+    out = out.to(memory_format=pick_memory_format())  # type: ignore[call-overload]
+    return out
+
+
+if torch._C._has_mkldnn:
+    _meta_lib_dont_use_me_use_register_meta_for_mkldnn = torch.library.Library(
+        "mkldnn", "IMPL", "Meta"
+    )
+
+    @register_meta(torch.ops.mkldnn._convolution_pointwise.default)
+    def meta_mkldnn_convolution_default(
+        input_tensor,
+        weight,
+        bias,
+        padding,
+        stride,
+        dilation,
+        groups,
+        attr,
+        scalars,
+        algorithm,
+    ):
+        shape_out = calc_conv_nd_return_shape(
+            input_tensor, weight, stride, padding, dilation, False, groups, []
+        )
+        out = input_tensor.new_empty(shape_out)
+        out_memory_format = torch.channels_last
+        out = out.to(memory_format=out_memory_format)  # type: ignore[call-overload]
+        return out
+
+    @register_meta(torch.ops.mkldnn._linear_pointwise.default)
+    def meta_linear_pointwise_default(
+        input_tensor, weight, bias, attr, scalars, algorithm
+    ):
+        return input_tensor.new_empty((*input_tensor.shape[:-1], weight.shape[0]))
+
+    if torch._C.has_mkl:
+        _meta_lib_dont_use_me_use_register_meta_for_mkl = torch.library.Library(
+            "mkl", "IMPL", "Meta"
+        )
+
+        @register_meta(torch.ops.mkl._mkl_linear)
+        def meta_mkl_linear(
+            input_tensor,
+            packed_weight,
+            orig_weight,
+            bias,
+            batch_size,
+        ):
+            return input_tensor.new_empty(
+                (*input_tensor.shape[:-1], orig_weight.shape[0])
+            )
+
+    _meta_lib_dont_use_me_use_register_meta_for_onednn = torch.library.Library(
+        "onednn", "IMPL", "Meta"
+    )
+
+    @register_meta(torch.ops.onednn.qconv2d_pointwise.default)
+    def meta_qconv2d_pointwise(
+        x,
+        x_scale,
+        x_zp,
+        w,  # prepacked_weight
+        w_scale,
+        w_zp,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        output_scale,
+        output_zero_point,
+        output_dtype,
+        attr,
+        scalars,
+        algorithm,
+    ):
+        shape_out = calc_conv_nd_return_shape(
+            x,
+            w,
+            stride,
+            padding,
+            dilation,
+            False,
+            groups,
+            None,
+        )
+        assert output_dtype in [torch.float32, torch.bfloat16]
+        out = x.new_empty(shape_out, dtype=output_dtype)
+        out = out.to(memory_format=torch.channels_last)
+        return out
+
+    @register_meta(torch.ops.onednn.qlinear_pointwise.default)
+    @register_meta(torch.ops.onednn.qlinear_pointwise.tensor)
+    def meta_qlinear_pointwise(
+        x,
+        x_scale,
+        x_zp,
+        w,
+        w_scale,
+        w_zp,
+        bias,
+        output_scale,
+        output_zero_point,
+        output_dtype,
+        post_op_name,
+        post_op_args,
+        post_op_algorithm,
+    ):
+        output_shape = list(x.shape)
+        # The weight has been transposed during the qlinear weight prepack process.
+        output_shape[-1] = w.shape[1]
+        assert output_dtype in [torch.float32, torch.bfloat16]
+        out = x.new_empty(output_shape, dtype=output_dtype)
+        return out
+
+    _meta_lib_dont_use_me_use_register_meta_for_quantized = torch.library.Library(
+        "quantized", "IMPL", "Meta"
+    )
+
+    @register_meta(torch.ops.quantized.max_pool2d)
+    def meta_quantized_max_pool2d(
+        input,
+        kernel_size,
+        stride=(),
+        padding=(0,),
+        dilation=(1,),
+        ceil_mode=False,
+    ):
+        (
+            nInputPlane,
+            outputHeight,
+            outputWidth,
+        ) = max_pool2d_checks_and_compute_shape(
+            input, kernel_size, stride, padding, dilation, ceil_mode
+        )
+        nbatch = input.size(-4) if input.dim() == 4 else 1
+        memory_format = torch.channels_last
+        if input.dim() == 3:
+            size = [nInputPlane, outputHeight, outputWidth]
+        else:
+            size = [nbatch, nInputPlane, outputHeight, outputWidth]
+        return torch.empty(
+            size,
+            dtype=input.dtype,
+            device=input.device,
+            memory_format=memory_format,
+        )
+
+
+# from check_dim_size() in aten/src/ATen/TensorUtils.cpp.
+def check_dim_size(tensor, dim, dim_size, size):
+    torch._check(
+        tensor.dim() == dim and tensor.shape[dim_size] == size,
+        lambda: f"Expected a tensor of dimension {dim} and tensor.size[{dim_size}] == {size}, "
+        + f"but got : dimension {tensor.dim()} and tensor.size[{dim_size}] = {tensor.shape[dim_size]}",
+    )
+
+
+@register_meta(aten.avg_pool2d.default)
+def meta_avg_pool2d(
+    input,
+    kernel_size,
+    stride=(),
+    padding=(0,),
+    ceil_mode=False,
+    count_include_pad=True,
+    divisor_override=None,
+):
+    def unpack(name, val):
+        torch._check(
+            len(val) in [1, 2],
+            lambda: f"avg_pool2d: {name} must either be a single int, or a tuple of two ints",
+        )
+        H = val[0]
+        W = H if len(val) == 1 else val[1]
+        return H, W
+
+    kH, kW = unpack("kernel_size", kernel_size)
+    torch._check(
+        len(stride) in [0, 1, 2],
+        lambda: "avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints",
+    )
+    if len(stride) == 0:
+        dH, dW = kH, kW
+    elif len(stride) == 1:
+        dH, dW = stride[0], stride[0]
+    else:
+        dH, dW = unpack("stride", stride)
+
+    padH, padW = unpack("padding", padding)
+
+    torch._check(
+        divisor_override is None or divisor_override != 0,
+        lambda: "divisor must be not zero",
+    )
+
+    nbatch = input.size(-4) if input.dim() == 4 else 1
+    nInputPlane = input.size(-3)
+    inputHeight = input.size(-2)
+    inputWidth = input.size(-1)
+
+    outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, 1, ceil_mode)
+    outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode)
+
+    memory_format = utils.suggest_memory_format(input)
+    pool2d_shape_check(
+        input,
+        kH,
+        kW,
+        dH,
+        dW,
+        padH,
+        padW,
+        1,
+        1,
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        outputHeight,
+        outputWidth,
+        memory_format,
+    )
+
+    if input.dim() == 3:
+        size = [nInputPlane, outputHeight, outputWidth]
+    else:
+        size = [nbatch, nInputPlane, outputHeight, outputWidth]
+    return torch.empty(
+        size,
+        dtype=input.dtype,
+        device=input.device,
+        memory_format=memory_format,
+    )
+
+
+# from avg_pool2d_backward_shape_check() in aten/src/ATen/native/Pool.h.
+def avg_pool2d_backward_shape_check(
+    input,
+    gradOutput,
+    nbatch,
+    kH,
+    kW,
+    dH,
+    dW,
+    padH,
+    padW,
+    nInputPlane,
+    inputHeight,
+    inputWidth,
+    outputHeight,
+    outputWidth,
+    mem_format,
+):
+    pool2d_shape_check(
+        input,
+        kH,
+        kW,
+        dH,
+        dW,
+        padH,
+        padW,
+        1,
+        1,
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        outputHeight,
+        outputWidth,
+        mem_format,
+    )
+
+    ndim = input.dim()
+    nOutputPlane = nInputPlane
+
+    check_dim_size(gradOutput, ndim, ndim - 3, nOutputPlane)
+    check_dim_size(gradOutput, ndim, ndim - 2, outputHeight)
+    check_dim_size(gradOutput, ndim, ndim - 1, outputWidth)
+
+
+# Don't override the C++ registration.
+@register_meta(aten.avg_pool2d_backward.default)
+def meta_avg_pool2d_backward(
+    gradOutput_,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    divisor_override,
+):
+    # From aten/src/ATen/native/AveragePool2d.cpp structured kernel meta func.
+    torch._check(
+        len(kernel_size) == 1 or len(kernel_size) == 2,
+        lambda: "avg_pool2d: kernel_size must either be a single int, or a tuple of two ints",
+    )
+    kH = kernel_size[0]
+    kW = kH if len(kernel_size) == 1 else kernel_size[1]
+    torch._check(
+        len(stride) == 0 or len(stride) == 1 or len(stride) == 2,
+        lambda: "avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints",
+    )
+    dH = kH if len(stride) == 0 else stride[0]
+    dW = kW if len(stride) == 0 else dH if len(stride) == 1 else stride[1]
+    torch._check(
+        len(padding) == 1 or len(padding) == 2,
+        lambda: "avg_pool2d: padding must either be a single int, or a tuple of two ints",
+    )
+    padH = padding[0]
+    padW = padH if len(padding) == 1 else padding[1]
+
+    torch._check(
+        divisor_override is None or divisor_override != 0,
+        lambda: "divisor must be not zero",
+    )
+
+    input_size = input.shape
+    nbatch = input_size[-4] if input.dim() == 4 else 1
+    nInputPlane = input_size[-3]
+    inputHeight = input_size[-2]
+    inputWidth = input_size[-1]
+
+    outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, 1, ceil_mode)
+    outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode)
+
+    mem_format = utils.suggest_memory_format(input)
+
+    avg_pool2d_backward_shape_check(
+        input,
+        gradOutput_,
+        nbatch,
+        kH,
+        kW,
+        dH,
+        dW,
+        padH,
+        padW,
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        outputHeight,
+        outputWidth,
+        mem_format,
+    )
+
+    return torch.empty(
+        input_size,
+        dtype=input.dtype,
+        device=input.device,
+        memory_format=mem_format,
+    )
+
+
+@register_meta(aten.avg_pool3d)
+@out_wrapper()
+def meta_avg_pool3d(
+    input,
+    kernel_size,
+    stride=(),
+    padding=(0,),
+    ceil_mode=False,
+    count_include_pad=True,
+    divisor_override=None,
+):
+    torch._check(
+        len(kernel_size) in (1, 3),
+        lambda: "avg_pool3d: kernel_size must be a single int, or a tuple of three ints",
+    )
+    kT = kernel_size[0]
+    kH = kT if len(kernel_size) == 1 else kernel_size[1]
+    kW = kT if len(kernel_size) == 1 else kernel_size[2]
+
+    torch._check(
+        not stride or len(stride) in (1, 3),
+        lambda: "avg_pool3d: stride must be omitted, a single int, or a tuple of three ints",
+    )
+    dT = kT if not stride else stride[0]
+    dH = kH if not stride else (dT if len(stride) == 1 else stride[1])
+    dW = kW if not stride else (dT if len(stride) == 1 else stride[2])
+
+    torch._check(
+        len(padding) in (1, 3),
+        lambda: "avg_pool3d: padding must be a single int, or a tuple of three ints",
+    )
+    padT = padding[0]
+    padH = padT if len(padding) == 1 else padding[1]
+    padW = padT if len(padding) == 1 else padding[2]
+
+    torch._check(
+        input.ndim in (4, 5),
+        lambda: "non-empty 4D or 5D (batch mode) tensor expected for input",
+    )
+
+    torch._check(
+        not divisor_override or divisor_override != 0,
+        lambda: "divisor must be not zero",
+    )
+
+    nbatch = input.size(0)
+    nslices = input.size(-4)
+    itime = input.size(-3)
+    iheight = input.size(-2)
+    iwidth = input.size(-1)
+
+    otime = pooling_output_shape(itime, kT, padT, dT, 1, ceil_mode)
+    oheight = pooling_output_shape(iheight, kH, padH, dH, 1, ceil_mode)
+    owidth = pooling_output_shape(iwidth, kW, padW, dW, 1, ceil_mode)
+
+    pool3d_shape_check(
+        input,
+        nslices,
+        kT,
+        kH,
+        kW,
+        dT,
+        dH,
+        dW,
+        padT,
+        padH,
+        padW,
+        1,
+        1,
+        1,
+        itime,
+        iheight,
+        iwidth,
+        otime,
+        oheight,
+        owidth,
+        "avg_pool3d()",
+        check_input_size=True,
+    )
+
+    if input.ndim == 4:
+        return input.new_empty((nslices, otime, oheight, owidth))
+    else:
+        return input.new_empty((nbatch, nslices, otime, oheight, owidth))
+
+
+@register_meta(aten.avg_pool3d_backward)
+@out_wrapper("grad_input")
+def meta_avg_pool3d_backward(
+    grad_output,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    divisor_override,
+):
+    torch._check(
+        len(kernel_size) in (1, 3),
+        lambda: "avg_pool3d: kernel_size must be a single int, or a tuple of three ints",
+    )
+    kT = kernel_size[0]
+    kH = kT if len(kernel_size) == 1 else kernel_size[1]
+    kW = kT if len(kernel_size) == 1 else kernel_size[2]
+
+    torch._check(
+        not stride or len(stride) in (1, 3),
+        lambda: "avg_pool3d: stride must be omitted, a single int, or a tuple of three ints",
+    )
+    dT = kT if not stride else stride[0]
+    dH = kH if not stride else (dT if len(stride) == 1 else stride[1])
+    dW = kW if not stride else (dT if len(stride) == 1 else stride[2])
+
+    torch._check(
+        len(padding) in (1, 3),
+        lambda: "avg_pool3d: padding must be a single int, or a tuple of three ints",
+    )
+    padT = padding[0]
+    padH = padT if len(padding) == 1 else padding[1]
+    padW = padT if len(padding) == 1 else padding[2]
+
+    torch._check(
+        input.ndim in (4, 5),
+        lambda: "non-empty 4D or 5D (batch mode) tensor expected for input",
+    )
+
+    torch._check(
+        not divisor_override or divisor_override != 0,
+        lambda: "divisor must be not zero",
+    )
+
+    nslices = input.size(-4)
+    itime = input.size(-3)
+    iheight = input.size(-2)
+    iwidth = input.size(-1)
+
+    otime_for_shape_check = pooling_output_shape(itime, kT, padT, dT, 1, ceil_mode)
+    oheight_for_shape_check = pooling_output_shape(iheight, kH, padH, dH, 1, ceil_mode)
+    owidth_for_shape_check = pooling_output_shape(iwidth, kW, padW, dW, 1, ceil_mode)
+
+    avg_pool3d_backward_shape_check(
+        input,
+        grad_output,
+        nslices,
+        kT,
+        kH,
+        kW,
+        dT,
+        dH,
+        dW,
+        padT,
+        padH,
+        padW,
+        itime,
+        iheight,
+        iwidth,
+        otime_for_shape_check,
+        oheight_for_shape_check,
+        owidth_for_shape_check,
+        "avg_pool3d_backward()",
+    )
+
+    return input.new_empty(input.shape)
+
+
+@register_meta(aten._adaptive_avg_pool2d.default)
+def meta_adaptive_avg_pool2d(self, output_size):
+    torch._check(
+        self.ndim == 3 or self.ndim == 4,
+        lambda: f"Expected 3D or 4D tensor, but got {self.shape}",
+    )
+    output_shape = self.shape[:-2] + tuple(output_size)
+    memory_format = utils.suggest_memory_format(self)
+    # need to set memory_format to preserve the memory format of the input
+    # channel last input should have channel last output
+    return torch.empty(
+        output_shape,
+        dtype=self.dtype,
+        device=self.device,
+        memory_format=memory_format,
+    )
+
+
+@register_meta(aten._adaptive_avg_pool3d.default)
+def meta_adaptive_avg_pool3d(self, output_size):
+    torch._check(
+        self.ndim == 4 or self.ndim == 5,
+        lambda: f"Expected 4D or 5D tensor, but got {self.shape}",
+    )
+    return self.new_empty(self.shape[:-3] + tuple(output_size))
+
+
+@register_meta(aten._adaptive_avg_pool2d_backward.default)
+def meta__adaptive_avg_pool2d_backward(grad_out, self):
+    ndim = grad_out.ndim
+    for i in range(1, ndim):
+        torch._check(
+            grad_out.size(i) > 0,
+            lambda: f"adaptive_avg_pool2d_backward(): Expected grad_output to have non-zero \
+                      size for non-batch dimensions, {grad_out.shape} with dimension {i} being empty",
+        )
+    torch._check(
+        ndim == 3 or ndim == 4,
+        lambda: f"adaptive_avg_pool2d_backward(): Expected 3D or 4D tensor, but got {self.shape}",
+    )
+    torch._check(
+        self.dtype == grad_out.dtype,
+        lambda: f"expected dtype {self.dtype} for `grad_output` but got dtype {grad_out.dtype}",
+    )
+    memory_format = torch.contiguous_format
+    if is_channels_last(self):
+        memory_format = torch.channels_last
+    return self.new_empty(self.shape).to(memory_format=memory_format)
+
+
+@register_meta(aten._adaptive_avg_pool3d_backward)
+@out_wrapper("grad_input")
+def meta__adaptive_avg_pool3d_backward(grad_output, self):
+    _adaptive_pool_empty_output_check(grad_output, "adaptive_avg_pool3d_backward")
+    return torch.empty_like(self, memory_format=torch.legacy_contiguous_format)
+
+
+def _adaptive_pool_empty_output_check(grad_output: Tensor, arg_name: str):
+    ndim = grad_output.ndim
+    for i in range(1, ndim):
+        torch._check(
+            grad_output.size(i) > 0,
+            lambda: (
+                f"{arg_name}(): Expected grad_output to have non-zero size for non-batch dimensions, "
+                f"but grad_output has sizes {grad_output.shape} with dimension {i} being empty"
+            ),
+        )
+
+
+@register_meta(aten.adaptive_max_pool2d)
+@out_wrapper("out", "indices")
+def meta_adaptive_max_pool2d(input, output_size):
+    ndim = input.ndim
+    torch._check(
+        ndim in (3, 4),
+        lambda: f"adaptive_max_pool2d(): Expected 3D or 4D tensor, but got: {input.shape}",
+    )
+    for i in range(1, ndim):
+        torch._check(
+            input.size(i) > 0,
+            lambda: (
+                f"adaptive_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
+                f"but input has sizes {input.shape} with dimension {i} being empty"
+            ),
+        )
+
+    torch._check(
+        len(output_size) == 2,
+        lambda: "adaptive_max_pool2d(): internal error: output_size.size() must be 2",
+    )
+
+    dimH = 1
+    sizeB = 1
+    sizeD = 0
+
+    if input.ndim == 4:
+        sizeB = input.size(0)
+        dimH += 1
+
+    sizeD = input.size(dimH - 1)
+    osizeH, osizeW = output_size
+
+    if input.ndim == 3:
+        out_shape = (sizeD, osizeH, osizeW)
+        out = input.new_empty(out_shape)
+        indices = input.new_empty(out_shape, dtype=torch.int64)
+        return out, indices
+    else:
+        out_shape = (sizeB, sizeD, osizeH, osizeW)  # type: ignore[assignment]
+        memory_format = utils.suggest_memory_format(input)
+        out = input.new_empty(out_shape).to(memory_format=memory_format)
+        indices = input.new_empty(out_shape, dtype=torch.int64).to(
+            memory_format=memory_format
+        )
+        return out, indices
+
+
+@register_meta(aten.adaptive_max_pool2d_backward)
+@out_wrapper("grad_input")
+def meta_adaptive_max_pool2d_backward(grad_output, input, indices):
+    ndim = grad_output.ndim
+    torch._check(
+        ndim in (3, 4),
+        lambda: f"adaptive_max_pooling2d_backward(): Expected 3D or 4D grad_output, but got: {grad_output.shape}",
+    )
+
+    _adaptive_pool_empty_output_check(grad_output, "adaptive_max_pool2d_backward")
+
+    torch._check(
+        input.dtype == grad_output.dtype,
+        lambda: f"expected dtype {input.dtype} for `grad_output` but got dtype {grad_output.dtype}",
+    )
+
+    memory_format = utils.suggest_memory_format(input)
+    return input.new_empty(input.shape).to(memory_format=memory_format)
+
+
+@register_meta(aten.adaptive_max_pool3d)
+@out_wrapper("out", "indices")
+def meta_adaptive_max_pool3d(input, output_size):
+    ndim = input.ndim
+    torch._check(
+        ndim in (4, 5),
+        lambda: f"adaptive_max_pool3d(): Expected 4D or 5D tensor, but got: {input.shape}",
+    )
+    for i in range(1, ndim):
+        torch._check(
+            input.size(i) > 0,
+            lambda: (
+                f"adaptive_max_pool3d(): Expected input to have non-zero size for non-batch dimensions, "
+                f"but input has sizes {input.shape} with dimension {i} being empty"
+            ),
+        )
+
+    torch._check(
+        len(output_size) == 3,
+        lambda: "adaptive_max_pool3d(): internal error: output_size.size() must be 3",
+    )
+
+    dimD = 0
+    sizeB = 1
+    sizeD = 0
+
+    if ndim == 5:
+        sizeB = input.size(0)
+        dimD += 1
+
+    sizeD = input.size(dimD)
+    osizeT, osizeH, osizeW = output_size
+
+    if ndim == 4:
+        out_shape = (sizeD, osizeT, osizeH, osizeW)
+    else:
+        out_shape = (sizeB, sizeD, osizeT, osizeH, osizeW)  # type: ignore[assignment]
+
+    out = input.new_empty(out_shape)
+    indices = input.new_empty(out_shape, dtype=torch.int64)
+
+    return out, indices
+
+
+@register_meta(aten.adaptive_max_pool3d_backward)
+@out_wrapper("grad_input")
+def meta_adaptive_max_pool3d_backward(grad_output, input, indices):
+    _adaptive_pool_empty_output_check(grad_output, "adaptive_max_pool3d_backward")
+    return input.new_empty(input.shape)
+
+
+@register_meta(aten.repeat_interleave.Tensor)
+def meta_repeat_interleave_Tensor(repeats, output_size=None):
+    if output_size is None:
+        raise RuntimeError("cannot repeat_interleave a meta tensor without output_size")
+    return repeats.new_empty(output_size)
+
+
+@register_meta([aten.complex.default, aten.complex.out])
+@out_wrapper()
+def meta_complex(real, imag):
+    assert real.dtype.is_floating_point
+    assert imag.dtype.is_floating_point
+    out_shape = _broadcast_shapes(real.shape, imag.shape)
+    return real.new_empty(out_shape, dtype=corresponding_complex_dtype(real.dtype))
+
+
+@register_meta([aten.nonzero_static.default, aten.nonzero_static.out])
+@out_wrapper()
+def nonzero_static(self, *, size: int, fill_value: int = -1):
+    return self.new_empty((size, self.dim()), dtype=torch.long)
+
+
+@register_meta([aten.index.Tensor, aten._unsafe_index.Tensor])
+def meta_index_Tensor(self, indices):
+    torch._check(bool(indices), lambda: "at least one index must be provided")
+    # aten::index is the internal advanced indexing implementation
+    # checkIndexTensorTypes and expandTensors
+    result: List[Optional[Tensor]] = []
+    for i, index in enumerate(indices):
+        if index is not None:
+            torch._check(
+                index.dtype in [torch.long, torch.int, torch.int8, torch.bool],
+                lambda: "tensors used as indices must be long, int, byte or bool tensors",
+            )
+            if index.dtype in [torch.int8, torch.bool]:
+                nonzero = index.nonzero()
+                k = len(result)
+                torch._check_index(
+                    k + index.ndim <= self.ndim,
+                    lambda: f"too many indices for tensor of dimension {self.ndim}",
+                )
+                for j in range(index.ndim):
+                    torch._check_index(
+                        index.shape[j] == self.shape[k + j],
+                        lambda: f"The shape of the mask {index.shape} at index {i} "
+                        f"does not match the shape of the indexed tensor {self.shape} at index {k + j}",
+                    )
+                    result.append(nonzero.select(1, j))
+            else:
+                result.append(index)
+        else:
+            result.append(index)
+    indices = result
+    torch._check(
+        len(indices) <= self.ndim,
+        lambda: f"too many indices for tensor of dimension {self.ndim} (got {len(indices)})",
+    )
+    # expand_outplace
+    import torch._refs as refs  # avoid import cycle in mypy
+
+    indices = list(refs._maybe_broadcast(*indices))
+    # add missing null tensors
+    while len(indices) < self.ndim:
+        indices.append(None)
+
+    # hasContiguousSubspace
+    #   true if all non-null tensors are adjacent
+    # See:
+    # https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
+    # https://stackoverflow.com/questions/53841497/why-does-numpy-mixed-basic-advanced-indexing-depend-on-slice-adjacency
+    state = 0
+    has_contiguous_subspace = False
+    for index in indices:
+        if state == 0:
+            if index is not None:
+                state = 1
+        elif state == 1:
+            if index is None:
+                state = 2
+        else:
+            if index is not None:
+                break
+    else:
+        has_contiguous_subspace = True
+
+    # transposeToFront
+    # This is the logic that causes the newly inserted dimensions to show up
+    # at the beginning of the tensor, if they're not contiguous
+    if not has_contiguous_subspace:
+        dims = []
+        transposed_indices = []
+        for i, index in enumerate(indices):
+            if index is not None:
+                dims.append(i)
+                transposed_indices.append(index)
+        for i, index in enumerate(indices):
+            if index is None:
+                dims.append(i)
+                transposed_indices.append(index)
+        self = self.permute(dims)
+        indices = transposed_indices
+
+    # AdvancedIndex::AdvancedIndex
+    # Now we can assume the indices have contiguous subspace
+    # This is simplified from AdvancedIndex which goes to more effort
+    # to put the input and indices in a form so that TensorIterator can
+    # take them.  If we write a ref for this, probably that logic should
+    # get implemented
+    before_shape: List[int] = []
+    after_shape: List[int] = []
+    replacement_shape: List[int] = []
+    for dim, index in enumerate(indices):
+        if index is None:
+            if replacement_shape:
+                after_shape.append(self.shape[dim])
+            else:
+                before_shape.append(self.shape[dim])
+        else:
+            replacement_shape = list(index.shape)
+    return self.new_empty(before_shape + replacement_shape + after_shape)
+
+
+@register_meta([aten.convolution_backward.default])
+def meta_convolution_backward(
+    grad_output_,
+    input_,
+    weight_,
+    bias_sizes_opt,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    output_mask,
+):
+    # High level logic taken from slow_conv3d_backward_cpu which should
+    # be representative of all convolution_backward impls
+    backend_grad_input = None
+    backend_grad_weight = None
+    backend_grad_bias = None
+
+    if output_mask[0]:
+        backend_grad_input = grad_output_.new_empty(input_.size())
+    if output_mask[1]:
+        backend_grad_weight = grad_output_.new_empty(weight_.size())
+    if output_mask[2]:
+        backend_grad_bias = grad_output_.new_empty(bias_sizes_opt)
+
+    return (backend_grad_input, backend_grad_weight, backend_grad_bias)
+
+
+@register_meta([aten.addbmm.default, aten.addbmm.out])
+@out_wrapper()
+def meta_addbmm(self, batch1, batch2, *, beta=1, alpha=1):
+    dim1 = batch1.size(1)
+    dim2 = batch2.size(2)
+    self = self.expand((dim1, dim2))
+    torch._check(batch1.dim() == 3, lambda: "batch1 must be a 3D tensor")
+    torch._check(batch2.dim() == 3, lambda: "batch2 must be a 3D tensor")
+    torch._check(
+        batch1.size(0) == batch2.size(0),
+        lambda: f"batch1 and batch2 must have same number of batches, got {batch1.size(0)} and {batch2.size(0)}",
+    )
+    torch._check(
+        batch1.size(2) == batch2.size(1),
+        lambda: (
+            f"Incompatible matrix sizes for bmm ({batch1.size(1)}x{batch1.size(2)} "
+            f"and {batch2.size(1)}x{batch2.size(2)})"
+        ),
+    )
+    torch._check(
+        self.size(0) == dim1 and self.size(1) == dim2,
+        lambda: "self tensor does not match matmul output shape",
+    )
+    return self.new_empty(self.size())
+
+
+def register_meta_foreach(ops):
+    def wrapper(fn):
+        def register(op):
+            op_name = str(op).split(".")[1]
+            scalar_op = getattr(aten, op_name.replace("_foreach_", ""))
+
+            _add_op_to_registry(
+                meta_table,
+                op,
+                partial(
+                    fn,
+                    _scalar_op=scalar_op,
+                ),
+            )
+
+        pytree.tree_map_(register, ops)
+        return fn
+
+    return wrapper
+
+
+@register_meta_foreach(
+    [
+        aten._foreach_abs,
+        aten._foreach_acos,
+        aten._foreach_asin,
+        aten._foreach_atan,
+        aten._foreach_ceil,
+        aten._foreach_cos,
+        aten._foreach_cosh,
+        aten._foreach_erf,
+        aten._foreach_erfc,
+        aten._foreach_exp,
+        aten._foreach_expm1,
+        aten._foreach_frac,
+        aten._foreach_floor,
+        aten._foreach_lgamma,
+        aten._foreach_log,
+        aten._foreach_log10,
+        aten._foreach_log1p,
+        aten._foreach_log2,
+        aten._foreach_neg,
+        aten._foreach_norm,
+        aten._foreach_reciprocal,
+        aten._foreach_round,
+        aten._foreach_sigmoid,
+        aten._foreach_sign,
+        aten._foreach_sin,
+        aten._foreach_sinh,
+        aten._foreach_sqrt,
+        aten._foreach_tan,
+        aten._foreach_tanh,
+        aten._foreach_trunc,
+        aten._foreach_zero,
+        aten._foreach_add,
+        aten._foreach_sub,
+        aten._foreach_mul,
+        aten._foreach_div,
+        aten._foreach_clamp_min,
+        aten._foreach_clamp_max,
+        aten._foreach_lerp,
+    ],
+)
+def _meta_foreach_out_of_place(*args, _scalar_op=None, **kwargs):
+    torch._check(
+        isinstance(args[0], list),
+        lambda: (f"The first argument must be List[Tensor], but got {type(args[0])}."),
+    )
+
+    nelem = len(args[0])
+    torch._check(
+        nelem > 0,
+        lambda: ("Tensor list must have at least one tensor."),
+    )
+
+    nlists = 1
+    for iarg, arg in enumerate(args[1:]):
+        if isinstance(arg, list):
+            nlists += 1
+            torch._check(
+                len(arg) == nelem,
+                lambda: (
+                    f"self and argument-{iarg+2} must match in length, "
+                    f"but got {nelem} and {len(arg)}."
+                ),
+            )
+        elif isinstance(arg, Tensor):
+            torch._check(
+                arg.dim() == 0 and arg.numel() == 1,
+                lambda: (
+                    "scalar tensor expected to be 0 dim but it has "
+                    f"{arg.dim()} dimensions and {arg.numel()} elements."
+                ),
+            )
+        else:
+            break
+
+    result = []
+    for elem in range(nelem):
+        each_args = [args[i][elem] for i in range(nlists)]
+        result.append(_scalar_op(*each_args, *args[nlists:], **kwargs))
+
+    return result
+
+
+@register_meta_foreach(
+    [
+        aten._foreach_abs_,
+        aten._foreach_acos_,
+        aten._foreach_asin_,
+        aten._foreach_atan_,
+        aten._foreach_ceil_,
+        aten._foreach_cos_,
+        aten._foreach_cosh_,
+        aten._foreach_erf_,
+        aten._foreach_erfc_,
+        aten._foreach_exp_,
+        aten._foreach_expm1_,
+        aten._foreach_frac_,
+        aten._foreach_floor_,
+        aten._foreach_lgamma_,
+        aten._foreach_log_,
+        aten._foreach_log10_,
+        aten._foreach_log1p_,
+        aten._foreach_log2_,
+        aten._foreach_neg_,
+        aten._foreach_reciprocal_,
+        aten._foreach_round_,
+        aten._foreach_sigmoid_,
+        aten._foreach_sign_,
+        aten._foreach_sin_,
+        aten._foreach_sinh_,
+        aten._foreach_sqrt_,
+        aten._foreach_tan_,
+        aten._foreach_tanh_,
+        aten._foreach_trunc_,
+        aten._foreach_zero_,
+        aten._foreach_add_,
+        aten._foreach_sub_,
+        aten._foreach_mul_,
+        aten._foreach_div_,
+        aten._foreach_clamp_min_,
+        aten._foreach_clamp_max_,
+        aten._foreach_lerp_,
+        aten._foreach_copy_,
+    ]
+)
+def _meta_foreach_inplace(*args, _scalar_op=None, **kwargs):
+    _meta_foreach_out_of_place(*args, _scalar_op=_scalar_op, **kwargs)
+    return
+
+
+@register_meta([aten._foreach_pow.ScalarAndTensor])
+def meta__foreach_pow_scalar_and_tensor(self, exponent):
+    # Only foreach_pow has a ScalarAndTensor method and needs special
+    # handling because it does not work with _meta_foreach_out_of_place.
+    torch._check(
+        isinstance(exponent, List),
+        lambda: f"exponent must be a tensor list but got {type(exponent)}",
+    )
+    return [torch.empty_like(e) for e in exponent]
+
+
+def _check_foreach_binop_tensor_lists(self, other):
+    torch._check(
+        isinstance(self, List) and isinstance(other, List),
+        lambda: (
+            "The first two arguments of must be List[Tensor], "
+            f"but got {type(self)} and {type(other)}."
+        ),
+    )
+    torch._check(
+        len(self) > 0 and len(self) == len(other),
+        lambda: (
+            "self and other must be non-empty and match in length, "
+            f"but got {len(self)} and {len(other)}."
+        ),
+    )
+
+
+@register_meta(
+    [
+        aten._foreach_maximum,
+        aten._foreach_minimum,
+    ]
+)
+def meta__foreach_binop_scalar(*args):
+    # aten.maximum(Tensor, Scalar) does not exist.
+    return _meta_foreach_out_of_place(*args, _scalar_op=aten.clamp_min)
+
+
+@register_meta(
+    [
+        aten._foreach_maximum_,
+        aten._foreach_minimum_,
+    ]
+)
+def meta__foreach_binop__scalar(*args):
+    # aten.maximum(Tensor, Scalar) does not exist
+    _meta_foreach_inplace(*args, _scalar_op=aten.clamp_min_)
+    return
+
+
+@register_meta(
+    [
+        aten._foreach_addcdiv.Scalar,
+        aten._foreach_addcmul.Scalar,
+    ]
+)
+def meta__foreach_addcop_scalar(self, tensor1, tensor2, scalar=1):
+    # forach_addcdiv and addcdiv have different signatures and
+    # cannot use _meta_foreach_out_of_place.
+    torch._check(
+        all(isinstance(l, List) for l in [self, tensor1, tensor2]),
+        lambda: (
+            "All arguments must be List[Tensor], "
+            f"but got {type(self)}, {type(tensor1)}, and {type(tensor2)}"
+        ),
+    )
+    torch._check(len(self) > 0, lambda: "input tensor list must not be empty.")
+    torch._check(
+        len(self) == len(tensor1) and len(self) == len(tensor2),
+        lambda: "All input tensor lists must have the same length",
+    )
+
+    return [torch.empty_like(s) for s in self]
+
+
+@register_meta([aten._foreach_addcdiv_.Tensor, aten._foreach_addcmul_.Tensor])
+def meta__foreach_addcop_tensor(self, tensor1, tensor2, scalars):
+    torch._check(
+        all(isinstance(l, List) for l in [self, tensor1, tensor2])
+        and isinstance(scalars, torch.Tensor),
+        lambda: (
+            "_foreach_addc*_ op expects arguments of type: List[Tensor], List[Tensor], List[Tensor], tensor, "
+            f"but got: {type(self)}, {type(tensor1)}, {type(tensor2)}, and {type(scalars)}"
+        ),
+    )
+    torch._check(len(self) > 0, lambda: "input tensor list must not be empty.")
+    torch._check(
+        len(self) == len(tensor1) and len(self) == len(tensor2),
+        lambda: "All input tensor lists must have the same length",
+    )
+
+
+@register_meta(
+    [
+        aten._foreach_addcdiv_.Scalar,
+        aten._foreach_addcmul_.Scalar,
+    ]
+)
+def meta__foreach_addcop__scalar(self, tensor1, tensor2, scalar=1):
+    torch._check(
+        all(isinstance(l, List) for l in [self, tensor1, tensor2]),
+        lambda: (
+            "All arguments of _foreach_addc*_ must be List[Tensor], "
+            f"but got {type(self)}, {type(tensor1)}, and {type(tensor2)}"
+        ),
+    )
+    torch._check(len(self) > 0, lambda: "input tensor list must not be empty.")
+    torch._check(
+        len(self) == len(tensor1) and len(self) == len(tensor2),
+        lambda: "All input tensor lists must have the same length",
+    )
+
+
+@register_meta([aten._fused_adam_.default])
+def meta__fused_adam_(
+    self,
+    grads,
+    exp_avgs,
+    exp_avg_sqs,
+    max_exp_avg_sqs,
+    state_steps,
+    *,
+    lr,
+    beta1,
+    beta2,
+    weight_decay,
+    eps,
+    amsgrad,
+    maximize,
+    grad_scale=None,
+    found_inf=None,
+):
+    for l in [self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]:
+        torch._check(
+            isinstance(l, List),
+            lambda: f"exponent must be a tensor list but got {type(l)}",
+        )
+
+
+@register_meta([aten._fused_adam.default])
+def meta__fused_adam(
+    self,
+    grads,
+    exp_avgs,
+    exp_avg_sqs,
+    max_exp_avg_sqs,
+    state_steps,
+    *,
+    lr,
+    beta1,
+    beta2,
+    weight_decay,
+    eps,
+    amsgrad,
+    maximize,
+    grad_scale=None,
+    found_inf=None,
+):
+    for l in [self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]:
+        torch._check(
+            isinstance(l, List),
+            lambda: f"exponent must be a tensor list but got {type(l)}",
+        )
+
+    def empty_like_list(tensor_list):
+        return [torch.empty_like(t) for t in tensor_list]
+
+    return (
+        empty_like_list(self),
+        empty_like_list(grads),
+        empty_like_list(exp_avgs),
+        empty_like_list(exp_avg_sqs),
+        empty_like_list(max_exp_avg_sqs),
+    )
+
+
+@register_meta([aten._int_mm])
+@out_wrapper()
+def meta__int_mm(a, b):
+    torch._check(a.dim() == 2, lambda: "a must be a 2D tensor")
+    torch._check(b.dim() == 2, lambda: "b must be a 2D tensor")
+    torch._check(
+        a.dtype is torch.int8,
+        lambda: f"expected self to be int8, got {a.dtype}",
+    )
+    torch._check(
+        b.dtype is torch.int8,
+        lambda: f"expected mat2 to be int8, got {b.dtype}",
+    )
+    torch._check(
+        a.size(1) == b.size(0),
+        lambda: (
+            f"Incompatible matrix sizes for _int_mm ({a.size(0)}x{a.size(1)} "
+            f"and {b.size(0)}x{b.size(1)})"
+        ),
+    )
+    return a.new_empty((a.size(0), b.size(1)), dtype=torch.int32)
+
+
+@register_meta([aten._convert_weight_to_int4pack])
+def meta__convert_weight_to_int4pack(w, inner_k_tiles):
+    torch._check(w.dim() == 2, lambda: "w must be a 2D tensor")
+    torch._check(
+        w.dtype is torch.int32,
+        lambda: f"expected w to be int32, got {w.dtype}",
+    )
+    n = w.size(0)
+    k = w.size(1)
+    return w.new_empty(
+        (
+            n // 8,
+            k // (inner_k_tiles * 16),
+            32,
+            inner_k_tiles // 2,
+        ),
+        dtype=torch.int32,
+    )
+
+
+@register_meta([aten._weight_int4pack_mm])
+def meta__weight_int4pack_mm(x, w, q_group_size, q_scale_and_zeros):
+    torch._check(x.dim() == 2, lambda: "x must be a 2D tensor")
+    torch._check(w.dim() == 4, lambda: "w must be a 4D tensor")
+    torch._check(
+        x.dtype is torch.bfloat16,
+        lambda: f"expected x to be bf16, got {x.dtype}",
+    )
+    torch._check(
+        w.dtype is torch.int32,
+        lambda: f"expected w to be int32, got {w.dtype}",
+    )
+    return x.new_empty(x.size(0), w.size(0) * 8, dtype=x.dtype)
+
+
+@register_meta([aten._weight_int8pack_mm])
+def meta__weight_int8pack_mm(x, w, q_scales):
+    torch._check(x.dim() == 2, lambda: "x must be a 2D tensor")
+    torch._check(
+        x.dtype is torch.bfloat16,
+        lambda: f"expected x to be bf16, got {x.dtype}",
+    )
+    torch._check(w.dim() == 2, lambda: "w must be a 2D tensor")
+    torch._check(
+        w.dtype is torch.int8,
+        lambda: f"expected w to be int8, got {w.dtype}",
+    )
+    return x.new_empty(x.size(0), w.size(0), dtype=x.dtype)
+
+
+@register_meta(aten._cdist_forward.default)
+def meta_cdist_forward(x1, x2, p, compute_mode):
+    torch._check(
+        x1.dim() >= 2,
+        lambda: f"cdist only supports at least 2D tensors, X1 got: {x1.dim()}D",
+    )
+    torch._check(
+        x2.dim() >= 2,
+        lambda: f"cdist only supports at least 2D tensors, X2 got: {x2.dim()}D",
+    )
+    torch._check(
+        x1.size(-1) == x2.size(-1),
+        lambda: f"X1 and X2 must have the same number of columns. X1: {x1.size(-1)} X2: {x2.size(-1)}",
+    )
+    torch._check(
+        utils.is_float_dtype(x1.dtype),
+        lambda: "cdist only supports floating-point dtypes, X1 got: {x1.dtype}",
+    )
+    torch._check(
+        utils.is_float_dtype(x2.dtype),
+        lambda: "cdist only supports floating-point dtypes, X2 got: {x2.dtype}",
+    )
+    torch._check(p >= 0, lambda: "cdist only supports non-negative p values")
+    torch._check(
+        compute_mode in (None, 1, 2),
+        lambda: f"possible modes: None, 1, 2, but was: {compute_mode}",
+    )
+    r1 = x1.size(-2)
+    r2 = x2.size(-2)
+    batch_tensor1 = x1.shape[:-2]
+    batch_tensor2 = x2.shape[:-2]
+    output_shape = list(torch.broadcast_shapes(batch_tensor1, batch_tensor2))
+    output_shape.extend([r1, r2])
+    return x1.new_empty(output_shape)
+
+
+@register_meta(aten._cdist_backward)
+@out_wrapper()
+def meta_cdist_backward(grad, x1, x2, p, cdist):
+    c1 = x1.shape[-1]
+    r1 = x1.shape[-2]
+    r2 = x2.shape[-2]
+    batch_tensor1 = x1.shape[:-2]
+    batch_tensor2 = x2.shape[:-2]
+    expand_batch_portion = list(torch.broadcast_shapes(batch_tensor1, batch_tensor2))
+    tensor1_expand_size = expand_batch_portion.copy()
+    tensor1_expand_size.extend([r1, c1])
+    batch_product = math.prod(expand_batch_portion)
+    if r1 == 0 or r2 == 0 or c1 == 0 or batch_product == 0:
+        return torch.zeros_like(x1)
+    if tensor1_expand_size != list(x1.shape):
+        x1 = x1.expand(tensor1_expand_size)
+    return torch.empty_like(x1, memory_format=torch.contiguous_format)
+
+
+# NB: This meta function accepts non-meta arguments!  When this behavior
+# was originally introduced this was accidental, but it is now load bearing
+# as people are using this so that they can conveniently test code involving
+# embeddings (feeding CPU tensor inputs with meta device EmbeddingBag module)
+@register_meta(aten._embedding_bag.default)
+def meta_embedding_bag(
+    weight,
+    indices,
+    offsets,
+    scale_grad_by_freq=False,
+    mode=0,
+    sparse=False,
+    per_sample_weights=None,
+    include_last_offset=False,
+    padding_idx=-1,
+):
+    torch._check(
+        indices.dtype in (torch.long, torch.int),
+        lambda: f"expected indices to be long or int, got {indices.dtype}",
+    )
+    torch._check(
+        offsets.dtype in (torch.long, torch.int),
+        lambda: f"expected offsets to be long or int, got {offsets.dtype}",
+    )
+    torch._check(
+        utils.is_float_dtype(weight.dtype),
+        lambda: f"expected weight to be floating point type, got {weight.dtype}",
+    )
+
+    num_bags = offsets.size(0)
+    if include_last_offset:
+        torch._check(
+            num_bags >= 1,
+            lambda: "include_last_offset: numBags should be at least 1",
+        )
+        num_bags -= 1
+
+    output = weight.new_empty(num_bags, weight.size(1))
+    MODE_SUM, MODE_MEAN, MODE_MAX = range(3)
+
+    if per_sample_weights is not None:
+        torch._check(
+            mode == MODE_SUM,
+            lambda: "embedding_bag: per_sample_weights only supported with mode='sum'",
+        )
+        torch._check(
+            per_sample_weights.dtype == weight.dtype,
+            lambda: f"expected weight ({weight.dtype}) and per_sample_weights ({per_sample_weights.dtype}) to have same dtype",
+        )
+        torch._check(
+            per_sample_weights.ndim == 1,
+            lambda: f"expected per_sample_weights to be 1D tensor, got {per_sample_weights.ndim}D",
+        )
+        torch._check(
+            per_sample_weights.numel() == indices.numel(),
+            lambda: (
+                f"expected per_sample_weights.numel() ({per_sample_weights.numel()} "
+                f"to be the same as indices.numel() ({indices.numel()})"
+            ),
+        )
+
+    def is_fast_path_index_select_scale(src, scale, output, padding_idx):
+        return (
+            is_fast_path_index_select(src, output, padding_idx) and scale.stride(0) == 1
+        )
+
+    def is_fast_path_index_select(src, output, padding_idx):
+        return (
+            (src.dtype == torch.float or src.dtype == torch.half)
+            and src.stride(1) == 1
+            and output.stride(1) == 1
+            and padding_idx < 0
+        )
+
+    def is_fast_path(src, scale, output, padding_idx):
+        if scale is not None:
+            return is_fast_path_index_select_scale(src, scale, output, padding_idx)
+        else:
+            return is_fast_path_index_select(src, output, padding_idx)
+
+    if device_hint(offsets) != "cpu":
+        offset2bag = indices.new_empty(indices.size(0))
+        bag_size = indices.new_empty(offsets.size())
+        if mode == MODE_MAX:
+            max_indices = indices.new_empty(num_bags, weight.size(1))
+        else:
+            max_indices = indices.new_empty(0)
+    else:
+        fast_path_sum = is_fast_path(weight, per_sample_weights, output, padding_idx)
+        if mode in (MODE_MEAN, MODE_MAX) or not fast_path_sum:
+            offset2bag = offsets.new_empty(indices.size(0))
+        else:
+            offset2bag = offsets.new_empty(0)
+        bag_size = offsets.new_empty(num_bags)
+        # This part of the logic comes from make_max_indices_out in EmbeddingBag.cpp
+        numBags = offsets.shape[0]
+        if mode == MODE_MAX:
+            if include_last_offset:
+                torch._check(
+                    numBags >= 1,
+                    lambda: "include_last_offset: numBags should be at least 1",
+                )
+                numBags -= 1
+            max_indices = offsets.new_empty(numBags, weight.shape[1])
+        else:
+            max_indices = offsets.new_empty(bag_size.size())
+    return output, offset2bag, bag_size, max_indices
+
+
+@register_meta(aten._embedding_bag_forward_only.default)
+def meta_embedding_bag_forward_only(weight, indices, offsets, *args):
+    output, offset2bag, bag_size, max_indices = meta_embedding_bag(
+        weight, indices, offsets, *args
+    )
+    if device_hint(offsets) == "cpu":
+        bag_size = offsets.new_empty(offsets.size())
+    return output, offset2bag, bag_size, max_indices
+
+
+def _get_reduction_dtype(input, dtype, promote_int_to_long=True):
+    # if specified, dtype takes precedence
+    if dtype:
+        return dtype
+
+    if input.dtype.is_floating_point or input.dtype.is_complex:
+        return input.dtype
+    elif promote_int_to_long:
+        return torch.long
+
+    return input.dtype
+
+
+@register_meta([aten.nansum.default, aten.nansum.out])
+@out_wrapper()
+def meta_nansum(input, dims=None, keepdim=False, *, dtype=None):
+    output_dtype = _get_reduction_dtype(input, dtype, promote_int_to_long=True)
+    dims = utils.reduction_dims(input.shape, dims)
+    output_shape = _compute_reduction_shape(input, dims, keepdim)
+    return input.new_empty(output_shape, dtype=output_dtype)
+
+
+@register_meta([aten.median.default, aten.nanmedian.default])
+def meta_median(input):
+    output_shape = utils.compute_reduction_output_shape(
+        input.shape, tuple(range(input.dim()))
+    )
+    return input.new_empty(output_shape)
+
+
+@register_meta(
+    [
+        aten.median.dim,
+        aten.median.dim_values,
+        aten.nanmedian.dim,
+        aten.nanmedian.dim_values,
+        aten.mode.default,
+        aten.mode.values,
+    ]
+)
+@out_wrapper("values", "indices")
+def meta_median_mode_dim(input, dim=-1, keepdim=False):
+    if device_hint(input) == "cuda":
+        utils.alert_not_deterministic("median CUDA with indices output")
+    dim = utils.reduction_dims(input.shape, (dim,))
+    output_shape = _compute_reduction_shape(input, dim, keepdim)
+    return (
+        input.new_empty(output_shape),
+        input.new_empty(output_shape, dtype=torch.long),
+    )
+
+
+@register_meta(aten.logical_not_.default)
+def meta_logical_not_(self):
+    return self
+
+
+@register_meta(aten.repeat.default)
+def meta_repeat(self, repeats):
+    torch._check(
+        len(repeats) >= self.dim(),
+        lambda: "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor",
+    )
+    # Add new leading dimensions to the tensor if the
+    # number of target dimensions is larger than the
+    # number of source dimensions.
+    num_new_dimensions = len(repeats) - self.dim()
+    padded_size = (1,) * num_new_dimensions + tuple(self.shape)
+    target_size = [padded_size[i] * repeats[i] for i in range(len(repeats))]
+    return self.new_empty(target_size)
+
+
+@register_meta(aten.zero_.default)
+def meta_zero_(self):
+    return self
+
+
+@register_meta(
+    [
+        aten.mul_.Scalar,
+        aten.div_.Scalar,
+        aten.mul_.Tensor,
+        aten.div_.Tensor,
+        aten.logical_and_.default,
+        aten.logical_or_.default,
+        aten.logical_xor_.default,
+    ],
+)
+def meta_binop_inplace(self, other):
+    if isinstance(other, torch.Tensor):
+        check_inplace_broadcast(self.shape, other.shape)
+    return self
+
+
+@register_meta(
+    [
+        aten.add_.Scalar,
+        aten.sub_.Scalar,
+        aten.add_.Tensor,
+        aten.sub_.Tensor,
+    ],
+)
+def meta_binop_inplace_alpha(self, other, alpha=1):
+    if isinstance(other, torch.Tensor):
+        check_inplace_broadcast(self.shape, other.shape)
+    return self
+
+
+@register_meta([aten.round.default, aten.round.decimals])
+def meta_round(self, **kwargs):
+    return elementwise_meta(
+        self, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
+def shift_dtype_check(fn_name, self, val):
+    torch._check(
+        utils.is_integer_dtype(self.dtype),
+        lambda: f"{fn_name}: Expected input tensor to have an integral dtype. Got {self.dtype}",
+    )
+    if isinstance(val, torch.Tensor):
+        torch._check(
+            utils.is_integer_dtype(val.dtype),
+            lambda: f"{fn_name}: Expected shift value to have an integral dtype. Got {val.dtype}",
+        )
+    else:
+        torch._check(
+            isinstance(val, IntLike),
+            lambda: f"{fn_name}: Expected shift value to be an int. Got {val}",
+        )
+
+
+@register_meta([aten.__rshift__.Tensor, aten.__rshift__.Scalar])
+def meta_rshifts(self, other):
+    shift_dtype_check("rshift", self, other)
+    return elementwise_meta(
+        self, other, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
+@register_meta([aten.__lshift__.Tensor, aten.__lshift__.Scalar])
+def meta_lshifts(self, other):
+    shift_dtype_check("lshift", self, other)
+    return elementwise_meta(
+        self, other, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
+@register_meta(aten.zero.default)
+def meta_zero(self):
+    return self.new_empty(self.shape)
+
+
+@register_meta([aten.fill_.Tensor, aten.fill_.Scalar])
+def meta_fill_(self, val):
+    return self
+
+
+@register_meta([aten.fill.Tensor, aten.fill.Scalar])
+def meta_fill(self, val):
+    return torch.empty_like(self)
+
+
+@register_meta(aten.relu_.default)
+def meta_relu_(self):
+    return self
+
+
+@register_meta([aten.index_put.default, aten._unsafe_index_put.default])
+def meta_index_put(self, indices, values, accumulate=False):
+    return torch.empty_like(self)
+
+
+@register_meta(aten.masked_fill_.Scalar)
+def meta_masked_fill_(self, mask, value):
+    check_inplace_broadcast(self.shape, mask.shape)
+    return self
+
+
+@register_meta(aten.masked_scatter_)
+def meta_masked_scatter_(self, mask, source):
+    torch._check(
+        mask.dtype in (torch.bool, torch.uint8), lambda: "Mask must be bool or uint8"
+    )
+    torch._check(
+        self.dtype == source.dtype,
+        lambda: "masked_scatter: expected self and source to have same "
+        "dtypes but got {self.dtype} and {source.dtype}",
+    )
+    return self
+
+
+@register_meta(aten.masked_scatter)
+@out_wrapper()
+def meta_masked_scatter(self, mask, source):
+    self, mask = _maybe_broadcast(self, mask)
+    output = torch.empty_like(self, memory_format=torch.contiguous_format)
+    return meta_masked_scatter_(output, mask, source)
+
+
+@register_meta(aten.masked_scatter_backward)
+def meta_masked_scatter_backward(self, mask, sizes):
+    return self.new_empty(sizes)
+
+
+@register_meta(aten.index_put_.default)
+def meta_index_put_(self, indices, values, accumulate=False):
+    return self
+
+
+@register_meta(aten.alias.default)
+def meta_alias(self):
+    return self.view(self.shape)
+
+
+def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None):
+    torch._check(batch1.dim() == 3, lambda: "batch1 must be a 3D tensor")
+    torch._check(batch2.dim() == 3, lambda: "batch2 must be a 3D tensor")
+
+    batch1_sizes = batch1.size()
+    batch2_sizes = batch2.size()
+
+    bs = batch1_sizes[0]
+    contraction_size = batch1_sizes[2]
+    res_rows = batch1_sizes[1]
+    res_cols = batch2_sizes[2]
+    output_size = (bs, res_rows, res_cols)
+
+    torch._check(
+        batch2_sizes[0] == bs and batch2_sizes[1] == contraction_size,
+        lambda: f"Expected size for first two dimensions of batch2 tensor to be: [{bs}"
+        f", {contraction_size}] but got: [{batch2_sizes[0]}, {batch2_sizes[1]}].",
+    )
+
+    # TODO: handle out
+
+    output = batch2.new_empty(output_size)
+
+    if not is_bmm and self_baddbmm is not None:
+        torch._check(self_baddbmm.dim() == 3, lambda: "self must be a 3D tensor")
+        torch._check(
+            self_baddbmm.size() == output_size,
+            lambda: f"Expected an input tensor shape with shape {output_size} but got shape: {self_baddbmm.size()}",
+        )
+
+    return output
+
+
+@register_meta(aten.bmm.default)
+def meta_bmm(self, mat2):
+    return common_meta_baddbmm_bmm(self, mat2, True)
+
+
+def div_rtn(x, y):
+    q = x // y
+    r = x % y
+    # WARNING: explicit bool conversion here is necessary;
+    # would be fixed by SymBool
+    if r != 0 and (bool(r < 0) != bool(y < 0)):
+        q -= 1
+    return q
+
+
+def pooling_output_shape_pad_lr(
+    inputSize, kernelSize, pad_l, pad_r, stride, dilation, ceil_mode
+):
+    outputSize = (
+        div_rtn(
+            inputSize
+            + pad_l
+            + pad_r
+            - dilation * (kernelSize - 1)
+            - 1
+            + (stride - 1 if ceil_mode else 0),
+            stride,
+        )
+        + 1
+    )
+    if ceil_mode:
+        if (outputSize - 1) * stride >= inputSize + pad_l:
+            outputSize -= 1
+    return outputSize
+
+
+def pooling_output_shape(inputSize, kernelSize, pad, stride, dilation, ceil_mode):
+    torch._check(stride != 0, lambda: "stride should not be zero")
+    torch._check(pad >= 0, lambda: f"pad must be non-negative, but got pad: {pad}")
+    torch._check(
+        pad <= ((kernelSize - 1) * dilation + 1) // 2,
+        lambda: (
+            f"pad should be at most half of effective kernel size, but got pad={pad}, "
+            f"kernel_size={kernelSize} and dilation={dilation}"
+        ),
+    )
+    return pooling_output_shape_pad_lr(
+        inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode
+    )
+
+
+def pool2d_shape_check(
+    input,
+    kH,
+    kW,
+    dH,
+    dW,
+    padH,
+    padW,
+    dilationH,
+    dilationW,
+    nInputPlane,
+    inputHeight,
+    inputWidth,
+    outputHeight,
+    outputWidth,
+    memory_format,
+):
+    ndim = input.dim()
+    nOutputPlane = nInputPlane
+
+    torch._check(
+        kW > 0 and kH > 0,
+        lambda: "kernel size should be greater than zero, but got kH: {kH}, kW: {kW}",
+    )
+    torch._check(
+        dW > 0 and dH > 0,
+        lambda: "stride should be greater than zero, but got dH: {dH}, dW: {dW}",
+    )
+    torch._check(
+        dilationH > 0 and dilationW > 0,
+        lambda: "dilation should be greater than zero, but got dilationH: {dilationH}, dilationW: {dilationW}",
+    )
+
+    valid_dims = input.size(1) != 0 and input.size(2) != 0
+
+    if memory_format == torch.channels_last:
+        torch._check(
+            ndim == 4 and valid_dims and input.size(3) != 0,
+            lambda: "Expected 4D (batch mode) tensor expected for input with channels_last layout"
+            " with optional 0 dim batch size for input, but got: {input.size()}",
+        )
+    else:
+        torch._check(
+            (ndim == 3 and input.size(0) != 0 and valid_dims)
+            or (ndim == 4 and valid_dims and input.size(3) != 0),
+            lambda: f"Expected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got: {input.size()}",
+        )
+
+    torch._check(
+        kW // 2 >= padW and kH // 2 >= padH,
+        lambda: "pad should be smaller than or equal to half of kernel size, but got "
+        f"padW = {padW}, padH = {padH}, kW = {kW}, kH = {kH}",
+    )
+
+    torch._check(
+        outputWidth >= 1 and outputHeight >= 1,
+        lambda: f"Given input size: ({nInputPlane}x{inputHeight}x{inputWidth}). "
+        f"Calculated output size: ({nOutputPlane}x{outputHeight}x{outputWidth}). "
+        "Output size is too small",
+    )
+
+
+def pool3d_shape_check(
+    input: Tensor,
+    nslices: int,
+    kT: int,
+    kH: int,
+    kW: int,
+    dT: int,
+    dH: int,
+    dW: int,
+    pT: int,
+    pH: int,
+    pW: int,
+    dilationT: int,
+    dilationH: int,
+    dilationW: int,
+    itime: int,
+    iheight: int,
+    iwidth: int,
+    otime: int,
+    oheight: int,
+    owidth: int,
+    fn_name: str,
+    check_input_size: bool = False,
+):
+    ndim = input.ndim
+
+    torch._check(
+        kT > 0 and kW > 0 and kH > 0,
+        lambda: (
+            f"kernel size should be greater than zero, but got "
+            f"kT: {kT}, kH: {kH}, kW: {kW}"
+        ),
+    )
+    torch._check(
+        dT > 0 and dW > 0 and dH > 0,
+        lambda: (
+            f"stride should be greater than zero, but got "
+            f"dT: {dT}, dH: {dH}, dW: {dW}"
+        ),
+    )
+    torch._check(
+        dilationT > 0 and dilationW > 0 and dilationH > 0,
+        lambda: (
+            f"dilation should be greater than zero, but got "
+            f"dilationT: {dilationT}, dilationH: {dilationH}, dilationW: {dilationW}"
+        ),
+    )
+
+    torch._check(
+        ndim in (4, 5),
+        lambda: f"{fn_name}: Expected 4D or 5D tensor for input, but got: {input.shape}",
+    )
+
+    for i in range(ndim):
+        if ndim == 5 and i == 0:
+            # size of batch-dim can be 0.
+            continue
+        torch._check(
+            input.size(i) > 0,
+            lambda: (
+                f"{fn_name}: Expected input's non-batch dimensions to have positive length,"
+                f" but input has a shape of {input.shape}"
+                f" and non-batch dimension {input.size(i)} has length zero!"
+            ),
+        )
+
+    if check_input_size:  # AveragePool3d
+        torch._check(
+            itime >= kT and iheight >= kH and iwidth >= kW,
+            lambda: (
+                f"input image (T: {itime} H: {iheight} W: {iwidth}) smaller than "
+                f"kernel size (kT: {kT} kH: {kH} kW: {kW})"
+            ),
+        )
+
+    torch._check(
+        kT / 2 >= pT and kW / 2 >= pW and kH / 2 >= pH,
+        lambda: (
+            f"pad should be smaller than or equal to half of kernel size, but got "
+            f"kT: {kT} kW: {kW} kH: {kH} padT: {pT} padW: {pW} padH: {pH}"
+        ),
+    )
+
+    torch._check(
+        otime >= 1 and owidth >= 1 and oheight >= 1,
+        lambda: (
+            f"Given input size: ({nslices}x{itime}x{iheight}x{iwidth}). "
+            f"Calculated output size: ({nslices}x{otime}x{oheight}x{owidth}). "
+            f"Output size is too small"
+        ),
+    )
+
+
+def max_pool3d_backward_shape_check(
+    input,
+    grad_output,
+    indices,
+    nslices,
+    kT,
+    kH,
+    kW,
+    dT,
+    dH,
+    dW,
+    pT,
+    pH,
+    pW,
+    dilationT,
+    dilationH,
+    dilationW,
+    itime,
+    iheight,
+    iwidth,
+    otime,
+    oheight,
+    owidth,
+    fn_name,
+):
+    ndim = input.ndim
+
+    pool3d_shape_check(
+        input,
+        nslices,
+        kT,
+        kH,
+        kW,
+        dT,
+        dH,
+        dW,
+        pT,
+        pH,
+        pW,
+        dilationT,
+        dilationH,
+        dilationW,
+        itime,
+        iheight,
+        iwidth,
+        otime,
+        oheight,
+        owidth,
+        fn_name,
+    )
+
+    check_dim_size(grad_output, ndim, ndim - 4, nslices)
+    check_dim_size(grad_output, ndim, ndim - 3, otime)
+    check_dim_size(grad_output, ndim, ndim - 2, oheight)
+    check_dim_size(grad_output, ndim, ndim - 1, owidth)
+
+    check_dim_size(indices, ndim, ndim - 4, nslices)
+    check_dim_size(indices, ndim, ndim - 3, otime)
+    check_dim_size(indices, ndim, ndim - 2, oheight)
+    check_dim_size(indices, ndim, ndim - 1, owidth)
+
+
+def avg_pool3d_backward_shape_check(
+    input: Tensor,
+    grad_output: Tensor,
+    nslices: int,
+    kT: int,
+    kH: int,
+    kW: int,
+    dT: int,
+    dH: int,
+    dW: int,
+    pT: int,
+    pH: int,
+    pW: int,
+    itime: int,
+    iheight: int,
+    iwidth: int,
+    otime: int,
+    oheight: int,
+    owidth: int,
+    fn_name: str,
+):
+    ndim = input.ndim
+
+    pool3d_shape_check(
+        input,
+        nslices,
+        kT,
+        kH,
+        kW,
+        dT,
+        dH,
+        dW,
+        pT,
+        pH,
+        pW,
+        1,
+        1,
+        1,
+        itime,
+        iheight,
+        iwidth,
+        otime,
+        oheight,
+        owidth,
+        fn_name,
+        True,
+    )
+
+    check_dim_size(grad_output, ndim, ndim - 4, nslices)
+    check_dim_size(grad_output, ndim, ndim - 3, otime)
+    check_dim_size(grad_output, ndim, ndim - 2, oheight)
+    check_dim_size(grad_output, ndim, ndim - 1, owidth)
+
+
+def max_pool2d_checks_and_compute_shape(
+    input, kernel_size, stride, padding, dilation, ceil_mode
+):
+    # Reference: aten/src/ATen/native/DilatedMaxPool2d.cpp
+    def unpack(name, val):
+        torch._check(
+            len(val) in [1, 2],
+            lambda: f"max_pool2d: {name} must either be a single int, or a tuple of two ints",
+        )
+        H = val[0]
+        W = H if len(val) == 1 else val[1]
+        return H, W
+
+    kH, kW = unpack("kernel_size", kernel_size)
+
+    torch._check(
+        len(stride) in [0, 1, 2],
+        lambda: "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints",
+    )
+    if len(stride) == 0:
+        dH, dW = kH, kW
+    else:
+        dH, dW = unpack("stride", stride)
+
+    padH, padW = unpack("padding", padding)
+    dilationH, dilationW = unpack("dilation", dilation)
+    nInputPlane = input.size(-3)
+    inputHeight = input.size(-2)
+    inputWidth = input.size(-1)
+
+    memory_format = utils.suggest_memory_format(input)
+    if memory_format == torch.channels_last:
+        torch._check(
+            input.dim() == 4,
+            lambda: "non-empty 4D (batch mode) tensor expected for input with channels_last layout",
+        )
+    elif memory_format == torch.contiguous_format:
+        torch._check(
+            input.dim() in [3, 4],
+            lambda: "non-empty 3D or 4D (batch mode) tensor expected for input",
+        )
+    else:
+        torch._check(
+            False,
+            lambda: "Unsupport memory format. Supports only ChannelsLast, Contiguous",
+        )
+
+    outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode)
+    outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode)
+
+    pool2d_shape_check(
+        input,
+        kH,
+        kW,
+        dH,
+        dW,
+        padH,
+        padW,
+        dilationH,
+        dilationW,
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        outputHeight,
+        outputWidth,
+        memory_format,
+    )
+
+    return nInputPlane, outputHeight, outputWidth
+
+
+@register_meta(aten.max_pool2d_with_indices_backward.default)
+def meta_max_pool2d_with_indices_backward(
+    grad_output,
+    self,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode,
+    indices,
+):
+    (
+        nInputPlane,
+        outputHeight,
+        outputWidth,
+    ) = max_pool2d_checks_and_compute_shape(
+        self, kernel_size, stride, padding, dilation, ceil_mode
+    )
+
+    torch._check(
+        self.dtype == grad_output.dtype,
+        lambda: f"Expected dtype {self.dtype} for `gradOutput` but got dtype {grad_output.dtype}",
+    )
+
+    nOutputPlane = nInputPlane
+    ndim = self.ndim
+
+    def _check_dim_size(t):
+        check_dim_size(t, ndim, ndim - 3, nOutputPlane)
+        check_dim_size(t, ndim, ndim - 2, outputHeight)
+        check_dim_size(t, ndim, ndim - 1, outputWidth)
+
+    _check_dim_size(grad_output)
+    _check_dim_size(indices)
+
+    memory_format = utils.suggest_memory_format(self)
+    return torch.empty(
+        self.shape,
+        dtype=self.dtype,
+        device=self.device,
+        memory_format=memory_format,
+    )
+
+
+@register_meta(aten.max_pool2d_with_indices.default)
+def meta_max_pool2d_with_indices(
+    input, kernel_size, stride=(), padding=(0,), dilation=(1,), ceil_mode=False
+):
+    (
+        nInputPlane,
+        outputHeight,
+        outputWidth,
+    ) = max_pool2d_checks_and_compute_shape(
+        input, kernel_size, stride, padding, dilation, ceil_mode
+    )
+
+    nbatch = input.size(-4) if input.dim() == 4 else 1
+    memory_format = utils.suggest_memory_format(input)
+    if input.dim() == 3:
+        size = [nInputPlane, outputHeight, outputWidth]
+    else:
+        size = [nbatch, nInputPlane, outputHeight, outputWidth]
+    return (
+        torch.empty(
+            size,
+            dtype=input.dtype,
+            device=input.device,
+            memory_format=memory_format,
+        ),
+        torch.empty(
+            size,
+            dtype=torch.int64,
+            device=input.device,
+            memory_format=memory_format,
+        ),
+    )
+
+
+@register_meta(aten.fractional_max_pool2d.default)
+def meta_fractional_max_pool2d(self_, kernel_size, output_size, random_samples):
+    torch._check(
+        self_.ndim in (3, 4),
+        lambda: f"fractional_max_pool2d: Expected 3D or 4D tensor, but got: {self_.ndim}",
+    )
+    ndim = self_.ndim
+
+    for d in range(ndim - 3, ndim):
+        torch._check(
+            self_.size(d) > 0,
+            f"fractional_max_pool2d: Expected input to have non-zero "
+            f" size for non-batch dimenions, but got {self_.size()} with dimension {d} empty",
+        )
+
+    # the check and message are out of sync, but this matches the structured meta
+    torch._check(
+        len(kernel_size) == 2,
+        lambda: "fractional_max_pool2d: kernel_size must"
+        "either be a single int or tuple of Ints",
+    )
+    torch._check(
+        len(output_size) == 2,
+        lambda: "fractional_max_pool2d: output_size must "
+        "either be a single int or tuple of Ints",
+    )
+
+    input_channels = self_.size(-3)
+    input_height = self_.size(-2)
+    input_width = self_.size(-1)
+    if ndim == 4:
+        input_batch = self_.size(0)
+    else:
+        input_batch = 1
+
+    torch._check(
+        self_.dtype == random_samples.dtype,
+        lambda: "Expect _random_samples to have the same dtype as input",
+    )
+    torch._check(
+        random_samples.ndim == 3,
+        lambda: f"Expect _random samples to have 3 dimensions got, {random_samples.ndim}",
+    )
+
+    n = random_samples.size(0)
+    c = random_samples.size(1)
+    d = random_samples.size(2)
+    torch._check(
+        n >= input_batch,
+        "Expect _random_samples.size(0) no less then input batch size.",
+    )
+    torch._check(
+        c == input_channels,
+        lambda: "Expect _random_samples.size(1) equals to input channel size.",
+    )
+    torch._check(d == 2, lambda: f"Expect _random_samples.size(2) equals to 2 got {d}.")
+
+    torch._check(
+        output_size[0] + kernel_size[0] - 1 <= input_height,
+        lambda: f"fractional_max_pool2d: kernel height {kernel_size[0]} is too large relative to input height {input_height}",
+    )
+    torch._check(
+        output_size[1] + kernel_size[1] - 1 <= input_width,
+        lambda: f"fractional_max_pool2d: kernel width {kernel_size[1]} is too large relative to input width {input_width}",
+    )
+
+    if self_.dim() == 4:
+        size = [input_batch, input_channels, output_size[0], output_size[1]]
+    else:
+        size = [input_channels, output_size[0], output_size[1]]
+
+    return (
+        torch.empty(
+            size,
+            dtype=self_.dtype,
+            device=self_.device,
+        ),
+        torch.empty(
+            size,
+            dtype=torch.int64,
+            device=self_.device,
+        ),
+    )
+
+
+@register_meta(aten.max_unpool2d)
+@out_wrapper()
+def meta_max_unpool2d(self_, indices, output_size):
+    utils.alert_not_deterministic("max_unpooling2d_forward_out")
+
+    torch._check(
+        indices.dtype == torch.int64,
+        lambda: f"elements in indices should be type int64 but got: {indices.dtype}",
+    )
+    torch._check(
+        len(output_size) == 2,
+        lambda: (
+            f"There should be exactly two elements (height, width) in output_size, "
+            f"but got {len(output_size)} elements."
+        ),
+    )
+
+    oheight, owidth = output_size
+
+    torch._check(
+        self_.ndim in (3, 4),
+        lambda: (
+            f"Input to max_unpooling2d should be a 3d or 4d Tensor, "
+            f"but got a tensor with {self_.ndim} dimensions."
+        ),
+    )
+    torch._check(
+        self_.shape == indices.shape,
+        lambda: (
+            f"Expected shape of indices to be same as that of the input tensor ({self_.shape}) "
+            f"but got indices tensor with shape: {indices.shape}"
+        ),
+    )
+
+    for i in range(1, self_.ndim):
+        torch._check(
+            self_.size(i) > 0,
+            lambda: (
+                f"max_unpooling2d(): "
+                f"Expected input to have non-zero size for non-batch dimensions, "
+                f"but got {self_.shape} with dimension {i} being empty."
+            ),
+        )
+
+    self = self_.contiguous()
+
+    if self_.ndim == 3:
+        nchannels = self.size(0)
+        result = self.new_empty((nchannels, oheight, owidth))
+    else:
+        nbatch = self.size(0)
+        nchannels = self.size(1)
+        result = self.new_empty((nbatch, nchannels, oheight, owidth))
+
+    return result
+
+
+def _max_unpooling3d_shape_check(input, indices, output_size, stride, padding, fn_name):
+    torch._check(
+        indices.dtype == torch.int64, lambda: "elements in indices should be type int64"
+    )
+    torch._check(
+        input.ndim in (4, 5),
+        lambda: f"Input to max_unpooling3d should be a 4d or 5d Tensor, but got a tensor with {input.ndim} dimensions.",
+    )
+    torch._check(
+        len(output_size) == 3,
+        lambda: (
+            f"There should be exactly three elements (depth, height, width) in output_size, "
+            f"but got {len(output_size)} elements."
+        ),
+    )
+    torch._check(
+        len(stride) == 3,
+        lambda: f"There should be exactly three elements (depth, height, width) in stride, but got: {len(stride)} elements.",
+    )
+    torch._check(
+        len(padding) == 3,
+        lambda: f"There should be exactly three elements (depth, height, width) in padding, but got: {len(padding)} elements.",
+    )
+    torch._check(
+        input.shape == indices.shape,
+        lambda: (
+            f"Expected shape of indices to be same as that of the input tensor ({input.shape}) "
+            f"but got indices tensor with shape: {indices.shape}"
+        ),
+    )
+
+    for i in range(1, input.ndim):
+        torch._check(
+            input.size(i) > 0,
+            lambda: (
+                f"{fn_name}: "
+                f"Expected input to have non-zero size for non-batch dimensions, "
+                f"but got {input.shape} with dimension {i} being empty."
+            ),
+        )
+
+    torch._check(
+        stride[0] > 0 and stride[1] > 0 and stride[2] > 0,
+        lambda: f"strides should be greater than zero, but got stride: {stride}",
+    )
+
+
+@register_meta(aten.max_unpool3d)
+@out_wrapper()
+def meta_max_unpool3d(self_, indices, output_size, stride, padding):
+    utils.alert_not_deterministic("max_unpooling3d_forward_out")
+
+    _max_unpooling3d_shape_check(
+        self_, indices, output_size, stride, padding, "max_unpooling3d()"
+    )
+
+    self = self_.contiguous()
+
+    odepth, oheight, owidth = output_size
+
+    if self_.ndim == 4:
+        nchannels = self.size(0)
+        result = self.new_empty((nchannels, odepth, oheight, owidth))
+    else:
+        nbatch = self.size(0)
+        nchannels = self.size(1)
+        result = self.new_empty((nbatch, nchannels, odepth, oheight, owidth))
+
+    return result
+
+
+@register_meta(aten.max_pool3d_with_indices)
+@out_wrapper("out", "indices")
+def meta_max_pool3d_with_indices(
+    input,
+    kernel_size,
+    stride=(),
+    padding=(0,),
+    dilation=(1,),
+    ceil_mode=False,
+):
+    torch._check(
+        len(kernel_size) in (1, 3),
+        lambda: "max_pool3d: kernel_size must either be a single int, or a tuple of three ints",
+    )
+    kT = kernel_size[0]
+    kH = kT if len(kernel_size) == 1 else kernel_size[1]
+    kW = kT if len(kernel_size) == 1 else kernel_size[2]
+
+    torch._check(
+        not stride or len(stride) in (1, 3),
+        lambda: "max_pool3d: stride must either be omitted, a single int, or a tuple of three ints",
+    )
+    dT = kT if not stride else stride[0]
+    dH = kH if not stride else (dT if len(stride) == 1 else stride[1])
+    dW = kW if not stride else (dT if len(stride) == 1 else stride[2])
+
+    torch._check(
+        len(padding) in (1, 3),
+        lambda: "max_pool3d: padding must either be a single int, or a tuple of three ints",
+    )
+    pT = padding[0]
+    pH = pT if len(padding) == 1 else padding[1]
+    pW = pT if len(padding) == 1 else padding[2]
+
+    torch._check(
+        len(dilation) in (1, 3),
+        lambda: "max_pool3d: dilation must be either a single int, or a tuple of three ints",
+    )
+    dilationT = dilation[0]
+    dilationH = dilationT if len(dilation) == 1 else dilation[1]
+    dilationW = dilationT if len(dilation) == 1 else dilation[2]
+
+    torch._check(
+        input.ndim in (4, 5),
+        lambda: "non-empty 4D or 5D (batch mode) tensor expected for input",
+    )
+
+    nbatch = input.size(-5) if input.ndim == 5 else 1
+    nslices = input.size(-4)
+    itime = input.size(-3)
+    iheight = input.size(-2)
+    iwidth = input.size(-1)
+
+    otime = pooling_output_shape(itime, kT, pT, dT, dilationT, ceil_mode)
+    oheight = pooling_output_shape(iheight, kH, pH, dH, dilationH, ceil_mode)
+    owidth = pooling_output_shape(iwidth, kW, pW, dW, dilationW, ceil_mode)
+
+    pool3d_shape_check(
+        input,
+        nslices,
+        kT,
+        kH,
+        kW,
+        dT,
+        dH,
+        dW,
+        pT,
+        pH,
+        pW,
+        dilationT,
+        dilationH,
+        dilationW,
+        itime,
+        iheight,
+        iwidth,
+        otime,
+        oheight,
+        owidth,
+        "max_pool3d_with_indices()",
+    )
+
+    channels_last = (
+        input.ndim == 5 and utils.suggest_memory_format(input) == torch.channels_last_3d
+    )
+    if input.ndim == 4:
+        input_channels_last_check = input.unsqueeze(0)
+        channels_last = (
+            not input_channels_last_check.is_contiguous()
+        ) and input_channels_last_check.is_contiguous(
+            memory_format=torch.channels_last_3d
+        )
+        out_shape = (nslices, otime, oheight, owidth)
+    else:
+        out_shape = (nbatch, nslices, otime, oheight, owidth)  # type: ignore[assignment]
+
+    out = input.new_empty(out_shape)
+    indices = input.new_empty(out_shape, dtype=torch.int64)
+
+    if channels_last:
+        out = out.to(memory_format=torch.channels_last_3d)
+        indices = indices.to(memory_format=torch.channels_last_3d)
+
+    return out, indices
+
+
+@register_meta(aten.max_pool3d_with_indices_backward)
+@out_wrapper("grad_input")
+def meta_max_pool3d_with_indices_backward(
+    grad_output,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode,
+    indices,
+):
+    torch._check(
+        len(kernel_size) in (1, 3),
+        lambda: "max_pool3d: kernel_size must either be a single int, or a tuple of three ints",
+    )
+    kT = kernel_size[0]
+    kH = kT if len(kernel_size) == 1 else kernel_size[1]
+    kW = kT if len(kernel_size) == 1 else kernel_size[2]
+
+    torch._check(
+        not stride or len(stride) in (1, 3),
+        lambda: "max_pool3d: stride must either be omitted, a single int, or a tuple of three ints",
+    )
+    dT = kT if not stride else stride[0]
+    dH = kH if not stride else (dT if len(stride) == 1 else stride[1])
+    dW = kW if not stride else (dT if len(stride) == 1 else stride[2])
+
+    torch._check(
+        len(padding) in (1, 3),
+        lambda: "max_pool3d: padding must either be a single int, or a tuple of three ints",
+    )
+    pT = padding[0]
+    pH = pT if len(padding) == 1 else padding[1]
+    pW = pT if len(padding) == 1 else padding[2]
+
+    torch._check(
+        len(dilation) in (1, 3),
+        lambda: "max_pool3d: dilation must be either a single int, or a tuple of three ints",
+    )
+    dilationT = dilation[0]
+    dilationH = dilationT if len(dilation) == 1 else dilation[1]
+    dilationW = dilationT if len(dilation) == 1 else dilation[2]
+
+    torch._check(
+        input.ndim in (4, 5),
+        lambda: "non-empty 4D or 5D (batch mode) tensor expected for input",
+    )
+
+    nslices = input.size(-4)
+    itime = input.size(-3)
+    iheight = input.size(-2)
+    iwidth = input.size(-1)
+
+    otime = grad_output.size(-3)
+    oheight = grad_output.size(-2)
+    owidth = grad_output.size(-1)
+
+    max_pool3d_backward_shape_check(
+        input,
+        grad_output,
+        indices,
+        nslices,
+        kT,
+        kH,
+        kW,
+        dT,
+        dH,
+        dW,
+        pT,
+        pH,
+        pW,
+        dilationT,
+        dilationH,
+        dilationW,
+        itime,
+        iheight,
+        iwidth,
+        otime,
+        oheight,
+        owidth,
+        "max_pool3d_with_indices_backward()",
+    )
+
+    channels_last = (
+        input.ndim == 5 and utils.suggest_memory_format(input) == torch.channels_last_3d
+    )
+    if input.ndim == 4:
+        input_channels_last_check = input.unsqueeze(0)
+        channels_last = (
+            not input_channels_last_check.is_contiguous()
+        ) and input_channels_last_check.is_contiguous(
+            memory_format=torch.channels_last_3d
+        )
+
+    grad_input = input.new_empty(input.shape)
+
+    if channels_last:
+        grad_input = grad_input.to(memory_format=torch.channels_last_3d)
+
+    return grad_input
+
+
+def check_grid_sampler_common(input: Tensor, grid: Tensor):
+    torch._check(
+        input.device == grid.device,
+        lambda: (
+            f"grid_sampler(): expected input and grid to be on same device, but input "
+            f"is on {input.device} and grid is on {grid.device}"
+        ),
+    )
+    torch._check(
+        input.layout == torch.strided and grid.layout == torch.strided,
+        lambda: (
+            f"grid_sampler(): expected input and grid to have torch.strided layout, but "
+            f"input has {input.layout} and grid has {grid.layout}"
+        ),
+    )
+    torch._check(
+        input.shape[0] == grid.shape[0],
+        lambda: (
+            f"grid_sampler(): expected grid and input to have same batch size, but got "
+            f"input with sizes {input.shape} and grid with sizes {grid.shape}"
+        ),
+    )
+    torch._check(
+        grid.shape[-1] == input.ndim - 2,
+        lambda: (
+            f"grid_sampler(): expected grid to have size {input.ndim - 2} in last "
+            f"dimension, but got grid with sizes {grid.shape}"
+        ),
+    )
+
+    for i in range(2, input.ndim):
+        torch._check(
+            input.shape[i] > 0,
+            lambda: (
+                f"grid_sampler(): expected input to have non-empty spatial dimensions, "
+                f"but input has sizes {input.shape} with dimension {i} being empty"
+            ),
+        )
+
+
+class GridSamplerInterpolation(Enum):
+    BILINEAR = 0
+    NEAREST = 1
+    BICUBIC = 2
+
+
+def check_grid_sampler_3d(input: Tensor, grid: Tensor, interpolation_mode: int):
+    torch._check(
+        input.ndim == 5 and input.ndim == grid.ndim,
+        lambda: (
+            f"grid_sampler(): expected 5D input and grid with same number of "
+            f"dimensions, but got input with sizes {input.shape}"
+            f" and grid with sizes {grid.shape}"
+        ),
+    )
+    torch._check(
+        not (
+            input.ndim == 5
+            and interpolation_mode == GridSamplerInterpolation.BICUBIC.value
+        ),
+        lambda: "grid_sampler(): bicubic interpolation only supports 4D input",
+    )
+
+
+@register_meta(aten.grid_sampler_2d_backward.default)
+def grid_sampler_2d_backward_meta(
+    grad_output,
+    input,
+    grid,
+    interpolation_mode,
+    padding_mode,
+    align_corners,
+    output_mask,
+):
+    input_requires_grad = output_mask[0]
+    if input_requires_grad:
+        grad_input = torch.zeros_like(input, memory_format=torch.contiguous_format)
+    else:
+        grad_input = None
+    grad_grid = torch.empty_like(grid, memory_format=torch.contiguous_format)
+    return (grad_input, grad_grid)
+
+
+@register_meta(aten.grid_sampler_3d)
+@out_wrapper()
+def grid_sampler_3d(
+    input,
+    grid,
+    interpolation_mode,
+    padding_mode,
+    align_corners,
+):
+    check_grid_sampler_common(input, grid)
+    check_grid_sampler_3d(input, grid, interpolation_mode)
+    N = input.shape[0]
+    C = input.shape[1]
+    out_D = grid.shape[1]
+    out_H = grid.shape[2]
+    out_W = grid.shape[3]
+    return input.new_empty((N, C, out_D, out_H, out_W))
+
+
+@register_meta(aten.grid_sampler_3d_backward)
+@out_wrapper("grad_input", "grad_grid")
+def grid_sampler_3d_backward(
+    grad_output,
+    input,
+    grid,
+    interpolation_mode,
+    padding_mode,
+    align_corners,
+    output_mask,
+):
+    check_grid_sampler_common(input, grid)
+    check_grid_sampler_3d(input, grid, interpolation_mode)
+    input_requires_grad = output_mask[0]
+    if input_requires_grad:
+        grad_input = torch.zeros_like(
+            input, memory_format=torch.legacy_contiguous_format
+        )
+    else:
+        grad_input = None
+    grad_grid = torch.empty_like(grid, memory_format=torch.legacy_contiguous_format)
+    return grad_input, grad_grid
+
+
+@register_meta([aten.full.default])
+def full(size, fill_value, *args, **kwargs):
+    dtype = kwargs.get("dtype", None)
+    if not dtype:
+        dtype = utils.get_dtype(fill_value)
+    kwargs["dtype"] = dtype
+    return torch.empty(size, *args, **kwargs)
+
+
+# zeros_like is special cased to work for sparse
+@register_meta(aten.zeros_like.default)
+def zeros_like(
+    self,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+    memory_format=None,
+):
+    if layout == torch.sparse_coo:
+        torch._check(
+            memory_format is None,
+            lambda: "memory format option is only supported by strided tensors",
+        )
+
+        res = torch.empty(
+            0,
+            dtype=self.dtype if dtype is None else dtype,
+            layout=layout,
+            device=self.device if device is None else device,
+            pin_memory=pin_memory,
+        )
+
+        if self.is_sparse:
+            res.sparse_resize_and_clear_(
+                self.size(), self.sparse_dim(), self.dense_dim()
+            )
+        else:
+            res.sparse_resize_and_clear_(self.size(), self.dim(), 0)
+
+        res._coalesced_(True)
+        return res
+    res = aten.empty_like.default(
+        self,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        memory_format=memory_format,
+    )
+    # device can be not "meta"
+    res.fill_(0)
+    return res
+
+
+@register_meta(aten.select.int)
+def meta_select(self, dim, index):
+    ndim = self.dim()
+    torch._check_index(
+        ndim != 0,
+        lambda: "select() cannot be applied to a 0-dim tensor.",
+    )
+
+    dim = dim if dim >= 0 else dim + ndim
+    size = self.size(dim)
+
+    torch._check_index(
+        not (-index > size or index >= size),
+        lambda: f"select(): index {index} out of range for tensor of size "
+        f"{self.size()} at dimension {dim}",
+    )
+
+    index = index if index >= 0 else index + size
+
+    new_size = list(self.size())
+    new_stride = list(self.stride())
+
+    new_storage_offset = self.storage_offset() + index * new_stride[dim]
+    del new_size[dim]
+    del new_stride[dim]
+
+    return self.as_strided(new_size, new_stride, new_storage_offset)
+
+
+@register_meta(aten.select_scatter.default)
+def meta_select_scatter(self, src, dim, index):
+    return utils.clone_preserve_strides(self)
+
+
+@register_meta(aten.slice_scatter.default)
+def meta_slice_scatter(self, src, dim=0, start=None, end=None, step=1):
+    return utils.clone_preserve_strides(self)
+
+
+# TODO: Deduplicate this with canonicalize_dim
+def maybe_wrap_dim(dim: int, dim_post_expr: int, wrap_scalar: bool = True):
+    if dim_post_expr <= 0:
+        assert wrap_scalar
+        dim_post_expr = 1
+    min = -dim_post_expr
+    max = dim_post_expr - 1
+    assert not (dim < min or dim > max), f"dim {dim} out of bounds ({min}, {max})"
+    if dim < 0:
+        dim += dim_post_expr
+    return dim
+
+
+def ensure_nonempty_size(t, dim):
+    return 1 if t.dim() == 0 else t.shape[dim]
+
+
+# From aten/src/ATen/native/ScatterGatherChecks.h
+def gather_shape_check(self, dim, index):
+    self_dims = max(self.dim(), 1)
+    index_dims = max(index.dim(), 1)
+    torch._check(
+        self_dims == index_dims,
+        lambda: "Index tensor must have the same number of dimensions as input tensor",
+    )
+    for i in range(self_dims):
+        if i != dim:
+            torch._check(
+                ensure_nonempty_size(index, i) <= ensure_nonempty_size(self, i),
+                lambda: f"Size does not match at dimension {i} expected index {index.shape}"
+                + f" to be smaller than self {self.shape} apart from dimension {dim}",
+            )
+
+
+@register_meta(aten.gather.default)
+def meta_gather(self, dim, index, sparse_grad=False):
+    wrapped_dim = maybe_wrap_dim(dim, self.dim())
+    is_index_empty = index.numel() == 0
+    if not is_index_empty:
+        torch._check(
+            index.dtype == torch.long,
+            lambda: f"gather(): Expected dtype int64 for index, but got {index.dtype}",
+        )
+        gather_shape_check(self, wrapped_dim, index)
+    return self.new_empty(index.shape)
+
+
+# From aten/src/ATen/native/TensorAdvancedIndexing.cpp
+def get_operator_enum(reduce_, use_new_options=False):
+    if use_new_options:
+        if reduce_ == "sum":
+            return "REDUCE_ADD"
+        elif reduce_ == "prod":
+            return "REDUCE_MULTIPLY"
+        elif reduce_ == "mean":
+            return "REDUCE_MEAN"
+        elif reduce_ == "amax":
+            return "REDUCE_MAXIMUM"
+        elif reduce_ == "amin":
+            return "REDUCE_MINIMUM"
+        torch._check(
+            False,
+            lambda: "reduce argument must be either sum, prod, mean, amax or amin.",
+        )
+        return
+    else:
+        if reduce_ == "add":
+            return "REDUCE_ADD"
+        elif reduce_ == "multiply":
+            return "REDUCE_MULTIPLY"
+        torch._check(False, lambda: "reduce argument must be either add or multiply.")
+        return
+
+
+# From aten/src/ATen/native/ScatterGatherChecks.h
+def scatter_gather_dtype_check(method_name, self, index, src_opt=None):
+    if index.numel() != 0:
+        torch._check(
+            index.dtype == torch.long,
+            lambda: f"{method_name}(): Expected dtype int64 for index",
+        )
+
+    if src_opt is not None:
+        torch._check(
+            self.dtype == src_opt.dtype,
+            lambda: f"{method_name}(): Expected self.dtype to be equal to src.dtype",
+        )
+
+
+def ensure_nonempty_dim(dim):
+    return max(dim, 1)
+
+
+# From aten/src/ATen/native/ScatterGatherChecks.h
+def scatter_shape_check(self, dim, index, src_opt=None):
+    if index.numel() == 0:
+        return
+    torch._check(
+        ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()),
+        lambda: "Index tensor must have the same number of dimensions as self tensor",
+    )
+
+    is_wrong_shape = False
+    self_dims = ensure_nonempty_dim(self.dim())
+
+    # Check: index.size(d) <= self.size(d) for all d != dim
+    for d in range(self_dims):
+        index_d_size = ensure_nonempty_size(index, d)
+        if d == dim:
+            continue
+        if index_d_size > ensure_nonempty_size(self, d):
+            is_wrong_shape = True
+            break
+
+    # Check: index.size(d) <= src.size(d) for all d if src is Tensor
+    if not is_wrong_shape and src_opt is not None:
+        for d in range(self_dims):
+            index_d_size = ensure_nonempty_size(index, d)
+            if index_d_size > ensure_nonempty_size(src_opt, d):
+                is_wrong_shape = True
+                break
+
+    if src_opt is not None:
+        torch._check(
+            ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()),
+            lambda: "Index tensor must have the same number of dimensions as self tensor",
+        )
+        torch._check(
+            not is_wrong_shape,
+            lambda: f"Expected index {index.shape} to be smaller than self {self.shape}"
+            + f" apart from dimension {dim} and to be smaller than src {src_opt.shape}",
+        )
+    else:
+        torch._check(
+            not is_wrong_shape,
+            lambda: f"Expected index {index.shape} to be smaller than self {self.shape}"
+            + f" apart from dimension {dim}",
+        )
+
+
+# From aten/src/ATen/native/TensorAdvancedIndexing.cpp
+def scatter_meta_impl(self, dim, index, src=None, reduce_=None, use_new_options=False):
+    wrapped_dim = maybe_wrap_dim(dim, self.dim())
+    scatter_gather_dtype_check("scatter", self, index, src)
+    scatter_shape_check(self, wrapped_dim, index, src)
+    if reduce_ is not None:
+        # Check if we have a valid reduce operator.
+        get_operator_enum(reduce_, use_new_options)
+
+
+@register_meta(aten.scatter_add.default)
+def meta_scatter_add(self, dim, index, src):
+    scatter_meta_impl(self, dim, index, src, "add")
+    return self.new_empty(self.shape)
+
+
+@register_meta(aten.scatter_add_)
+def meta_scatter_add_(self, dim, index, src):
+    scatter_meta_impl(self, dim, index, src, "add")
+    return self
+
+
+@register_meta(
+    [
+        aten.scatter.src,
+        aten.scatter.value,
+        aten.scatter.reduce,
+        aten.scatter.value_reduce,
+    ]
+)
+@out_wrapper()
+def meta_scatter(self, dim, index, src_or_value, reduce=None):
+    src = src_or_value if isinstance(src_or_value, torch.Tensor) else None
+    scatter_meta_impl(self, dim, index, src, reduce)
+    return self.new_empty(self.shape)
+
+
+@register_meta(
+    [
+        aten.scatter_.src,
+        aten.scatter_.value,
+        aten.scatter_.reduce,
+        aten.scatter_.value_reduce,
+    ]
+)
+def meta_scatter_(self, dim, index, src_or_value, reduce=None):
+    src = src_or_value if isinstance(src_or_value, torch.Tensor) else None
+    scatter_meta_impl(self, dim, index, src, reduce)
+    return self
+
+
+@register_meta(
+    [
+        aten._scaled_dot_product_flash_attention_backward,
+    ]
+)
+def meta__scaled_dot_product_flash_backward(
+    grad_out: Tensor,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    out: Tensor,
+    logsumexp: Tensor,
+    cum_seq_q: Tensor,
+    cum_seq_k: Tensor,
+    max_q: int,
+    max_k: int,
+    dropout_p: float,
+    is_causal: bool,
+    philox_seed: Tensor,
+    philox_offset: Tensor,
+    scale: Optional[float] = None,
+):
+    grad_q = torch.empty_like(query.transpose(1, 2)).transpose(1, 2)
+    grad_k = torch.empty_like(key.transpose(1, 2)).transpose(1, 2)
+    grad_v = torch.empty_like(value.transpose(1, 2)).transpose(1, 2)
+    return grad_q, grad_k, grad_v
+
+
+@register_meta(
+    [
+        aten._scaled_dot_product_flash_attention_for_cpu,
+    ]
+)
+def meta__scaled_dot_product_flash_attention_for_cpu(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    attn_mask: Optional[Tensor] = None,
+    scale: Optional[float] = None,
+):
+    batch_size = query.size(0)
+    num_heads = query.size(1)
+    max_seqlen_batch_q = query.size(2)
+    head_dim = query.size(3)
+
+    attention = torch.empty(
+        (batch_size, max_seqlen_batch_q, num_heads, head_dim),
+        dtype=query.dtype,
+        device=query.device,
+    ).transpose(1, 2)
+    logsumexp = torch.empty(
+        (
+            batch_size,
+            max_seqlen_batch_q,
+            num_heads,
+        ),
+        dtype=torch.float,
+        device=query.device,
+    ).transpose(1, 2)
+    return (
+        attention,
+        logsumexp,
+    )
+
+
+@register_meta(
+    [
+        aten._scaled_dot_product_flash_attention_for_cpu_backward,
+    ]
+)
+def meta__scaled_dot_product_flash_attention_for_cpu_backward(
+    grad_out: Tensor,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    out: Tensor,
+    logsumexp: Tensor,
+    dropout_p: float,
+    is_causal: bool,
+    attn_mask: Optional[Tensor] = None,
+    scale: Optional[float] = None,
+):
+    # cpus's grad layout is different from cuda's,
+    # i.e. (batch_size, seq_len，num_heads, head_dim）
+    batch_size = query.size(0)
+    num_heads = query.size(1)
+    head_dim = query.size(3)
+    len_q = query.size(2)
+    len_k = key.size(2)
+
+    grad_q = torch.empty_permuted(
+        (batch_size, num_heads, len_q, head_dim),
+        (0, 2, 1, 3),
+        dtype=query.dtype,
+        device=query.device,
+    )
+    grad_k = torch.empty_permuted(
+        (batch_size, num_heads, len_k, head_dim),
+        (0, 2, 1, 3),
+        dtype=key.dtype,
+        device=key.device,
+    )
+    grad_v = torch.empty_permuted(
+        (batch_size, num_heads, len_k, head_dim),
+        (0, 2, 1, 3),
+        dtype=value.dtype,
+        device=value.device,
+    )
+
+    return grad_q, grad_k, grad_v
+
+
+@register_meta(
+    [
+        aten._scaled_dot_product_efficient_attention_backward,
+    ]
+)
+def meta__scaled_dot_product_efficient_backward(
+    grad_out: Tensor,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    attn_bias: Optional[Tensor],
+    out: Tensor,
+    logsumexp: Tensor,
+    philox_seed: Tensor,
+    philox_offset: Tensor,
+    dropout_p: float,
+    grad_input_mask: List[bool],
+    is_causal: bool = False,
+    scale: Optional[float] = None,
+):
+    batch_size = query.size(0)
+    num_heads = query.size(1)
+    max_q = query.size(2)
+    head_dim = query.size(3)
+    head_dim_v = value.size(3)
+
+    max_k = key.size(2)
+
+    grad_q = torch.empty_permuted(
+        (batch_size, num_heads, max_q, head_dim),
+        (0, 2, 1, 3),
+        dtype=query.dtype,
+        device=query.device,
+    )
+    grad_k = torch.empty_permuted(
+        (batch_size, num_heads, max_k, head_dim),
+        (0, 2, 1, 3),
+        dtype=key.dtype,
+        device=key.device,
+    )
+    grad_v = torch.empty_permuted(
+        (batch_size, num_heads, max_k, head_dim_v),
+        (0, 2, 1, 3),
+        dtype=value.dtype,
+        device=value.device,
+    )
+    grad_bias = None
+    if attn_bias is not None and grad_input_mask[3]:
+        lastDim = attn_bias.size(-1)
+        lastDimAligned = lastDim if lastDim % 16 == 0 else lastDim + 16 - lastDim % 16
+        new_sizes = list(attn_bias.size())
+        new_sizes[-1] = lastDimAligned
+        grad_bias = torch.empty(
+            new_sizes, dtype=attn_bias.dtype, device=attn_bias.device
+        )
+        grad_bias = grad_bias[..., :lastDim]
+
+    return grad_q, grad_k, grad_v, grad_bias
+
+
+@register_meta(
+    [
+        aten._flash_attention_backward,
+    ]
+)
+def meta__flash_attention_backward(
+    grad_out: Tensor,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    out: Tensor,
+    logsumexp: Tensor,
+    cum_seq_q: Tensor,
+    cum_seq_k: Tensor,
+    max_q: int,
+    max_k: int,
+    dropout_p: float,
+    is_causal: bool,
+    philox_seed: Tensor,
+    philox_offset: Tensor,
+    scale: Optional[float] = None,
+):
+    grad_query = torch.empty_like(query)
+    grad_key = torch.empty_like(key)
+    grad_value = torch.empty_like(value)
+
+    return grad_query, grad_key, grad_value
+
+
+@register_meta(
+    [
+        aten._efficient_attention_backward,
+    ]
+)
+def meta__efficient_attention_backward(
+    grad_out: Tensor,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    bias: Optional[Tensor],
+    cu_seqlens_q: Optional[Tensor],
+    cu_seqlens_k: Optional[Tensor],
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    logsumexp: Tensor,
+    dropout_p: float,
+    philox_seed: Tensor,
+    philox_offset: Tensor,
+    custom_mask_type: int,
+    bias_requires_grad: bool,
+    scale: Optional[float] = None,
+    num_splits_key: Optional[int] = None,
+):
+    grad_query = torch.empty_like(query)
+    grad_key = torch.empty_like(key)
+    grad_value = torch.empty_like(value)
+
+    if bias is not None:
+        lastDim = bias.size(-1)
+        lastDimAligned = lastDim if lastDim % 16 == 0 else lastDim + 16 - lastDim % 16
+        new_sizes = list(bias.size())
+        new_sizes[-1] = lastDimAligned
+        grad_bias = torch.empty(new_sizes, dtype=bias.dtype, device=bias.device)
+        grad_bias = grad_bias[..., :lastDim]
+    else:
+        grad_bias = torch.empty((), device=query.device)
+
+    return grad_query, grad_key, grad_value, grad_bias
+
+
+@register_meta([aten._scaled_mm.default])
+def meta_scaled_mm(
+    self: torch.Tensor,
+    mat2: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    scale_a: Optional[torch.Tensor] = None,
+    scale_b: Optional[torch.Tensor] = None,
+    scale_result: Optional[torch.Tensor] = None,
+    use_fast_accum: bool = False,
+):
+    def is_row_major(stride):
+        return stride[0] > stride[1] and stride[1] == 1
+
+    def is_col_major(shape, stride):
+        return stride[0] == 1 and stride[1] == shape[0]
+
+    def is_fp8_type(dtype):
+        return dtype in (
+            torch.float8_e4m3fn,
+            torch.float8_e5m2,
+            torch.float8_e4m3fnuz,
+            torch.float8_e5m2fnuz,
+        )
+
+    torch._check(
+        self.dim() == 2 and mat2.dim() == 2,
+        lambda: f"Inputs must be 2D but got self.dim()={self.dim()} and mat2.dim()={mat2.dim()}",
+    )
+    torch._check(
+        is_row_major(self.stride()),
+        lambda: "self must be row_major",
+    )
+    torch._check(
+        is_col_major(mat2.shape, mat2.stride()),
+        lambda: "mat2 must be col_major",
+    )
+    torch._check(
+        self.size(1) % 16 == 0,
+        lambda: f"Expected self.size(0) to be divisible by 16, but got self.size(1)={self.size(1)}",
+    )
+    torch._check(
+        mat2.size(0) % 16 == 0 and mat2.size(1) % 16 == 0,
+        lambda: f"Expected both dimensions of mat2 to be divisble by 16 but got {mat2.shape}",
+    )
+    torch._check(
+        is_fp8_type(self.dtype) and is_fp8_type(mat2.dtype),
+        lambda: f"Expected both inputs to be fp8 types but got self.dtype={self.dtype} and mat2.dtype={mat2.dtype}",
+    )
+    _out_dtype = out_dtype if out_dtype is not None else self.dtype
+    return torch.empty(
+        self.size(0), mat2.size(1), dtype=_out_dtype, device=self.device
+    ), torch.empty((), dtype=torch.float32, device=self.device)
+
+
+@register_meta([aten.scatter_reduce.two, aten.scatter_reduce.two_out])
+@out_wrapper()
+def meta_scatter_reduce_two(self, dim, index, src, reduce, include_self=True):
+    scatter_meta_impl(self, dim, index, src, reduce, use_new_options=True)
+    return self.new_empty(self.shape)
+
+
+@register_meta(aten.scatter_reduce_.two)
+def meta_scatter_reduce__two(self, dim, index, src, reduce, include_self=True):
+    scatter_meta_impl(self, dim, index, src, reduce, use_new_options=True)
+    return self
+
+
+@register_meta([aten.multinomial.default, aten.multinomial.out])
+@out_wrapper()
+def meta_multinomial(input, num_samples, replacement=False, *, generator=None):
+    torch._check(
+        0 < input.dim() <= 2,
+        lambda: f"The probabilty distributions dimensions must be 1 or 2, but got {input.dim()}",
+    )
+    if input.dim() == 1:
+        return torch.empty(num_samples, dtype=torch.long, device=input.device)
+    return torch.empty(
+        input.size(0), num_samples, dtype=torch.long, device=input.device
+    )
+
+
+def multiply_integers(vs):
+    r = 1
+    for v in vs:
+        r *= v
+    return r
+
+
+def upsample_common_check(input_size, output_size, num_spatial_dims):
+    torch._check(
+        len(output_size) == num_spatial_dims,
+        lambda: f"It is expected output_size equals to {num_spatial_dims}, but got size {len(output_size)}",
+    )
+    expected_input_dims = num_spatial_dims + 2  # N, C, ...
+    torch._check(
+        len(input_size) == expected_input_dims,
+        lambda: f"It is expected input_size equals to {expected_input_dims}, but got size {len(input_size)}",
+    )
+
+    torch._check(
+        all(s > 0 for s in input_size[2:]) and all(s > 0 for s in output_size),
+        lambda: f"Input and output sizes should be greater than 0, but got "
+        f"input size {input_size} and output size {output_size}",
+    )
+
+    nbatch, channels = input_size[:2]
+    return (nbatch, channels, *output_size)
+
+
+@register_meta(
+    [aten.upsample_nearest1d.default, aten._upsample_nearest_exact1d.default]
+)
+def upsample_nearest1d(input, output_size, scales=None):
+    torch._check(
+        input.numel() != 0 or multiply_integers(input.size()[1:]),
+        lambda: f"Non-empty 3D data tensor expected but got a tensor with sizes {input.size()}",
+    )
+    full_output_size = upsample_common_check(
+        input.size(), output_size, num_spatial_dims=1
+    )
+    return input.new_empty(full_output_size).to(
+        memory_format=utils.suggest_memory_format(input)
+    )
+
+
+@register_meta(
+    [aten.upsample_nearest2d.default, aten._upsample_nearest_exact2d.default]
+)
+def upsample_nearest2d(input, output_size, scales_h=None, scales_w=None):
+    torch._check(
+        input.numel() != 0 or multiply_integers(input.size()[1:]),
+        lambda: f"Non-empty 4D data tensor expected but got a tensor with sizes {input.size()}",
+    )
+    full_output_size = upsample_common_check(
+        input.size(), output_size, num_spatial_dims=2
+    )
+    output = input.new_empty(full_output_size)
+
+    # convert output to correct memory format, if necessary
+    memory_format = utils.suggest_memory_format(input)
+
+    # following "heuristic: only use channels_last path when it's faster than the contiguous path"
+    _, n_channels, _, _ = input.shape
+    if input.device.type == "cuda" and n_channels < 4:
+        memory_format = torch.contiguous_format
+
+    output = output.contiguous(memory_format=memory_format)
+
+    return output
+
+
+@register_meta(
+    [
+        aten.upsample_nearest2d_backward.default,
+        aten._upsample_nearest_exact2d_backward.default,
+    ]
+)
+def upsample_nearest2d_backward(
+    grad_output: Tensor,
+    output_size: Sequence[Union[int, torch.SymInt]],
+    input_size: Sequence[Union[int, torch.SymInt]],
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
+):
+    full_output_size = upsample_common_check(
+        input_size, output_size, num_spatial_dims=2
+    )
+    torch._check(
+        grad_output.ndim == 4,
+        lambda: f"Expected grad_output to be a tensor of dimension 4 but got: dimension {grad_output.ndim}",
+    )
+    for i in range(4):
+        torch._check(
+            grad_output.size(i) == full_output_size[i],
+            lambda: (
+                f"Expected grad_output to have the same shape as output;"
+                f" output.size({i}) = {full_output_size[i]}"
+                f" but got grad_output.size({i}) = {grad_output.size(i)}"
+            ),
+        )
+
+    return grad_output.new_empty(input_size).to(
+        memory_format=utils.suggest_memory_format(grad_output)
+    )  # type: ignore[call-overload]
+
+
+@register_meta(
+    [aten.upsample_nearest3d.default, aten._upsample_nearest_exact3d.default]
+)
+def upsample_nearest3d(input, output_size, scales_d=None, scales_h=None, scales_w=None):
+    torch._check(
+        input.numel() != 0 or multiply_integers(input.size()[1:]),
+        lambda: f"Non-empty 5D data tensor expected but got a tensor with sizes {input.size()}",
+    )
+    full_output_size = upsample_common_check(
+        input.size(), output_size, num_spatial_dims=3
+    )
+    return input.new_empty(full_output_size).to(
+        memory_format=utils.suggest_memory_format(input)
+    )
+
+
+@register_meta(
+    [
+        aten.sort.default,
+        aten.sort.stable,
+        aten.sort.values,
+        aten.sort.values_stable,
+    ]
+)
+def meta_sort(self, stable=None, dim=-1, descending=False, values=None, indices=None):
+    v, i = torch.empty_like(self), torch.empty_like(self, dtype=torch.int64)
+    if values is not None and indices is not None:
+        assert isinstance(values, TensorLike)
+        assert isinstance(indices, TensorLike)
+        # Makes sure values and indices have the same strides. For cases where
+        # these have different shapes, like (5, 10, 5) and (0) in msort.
+        out_shape = v.shape
+        out_stride = v.stride()
+        values = _maybe_resize_out(values, out_shape)
+        indices = _maybe_resize_out(indices, out_shape)
+        values.as_strided_(out_shape, out_stride)
+        indices.as_strided_(out_shape, out_stride)
+        _safe_copy_out(copy_from=v, copy_to=values)  # type: ignore[arg-type]
+        _safe_copy_out(copy_from=i, copy_to=indices)  # type: ignore[arg-type]
+        return values, indices
+    return v, i
+
+
+@register_meta(aten.argsort.stable)
+def meta_argsort(self, *, stable, dim=-1, descending=False):
+    return meta_sort(self, stable=stable, dim=dim, descending=descending)[1]
+
+
+def rnn_cell_checkSizes(
+    input_gates, hidden_gates, input_bias, hidden_bias, factor, prev_hidden
+):
+    torch._check(input_gates.ndim == 2, lambda: f"{input_gates.ndim} != 2")
+    torch._check(
+        input_gates.shape == hidden_gates.shape,
+        lambda: f"{input_gates.shape} != {hidden_gates.shape}",
+    )
+    gates_size = input_gates.size(1)
+    if input_bias is not None:
+        torch._check(input_bias.ndim == 1, lambda: f"{input_bias.ndim} != 1")
+        torch._check(
+            input_bias.numel() == gates_size,
+            lambda: f"{input_bias.numel()} != {gates_size}",
+        )
+        torch._check(
+            input_bias.shape == hidden_bias.shape,
+            lambda: f"{input_bias.shape} != {hidden_bias.shape}",
+        )
+    torch._check(prev_hidden.ndim == 2, lambda: f"{prev_hidden.ndim} != 2")
+    expected_prev_hidden_numel = input_gates.size(0) * gates_size // factor
+    torch._check(
+        prev_hidden.numel() == expected_prev_hidden_numel,
+        lambda: f"{prev_hidden.numel()} != {input_gates.size(0)} * {gates_size} // {factor} (aka {expected_prev_hidden_numel})",
+    )
+    torch._check(
+        all(
+            x.device == input_gates.device
+            for x in [hidden_gates, input_bias, hidden_bias, prev_hidden]
+        ),
+        lambda: "expected all inputs to be same device",
+    )
+
+
+@register_meta(aten._thnn_fused_lstm_cell.default)
+def _thnn_fused_lstm_cell_meta(
+    input_gates, hidden_gates, cx, input_bias=None, hidden_bias=None
+):
+    rnn_cell_checkSizes(input_gates, hidden_gates, input_bias, hidden_bias, 4, cx)
+    workspace = torch.empty_like(input_gates, memory_format=torch.contiguous_format)
+    hy = torch.empty_like(cx, memory_format=torch.contiguous_format)
+    cy = torch.empty_like(cx, memory_format=torch.contiguous_format)
+    return (hy, cy, workspace)
+
+
+@register_meta(aten._cudnn_rnn.default)
+def _cudnn_rnn(
+    input,
+    weight,
+    weight_stride0,
+    weight_buf,
+    hx,
+    cx,
+    mode,
+    hidden_size,
+    proj_size,
+    num_layers,
+    batch_first,
+    dropout,
+    train,
+    bidirectional,
+    batch_sizes,
+    dropout_state,
+):
+    is_input_packed = len(batch_sizes) != 0
+    if is_input_packed:
+        seq_length = len(batch_sizes)
+        mini_batch = batch_sizes[0]
+        batch_sizes_sum = input.shape[0]
+    else:
+        seq_length = input.shape[1] if batch_first else input.shape[0]
+        mini_batch = input.shape[0] if batch_first else input.shape[1]
+        batch_sizes_sum = -1
+
+    num_directions = 2 if bidirectional else 1
+    out_size = proj_size if proj_size != 0 else hidden_size
+    if is_input_packed:
+        out_shape = [batch_sizes_sum, out_size * num_directions]
+    else:
+        out_shape = (
+            [mini_batch, seq_length, out_size * num_directions]
+            if batch_first
+            else [seq_length, mini_batch, out_size * num_directions]
+        )
+    output = input.new_empty(out_shape)
+
+    cell_shape = [num_layers * num_directions, mini_batch, hidden_size]
+    if cx is None:
+        cy = torch.empty(0, device=input.device)
+    else:
+        cy = cx.new_empty(cell_shape)
+
+    hy = hx.new_empty([num_layers * num_directions, mini_batch, out_size])
+
+    # TODO: Query cudnnGetRNNTrainingReserveSize (expose to python)
+    reserve_shape = 0 if train else 0
+    reserve = input.new_empty(reserve_shape, dtype=torch.uint8)
+
+    return output, hy, cy, reserve, weight_buf
+
+
+@register_meta(aten.mkldnn_rnn_layer.default)
+def mkldnn_rnn_layer(
+    input,
+    w0,
+    w1,
+    w2,
+    w3,
+    hx_,
+    cx_,
+    reverse,
+    batch_sizes,
+    mode,
+    hidden_size,
+    num_layers,
+    has_biases,
+    bidirectional,
+    batch_first,
+    train,
+):
+    seq_length = input.shape[1] if batch_first else input.shape[0]
+    mini_batch = input.shape[0] if batch_first else input.shape[1]
+    output_chanels = hidden_size
+    out_shape = (
+        [mini_batch, seq_length, output_chanels]
+        if batch_first
+        else [seq_length, mini_batch, output_chanels]
+    )
+    output = input.new_empty(out_shape)
+    if hx_ is None:
+        hy = torch.empty(0, device=input.device)
+    else:
+        hy = hx_.new_empty(hx_.shape)
+    if cx_ is None:
+        cy = torch.empty(0, device=input.device)
+    else:
+        cy = cx_.new_empty(cx_.shape)
+    workspace = torch.empty(0, device=input.device, dtype=torch.uint8)
+    return output, hy, cy, workspace
+
+
+def zero_numel_check_dims(self, dim, fn_name):
+    if self.ndim == 0:
+        torch._check_index(
+            dim == 0 or dim == -1,
+            lambda: f"{fn_name}: Expected reduction dim -1 or 0 for scalar but got {dim}",
+        )
+    else:
+        torch._check_index(
+            self.size(dim) != 0,
+            lambda: f"{fn_name}: Expected reduction dim {dim} to have non-zero size.",
+        )
+
+
+# From aten/src/ATen/native/ReduceOps.cpp
+def check_argmax_argmin(name, self, dim):
+    if dim is not None:
+        dim = maybe_wrap_dim(dim, self.dim())
+        zero_numel_check_dims(self, dim, name)
+    else:
+        torch._check(
+            self.numel() != 0,
+            lambda: f"{name}: Expected reduction dim to be specified for input.numel() == 0.",
+        )
+
+
+@register_meta([aten.argmax.default, aten.argmin.default])
+def argmax_argmin_meta(self, dim=None, keepdim=False):
+    check_argmax_argmin("argmax", self, dim)
+    dims = utils.reduction_dims(self.shape, (dim,) if dim is not None else None)
+    shape = _compute_reduction_shape(self, dims, keepdim)
+    return self.new_empty(shape, dtype=torch.int64)
+
+
+@register_meta(aten.scalar_tensor.default)
+def scalar_tensor(s, dtype=None, layout=None, device=None, pin_memory=None):
+    return torch.empty(
+        (), dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_meta(aten.topk.default)
+def topk_meta(self, k, dim=-1, largest=True, sorted=True):
+    # From aten/src/ATen/native/Sorting.cpp
+    dim = maybe_wrap_dim(dim, self.dim(), wrap_scalar=True)
+    torch._check(
+        k >= 0 and k <= (self.size(dim) if self.dim() > 0 else 1),
+        lambda: "selected index k out of range",
+    )
+    sliceSize = 1 if self.dim() == 0 else self.size(dim)
+    torch._check(k >= 0 and k <= sliceSize, lambda: "k not in range for dimension")
+
+    topKSize = list(self.shape)
+    if len(topKSize) > 0:
+        topKSize[dim] = k
+    return self.new_empty(topKSize), self.new_empty(topKSize, dtype=torch.int64)
+
+
+legacy_contiguous_memory_format = torch.contiguous_format
+
+
+# From aten/src/ATen/native/cuda/RNN.cu
+def checkLSTMBackwardSizes(grad_hy, grad_cy, cx, cy, workspace):
+    defined_grad = grad_hy if grad_hy is not None else grad_cy
+    torch._check(defined_grad.dim() == 2, lambda: "")
+    exp_size = defined_grad.size()
+    if grad_hy is not None:
+        torch._check(grad_hy.size() == exp_size, lambda: "")
+    if grad_cy is not None:
+        torch._check(grad_cy.size() == exp_size, lambda: "")
+    torch._check(cx.size() == exp_size, lambda: "")
+    torch._check(cy.size() == exp_size, lambda: "")
+    torch._check(workspace.dim() == 2, lambda: "")
+    torch._check(workspace.numel() == exp_size[0] * exp_size[1] * 4, lambda: "")
+
+
+# From aten/src/ATen/native/cuda/RNN.cu
+@register_meta(aten._thnn_fused_lstm_cell_backward_impl.default)
+def _thnn_fused_lstm_cell_backward_impl(grad_hy, grad_cy, cx, cy, workspace, has_bias):
+    if grad_hy is None and grad_cy is None:
+        return None, None, None
+    checkLSTMBackwardSizes(grad_hy, grad_cy, cx, cy, workspace)
+    grad_gates = torch.empty_like(
+        workspace, memory_format=legacy_contiguous_memory_format
+    )
+    grad_cx = torch.empty_like(cx, memory_format=legacy_contiguous_memory_format)
+    grad_bias = grad_gates.sum(0, keepdim=False) if has_bias else None
+    return grad_gates, grad_cx, grad_bias
+
+
+# From aten/src/ATen/native/mps/operations/Linear.mm
+@register_meta(aten.linear_backward.default)
+def linear_backward(input_, grad_output_, weight_, output_mask):
+    grad_input = None
+    grad_weight = None
+    grad_bias = None
+    if output_mask[0]:
+        grad_input = grad_output_.new_empty(input_.size())
+    if output_mask[1] or output_mask[2]:
+        grad_weight = grad_output_.new_empty((grad_output_.size(-1), input_.size(-1)))
+        grad_bias = grad_output_.new_empty(grad_output_.size(-1))
+    return (grad_input, grad_weight, grad_bias)
+
+
+@register_meta(aten.pixel_shuffle.default)
+def meta_pixel_shuffle(self, upscale_factor):
+    assert (
+        len(self.shape) > 2 and self.shape[-3] % (upscale_factor * upscale_factor) == 0
+    ), f"Invalid input shape for pixel_shuffle: {self.shape} with upscale_factor = {upscale_factor}"
+
+    def is_channels_last(ten):
+        return torch._prims_common.suggest_memory_format(ten) == torch.channels_last
+
+    def pick_memory_format():
+        if is_channels_last(self):
+            if device_hint(self) == "cuda":
+                return torch.contiguous_format
+            else:
+                return torch.channels_last
+        elif self.is_contiguous(memory_format=torch.contiguous_format):
+            return torch.contiguous_format
+        elif self.is_contiguous(memory_format=torch.preserve_format):
+            return torch.preserve_format
+
+    C = self.shape[-3] // (upscale_factor * upscale_factor)
+    Hr = self.shape[-2] * upscale_factor
+    Wr = self.shape[-1] * upscale_factor
+    out_shape = (*self.shape[:-3], C, Hr, Wr)
+
+    out = self.new_empty(out_shape)
+    out = out.to(memory_format=pick_memory_format())  # type: ignore[call-overload]
+    return out
+
+
+@register_meta(aten.mkldnn_rnn_layer_backward.default)
+def mkldnn_rnn_layer_backward(
+    input,
+    weight0,
+    weight1,
+    weight2,
+    weight3,
+    hx_,
+    cx_tmp,
+    output,
+    hy_,
+    cy_,
+    grad_output_r_opt,
+    grad_hy_r_opt,
+    grad_cy_r_opt,
+    reverse,
+    mode,
+    hidden_size,
+    num_layers,
+    has_biases,
+    train,
+    bidirectional,
+    batch_sizes,
+    batch_first,
+    workspace,
+):
+    diff_x = input.new_empty(input.shape)
+    diff_hx = hx_.new_empty(hx_.shape)
+    diff_cx = cx_tmp.new_empty(cx_tmp.shape)
+    diff_w1 = weight0.new_empty(weight0.shape)
+    diff_w2 = weight1.new_empty(weight1.shape)
+    diff_b = weight2.new_empty(weight2.shape)
+    return diff_x, diff_w1, diff_w2, diff_b, diff_b, diff_hx, diff_cx
+
+
+@register_meta([aten.bucketize.Tensor, aten.bucketize.Tensor_out])
+@out_wrapper()
+def meta_bucketize(self, boundaries, *, out_int32=False, right=False):
+    return torch.empty_like(
+        self, dtype=torch.int32 if out_int32 else torch.int64
+    ).contiguous()
+
+
+@register_meta(
+    [aten._upsample_bilinear2d_aa.default, aten._upsample_bicubic2d_aa.default]
+)
+def meta_upsample_bimode2d_aa(
+    input, output_size, align_corners, scales_h=None, scales_w=None
+):
+    full_output_size = upsample_common_check(
+        input.size(), output_size, num_spatial_dims=2
+    )
+    torch._check(
+        input.numel() != 0 or all(size > 0 for size in input.size()[1:]),
+        lambda: f"Non-empty 4D data tensor expected but got a tensor with sizes {input.size()}",
+    )
+    return input.new_empty(full_output_size).to(
+        memory_format=utils.suggest_memory_format(input)
+    )
+
+
+# From aten/src/ATen/native/cuda/AmpKernels.cu
+@register_meta(aten._amp_foreach_non_finite_check_and_unscale_.default)
+def _amp_foreach_non_finite_check_and_unscale_(self, found_inf, inv_scale):
+    torch._check(
+        found_inf.numel() == 1, lambda: "found_inf must be a 1-element tensor."
+    )
+    torch._check(
+        inv_scale.numel() == 1, lambda: "inv_scale must be a 1-element tensor."
+    )
+    torch._check(
+        found_inf.dtype.is_floating_point,
+        lambda: "found_inf must be a float tensor.",
+    )
+    torch._check(
+        inv_scale.dtype.is_floating_point,
+        lambda: "inv_scale must be a float tensor.",
+    )
+
+
+# From aten/src/ATen/native/UnaryOps.cpp
+@register_meta([aten.nan_to_num.default, aten.nan_to_num.out])
+@out_wrapper()
+def nan_to_num(self, nan=None, posinf=None, neginf=None):
+    result_size = list(self.size())
+    return self.new_empty(result_size)
+
+
+@register_meta(torch.ops.aten.transpose_)
+def transpose_(self, dim0, dim1):
+    assert self.layout not in {
+        torch.sparse_csr,
+        torch.sparse_csc,
+        torch.sparse_bsr,
+        torch.sparse_bsc,
+    }, f"torch.transpose_: in-place transposition is not supported for {self.layout} layout"
+
+    ndims = self.ndim
+
+    dim0 = maybe_wrap_dim(dim0, ndims)
+    dim1 = maybe_wrap_dim(dim1, ndims)
+
+    if dim0 == dim1:
+        return self
+
+    size = list(self.size())
+    stride = list(self.stride())
+
+    stride[dim0], stride[dim1] = stride[dim1], stride[dim0]
+    size[dim0], size[dim1] = size[dim1], size[dim0]
+
+    self.as_strided_(size, stride)
+    return self
+
+
+@register_meta(torch.ops.aten.t_)
+def t_(self):
+    ndims = self.ndim
+
+    if self.is_sparse:
+        sparse_dim = self.sparse_dim()
+        dense_dim = self.dense_dim()
+        assert (
+            sparse_dim <= 2 and dense_dim == 0
+        ), f"t_ expects a tensor with <= 2 sparse and 0 dense dimensions, but got {sparse_dim} sparse and {dense_dim} dense dimensions"  # noqa: B950
+    else:
+        assert (
+            self.dim() <= 2
+        ), f"t_ expects a tensor with <= 2 dimensions, but self is {ndims}D"
+
+    return transpose_(self, 0, 0 if ndims < 2 else 1)
+
+
+@register_meta(aten.searchsorted)
+@out_wrapper()
+def meta_searchsorted(
+    sorted_sequence, self, *, out_int32=False, right=False, side=None, sorter=None
+):
+    dtype = torch.int32 if out_int32 else torch.int64
+    if isinstance(self, torch.Tensor):
+        return torch.empty_like(self, dtype=dtype).contiguous()
+    else:  # Scalar
+        return torch.empty((), dtype=dtype, device=sorted_sequence.device)
+
+
+def _check_for_unsupported_isin_dtype(dtype):
+    torch._check(
+        dtype not in [torch.bool, torch.bfloat16, torch.complex128, torch.complex64],
+        lambda: f"Unsupported input type encountered for isin(): {dtype}",
+    )
+
+
+@register_meta(aten.isin)
+@out_wrapper()
+def meta_isin(elements, test_elements, *, assume_unique=False, invert=False):
+    torch._check(
+        isinstance(elements, Tensor) or isinstance(test_elements, Tensor),
+        lambda: "At least one of elements and test_elements must be a Tensor.",
+    )
+    if not isinstance(elements, Tensor):
+        elements = torch.tensor(elements, device=test_elements.device)
+
+    if not isinstance(test_elements, Tensor):
+        test_elements = torch.tensor(test_elements, device=elements.device)
+
+    _check_for_unsupported_isin_dtype(elements.dtype)
+    _check_for_unsupported_isin_dtype(test_elements.dtype)
+    return torch.empty_like(elements, dtype=torch.bool)
+
+
+@register_meta(aten.polygamma)
+@out_wrapper()
+def meta_polygamma(n: int, self: Tensor) -> Tensor:
+    torch._check(n >= 0, lambda: "polygamma(n, x) does not support negative n.")
+    _, result_dtype = elementwise_dtypes(
+        self,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    )
+    return torch.empty_like(self, dtype=result_dtype)
+
+
+def _create_unary_float_meta_func(func):
+    @register_meta(func)
+    @out_wrapper()
+    def _f(x):
+        return elementwise_meta(
+            x, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+        )
+
+    return _f
+
+
+def _create_binary_float_meta_func(func):
+    @register_meta(func)
+    @out_wrapper()
+    def _f(x, y):
+        return elementwise_meta(
+            x, y, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+        )
+
+    return _f
+
+
+_create_unary_float_meta_func(aten.special_airy_ai)
+_create_unary_float_meta_func(aten.special_bessel_y0)
+_create_unary_float_meta_func(aten.special_bessel_y1)
+_create_unary_float_meta_func(aten.special_modified_bessel_i0)
+_create_unary_float_meta_func(aten.special_modified_bessel_i1)
+_create_unary_float_meta_func(aten.special_modified_bessel_k0)
+_create_unary_float_meta_func(aten.special_modified_bessel_k1)
+_create_unary_float_meta_func(aten.special_scaled_modified_bessel_k0)
+_create_unary_float_meta_func(aten.special_scaled_modified_bessel_k1)
+
+
+_create_binary_float_meta_func(aten.special_chebyshev_polynomial_t)
+_create_binary_float_meta_func(aten.special_chebyshev_polynomial_u)
+_create_binary_float_meta_func(aten.special_chebyshev_polynomial_v)
+_create_binary_float_meta_func(aten.special_chebyshev_polynomial_w)
+_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_t)
+_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_u)
+_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_v)
+_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_w)
+_create_binary_float_meta_func(aten.special_hermite_polynomial_h)
+_create_binary_float_meta_func(aten.special_hermite_polynomial_he)
+_create_binary_float_meta_func(aten.special_laguerre_polynomial_l)
+_create_binary_float_meta_func(aten.special_legendre_polynomial_p)
+
+
+# We must also trigger meta registrations from PrimTorch ref
+# decompositions
+import torch._refs
+import torch._refs.nn.functional
+import torch._refs.special
+
+
+def activate_meta():
+    activate_meta_table = {}
+
+    # For a given op, we pick the most specific decomp function from
+    # global_decomp_table in the precedence order of meta > post_autograd > pre_autograd
+    for type in ["meta", "post_autograd", "pre_autograd"]:
+        registry = global_decomposition_table[type]
+
+        for opo in registry:
+            if opo not in activate_meta_table:
+                activate_meta_table[opo] = registry[opo]
+
+    for op_overload, fn in activate_meta_table.items():
+        # Don't register meta for HigherOrderOp's decomp.
+        # We can reconsider this in the future, but in general,
+        # the way you do a meta for a HigherOrderOp is different from
+        # OpOverload.
+        if isinstance(op_overload, torch._ops.HigherOrderOperator):
+            continue
+        assert isinstance(op_overload, OpOverload)
+
+        op_overload.py_impl(torch._C.DispatchKey.Meta)(fn)
+
+        if torch._C._dispatch_has_kernel_for_dispatch_key(
+            op_overload.name(), "CompositeImplicitAutograd"
+        ):
+            # Internally, we shouldn't be registering meta kernels for any operators that
+            # have CompositeImplicitAutograd kernels.
+            # Instead, we should be letting those decompositions run, and writing meta kernels
+            # only for the base operators.
+            if op_overload in global_decomposition_table["meta"]:
+                raise RuntimeError(
+                    f"{op_overload} is a CompositeImplicitAutograd op, we shouldn't "
+                    "register meta function for it. Instead, we should let the decomposition run and write "
+                    "meta kernels for the base operators."
+                )
+            pass
+        elif op_overload.is_view:
+            # Attempting to register a python meta kernel for a view operator.
+            # We shouldn't do this, because the output will report as not having aliased storages.
+            # All view ops have meta kernels in C++ today, so we should use those instead.
+            pass
+        elif op_overload.name() in {
+            "aten::empty_strided",  # causing infinite recursion, test_meta.py
+            "aten::clone",  # causing infinite recursion
+            "aten::_to_copy",  # causing infinite recursion, test_serialization.py -k test_tensor_subclass_getstate_overwrite  # noqa: B950
+            "aten::copy_",  # Exception not raised, test_torch.py -k test_storage_meta_errors_cpu_int64  # noqa: B950
+            "aten::constant_pad_nd",  # requires_grad mismatch, test_ops.py -k test_fake_crossref_backward_amp_istft_cuda_float32  # noqa: B950
+            "aten::rot90",  # requires_grad mismatch! test_ops.py -k test_fake_crossref_backward_amp_rot90_cuda_float32  # noqa: B950
+            "aten::as_strided_scatter",  # requires_grad mismatch, test_ops.py -k test_fake_crossref_backward_no_amp_as_strided_scatter_cuda_float32  # noqa: B950
+        }:
+            pass
+        else:
+            if "mkldnn::" in op_overload.name():
+                _meta_lib_dont_use_me_use_register_meta_for_mkldnn.impl(op_overload, fn)
+            elif "mkl::" in op_overload.name():
+                _meta_lib_dont_use_me_use_register_meta_for_mkl.impl(op_overload, fn)
+            elif "onednn::" in op_overload.name():
+                _meta_lib_dont_use_me_use_register_meta_for_onednn.impl(op_overload, fn)
+            elif "quantized::" in op_overload.name():
+                _meta_lib_dont_use_me_use_register_meta_for_quantized.impl(
+                    op_overload, fn
+                )
+            else:
+                _meta_lib_dont_use_me_use_register_meta.impl(op_overload, fn)
+
+
+activate_meta()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_namedtensor_internals.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_namedtensor_internals.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbc9de2de091d03295a0aca0011036e0a67a97eb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_namedtensor_internals.py
@@ -0,0 +1,157 @@
+from collections import OrderedDict
+
+"""
+This file contains helper functions that implement experimental functionality
+for named tensors in python. All of these are experimental, unstable, and
+subject to change or deletion.
+"""
+
+
+def check_serializing_named_tensor(tensor):
+    if tensor.has_names():
+        raise RuntimeError(
+            "NYI: Named tensors don't support serialization. Please drop "
+            "names via `tensor = tensor.rename(None)` before serialization."
+        )
+
+
+def build_dim_map(tensor):
+    """Returns a map of { dim: dim_name } where dim is a name if the dim is named
+    and the dim index otherwise."""
+    return OrderedDict(
+        [(idx if name is None else name, name) for idx, name in enumerate(tensor.names)]
+    )
+
+
+def unzip_namedshape(namedshape):
+    if isinstance(namedshape, OrderedDict):
+        namedshape = namedshape.items()
+    if not hasattr(namedshape, "__iter__") and not isinstance(namedshape, tuple):
+        raise RuntimeError(
+            f"Expected namedshape to be OrderedDict or iterable of tuples, got: {type(namedshape)}"
+        )
+    if len(namedshape) == 0:
+        raise RuntimeError("Expected namedshape to non-empty.")
+    return zip(*namedshape)
+
+
+def namer_api_name(inplace):
+    if inplace:
+        return "rename_"
+    else:
+        return "rename"
+
+
+def is_ellipsis(item):
+    return item == Ellipsis or item == "..."
+
+
+def single_ellipsis_index(names, fn_name):
+    ellipsis_indices = [i for i, name in enumerate(names) if is_ellipsis(name)]
+    if len(ellipsis_indices) >= 2:
+        raise RuntimeError(
+            f"{fn_name}: More than one Ellipsis ('...') found in names ("
+            f"{names}). This function supports up to one Ellipsis."
+        )
+    if len(ellipsis_indices) == 1:
+        return ellipsis_indices[0]
+    return None
+
+
+def expand_single_ellipsis(numel_pre_glob, numel_post_glob, names):
+    return names[numel_pre_glob : len(names) - numel_post_glob]
+
+
+def replace_ellipsis_by_position(ellipsis_idx, names, tensor_names):
+    globbed_names = expand_single_ellipsis(
+        ellipsis_idx, len(names) - ellipsis_idx - 1, tensor_names
+    )
+    return names[:ellipsis_idx] + globbed_names + names[ellipsis_idx + 1 :]
+
+
+def resolve_ellipsis(names, tensor_names, fn_name):
+    """
+    Expands ... inside `names` to be equal to a list of names from `tensor_names`.
+    """
+    ellipsis_idx = single_ellipsis_index(names, fn_name)
+    if ellipsis_idx is None:
+        return names
+    return replace_ellipsis_by_position(ellipsis_idx, names, tensor_names)
+
+
+def update_names_with_list(tensor, names, inplace):
+    # Special case for tensor.rename(None)
+    if len(names) == 1 and names[0] is None:
+        return tensor._update_names(None, inplace)
+
+    return tensor._update_names(
+        resolve_ellipsis(names, tensor.names, namer_api_name(inplace)), inplace
+    )
+
+
+def update_names_with_mapping(tensor, rename_map, inplace):
+    dim_map = build_dim_map(tensor)
+    for old_dim in rename_map.keys():
+        new_dim = rename_map[old_dim]
+        if old_dim in dim_map.keys():
+            dim_map[old_dim] = new_dim
+        else:
+            raise RuntimeError(
+                f"{namer_api_name(inplace)}: Tried to rename dim '{old_dim}' to dim "
+                f"{new_dim} in Tensor[{tensor.names}] but dim '{old_dim}' does not exist"
+            )
+    return tensor._update_names(tuple(dim_map.values()), inplace)
+
+
+def update_names(tensor, names, rename_map, inplace):
+    """There are two usages:
+
+    tensor.rename(*names) returns a view on tensor with named dims `names`.
+    `names` must be of length `tensor.dim()`; otherwise, if '...' is in `names`,
+    then it is expanded greedily to be equal to the corresponding names from
+    `tensor.names`.
+
+    For example,
+    ```
+    >>> # xdoctest: +SKIP
+    >>> x = torch.empty(2, 3, 5, 7, names=('N', 'C', 'H', 'W'))
+    >>> x.rename('...', 'height', 'width').names
+    ('N', 'C', 'height', 'width')
+
+    >>> # xdoctest: +SKIP
+    >>> x.rename('batch', '...', 'width').names
+    ('batch', 'C', 'H', 'width')
+
+    ```
+
+    tensor.rename(**rename_map) returns a view on tensor that has rename dims
+        as specified in the mapping `rename_map`.
+
+    For example,
+    ```
+    >>> # xdoctest: +SKIP
+    >>> x = torch.empty(2, 3, 5, 7, names=('N', 'C', 'H', 'W'))
+    >>> x.rename(W='width', H='height').names
+    ('N', 'C', 'height', 'width')
+
+    ```
+
+    Finally, tensor.rename has an in-place version called tensor.rename_.
+    """
+    has_names = len(names) > 0
+    has_rename_pairs = bool(rename_map)
+    if has_names and has_rename_pairs:
+        raise RuntimeError(
+            f"{namer_api_name(inplace)}: This function takes either positional "
+            f"args or keyword args, but not both. Use tensor.{namer_api_name(inplace)}(*names) "
+            f"to name dims and tensor.{namer_api_name(inplace)}(**rename_map) to rename "
+            "dims."
+        )
+
+    # Special case for tensor.rename(*[]), which is valid for a 0 dim tensor.
+    if not has_names and not has_rename_pairs:
+        return update_names_with_list(tensor, names, inplace)
+
+    if has_names:
+        return update_names_with_list(tensor, names, inplace)
+    return update_names_with_mapping(tensor, rename_map, inplace)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_tensor.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1e5fffb11e2a10f0ca957db2c5e21c78ffd3eb8
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_tensor.py
@@ -0,0 +1,1543 @@
+import copyreg
+import enum
+import functools
+import warnings
+from collections import OrderedDict
+from copy import deepcopy
+from numbers import Number
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch._C as _C
+import torch.utils.hooks as hooks
+from torch._namedtensor_internals import (
+    check_serializing_named_tensor,
+    is_ellipsis,
+    resolve_ellipsis,
+    single_ellipsis_index,
+    unzip_namedshape,
+    update_names,
+)
+from torch.overrides import (
+    get_default_nowrap_functions,
+    handle_torch_function,
+    has_torch_function,
+    has_torch_function_unary,
+    has_torch_function_variadic,
+)
+from torch.utils.dlpack import DLDeviceType
+
+
+def _handle_torch_function_and_wrap_type_error_to_not_implemented(f):
+    assigned = functools.WRAPPER_ASSIGNMENTS
+
+    @functools.wraps(f, assigned=assigned)
+    def wrapped(*args, **kwargs):
+        try:
+            # See https://github.com/pytorch/pytorch/issues/75462
+            if has_torch_function(args):
+                return handle_torch_function(wrapped, args, *args, **kwargs)
+            return f(*args, **kwargs)
+        except TypeError:
+            return NotImplemented
+
+    return wrapped
+
+
+# Should not be used, this is kept only for BC of loading old serialized Tensor subclasses
+def _rebuild_from_type(func, type, args, dict):
+    if type is Tensor:
+        return func(*args)
+
+    ret = func(*args).as_subclass(type)
+    ret.__dict__ = dict
+    return ret
+
+
+def _rebuild_from_type_v2(func, new_type, args, state):
+    ret = func(*args)
+    if type(ret) is not new_type:
+        ret = ret.as_subclass(new_type)
+    # Tensor does define __setstate__ even though it doesn't define
+    # __getstate__. So only use __setstate__ if it is NOT the one defined
+    # on Tensor
+    if (
+        getattr(ret.__class__, "__setstate__", Tensor.__setstate__)
+        is not Tensor.__setstate__
+    ):
+        ret.__setstate__(state)
+    else:
+        ret = torch._utils._set_obj_state(ret, state)
+    return ret
+
+
+# NB: If you subclass Tensor, and want to share the subclassed class
+# across processes, you must also update torch/multiprocessing/reductions.py
+# to define a ForkingPickler serialization mode for the class.
+#
+# NB: If you add a new method to Tensor, you must update
+# torch/_C/__init__.pyi.in to add a type annotation for your method;
+# otherwise, it will not show up in autocomplete.
+class Tensor(torch._C.TensorBase):
+    def __deepcopy__(self, memo):
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__deepcopy__, (self,), self, memo)
+        if not self.is_leaf:
+            raise RuntimeError(
+                "Only Tensors created explicitly by the user "
+                "(graph leaves) support the deepcopy protocol at the moment.  "
+                "If you were attempting to deepcopy a module, this may be because "
+                "of a torch.nn.utils.weight_norm usage, "
+                "see https://github.com/pytorch/pytorch/pull/103001"
+            )
+        if id(self) in memo:
+            return memo[id(self)]
+        with torch.no_grad():
+            # TODO: skipping storage copy is wrong for meta, as meta
+            # does accurate alias tracking; however, the code below
+            # doesn't work because of
+            # https://github.com/pytorch/pytorch/issues/47442
+            # Update the test in test_serialization if you remove 'meta' from here
+            if (
+                self.is_sparse
+                or self.device.type
+                in ["lazy", "xla", "mtia", "mps", "ort", "meta", "ipu"]
+                or (
+                    not torch._C._has_storage(self)
+                    and self.device.type == torch._C._get_privateuse1_backend_name()
+                )
+                or (type(self) is not Tensor and self.data_ptr() == 0)
+            ):
+                new_tensor = self.clone()
+                if type(new_tensor) is not type(self):
+                    raise RuntimeError(
+                        "The default implementation of __deepcopy__() for wrapper subclasses "
+                        "only works for subclass types that implement clone() and for which "
+                        "cloning returns another instance of the same subclass. You should either "
+                        "properly implement clone() for your subclass or override __deepcopy__() "
+                        "if it is intended behavior for clone() to return an instance of a "
+                        "different type."
+                    )
+            else:
+                new_storage = self._typed_storage()._deepcopy(memo)
+                if self.is_quantized:
+                    # quantizer_params can be different type based on torch attribute
+                    quantizer_params: Union[
+                        Tuple[torch.qscheme, float, int],
+                        Tuple[torch.qscheme, Tensor, Tensor, int],
+                    ]
+                    if self.qscheme() == torch.per_tensor_affine:
+                        quantizer_params = (
+                            self.qscheme(),
+                            self.q_scale(),
+                            self.q_zero_point(),
+                        )
+                    elif self.qscheme() in (
+                        torch.per_channel_affine,
+                        torch.per_channel_affine_float_qparams,
+                    ):
+                        quantizer_params = (
+                            self.qscheme(),
+                            self.q_per_channel_scales(),
+                            self.q_per_channel_zero_points(),
+                            self.q_per_channel_axis(),
+                        )
+                    else:
+                        raise RuntimeError(
+                            f"Unsupported qscheme {self.qscheme()} in deepcopy"
+                        )
+                    # TODO: Once we decide to break serialization FC, no longer
+                    # need to wrap with TypedStorage
+                    new_tensor = torch._utils._rebuild_qtensor(
+                        torch.storage.TypedStorage(
+                            wrap_storage=new_storage._untyped_storage,
+                            dtype=self.dtype,
+                            _internal=True,
+                        ),
+                        self.storage_offset(),
+                        self.size(),
+                        self.stride(),
+                        quantizer_params,
+                        self.requires_grad,
+                        self._backward_hooks,
+                    )
+                    if type(new_tensor) is not type(self):
+                        raise RuntimeError(
+                            "The default implementation of __deepcopy__() for quantized tensors "
+                            "expects the tensor returned by torch._utils._rebuild_qtensor() to "
+                            "match the type of the instance being copied. If you encounter this, "
+                            "please open an issue on PyTorch's GitHub."
+                        )
+                else:
+                    new_tensor = self.new_empty([])
+                    if type(new_tensor) is not type(self):
+                        raise RuntimeError(
+                            "The default implementation of __deepcopy__() for non-wrapper subclasses "
+                            "only works for subclass types that implement new_empty() and for which "
+                            "that function returns another instance of the same subclass. You should "
+                            "either properly implement new_empty() for your subclass or override "
+                            "__deepcopy__() if it is intended behavior for new_empty() to return "
+                            "an instance of a different type."
+                        )
+                    new_tensor.set_(
+                        new_storage, self.storage_offset(), self.size(), self.stride()
+                    )
+                    if self.is_conj():
+                        new_tensor = new_tensor.conj_physical()
+                    if self.is_neg():
+                        new_tensor = new_tensor.neg()
+            if self.requires_grad:
+                new_tensor.requires_grad_()
+            if self.grad is not None:
+                new_tensor.grad = self.grad.__deepcopy__(memo)
+
+            if type(self) is not Tensor:
+                if type(new_tensor) is not type(self):
+                    raise RuntimeError(
+                        "Type of deepcopy result does not match the type of the source tensor. "
+                        "If you encounter this, please open an issue on PyTorch's GitHub."
+                    )
+
+                # Plain Tensors don't have slots
+                slots_to_save = copyreg._slotnames(self.__class__)  # type: ignore[attr-defined]
+                for slot in slots_to_save:
+                    if hasattr(self, slot):
+                        setattr(new_tensor, slot, deepcopy(getattr(self, slot), memo))
+
+            new_tensor.__dict__ = deepcopy(self.__dict__, memo)
+
+            memo[id(self)] = new_tensor
+            return new_tensor
+
+    def __reduce_ex__(self, proto):
+        state = torch._utils._get_obj_state(self)
+        if type(self) is Tensor and not state:
+            # Fast path for regular tensor without Python state.
+            return self._reduce_ex_internal(proto)
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__reduce_ex__, (self,), self, proto)
+        func, args = self._reduce_ex_internal(proto)
+        return (_rebuild_from_type_v2, (func, type(self), args, state))
+
+    def storage(self):
+        r"""
+        storage() -> torch.TypedStorage
+
+        Returns the underlying :class:`TypedStorage`.
+
+        .. warning::
+
+            :class:`TypedStorage` is deprecated. It will be removed in the future, and
+            :class:`UntypedStorage` will be the only storage class. To access the
+            :class:`UntypedStorage` directly, use :attr:`Tensor.untyped_storage()`.
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.storage, (self,), self)
+
+        torch.storage._warn_typed_storage_removal(stacklevel=2)
+        return self._typed_storage()
+
+    # For internal use only, to avoid raising deprecation warning
+    def _typed_storage(self):
+        untyped_storage = self.untyped_storage()
+        return torch.TypedStorage(
+            wrap_storage=untyped_storage, dtype=self.dtype, _internal=True
+        )
+
+    def _reduce_ex_internal(self, proto):
+        check_serializing_named_tensor(self)
+        # See Note [Don't serialize hooks]
+        torch.utils.hooks.warn_if_has_hooks(self)
+        backward_hooks: Dict[Any, Any] = OrderedDict()
+        # Note: Numpy array is chosen to be the rebuild component for XLA, MTIA, ORT Tensors.
+        # We considered a few options:
+        # 1. CPU tensor can't be used here.
+        #    Otherwise in torch.load CPU storage is reconstructed with randomly
+        #    initialized data, moved onto backend device, and then storage is updated
+        #    to the serialized content. This works perfectly for CPU/CUDA but not these backends;
+        #    their tensors are disconnected with storage so they don't get the update.
+        # 2. Python list is not a good fit due to performance reason.
+        #    `tolist()` converts every single element in the tensor into python objects
+        #    and serialize them one by one.
+        if self.device.type in ["xla", "mtia", "ort"] or (
+            not torch._C._has_storage(self)
+            and self.device.type == torch._C._get_privateuse1_backend_name()
+        ):
+            # Convert BFloat16 tesors to Float32 before conversion to numpy, as numpy doesn't
+            # support BFloat16. The rebuild tensor from numpy takes in the original self.dtype,
+            # this would reconstruct the BFloat16 tensor from numpy.
+            numpy_tensor = (
+                self.cpu().numpy()
+                if self.dtype != torch.bfloat16
+                else self.cpu().to(torch.float32).numpy()
+            )
+            return (
+                torch._utils._rebuild_device_tensor_from_numpy,
+                (numpy_tensor, self.dtype, str(self.device), self.requires_grad),
+            )
+        if self.device.type == "meta":
+            # NB: This implementation BREAKS storage sharing.  Current
+            # hypothesis is that no one cares for meta tensors.
+            arg_meta = (
+                self.dtype,
+                tuple(self.size()),
+                self.stride(),
+                self.requires_grad,
+            )
+            return (torch._utils._rebuild_meta_tensor_no_storage, arg_meta)
+        if self.is_quantized:
+            # quantizer_params can be different type based on torch attribute
+            quantizer_params: Union[
+                Tuple[torch.qscheme, float, int], Tuple[Any, Tensor, Tensor, int]
+            ]
+            if self.qscheme() == torch.per_tensor_affine:
+                quantizer_params = (
+                    torch.per_tensor_affine,
+                    self.q_scale(),
+                    self.q_zero_point(),
+                )
+            elif self.qscheme() in (
+                torch.per_channel_affine,
+                torch.per_channel_affine_float_qparams,
+            ):
+                # convert scales and zero points to tuple to avoid recursive calls
+                # when/if we get multi-axis quantized tensors in the future, the shape
+                # is recoverable from the main tensor shape
+                quantizer_params = (
+                    torch.per_channel_affine,
+                    self.q_per_channel_scales(),
+                    self.q_per_channel_zero_points(),
+                    self.q_per_channel_axis(),
+                )
+            else:
+                raise RuntimeError(
+                    f"Serialization is not supported for tensors of type {self.qscheme()}"
+                )
+            # TODO: Once we decide to break serialization FC, no longer
+            # need to wrap with TypedStorage
+            args_qtensor = (
+                torch.storage.TypedStorage(
+                    wrap_storage=self._typed_storage()._untyped_storage,
+                    dtype=self.dtype,
+                    _internal=True,
+                ),
+                self.storage_offset(),
+                tuple(self.size()),
+                self.stride(),
+                quantizer_params,
+                self.requires_grad,
+                backward_hooks,
+            )
+            return (torch._utils._rebuild_qtensor, args_qtensor)
+        elif self.is_sparse:
+            if self.layout == torch.sparse_coo:
+                args_sparse = (
+                    self.layout,
+                    (self._indices(), self._values(), self.size(), self.is_coalesced()),
+                )
+            else:
+                raise NotImplementedError(
+                    f"sparse tensor __reduce_ex__ for layout `{self.layout}`"
+                )
+            return (torch._utils._rebuild_sparse_tensor, args_sparse)
+        elif self.layout in {
+            torch.sparse_csr,
+            torch.sparse_csc,
+            torch.sparse_bsr,
+            torch.sparse_bsc,
+        }:
+            if self.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                compressed_indices, plain_indices = (
+                    self.crow_indices(),
+                    self.col_indices(),
+                )
+            else:
+                compressed_indices, plain_indices = (
+                    self.ccol_indices(),
+                    self.row_indices(),
+                )
+            args_sparse_compressed = (
+                self.layout,
+                (
+                    compressed_indices,
+                    plain_indices,
+                    self.values(),
+                    self.size(),
+                ),
+            )
+            return (torch._utils._rebuild_sparse_tensor, args_sparse_compressed)
+        elif self.is_nested:
+            args_nested = (
+                # NB: values() currently returns the storage as a buffer in an unsafe way.
+                # Ideally, we'd use a private API for this instead. TODO: Switch to this if
+                # we ever get around to adding it.
+                self.values(),
+                self._nested_tensor_size(),
+                self._nested_tensor_strides(),
+                self._nested_tensor_storage_offsets(),
+            )
+            return (torch._utils._rebuild_nested_tensor, args_nested)
+        elif (
+            self.data_ptr() == 0
+            and type(self) is not torch.Tensor
+            and type(self).__torch_dispatch__ is not torch.Tensor.__torch_dispatch__
+        ):
+            arg_wrapper_subclass = (
+                type(self),
+                self.dtype,
+                tuple(self.size()),
+                self.stride(),
+                self.storage_offset(),
+                self.layout,
+                self.device,
+                self.requires_grad,
+            )
+            return (torch._utils._rebuild_wrapper_subclass, arg_wrapper_subclass)
+        else:
+            v3_dtypes = [
+                torch.float8_e5m2,
+                torch.float8_e4m3fn,
+                torch.float8_e5m2fnuz,
+                torch.float8_e4m3fnuz,
+                torch.bits8,
+                torch.bits16,
+                torch.bits1x8,
+                torch.bits2x4,
+                torch.bits4x2,
+                torch.complex32,
+            ]
+            if self.dtype in v3_dtypes:
+                rebuild_func = torch._utils._rebuild_tensor_v3
+                storage = self.untyped_storage()
+            else:
+                # TODO: Once we decide to break serialization FC, no longer
+                # need to wrap with TypedStorage
+                rebuild_func = torch._utils._rebuild_tensor_v2  # type: ignore[assignment]
+                storage = torch.storage.TypedStorage(
+                    wrap_storage=self._typed_storage()._untyped_storage,
+                    dtype=self.dtype,
+                    _internal=True,
+                )  # type: ignore[assignment]
+            args = (
+                storage,
+                self.storage_offset(),
+                tuple(self.size()),
+                self.stride(),
+                self.requires_grad,
+                backward_hooks,
+            )  # previously was self._backward_hooks
+
+            if isinstance(storage, torch.storage.UntypedStorage):
+                args = args + (self.dtype,)  # type: ignore[assignment]
+
+            metadata = torch._utils.get_tensor_metadata(self)
+            if metadata:
+                args = args + (metadata,)  # type: ignore[assignment]
+
+            return (rebuild_func, args)
+
+    def __setstate__(self, state):
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__setstate__, (self,), self, state)
+        # Warning: this method is NOT called when you torch.load() a tensor;
+        # that is managed by _rebuild_tensor_v2
+        if not self.is_leaf:
+            raise RuntimeError("__setstate__ can be only called on leaf Tensors")
+        if len(state) == 4:
+            # legacy serialization of Tensor
+            self.set_(*state)
+            return
+        elif len(state) == 5:
+            # legacy serialization of Variable
+            self.data = state[0]
+            state = (state[3], state[4], state[2])
+        # The setting of _backward_hooks is expected to be a no-op.
+        # See Note [Don't serialize hooks]
+        self.requires_grad, _, self._backward_hooks = state
+
+    def __repr__(self, *, tensor_contents=None):
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.__repr__, (self,), self, tensor_contents=tensor_contents
+            )
+        # All strings are unicode in Python 3.
+        return torch._tensor_str._str(self, tensor_contents=tensor_contents)
+
+    def backward(
+        self, gradient=None, retain_graph=None, create_graph=False, inputs=None
+    ):
+        r"""Computes the gradient of current tensor wrt graph leaves.
+
+        The graph is differentiated using the chain rule. If the tensor is
+        non-scalar (i.e. its data has more than one element) and requires
+        gradient, the function additionally requires specifying ``gradient``.
+        It should be a tensor of matching type and location, that contains
+        the gradient of the differentiated function w.r.t. ``self``.
+
+        This function accumulates gradients in the leaves - you might need to zero
+        ``.grad`` attributes or set them to ``None`` before calling it.
+        See :ref:`Default gradient layouts<default-grad-layouts>`
+        for details on the memory layout of accumulated gradients.
+
+        .. note::
+
+            If you run any forward ops, create ``gradient``, and/or call ``backward``
+            in a user-specified CUDA stream context, see
+            :ref:`Stream semantics of backward passes<bwd-cuda-stream-semantics>`.
+
+        .. note::
+
+            When ``inputs`` are provided and a given input is not a leaf,
+            the current implementation will call its grad_fn (though it is not strictly needed to get this gradients).
+            It is an implementation detail on which the user should not rely.
+            See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details.
+
+        Args:
+            gradient (Tensor or None): Gradient w.r.t. the
+                tensor. If it is a tensor, it will be automatically converted
+                to a Tensor that does not require grad unless ``create_graph`` is True.
+                None values can be specified for scalar Tensors or ones that
+                don't require grad. If a None value would be acceptable then
+                this argument is optional.
+            retain_graph (bool, optional): If ``False``, the graph used to compute
+                the grads will be freed. Note that in nearly all cases setting
+                this option to True is not needed and often can be worked around
+                in a much more efficient way. Defaults to the value of
+                ``create_graph``.
+            create_graph (bool, optional): If ``True``, graph of the derivative will
+                be constructed, allowing to compute higher order derivative
+                products. Defaults to ``False``.
+            inputs (sequence of Tensor): Inputs w.r.t. which the gradient will be
+                accumulated into ``.grad``. All other Tensors will be ignored. If not
+                provided, the gradient is accumulated into all the leaf Tensors that were
+                used to compute the attr::tensors.
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.backward,
+                (self,),
+                self,
+                gradient=gradient,
+                retain_graph=retain_graph,
+                create_graph=create_graph,
+                inputs=inputs,
+            )
+        torch.autograd.backward(
+            self, gradient, retain_graph, create_graph, inputs=inputs
+        )
+
+    def register_hook(self, hook):
+        r"""Registers a backward hook.
+
+        The hook will be called every time a gradient with respect to the
+        Tensor is computed. The hook should have the following signature::
+
+            hook(grad) -> Tensor or None
+
+
+        The hook should not modify its argument, but it can optionally return
+        a new gradient which will be used in place of :attr:`grad`.
+
+        This function returns a handle with a method ``handle.remove()``
+        that removes the hook from the module.
+
+        .. note::
+            See :ref:`backward-hooks-execution` for more information on how when this hook
+            is executed, and how its execution is ordered relative to other hooks.
+
+        Example::
+
+            >>> v = torch.tensor([0., 0., 0.], requires_grad=True)
+            >>> h = v.register_hook(lambda grad: grad * 2)  # double the gradient
+            >>> v.backward(torch.tensor([1., 2., 3.]))
+            >>> v.grad
+
+             2
+             4
+             6
+            [torch.FloatTensor of size (3,)]
+
+            >>> h.remove()  # removes the hook
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.register_hook, (self,), self, hook)
+        if not self.requires_grad:
+            raise RuntimeError(
+                "cannot register a hook on a tensor that doesn't require gradient"
+            )
+        if self._backward_hooks is None:
+            self._backward_hooks = OrderedDict()
+            if self.grad_fn is not None:
+                self.grad_fn._register_hook_dict(self)
+        handle = hooks.RemovableHandle(self._backward_hooks)
+        self._backward_hooks[handle.id] = hook
+        return handle
+
+    def register_post_accumulate_grad_hook(self, hook):
+        r"""Registers a backward hook that runs after grad accumulation.
+
+        The hook will be called after all gradients for a tensor have been accumulated,
+        meaning that the .grad field has been updated on that tensor. The post
+        accumulate grad hook is ONLY applicable for leaf tensors (tensors without a
+        .grad_fn field). Registering this hook on a non-leaf tensor will error!
+
+        The hook should have the following signature::
+
+            hook(param: Tensor) -> None
+
+        Note that, unlike other autograd hooks, this hook operates on the tensor
+        that requires grad and not the grad itself. The hook can in-place modify
+        and access its Tensor argument, including its .grad field.
+
+        This function returns a handle with a method ``handle.remove()``
+        that removes the hook from the module.
+
+        .. note::
+            See :ref:`backward-hooks-execution` for more information on how when this hook
+            is executed, and how its execution is ordered relative to other hooks. Since
+            this hook runs during the backward pass, it will run in no_grad mode (unless
+            create_graph is True). You can use torch.enable_grad() to re-enable autograd
+            within the hook if you need it.
+
+        Example::
+
+            >>> v = torch.tensor([0., 0., 0.], requires_grad=True)
+            >>> lr = 0.01
+            >>> # simulate a simple SGD update
+            >>> h = v.register_post_accumulate_grad_hook(lambda p: p.add_(p.grad, alpha=-lr))
+            >>> v.backward(torch.tensor([1., 2., 3.]))
+            >>> v
+            tensor([-0.0100, -0.0200, -0.0300], requires_grad=True)
+
+            >>> h.remove()  # removes the hook
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.register_post_accumulate_grad_hook, (self,), self, hook
+            )
+        if not self.requires_grad:
+            raise RuntimeError(
+                "cannot register a hook on a tensor that doesn't require gradient"
+            )
+        if self.grad_fn is not None:
+            raise RuntimeError(
+                "post accumulate grad hooks cannot be registered on non-leaf tensors"
+            )
+        if self._post_accumulate_grad_hooks is None:
+            self._post_accumulate_grad_hooks: Dict[Any, Any] = OrderedDict()
+        handle = hooks.RemovableHandle(self._post_accumulate_grad_hooks)
+        self._post_accumulate_grad_hooks[handle.id] = hook
+        return handle
+
+    def reinforce(self, reward):
+        def trim(str):
+            return "\n".join([line.strip() for line in str.split("\n")])
+
+        raise RuntimeError(
+            trim(
+                r"""reinforce() was removed.
+            Use torch.distributions instead.
+            See https://pytorch.org/docs/master/distributions.html
+
+            Instead of:
+
+            probs = policy_network(state)
+            action = probs.multinomial()
+            next_state, reward = env.step(action)
+            action.reinforce(reward)
+            action.backward()
+
+            Use:
+
+            probs = policy_network(state)
+            # NOTE: categorical is equivalent to what used to be called multinomial
+            m = torch.distributions.Categorical(probs)
+            action = m.sample()
+            next_state, reward = env.step(action)
+            loss = -m.log_prob(action) * reward
+            loss.backward()
+        """
+            )
+        )
+
+    detach = _C._add_docstr(
+        _C.TensorBase.detach,
+        r"""
+    Returns a new Tensor, detached from the current graph.
+
+    The result will never require gradient.
+
+    This method also affects forward mode AD gradients and the result will never
+    have forward mode AD gradients.
+
+    .. note::
+
+      Returned Tensor shares the same storage with the original one.
+      In-place modifications on either of them will be seen, and may trigger
+      errors in correctness checks.
+    """,
+    )
+
+    detach_ = _C._add_docstr(
+        _C.TensorBase.detach_,
+        r"""
+    Detaches the Tensor from the graph that created it, making it a leaf.
+    Views cannot be detached in-place.
+
+    This method also affects forward mode AD gradients and the result will never
+    have forward mode AD gradients.
+    """,
+    )
+
+    def is_shared(self):
+        r"""Checks if tensor is in shared memory.
+
+        This is always ``True`` for CUDA tensors.
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.is_shared, (self,), self)
+        return self._typed_storage()._is_shared()
+
+    def share_memory_(self):
+        r"""Moves the underlying storage to shared memory.
+
+        This is a no-op if the underlying storage is already in shared memory
+        and for CUDA tensors. Tensors in shared memory cannot be resized.
+
+        See :meth:`torch.UntypedStorage.share_memory_` for more details.
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.share_memory_, (self,), self)
+        self._typed_storage()._share_memory_()
+        return self
+
+    def module_load(self, other, assign=False):
+        r"""Defines how to transform ``other`` when loading it into ``self`` in :meth:`~nn.Module.load_state_dict`.
+
+        Used when :func:`~torch.__future__.get_swap_module_params_on_conversion` is ``True``.
+
+        It is expected that ``self`` is a parameter or buffer in an ``nn.Module`` and ``other`` is the
+        value in the state dictionary with the corresponding key, this method defines
+        how ``other`` is remapped before being swapped with ``self`` via
+        :func:`~torch.utils.swap_tensors`` in ``module.load_state_dict()``.
+
+        .. note::
+            This method should always return a new object that is not ``self`` or ``other``.
+            For example, the default implementation returns ``self.copy_(other).detach()``
+            if ``assign`` is ``False`` or ``other.detach()`` if ``assign`` is ``True``.
+
+        Args:
+            other (Tensor): value in state dict with key corresponding to ``self``
+            assign (bool): the assign argument passed to :meth:`nn.Module.load_state_dict`
+
+        """
+        if has_torch_function_variadic(self, other):
+            return handle_torch_function(
+                Tensor.module_load, (self, other), self, other, assign=assign
+            )
+
+        if assign:
+            return other.detach()
+        else:
+            return self.copy_(other).detach()
+
+    def __reversed__(self):
+        r"""Reverses the tensor along dimension 0."""
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__reversed__, (self,), self)
+        if self.dim() == 0:
+            return self
+        else:
+            return self.flip(0)
+
+    def norm(
+        self,
+        p: Optional[Union[float, str]] = "fro",
+        dim=None,
+        keepdim=False,
+        dtype=None,
+    ):
+        r"""See :func:`torch.norm`"""
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.norm, (self,), self, p=p, dim=dim, keepdim=keepdim, dtype=dtype
+            )
+        return torch.norm(self, p, dim, keepdim, dtype=dtype)
+
+    def solve(self, other):
+        from ._linalg_utils import solve
+
+        return solve(self, other)
+
+    def lstsq(self, other):
+        from ._linalg_utils import lstsq
+
+        return lstsq(self, other)
+
+    def eig(self, eigenvectors=False):
+        from ._linalg_utils import eig
+
+        return eig(self, eigenvectors=eigenvectors)
+
+    def symeig(self, eigenvectors=False):
+        from ._linalg_utils import _symeig
+
+        return _symeig(self, eigenvectors=eigenvectors)
+
+    def lu(self, pivot=True, get_infos=False):
+        r"""See :func:`torch.lu`"""
+        # If get_infos is True, then we don't need to check for errors and vice versa
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.lu, (self,), self, pivot=pivot, get_infos=get_infos
+            )
+
+        LU, pivots, infos = torch._lu_with_info(
+            self, pivot=pivot, check_errors=(not get_infos)
+        )
+        if get_infos:
+            return LU, pivots, infos
+        else:
+            return LU, pivots
+
+    def stft(
+        self,
+        n_fft: int,
+        hop_length: Optional[int] = None,
+        win_length: Optional[int] = None,
+        window: "Optional[Tensor]" = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        normalized: bool = False,
+        onesided: Optional[bool] = None,
+        return_complex: Optional[bool] = None,
+    ):
+        r"""See :func:`torch.stft`
+
+        .. warning::
+          This function changed signature at version 0.4.1. Calling with
+          the previous signature may cause error or return incorrect result.
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.stft,
+                (self,),
+                self,
+                n_fft,
+                hop_length=hop_length,
+                win_length=win_length,
+                window=window,
+                center=center,
+                pad_mode=pad_mode,
+                normalized=normalized,
+                onesided=onesided,
+                return_complex=return_complex,
+            )
+        return torch.stft(
+            self,
+            n_fft,
+            hop_length,
+            win_length,
+            window,
+            center,
+            pad_mode,
+            normalized,
+            onesided,
+            return_complex=return_complex,
+        )
+
+    def istft(
+        self,
+        n_fft: int,
+        hop_length: Optional[int] = None,
+        win_length: Optional[int] = None,
+        window: "Optional[Tensor]" = None,
+        center: bool = True,
+        normalized: bool = False,
+        onesided: Optional[bool] = None,
+        length: Optional[int] = None,
+        return_complex: bool = False,
+    ):
+        r"""See :func:`torch.istft`"""
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.istft,
+                (self,),
+                self,
+                n_fft,
+                hop_length=hop_length,
+                win_length=win_length,
+                window=window,
+                center=center,
+                normalized=normalized,
+                onesided=onesided,
+                length=length,
+                return_complex=return_complex,
+            )
+        return torch.istft(
+            self,
+            n_fft,
+            hop_length,
+            win_length,
+            window,
+            center,
+            normalized,
+            onesided,
+            length,
+            return_complex=return_complex,
+        )
+
+    def resize(self, *sizes):
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.resize, (self,), self, *sizes)
+        warnings.warn("non-inplace resize is deprecated")
+        from torch.autograd._functions import Resize
+
+        return Resize.apply(self, sizes)
+
+    def resize_as(self, tensor):
+        if has_torch_function_variadic(self, tensor):
+            return handle_torch_function(Tensor.resize_as, (self, tensor), self, tensor)
+        warnings.warn("non-inplace resize_as is deprecated")
+        from torch.autograd._functions import Resize
+
+        return Resize.apply(self, tensor.size())
+
+    def split(self, split_size, dim=0):
+        r"""See :func:`torch.split`"""
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.split, (self,), self, split_size, dim=dim
+            )
+        if isinstance(split_size, Tensor):
+            try:
+                split_size = int(split_size)
+            except ValueError:
+                pass
+
+        if isinstance(split_size, (int, torch.SymInt)):
+            return torch._VF.split(self, split_size, dim)  # type: ignore[attr-defined]
+        else:
+            return torch._VF.split_with_sizes(self, split_size, dim)
+
+    def unique(self, sorted=True, return_inverse=False, return_counts=False, dim=None):
+        r"""Returns the unique elements of the input tensor.
+
+        See :func:`torch.unique`
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.unique,
+                (self,),
+                self,
+                sorted=sorted,
+                return_inverse=return_inverse,
+                return_counts=return_counts,
+                dim=dim,
+            )
+        return torch.unique(
+            self,
+            sorted=sorted,
+            return_inverse=return_inverse,
+            return_counts=return_counts,
+            dim=dim,
+        )
+
+    def unique_consecutive(self, return_inverse=False, return_counts=False, dim=None):
+        r"""Eliminates all but the first element from every consecutive group of equivalent elements.
+
+        See :func:`torch.unique_consecutive`
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.unique_consecutive,
+                (self,),
+                self,
+                return_inverse=return_inverse,
+                return_counts=return_counts,
+                dim=dim,
+            )
+        return torch.unique_consecutive(
+            self, return_inverse=return_inverse, return_counts=return_counts, dim=dim
+        )
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rsub__(self, other):
+        return _C._VariableFunctions.rsub(self, other)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rdiv__(self, other):
+        return self.reciprocal() * other
+
+    __rtruediv__ = __rdiv__
+    __itruediv__ = _C.TensorBase.__idiv__
+
+    __pow__ = _handle_torch_function_and_wrap_type_error_to_not_implemented(
+        _C.TensorBase.pow
+    )
+    __ipow__ = _handle_torch_function_and_wrap_type_error_to_not_implemented(
+        _C.TensorBase.pow_
+    )
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rmod__(self, other):
+        return torch.remainder(other, self)
+
+    def __format__(self, format_spec):
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__format__, (self,), self, format_spec)
+        if self.dim() == 0 and not self.is_meta and type(self) is Tensor:
+            return self.item().__format__(format_spec)
+        return object.__format__(self, format_spec)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rpow__(self, other):
+        return torch.pow(other, self)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __floordiv__(self, other):
+        return torch.floor_divide(self, other)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rfloordiv__(self, other):
+        return torch.floor_divide(other, self)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rlshift__(self, other):
+        return torch.bitwise_left_shift(other, self)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rrshift__(self, other):
+        return torch.bitwise_right_shift(other, self)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rmatmul__(self, other):
+        return torch.matmul(other, self)
+
+    __pos__ = _C.TensorBase.positive
+    __neg__ = _C.TensorBase.neg
+    __abs__ = _C.TensorBase.abs
+
+    def __len__(self):
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__len__, (self,), self)
+        if self.dim() == 0:
+            raise TypeError("len() of a 0-d tensor")
+        if torch._C._get_tracing_state():
+            warnings.warn(
+                "Using len to get tensor shape might cause the trace to be incorrect. "
+                "Recommended usage would be tensor.shape[0]. "
+                "Passing a tensor of different shape might lead to errors or silently give "
+                "incorrect results.",
+                category=torch.jit.TracerWarning,
+                stacklevel=2,
+            )
+        return self.shape[0]
+
+    def __iter__(self):
+        # NB: we use 'imap' and not 'map' here, so that in Python 2 we get a
+        # generator and don't eagerly perform all the indexes.  This could
+        # save us work, and also helps keep trace ordering deterministic
+        # (e.g., if you zip(*hiddens), the eager map will force all the
+        # indexes of hiddens[0] before hiddens[1], while the generator
+        # map will interleave them.)
+        # NB: We have intentionally skipped __torch_function__ dispatch here.
+        # See gh-54457
+        if self.dim() == 0:
+            raise TypeError("iteration over a 0-d tensor")
+        if torch._C._get_tracing_state():
+            warnings.warn(
+                "Iterating over a tensor might cause the trace to be incorrect. "
+                "Passing a tensor of different shape won't change the number of "
+                "iterations executed (and might lead to errors or silently give "
+                "incorrect results).",
+                category=torch.jit.TracerWarning,
+                stacklevel=2,
+            )
+        return iter(self.unbind(0))
+
+    def __hash__(self):
+        # Do NOT handle __torch_function__ here as user's default
+        # implementation that handle most functions will most likely do it wrong.
+        # It can be easily overridden by defining this method on the user
+        # subclass if needed.
+        return id(self)
+
+    def __dir__(self):
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__dir__, (self,), self)
+        tensor_methods = dir(self.__class__)
+        tensor_methods.remove("volatile")  # deprecated
+        attrs = list(self.__dict__.keys())
+        keys = tensor_methods + attrs
+
+        # property only available dense, cuda tensors
+        if (not self.is_cuda) or self.is_sparse:
+            keys.remove("__cuda_array_interface__")
+
+        return sorted(keys)
+
+    # Numpy array interface, to support `numpy.asarray(tensor) -> ndarray`
+    __array_priority__ = 1000  # prefer Tensor ops over numpy ones
+
+    def __array__(self, dtype=None):
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__array__, (self,), self, dtype=dtype)
+        if dtype is None:
+            return self.numpy()
+        else:
+            return self.numpy().astype(dtype, copy=False)
+
+    # Wrap Numpy array again in a suitable tensor when done, to support e.g.
+    # `numpy.sin(tensor) -> tensor` or `numpy.greater(tensor, 0) -> ByteTensor`
+    def __array_wrap__(self, array):
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.__array_wrap__, (self,), self, array=array
+            )
+        if array.dtype == bool:
+            # Workaround, torch has no built-in bool tensor
+            array = array.astype("uint8")
+        return torch.from_numpy(array)
+
+    def __contains__(self, element):
+        r"""Check if `element` is present in tensor
+
+        Args:
+            element (Tensor or scalar): element to be checked
+                for presence in current tensor"
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__contains__, (self,), self, element)
+        if isinstance(
+            element, (torch.Tensor, Number, torch.SymInt, torch.SymFloat, torch.SymBool)
+        ):
+            # type hint doesn't understand the __contains__ result array
+            return (element == self).any().item()  # type: ignore[union-attr]
+
+        raise RuntimeError(
+            f"Tensor.__contains__ only supports Tensor or scalar, but you passed in a {type(element)}."
+        )
+
+    @property
+    def __cuda_array_interface__(self):
+        """Array view description for cuda tensors.
+
+        See:
+        https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html
+        """
+        if has_torch_function_unary(self):
+            # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185
+            return handle_torch_function(Tensor.__cuda_array_interface__.__get__, (self,), self)  # type: ignore[attr-defined]
+
+        # raise AttributeError for unsupported tensors, so that
+        # hasattr(cpu_tensor, "__cuda_array_interface__") is False.
+        if not self.is_cuda:
+            raise AttributeError(
+                "Can't get __cuda_array_interface__ on non-CUDA tensor type: %s "
+                "If CUDA data is required use tensor.cuda() to copy tensor to device memory."
+                % self.type()
+            )
+
+        if self.is_sparse:
+            raise AttributeError(
+                "Can't get __cuda_array_interface__ on sparse type: %s "
+                "Use Tensor.to_dense() to convert to a dense tensor first."
+                % self.type()
+            )
+
+        # RuntimeError, matching tensor.__array__() behavior.
+        if self.requires_grad:
+            raise RuntimeError(
+                "Can't get __cuda_array_interface__ on Variable that requires grad. "
+                "If gradients aren't required, use var.detach() to get Variable that doesn't require grad."
+            )
+
+        # CUDA devices are little-endian and tensors are stored in native byte
+        # order. 1-byte entries are endian-agnostic.
+        typestr = {
+            torch.complex64: "<c8",
+            torch.complex128: "<c16",
+            torch.float16: "<f2",
+            torch.float32: "<f4",
+            torch.float64: "<f8",
+            torch.uint8: "|u1",
+            torch.int8: "|i1",
+            torch.int16: "<i2",
+            torch.int32: "<i4",
+            torch.int64: "<i8",
+        }[self.dtype]
+
+        itemsize = self.element_size()
+
+        shape = tuple(self.shape)
+        if self.is_contiguous():
+            # __cuda_array_interface__ v2 requires the strides to be omitted
+            # (either not set or set to None) for C-contiguous arrays.
+            strides = None
+        else:
+            strides = tuple(s * itemsize for s in self.stride())
+        data_ptr = self.data_ptr() if self.numel() > 0 else 0
+        data = (data_ptr, False)  # read-only is false
+
+        return dict(typestr=typestr, shape=shape, strides=strides, data=data, version=2)
+
+    def storage_type(self):
+        r"""storage_type() -> type
+
+        Returns the type of the underlying storage.
+
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.storage_type, (self,), self)
+
+        torch.storage._warn_typed_storage_removal()
+
+        return self._typed_storage()._get_legacy_storage_class()
+
+    def refine_names(self, *names):
+        r"""Refines the dimension names of :attr:`self` according to :attr:`names`.
+
+        Refining is a special case of renaming that "lifts" unnamed dimensions.
+        A ``None`` dim can be refined to have any name; a named dim can only be
+        refined to have the same name.
+
+        Because named tensors can coexist with unnamed tensors, refining names
+        gives a nice way to write named-tensor-aware code that works with both
+        named and unnamed tensors.
+
+        :attr:`names` may contain up to one Ellipsis (``...``).
+        The Ellipsis is expanded greedily; it is expanded in-place to fill
+        :attr:`names` to the same length as ``self.dim()`` using names from the
+        corresponding indices of ``self.names``.
+
+        Python 2 does not support Ellipsis but one may use a string literal
+        instead (``'...'``).
+
+        Args:
+            names (iterable of str): The desired names of the output tensor. May
+                contain up to one Ellipsis.
+
+        Examples::
+
+            >>> imgs = torch.randn(32, 3, 128, 128)
+            >>> named_imgs = imgs.refine_names('N', 'C', 'H', 'W')
+            >>> named_imgs.names
+            ('N', 'C', 'H', 'W')
+
+            >>> tensor = torch.randn(2, 3, 5, 7, 11)
+            >>> tensor = tensor.refine_names('A', ..., 'B', 'C')
+            >>> tensor.names
+            ('A', None, None, 'B', 'C')
+
+        .. warning::
+            The named tensor API is experimental and subject to change.
+
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.refine_names, (self,), self, *names)
+        names = resolve_ellipsis(names, self.names, "refine_names")
+        return super().refine_names(names)
+
+    def align_to(self, *names):
+        r"""Permutes the dimensions of the :attr:`self` tensor to match the order
+        specified in :attr:`names`, adding size-one dims for any new names.
+
+        All of the dims of :attr:`self` must be named in order to use this method.
+        The resulting tensor is a view on the original tensor.
+
+        All dimension names of :attr:`self` must be present in :attr:`names`.
+        :attr:`names` may contain additional names that are not in ``self.names``;
+        the output tensor has a size-one dimension for each of those new names.
+
+        :attr:`names` may contain up to one Ellipsis (``...``).
+        The Ellipsis is expanded to be equal to all dimension names of :attr:`self`
+        that are not mentioned in :attr:`names`, in the order that they appear
+        in :attr:`self`.
+
+        Python 2 does not support Ellipsis but one may use a string literal
+        instead (``'...'``).
+
+        Args:
+            names (iterable of str): The desired dimension ordering of the
+                output tensor. May contain up to one Ellipsis that is expanded
+                to all unmentioned dim names of :attr:`self`.
+
+        Examples::
+
+            >>> tensor = torch.randn(2, 2, 2, 2, 2, 2)
+            >>> named_tensor = tensor.refine_names('A', 'B', 'C', 'D', 'E', 'F')
+
+            # Move the F and E dims to the front while keeping the rest in order
+            >>> named_tensor.align_to('F', 'E', ...)
+
+        .. warning::
+            The named tensor API is experimental and subject to change.
+
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.align_to, (self,), self, *names)
+        ellipsis_idx = single_ellipsis_index(names, "align_to")
+        if ellipsis_idx is None:
+            return super().align_to(names)
+        return super().align_to(
+            [name for name in names if not is_ellipsis(name)], ellipsis_idx
+        )
+
+    def unflatten(self, dim, sizes):
+        r"""
+        unflatten(dim, sizes) -> Tensor
+
+        See :func:`torch.unflatten`.
+
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.unflatten, (self,), self, dim, sizes)
+
+        if not sizes:
+            raise RuntimeError("unflatten: sizes must be non-empty")
+
+        names = None
+        if isinstance(sizes, OrderedDict) or (
+            isinstance(sizes, (tuple, list)) and isinstance(sizes[0], (tuple, list))
+        ):
+            names, sizes = unzip_namedshape(sizes)
+            return super().unflatten(dim, sizes, names)
+        else:
+            return super().unflatten(dim, sizes)
+
+    def rename_(self, *names, **rename_map):
+        """In-place version of :meth:`~Tensor.rename`."""
+
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.rename_, (self,), self, *names, **rename_map
+            )
+
+        # Note [rename_ / rename API]
+        # The Python API for these is different from the C++ API. In Python:
+        # 1) tensor.rename(*names) takes a vararglist of names
+        # 2) tensor.rename(**rename_map) takes a map of names to rename.
+        # C++ is static, making it difficult to implement similar behavior.
+        return update_names(self, names, rename_map, inplace=True)
+
+    def rename(self, *names, **rename_map):
+        """Renames dimension names of :attr:`self`.
+
+        There are two main usages:
+
+        ``self.rename(**rename_map)`` returns a view on tensor that has dims
+        renamed as specified in the mapping :attr:`rename_map`.
+
+        ``self.rename(*names)`` returns a view on tensor, renaming all
+        dimensions positionally using :attr:`names`.
+        Use ``self.rename(None)`` to drop names on a tensor.
+
+        One cannot specify both positional args :attr:`names` and keyword args
+        :attr:`rename_map`.
+
+        Examples::
+
+            >>> imgs = torch.rand(2, 3, 5, 7, names=('N', 'C', 'H', 'W'))
+            >>> renamed_imgs = imgs.rename(N='batch', C='channels')
+            >>> renamed_imgs.names
+            ('batch', 'channels', 'H', 'W')
+
+            >>> renamed_imgs = imgs.rename(None)
+            >>> renamed_imgs.names
+            (None, None, None, None)
+
+            >>> renamed_imgs = imgs.rename('batch', 'channel', 'height', 'width')
+            >>> renamed_imgs.names
+            ('batch', 'channel', 'height', 'width')
+
+        .. warning::
+            The named tensor API is experimental and subject to change.
+
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.rename, (self,), self, *names, **rename_map
+            )
+
+        # See Note [rename_ / rename API]
+        return update_names(self, names, rename_map, inplace=False)
+
+    def to_sparse_coo(self):
+        """Convert a tensor to :ref:`coordinate format <sparse-coo-docs>`.
+
+        Examples::
+
+             >>> dense = torch.randn(5, 5)
+             >>> sparse = dense.to_sparse_coo()
+             >>> sparse._nnz()
+             25
+
+        """
+        return self.to_sparse()
+
+    def dim_order(self):
+        """
+
+        dim_order() -> tuple
+
+        Returns a tuple of int describing the dim order or physical layout of :attr:`self`.
+
+        Args:
+            None
+
+        Dim order represents how dimensions are laid out in memory,
+        starting from the outermost to the innermost dimension.
+
+        Example::
+            >>> torch.empty((2, 3, 5, 7)).dim_order()
+            (0, 1, 2, 3)
+            >>> torch.empty((2, 3, 5, 7), memory_format=torch.channels_last).dim_order()
+            (0, 2, 3, 1)
+
+        .. warning::
+            The dim_order tensor API is experimental and subject to change.
+
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.dim_order, (self,), self)
+
+        import torch._prims_common as utils
+
+        return tuple(utils.compute_elementwise_output_logical_to_physical_perm(self))
+
+    def _update_names(self, names, inplace):
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor._update_names, (self,), self, names, inplace
+            )
+
+        # See Note [rename_ / rename API]
+        if inplace:
+            return super().rename_(names)
+        else:
+            return super().rename(names)
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        """
+        This __torch_function__ implementation wraps subclasses such that
+        methods called on subclasses return a subclass instance instead of
+        a ``torch.Tensor`` instance.
+
+        One corollary to this is that you need coverage for torch.Tensor
+        methods if implementing __torch_function__ for subclasses.
+
+        We recommend always calling ``super().__torch_function__`` as the base
+        case when doing the above.
+
+        While not mandatory, we recommend making `__torch_function__` a classmethod.
+        """
+        if kwargs is None:
+            kwargs = {}
+
+        if not all(issubclass(cls, t) for t in types):
+            return NotImplemented
+
+        with _C.DisableTorchFunctionSubclass():
+            ret = func(*args, **kwargs)
+            if func in get_default_nowrap_functions():
+                return ret
+            else:
+                return _convert(ret, cls)
+
+    __torch_dispatch__ = _C._disabled_torch_dispatch_impl
+
+    def __dlpack__(self, stream=None):
+        """
+        Creates a DLpack `capsule https://data-apis.org/array-api/latest/design_topics/data_interchange.html#data-interchange`_
+        of the current tensor to be exported to other libraries.
+
+        This function will be called from the `from_dlpack` method
+        of the library that will consume the capsule. `from_dlpack` passes the current
+        stream to this method as part of the specification.
+
+        Args:
+            stream (integer or None): An optional Python integer representing a
+            pointer to a CUDA stream. The current stream is synchronized with
+            this stream before the capsule is created, and since the capsule
+            shares its storage with the tensor this make it safe to access from
+            both streams.  If None or -1 is passed then no synchronization is performed.
+            If 1 (on CUDA) or 0 (on ROCM) then the default stream is used for
+            synchronization.
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__dlpack__, (self,), self, stream)
+
+        # DLPack capsules can't capture all of PyTorch's semantics,
+        # so we prohibit exporting tensors that would lose their properties like
+        # requires_grad and having the conjugate bit set.
+        if self.requires_grad:
+            raise RuntimeError(
+                "Can't export tensors that require gradient, use tensor.detach()"
+            )
+        if self.is_conj():
+            raise RuntimeError("Can't export tensors with the conjugate bit set")
+        if self.layout != torch.strided:
+            raise RuntimeError(
+                "Can't export tensors with layout other than torch.strided"
+            )
+
+        if stream is not None and type(stream) is not int:
+            # Stream pointers in CUDA/ROCm are uniquely numbered and can
+            # be retrieved from their integer value.
+            raise TypeError("stream must be ``int`` or ``none``")
+        elif stream is not None and stream != -1:
+            if self.device.type == "cuda":
+                # NB: This logic handles the special case values for default
+                # streams and must be kept in sync with from_dlpack in
+                # torch/utils/dlpack.py
+                if stream == 1 and torch.version.hip is None:
+                    stream = torch.cuda.default_stream()
+                elif stream == 0 and torch.version.hip is not None:
+                    stream = torch.cuda.default_stream()
+                else:
+                    stream = torch.cuda.ExternalStream(stream)
+                # Only synchronize on different streams
+                sync_stream = torch.cuda.current_stream()
+                if stream != sync_stream:
+                    event = torch.cuda.Event()
+                    event.record(sync_stream)
+                    stream.wait_event(event)
+        return torch.to_dlpack(self)
+
+    def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]:
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__dlpack_device__, (self,), self)
+        device = self.device
+        idx = device.index if device.index is not None else 0
+        torch_device_type = device.type
+        if torch_device_type == "cuda" and torch.version.hip is not None:
+            device_type = DLDeviceType.kDLROCM
+        elif torch_device_type == "cpu" and self.is_pinned():
+            device_type = DLDeviceType.kDLCPUPinned
+        elif torch_device_type == "cuda":
+            device_type = DLDeviceType.kDLGPU
+        elif torch_device_type == "cpu":
+            device_type = DLDeviceType.kDLCPU
+        elif self.device.type == "xpu":
+            device_type = DLDeviceType.kDLOneAPI
+        else:
+            raise ValueError(f"Unknown device type {torch_device_type} for Dlpack")
+        return (device_type, idx)
+
+    __module__ = "torch"
+
+
+def _convert(ret, cls):
+    if cls is Tensor:
+        return ret
+
+    if isinstance(ret, Tensor) and not isinstance(ret, cls):
+        ret = ret.as_subclass(cls)
+
+    if isinstance(ret, (tuple, list)):
+        # Also handles things like namedtuples
+        ret = type(ret)(_convert(r, cls) for r in ret)
+
+    return ret
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_vmap_internals.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_vmap_internals.py
new file mode 100644
index 0000000000000000000000000000000000000000..8440abccb23904e935878e245b390465e04b5db0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_vmap_internals.py
@@ -0,0 +1,237 @@
+import functools
+import warnings
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+from torch.utils._pytree import _broadcast_to_and_flatten, tree_flatten, tree_unflatten
+
+in_dims_t = Union[int, Tuple]
+out_dims_t = Union[int, Tuple[int, ...]]
+
+
+# Checks that all args-to-be-batched have the same batch dim size
+def _validate_and_get_batch_size(
+    flat_in_dims: List[Optional[int]], flat_args: List
+) -> int:
+    batch_sizes = [
+        arg.size(in_dim)
+        for in_dim, arg in zip(flat_in_dims, flat_args)
+        if in_dim is not None
+    ]
+    if batch_sizes and any(size != batch_sizes[0] for size in batch_sizes):
+        raise ValueError(
+            f"vmap: Expected all tensors to have the same size in the mapped "
+            f"dimension, got sizes {batch_sizes} for the mapped dimension"
+        )
+    return batch_sizes[0]
+
+
+def _num_outputs(batched_outputs: Union[Tensor, Tuple[Tensor, ...]]) -> int:
+    if isinstance(batched_outputs, tuple):
+        return len(batched_outputs)
+    return 1
+
+
+# If value is a tuple, check it has length `num_elements`.
+# If value is not a tuple, make a tuple with `value` repeated `num_elements` times
+def _as_tuple(
+    value: Any, num_elements: int, error_message_lambda: Callable[[], str]
+) -> Tuple:
+    if not isinstance(value, tuple):
+        return (value,) * num_elements
+    if len(value) != num_elements:
+        raise ValueError(error_message_lambda())
+    return value
+
+
+# Creates BatchedTensors for every Tensor in arg that should be batched.
+# Returns the (potentially) batched arguments and the batch_size.
+def _create_batched_inputs(
+    in_dims: in_dims_t, args: Tuple, vmap_level: int, func: Callable
+) -> Tuple[Tuple, int]:
+    if not isinstance(in_dims, int) and not isinstance(in_dims, tuple):
+        raise ValueError(
+            f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+            f"expected `in_dims` to be int or a (potentially nested) tuple "
+            f"matching the structure of inputs, got: {type(in_dims)}."
+        )
+    if len(args) == 0:
+        raise ValueError(
+            f"vmap({_get_name(func)})(<inputs>): got no inputs. Maybe you forgot to add "
+            f"inputs, or you are trying to vmap over a function with no inputs. "
+            f"The latter is unsupported."
+        )
+
+    flat_args, args_spec = tree_flatten(args)
+    flat_in_dims = _broadcast_to_and_flatten(in_dims, args_spec)
+    if flat_in_dims is None:
+        raise ValueError(
+            f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+            f"in_dims is not compatible with the structure of `inputs`. "
+            f"in_dims has structure {tree_flatten(in_dims)[1]} but inputs "
+            f"has structure {args_spec}."
+        )
+
+    for arg, in_dim in zip(flat_args, flat_in_dims):
+        if not isinstance(in_dim, int) and in_dim is not None:
+            raise ValueError(
+                f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+                f"Got in_dim={in_dim} for an input but in_dim must be either "
+                f"an integer dimension or None."
+            )
+        if isinstance(in_dim, int) and not isinstance(arg, Tensor):
+            raise ValueError(
+                f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+                f"Got in_dim={in_dim} for an input but the input is of type "
+                f"{type(arg)}. We cannot vmap over non-Tensor arguments, "
+                f"please use None as the respective in_dim"
+            )
+        if in_dim is not None and (in_dim < 0 or in_dim >= arg.dim()):
+            raise ValueError(
+                f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+                f"Got in_dim={in_dim} for some input, but that input is a Tensor "
+                f"of dimensionality {arg.dim()} so expected in_dim to satisfy "
+                f"0 <= in_dim < {arg.dim()}."
+            )
+
+    batch_size = _validate_and_get_batch_size(flat_in_dims, flat_args)
+    # See NOTE [Ignored _remove_batch_dim, _add_batch_dim]
+    batched_inputs = [
+        arg if in_dim is None else torch._add_batch_dim(arg, in_dim, vmap_level)
+        for in_dim, arg in zip(flat_in_dims, flat_args)
+    ]
+    return tree_unflatten(batched_inputs, args_spec), batch_size
+
+
+# Undos the batching (and any batch dimensions) associated with the `vmap_level`.
+def _unwrap_batched(
+    batched_outputs: Union[Tensor, Tuple[Tensor, ...]],
+    out_dims: out_dims_t,
+    vmap_level: int,
+    batch_size: int,
+    func: Callable,
+    allow_none_pass_through: bool = False,
+) -> Tuple:
+    num_outputs = _num_outputs(batched_outputs)
+    out_dims_as_tuple = _as_tuple(
+        out_dims,
+        num_outputs,
+        lambda: f"vmap({_get_name(func)}, ..., out_dims={out_dims}): `out_dims` must "
+        f"have one dim per output (got {num_outputs} outputs) of {_get_name(func)}.",
+    )
+
+    # NOTE [Ignored _remove_batch_dim, _add_batch_dim]
+    # There is something wrong with our type bindings for functions that begin
+    # with '_', see #40397.
+    if isinstance(batched_outputs, Tensor):
+        out_dim = out_dims_as_tuple[0]
+        return torch._remove_batch_dim(batched_outputs, vmap_level, batch_size, out_dim)  # type: ignore[return-value]
+    if allow_none_pass_through:
+        return tuple(
+            (
+                torch._remove_batch_dim(out, vmap_level, batch_size, out_dim)
+                if out is not None
+                else None
+            )
+            for out, out_dim in zip(batched_outputs, out_dims_as_tuple)
+        )
+    else:
+        return tuple(
+            torch._remove_batch_dim(out, vmap_level, batch_size, out_dim)
+            for out, out_dim in zip(batched_outputs, out_dims_as_tuple)
+        )
+
+
+# Checks that `fn` returned one or more Tensors and nothing else.
+# NB: A python function that return multiple arguments returns a single tuple,
+# so we are effectively checking that `outputs` is a single Tensor or a tuple of
+# Tensors.
+def _validate_outputs(outputs: Any, func: Callable) -> None:
+    if isinstance(outputs, Tensor):
+        return
+    if not isinstance(outputs, tuple):
+        raise ValueError(
+            f"vmap({_get_name(func)}, ...): `{_get_name(func)}` must only return "
+            f"Tensors, got type {type(outputs)} as the return."
+        )
+    for idx, output in enumerate(outputs):
+        if isinstance(output, Tensor):
+            continue
+        raise ValueError(
+            f"vmap({_get_name(func)}, ...): `{_get_name(func)}` must only return "
+            f"Tensors, got type {type(output)} for return {idx}."
+        )
+
+
+def _check_out_dims_is_int_or_int_tuple(out_dims: out_dims_t, func: Callable) -> None:
+    if isinstance(out_dims, int):
+        return
+    if not isinstance(out_dims, tuple) or not all(
+        isinstance(out_dim, int) for out_dim in out_dims
+    ):
+        raise ValueError(
+            f"vmap({_get_name(func)}, ..., out_dims={out_dims}): `out_dims` must be "
+            f"an int or a tuple of int representing where in the outputs the "
+            f"vmapped dimension should appear."
+        )
+
+
+def _get_name(func: Callable):
+    if hasattr(func, "__name__"):
+        return func.__name__
+
+    # Not all callables have __name__, in fact, only static functions/methods do.
+    # A callable created via functools.partial or an nn.Module, to name some
+    # examples, don't have a __name__.
+    return repr(func)
+
+
+# vmap(func)(inputs) wraps all Tensor inputs to be batched in BatchedTensors,
+# sends those into func, and then unwraps the output BatchedTensors. Operations
+# on BatchedTensors perform the batched operations that the user is asking for.
+def vmap(func: Callable, in_dims: in_dims_t = 0, out_dims: out_dims_t = 0) -> Callable:
+    """
+    Please use torch.vmap instead of this API.
+    """
+    warnings.warn(
+        "Please use torch.vmap instead of torch._vmap_internals.vmap. ",
+        stacklevel=2,
+    )
+    return _vmap(func, in_dims, out_dims)
+
+
+# A version of vmap but without the initial "experimental prototype" warning
+def _vmap(
+    func: Callable,
+    in_dims: in_dims_t = 0,
+    out_dims: out_dims_t = 0,
+    allow_none_pass_through: bool = False,
+) -> Callable:
+    # The `allow_none_pass_through` argument is a temporary workaround may be removed.
+    # Currently it enables us to wrap the call in `autograd.grad` to the autograd engine,
+    # which may return None if any of the inputs are unused. See the issue discussing this:
+    # https://github.com/facebookresearch/functorch/issues/159.
+    @functools.wraps(func)
+    def wrapped(*args):
+        _check_out_dims_is_int_or_int_tuple(out_dims, func)
+        vmap_level = torch._C._vmapmode_increment_nesting()
+        try:
+            batched_inputs, batch_size = _create_batched_inputs(
+                in_dims, args, vmap_level, func
+            )
+            batched_outputs = func(*batched_inputs)
+            if not allow_none_pass_through:
+                _validate_outputs(batched_outputs, func)
+            return _unwrap_batched(
+                batched_outputs,
+                out_dims,
+                vmap_level,
+                batch_size,
+                func,
+                allow_none_pass_through=allow_none_pass_through,
+            )
+        finally:
+            torch._C._vmapmode_decrement_nesting()
+
+    return wrapped
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_weights_only_unpickler.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_weights_only_unpickler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8f24bef1b51d2b978eb1083ad405832057c2107
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_weights_only_unpickler.py
@@ -0,0 +1,306 @@
+# Unpickler restricted to loading only state dicts
+# Restrict constructing types to a list defined in _get_allowed_globals()
+# Restrict BUILD operation to `Tensor`, `Parameter` and `OrderedDict` types only
+# Restrict APPEND/APPENDS to `list`
+# In `GLOBALS` operation do not do class lookup by name, but rather rely on dictionary
+# defined by `_get_allowed_globals()` method, that contains:
+# - torch types (Storage, dtypes, Tensor, `torch.Size`),
+# - `torch._utils._rebuild` functions.
+# - `torch.nn.Parameter`
+# - `collections.OrderedDict`
+
+# Based of https://github.com/python/cpython/blob/main/Lib/pickle.py
+# Expected to be useful for loading PyTorch model weights
+# For example:
+# data = urllib.request.urlopen('https://download.pytorch.org/models/resnet50-0676ba61.pth').read()
+# buf = io.BytesIO(data)
+# weights = torch.load(buf, weights_only = True)
+
+import functools as _functools
+from collections import OrderedDict
+from pickle import (
+    APPEND,
+    APPENDS,
+    BINFLOAT,
+    BINGET,
+    BININT,
+    BININT1,
+    BININT2,
+    BINPERSID,
+    BINPUT,
+    BINUNICODE,
+    BUILD,
+    bytes_types,
+    decode_long,
+    EMPTY_DICT,
+    EMPTY_LIST,
+    EMPTY_SET,
+    EMPTY_TUPLE,
+    GLOBAL,
+    LONG1,
+    LONG_BINGET,
+    LONG_BINPUT,
+    MARK,
+    NEWFALSE,
+    NEWOBJ,
+    NEWTRUE,
+    NONE,
+    PROTO,
+    REDUCE,
+    SETITEM,
+    SETITEMS,
+    SHORT_BINSTRING,
+    STOP,
+    TUPLE,
+    TUPLE1,
+    TUPLE2,
+    TUPLE3,
+    UnpicklingError,
+)
+from struct import unpack
+from sys import maxsize
+from typing import Any, Dict, List
+
+import torch
+
+
+# Unpickling machinery
+@_functools.lru_cache(maxsize=1)
+def _get_allowed_globals():
+    rc: Dict[str, Any] = {
+        "collections.OrderedDict": OrderedDict,
+        "torch.nn.parameter.Parameter": torch.nn.Parameter,
+        "torch.serialization._get_layout": torch.serialization._get_layout,
+        "torch.Size": torch.Size,
+        "torch.Tensor": torch.Tensor,
+    }
+    # dtype
+    for t in [
+        torch.complex32,
+        torch.complex64,
+        torch.complex128,
+        torch.float8_e5m2,
+        torch.float8_e4m3fn,
+        torch.float8_e5m2fnuz,
+        torch.float8_e4m3fnuz,
+        torch.float16,
+        torch.float32,
+        torch.float64,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+    ]:
+        rc[str(t)] = t
+    # Tensor classes
+    for tt in torch._tensor_classes:
+        rc[f"{tt.__module__}.{tt.__name__}"] = tt
+    # Storage classes
+    for ts in torch._storage_classes:
+        if ts not in (torch.storage.TypedStorage, torch.storage.UntypedStorage):
+            # Wrap legacy storage types in a dummy class
+            rc[f"{ts.__module__}.{ts.__name__}"] = torch.serialization.StorageType(
+                ts.__name__
+            )
+        else:
+            rc[f"{ts.__module__}.{ts.__name__}"] = ts
+    # Rebuild functions
+    for f in [
+        torch._utils._rebuild_parameter,
+        torch._utils._rebuild_tensor,
+        torch._utils._rebuild_tensor_v2,
+        torch._utils._rebuild_tensor_v3,
+        torch._utils._rebuild_sparse_tensor,
+        torch._utils._rebuild_meta_tensor_no_storage,
+        torch._utils._rebuild_nested_tensor,
+    ]:
+        rc[f"torch._utils.{f.__name__}"] = f
+
+    # Handles Tensor Subclasses, Tensor's with attributes.
+    # NOTE: It calls into above rebuild functions for regular Tensor types.
+    rc["torch._tensor._rebuild_from_type_v2"] = torch._tensor._rebuild_from_type_v2
+    return rc
+
+
+class Unpickler:
+    def __init__(self, file, *, encoding: str = "bytes"):
+        self.encoding = encoding
+        self.readline = file.readline
+        self.read = file.read
+        self.memo: Dict[int, Any] = {}
+
+    def load(self):
+        """Read a pickled object representation from the open file.
+
+        Return the reconstituted object hierarchy specified in the file.
+        """
+        self.metastack = []
+        self.stack: List[Any] = []
+        self.append = self.stack.append
+        read = self.read
+        readline = self.readline
+        while True:
+            key = read(1)
+            if not key:
+                raise EOFError
+            assert isinstance(key, bytes_types)
+            # Risky operators
+            if key[0] == GLOBAL[0]:
+                module = readline()[:-1].decode("utf-8")
+                name = readline()[:-1].decode("utf-8")
+                full_path = f"{module}.{name}"
+                if full_path in _get_allowed_globals():
+                    self.append(_get_allowed_globals()[full_path])
+                else:
+                    raise RuntimeError(f"Unsupported class {full_path}")
+            elif key[0] == NEWOBJ[0]:
+                args = self.stack.pop()
+                cls = self.stack.pop()
+                if cls is not torch.nn.Parameter:
+                    raise RuntimeError(f"Trying to instantiate unsupported class {cls}")
+                self.append(torch.nn.Parameter(*args))
+            elif key[0] == REDUCE[0]:
+                args = self.stack.pop()
+                func = self.stack[-1]
+                if func not in _get_allowed_globals().values():
+                    raise RuntimeError(
+                        f"Trying to call reduce for unrecognized function {func}"
+                    )
+                self.stack[-1] = func(*args)
+            elif key[0] == BUILD[0]:
+                state = self.stack.pop()
+                inst = self.stack[-1]
+                if type(inst) is torch.Tensor:
+                    # Legacy unpickling
+                    inst.set_(*state)
+                elif type(inst) is torch.nn.Parameter:
+                    inst.__setstate__(state)
+                elif type(inst) is OrderedDict:
+                    inst.__dict__.update(state)
+                else:
+                    raise RuntimeError(
+                        f"Can only build Tensor, parameter or dict objects, but got {type(inst)}"
+                    )
+            # Stack manipulation
+            elif key[0] == APPEND[0]:
+                item = self.stack.pop()
+                list_obj = self.stack[-1]
+                if type(list_obj) is not list:
+                    raise RuntimeError(
+                        f"Can only append to lists, but got {type(list_obj)}"
+                    )
+                list_obj.append(item)
+            elif key[0] == APPENDS[0]:
+                items = self.pop_mark()
+                list_obj = self.stack[-1]
+                if type(list_obj) is not list:
+                    raise RuntimeError(
+                        f"Can only extend lists, but got {type(list_obj)}"
+                    )
+                list_obj.extend(items)
+            elif key[0] == SETITEM[0]:
+                (v, k) = (self.stack.pop(), self.stack.pop())
+                self.stack[-1][k] = v
+            elif key[0] == SETITEMS[0]:
+                items = self.pop_mark()
+                for i in range(0, len(items), 2):
+                    self.stack[-1][items[i]] = items[i + 1]
+            elif key[0] == MARK[0]:
+                self.metastack.append(self.stack)
+                self.stack = []
+                self.append = self.stack.append
+            elif key[0] == TUPLE[0]:
+                items = self.pop_mark()
+                self.append(tuple(items))
+            elif key[0] == TUPLE1[0]:
+                self.stack[-1] = (self.stack[-1],)
+            elif key[0] == TUPLE2[0]:
+                self.stack[-2:] = [(self.stack[-2], self.stack[-1])]
+            elif key[0] == TUPLE3[0]:
+                self.stack[-3:] = [(self.stack[-3], self.stack[-2], self.stack[-1])]
+            # Basic types construction
+            elif key[0] == NONE[0]:
+                self.append(None)
+            elif key[0] == NEWFALSE[0]:
+                self.append(False)
+            elif key[0] == NEWTRUE[0]:
+                self.append(True)
+            elif key[0] == EMPTY_TUPLE[0]:
+                self.append(())
+            elif key[0] == EMPTY_LIST[0]:
+                self.append([])
+            elif key[0] == EMPTY_DICT[0]:
+                self.append({})
+            elif key[0] == EMPTY_SET[0]:
+                self.append(set())
+            elif key[0] == BININT[0]:
+                self.append(unpack("<i", read(4))[0])
+            elif key[0] == BININT1[0]:
+                self.append(self.read(1)[0])
+            elif key[0] == BININT2[0]:
+                self.append(unpack("<H", read(2))[0])
+            elif key[0] == BINFLOAT[0]:
+                self.append(unpack(">d", self.read(8))[0])
+            elif key[0] == BINUNICODE[0]:
+                strlen = unpack("<I", read(4))[0]
+                if strlen > maxsize:
+                    raise RuntimeError("String is too long")
+                strval = str(read(strlen), "utf-8", "surrogatepass")
+                self.append(strval)
+            elif key[0] == SHORT_BINSTRING[0]:
+                strlen = read(1)[0]
+                strdata = read(strlen)
+                if self.encoding != "bytes":
+                    strdata = strdata.decode(self.encoding, "strict")
+                self.append(strdata)
+            elif key[0] == BINPERSID[0]:
+                pid = self.stack.pop()
+                # Only allow persistent load of storage
+                if type(pid) is not tuple and not type(pid) is not int:
+                    raise RuntimeError(
+                        f"persistent_load id must be tuple or int, but got {type(pid)}"
+                    )
+                if (
+                    type(pid) is tuple
+                    and len(pid) > 0
+                    and torch.serialization._maybe_decode_ascii(pid[0]) != "storage"
+                ):
+                    raise RuntimeError(
+                        f"Only persistent_load of storage is allowed, but got {pid[0]}"
+                    )
+                self.append(self.persistent_load(pid))
+            elif key[0] in [BINGET[0], LONG_BINGET[0]]:
+                idx = (read(1) if key[0] == BINGET[0] else unpack("<I", read(4)))[0]
+                self.append(self.memo[idx])
+            elif key[0] in [BINPUT[0], LONG_BINPUT[0]]:
+                i = (read(1) if key[0] == BINPUT[0] else unpack("<I", read(4)))[0]
+                if i < 0:
+                    raise ValueError("negative argument")
+                self.memo[i] = self.stack[-1]
+            elif key[0] == LONG1[0]:
+                n = read(1)[0]
+                data = read(n)
+                self.append(decode_long(data))
+            # First and last deserializer ops
+            elif key[0] == PROTO[0]:
+                # Read and ignore proto version
+                read(1)[0]
+            elif key[0] == STOP[0]:
+                rc = self.stack.pop()
+                return rc
+            else:
+                raise RuntimeError(f"Unsupported operand {key[0]}")
+
+    # Return a list of items pushed in the stack after last MARK instruction.
+    def pop_mark(self):
+        items = self.stack
+        self.stack = self.metastack.pop()
+        self.append = self.stack.append
+        return items
+
+    def persistent_load(self, pid):
+        raise UnpicklingError("unsupported persistent id encountered")
+
+
+def load(file, *, encoding: str = "ASCII"):
+    return Unpickler(file, encoding=encoding).load()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..688f475cfb0c09f1665478393cae5220251ac6c2
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/dynamic/modules/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/dynamic/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8168b30406a8b0c27251d466b3a9195016eba64
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/dynamic/modules/__init__.py
@@ -0,0 +1,3 @@
+from .linear import Linear
+
+__all__ = ["Linear"]
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/dynamic/modules/__pycache__/linear.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/dynamic/modules/__pycache__/linear.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0b8e79a72753a0977d6da817d37b426e7977daf
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/dynamic/modules/__pycache__/linear.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fda5a58f2984ee05b0d167297b458f62c37fc59
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/__init__.py
@@ -0,0 +1 @@
+from . import quantized
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/__pycache__/linear.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/__pycache__/linear.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e200577528df2e8337db8dddd09a84799c72a90a
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/__pycache__/linear.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/dynamic/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/dynamic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..83a394f4df276171e7e5b2a1eb0cee843f9d4e99
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/dynamic/__init__.py
@@ -0,0 +1,5 @@
+from .linear import Linear
+
+__all__ = [
+    "Linear",
+]
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/linear.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/linear.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae3b56d75c86e07cd99564a6a07c502415cd449b
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/linear.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/linear.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..db48a825756b14c26b50b14cbc53d38c9545ac7f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/linear.py
@@ -0,0 +1,197 @@
+from typing import Optional
+
+import torch
+from torch.ao.nn.quantized.modules.utils import _quantize_weight, _hide_packed_params_repr
+
+__all__ = ['LinearPackedParams', 'Linear']
+
+# TODO (zaf): Inherit from `quantized.LinearPackedParams` (T83294430)
+class LinearPackedParams(torch.nn.Module):
+    _version = 1
+
+    def __init__(self, row_block_size=1, col_block_size=4, dtype=torch.qint8):
+        super().__init__()
+
+        if dtype != torch.qint8:
+            raise NotImplementedError("Linear prepacking only supports QINT8")
+        self.dtype = dtype
+        wq = torch._empty_affine_quantized([1, 1], scale=1.0, zero_point=0, dtype=torch.qint8)
+        self.set_weight_bias(wq, None, row_block_size, col_block_size)
+
+    def _get_name(self):
+        return "SparseQuantizedLinearPackedParams"
+
+    @torch.jit.export
+    def set_weight_bias(self, weight: torch.Tensor, bias: Optional[torch.Tensor],
+                        row_block_size: Optional[int], col_block_size: Optional[int]) -> None:
+        assert row_block_size is not None and col_block_size is not None
+        self._packed_params = torch.ops.sparse.qlinear_prepack(weight, bias, row_block_size, col_block_size)
+
+    @torch.jit.export
+    def _weight_bias(self):
+        (weight, bias, block_sizes) = torch.ops.sparse.qlinear_unpack(self._packed_params)
+        return (weight, bias, block_sizes[0], block_sizes[1])
+
+    def forward(self, x):
+        return x
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'dtype'] = self.dtype
+        destination[prefix + '_packed_params'] = self._weight_bias()
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+        assert version <= self._version
+
+        self.dtype = state_dict.pop(prefix + 'dtype')
+        weight, bias, row_block_size, col_block_size = state_dict.pop(prefix + '_packed_params')
+        self.set_weight_bias(weight, bias, row_block_size, col_block_size)
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
+
+    @torch.jit.export
+    def __getstate__(self):
+        return self._packed_params, self.training, self.dtype
+
+    @torch.jit.export
+    def __setstate__(self, state):
+        (self._packed_params, self.training, self.dtype) = state
+
+    def __repr__(self):
+        return self._weight_bias().__repr__()
+
+# TODO (zaf): Inherit from `quantized.Linear` (T83294430)
+class Linear(torch.nn.Module):
+    r"""
+    A quantized sparse linear module with quantized tensor as inputs and outputs.
+    """
+    _version = 1
+    _FLOAT_MODULE = torch.nn.Linear
+
+    def __init__(self, in_features, out_features, row_block_size, col_block_size, bias=True, dtype=torch.qint8):
+        super().__init__()
+
+        if dtype != torch.qint8:
+            raise NotImplementedError("Only QINT8 is supported for Sparse Quantized Linear")
+
+        self.in_features = in_features
+        self.out_features = out_features
+
+        if bias:
+            bias = torch.zeros(self.out_features, dtype=torch.float)
+        else:
+            bias = None
+
+        qweight = torch._empty_affine_quantized([out_features, in_features],
+                                                scale=1, zero_point=0, dtype=torch.qint8)
+        self._packed_params = LinearPackedParams(row_block_size=row_block_size,
+                                                 col_block_size=col_block_size,
+                                                 dtype=dtype)
+        self._packed_params.set_weight_bias(qweight, bias, row_block_size, col_block_size)
+        self.scale = 1.0
+        self.zero_point = 0
+
+    @classmethod
+    def _get_name(cls):
+        return 'SparseQuantizedLinear'
+
+    def extra_repr(self):
+        return 'in_features={}, out_features={}, scale={}, zero_point={}, qscheme={}'.format(
+            self.in_features, self.out_features, self.scale, self.zero_point, self.weight().qscheme()
+        )
+
+    def __repr__(self):
+        return _hide_packed_params_repr(self, LinearPackedParams)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.sparse.qlinear(x, self._packed_params._packed_params, self.scale, self.zero_point)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'scale'] = torch.tensor(self.scale)
+        destination[prefix + 'zero_point'] = torch.tensor(self.zero_point)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        self.scale = float(state_dict[prefix + 'scale'])
+        state_dict.pop(prefix + 'scale')
+
+        self.zero_point = int(state_dict[prefix + 'zero_point'])
+        state_dict.pop(prefix + 'zero_point')
+
+        op_type = int(state_dict[prefix + 'op_type'])
+        state_dict.pop(prefix + 'op_type')
+
+        version = local_metadata.get('version', None)
+        assert version <= self._version
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, False,
+            missing_keys, unexpected_keys, error_msgs)
+
+    def _weight_bias(self):
+        return self._packed_params._weight_bias()
+
+    def weight(self):
+        return self._weight_bias()[0]
+
+    def bias(self):
+        return self._weight_bias()[1]
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor],
+                        row_block_size: Optional[int], col_block_size: Optional[int]) -> None:
+        assert row_block_size is not None and col_block_size is not None
+        self._packed_params.set_weight_bias(w, b, row_block_size, col_block_size)
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a quantized sparse module from a float module.
+
+        We only care about the convert at this stage, no need for observers just yet.
+
+        TODO(zaf): Need to add the sparse params to the qconfig
+        """
+        assert type(mod) == cls._FLOAT_MODULE, cls._get_name() + \
+            '.from_float only works for ' + cls._FLOAT_MODULE.__name__
+        assert hasattr(mod, 'sparse_params'), \
+            ('Expecting the Linear to have `sparse_params`. Make sure you have provided arguments '
+             'in the `sparsifier.squash_mask(params_to_save=("sparse_block_shape",))` method.')
+        sparse_block_shape = mod.sparse_params.get('sparse_block_shape', None)  # type: ignore[operator, union-attr]
+        assert isinstance(sparse_block_shape, (tuple, list))
+        assert len(sparse_block_shape) == 2
+        # TODO: Need to add options to qconfig to avoid the calibration.
+        # TODO: Add calibration for the sparsity
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        activation_post_process = mod.activation_post_process
+        weight_post_process = mod.qconfig.weight()  # type: ignore[operator, union-attr]
+
+        # Assumption is that the weight is already sparsified by the
+        # `sparsifier.convert`
+        weight = mod.weight
+
+        weight_post_process(weight)
+        dtype = weight_post_process.dtype
+        act_scale, act_zp = activation_post_process.calculate_qparams()  # type: ignore[operator, union-attr]
+        assert dtype == torch.qint8, 'Weight observer must have dtype torch.qint8'
+        w_sc, w_zp = weight_post_process.calculate_qparams()
+        if isinstance(w_zp, torch.Tensor):
+            assert not torch.any(w_zp.bool()), "All weight zero points must map to 0"
+        else:
+            assert w_zp == 0, 'Weight zero point must map to 0'
+        qweight = _quantize_weight(weight.float(), weight_post_process)
+
+        row_block_size = mod.sparse_params['sparse_block_shape'][0]  # type: ignore[index]
+        col_block_size = mod.sparse_params['sparse_block_shape'][1]  # type: ignore[index]
+        qlinear = cls(mod.in_features,
+                      mod.out_features,
+                      row_block_size,
+                      col_block_size,
+                      dtype=dtype)
+        qlinear.set_weight_bias(qweight, mod.bias,
+                                row_block_size, col_block_size)  # type: ignore[arg-type]
+        qlinear.scale = float(act_scale)
+        qlinear.zero_point = int(act_zp)
+        return qlinear
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d934f57857436dd0ea7945e327cf3d0532c4c10
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/utils.py
@@ -0,0 +1,42 @@
+import threading
+
+__all__ = [
+    "LinearBlockSparsePattern"
+]
+
+def _is_valid_linear_block_sparse_pattern(row_block_size, col_block_size):
+    return (row_block_size == 1 and col_block_size == 4) or \
+           (row_block_size == 8 and col_block_size == 1)
+
+# This is a stop-gap measure as current flow does not allow module
+# specific block sparse pattern.
+# Infact there is no way to convey sparse pattern via module config
+# of quantization flow. Thus using the global context to convey
+# sparsity pattern.
+# Once the flow supports it, this should be removed.
+class LinearBlockSparsePattern:
+    rlock = threading.RLock()
+    row_block_size = 1
+    col_block_size = 4
+    prev_row_block_size = 1
+    prev_col_block_size = 4
+
+    def __init__(self, row_block_size=1, col_block_size=4):
+        assert _is_valid_linear_block_sparse_pattern(row_block_size, col_block_size)
+        LinearBlockSparsePattern.rlock.acquire()
+        LinearBlockSparsePattern.prev_row_block_size = LinearBlockSparsePattern.row_block_size
+        LinearBlockSparsePattern.prev_col_block_size = LinearBlockSparsePattern.col_block_size
+        LinearBlockSparsePattern.row_block_size = row_block_size
+        LinearBlockSparsePattern.col_block_size = col_block_size
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exc_type, exc_value, backtrace):
+        LinearBlockSparsePattern.row_block_size = LinearBlockSparsePattern.prev_row_block_size
+        LinearBlockSparsePattern.col_block_size = LinearBlockSparsePattern.prev_col_block_size
+        LinearBlockSparsePattern.rlock.release()
+
+    @staticmethod
+    def block_size():
+        return LinearBlockSparsePattern.row_block_size, LinearBlockSparsePattern.col_block_size
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..206bfdae4cf52df70bb11401b6e54559a20e896f
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f4fcb461e22ac7c55d6f2c6b1e6298bd4827bb3
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
@@ -0,0 +1,309 @@
+import abc
+import torch
+from typing import Optional, Tuple, List, Any, Dict
+from ...sparsifier import base_sparsifier
+from collections import defaultdict
+from torch import nn
+import copy
+from ...sparsifier import utils
+from torch.nn.utils import parametrize
+import sys
+import warnings
+
+if not sys.warnoptions:
+    # to suppress repeated warnings when being used in a training loop.
+    warnings.simplefilter("once")
+
+__all__ = ['BaseDataSparsifier']
+
+EMBEDDING_TYPES = {
+    nn.Embedding,
+    nn.EmbeddingBag,
+}
+
+SUPPORTED_TYPES = {
+    torch.Tensor,
+    nn.Parameter,
+    *EMBEDDING_TYPES,
+}
+
+
+class _Container(nn.Module):
+    pass
+
+
+class BaseDataSparsifier(base_sparsifier.BaseSparsifier):
+    r"""
+    Base Data Sparsifier class for all Data sparsifiers.
+    The abstract class accepts raw torch tensors / embedding / embedding bags (refer to SUPPORTED_TYPES above)
+    to prepare for sparsification.
+    In this case, mask (and parametrizations) is owned by the class and not by the user.
+    Specifically, the container object inside the class maintains the mask and parametrizations of the input data
+
+    Args:
+        data_list (list of tuples)
+            list of (name, data) tuples to sparsify. Lookup SUPPORTED_TYPES
+            for type of data. Internally, a container module handles the data sparsification.
+
+        defaults (dict)
+            default configurations will be attached to the
+            configuration. Only the keys that don't exist in the `config` will
+            be updated.
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> data_list = [('tensor_1', torch.randn(3,3)), ('tensor_2', torch.randn(4,4))]
+        >>> defaults = {'sparsity_level': 0.7}
+        >>> sparsifier = DerivedDataSparsifier(data_list = data_list, **defaults) # Some sparsifier that inherits BaseDataSparsifier
+        >>> new_tensor_to_add = {'name': 'tensor_3', 'data': torch.randn(5,5), 'sparsity_level': 0.3}
+        >>> sparsifier.add_data(**new_tensor_to_add)
+        >>> # tensor_1 and tensor_2 will have sparsity_level of 0.7 but tensor_3 will have sparsity_level=0.3
+    """
+    def __init__(self, data_list: Optional[List[Tuple[str, Any]]] = None, **defaults):
+        super().__init__(defaults=defaults)
+
+        self._container = _Container()
+
+        self.data_groups: Dict[str, Dict] = defaultdict(dict)  # name -> {**config}
+        if data_list is not None:
+            # add data with default config here
+            [self.add_data(name, data, **self.defaults) for name, data in data_list]
+
+    def prepare(self):
+        raise NotImplementedError("this function is undefined for this class")
+
+    def _extract_weight(self, data):
+        # extract the weight parameter instead of underlying data
+        if type(data) in [torch.Tensor, nn.Parameter]:
+            return data
+        elif type(data) in EMBEDDING_TYPES:
+            return data.weight
+
+    def add_data(self, name: str, data, reuse_mask=True, **config):
+        r""" Configures and parametrizes the internal container model with name and data.
+
+        **Note**:
+            1. If the data with name already exists, it replaces the data.
+            2. While replacing, the old mask is reused when `reuse_mask=True`
+            3. If `reuse_mask=True`, then the replacing data needs to have the same shape as that of old data.
+            4. By default, the config of the replaced data is used as config for the replacing data, unless something
+               is specified in the config dictionary.
+        """
+        assert type(data) in SUPPORTED_TYPES, \
+            "specified data type not supported at the moment"
+        local_args = copy.deepcopy(self.defaults)
+        local_args.update(config)
+        weight = self._extract_weight(data)
+
+        # Bookkeeping in the container class
+        mask = local_args.get('mask', torch.ones_like(weight))
+        param_class = local_args.get('parametrization', utils.FakeSparsity)
+
+        if name in self.state:
+            # If the named data already exists - replace
+            warnings.warn("Replacing existing data of the same name. - Did you mean a different name?")
+
+            # reuse old config
+            old_args = self.data_groups[name]
+            local_args = copy.deepcopy(old_args)
+            local_args.update(config)
+
+            if reuse_mask:
+                current_data = self.get_data(name=name)
+                assert weight.shape == current_data.shape, \
+                    "to retain the old mask, the shape of the new data must be the same as the previous one"
+                mask = self.get_mask(name=name)  # reuse mask instead of creating a new one
+
+            self._delete_data(name=name)
+
+        # parameter creates a deepcopy of the weight inside, so create a buffer
+        self._container.register_buffer(name=name, tensor=weight)
+        parametrize.register_parametrization(self._container, name, param_class(mask))
+        self.state[name]['mask'] = mask
+        self.data_groups[name] = local_args
+        return getattr(self._container, name)
+
+    def get_data(self, name: str, return_original: bool = True):
+        r"""Returns weight tensor (or data)
+        Args:
+            - name: name of the data to be returned
+            - return_original returns weight tensor without applying parametrization if True
+                else - returns the sparsified version (parametrized)
+        """
+        if name not in self.data_groups:
+            raise ValueError("data with specified name does not exist")
+
+        if return_original:
+            if not parametrize.is_parametrized(self._container, name):
+                raise ValueError("mask squashed - original mask value does not exist")
+            data = getattr(self._container.parametrizations, name).original
+            return data
+        else:
+            return getattr(self._container, name)
+
+    def _convert_mask(self, states, sparse_coo=True):
+        r"""Converts the mask to sparse coo or dense tensors depending on the `sparse_coo` argument.
+        """
+        states = copy.deepcopy(states)
+        for state in states.values():
+            if sparse_coo:
+                state['mask'] = state['mask'].to_sparse_coo()
+            else:
+                state['mask'] = state['mask'].to_dense()
+
+        return states
+
+    def state_dict(self):
+        r"""Returns the state of the optimizer as a :class:`dict`.
+
+        It contains:
+        * state - contains name -> mask mapping.
+        * data_groups - a list containing all sparsity configuration groups
+            with the key name specifying the name of the data
+        * container_state_dict - the state dictionary of the internal
+            container model used for sparsification
+        """
+        state = self._convert_mask(self.state)
+        return {
+            'state': state,
+            'data_groups': self.data_groups,
+            '_container': self._container.state_dict()
+        }
+
+    def _load_container_from_state(self, states, data_groups, container_state_dict):
+        r"""This restores the state of the container specifically based on the data present in state and data_groups
+        If the data was parametrized, then the data would be added to the container and then parametrized,
+        else it would just add the attribute the container.
+        """
+        for name, state in states.items():
+            config_name = data_groups.get(name, None)
+            if config_name is None:
+                raise RuntimeError(f"Error loading {name}")
+
+            # check if the data with such a name was parametrized, if so parametrize
+            # otherwise just set the attribute and continue
+            parametrized_name = f'parametrizations.{name}.original'
+            parametrized = False
+            data = container_state_dict.get(name, None)
+            if name in container_state_dict:
+                # the parametrization was probably removed for this
+                data = container_state_dict.get(name)
+
+            elif parametrized_name in container_state_dict:
+                # so the weight was parametrized
+                data = container_state_dict.get(parametrized_name)
+                parametrized = True
+
+            else:
+                raise RuntimeError(f"Error loading {name}")
+
+            self._container.register_buffer(name=name, tensor=data)
+
+            if parametrized:
+                # register parameter if parametrized
+                mask = state.get('mask', torch.ones_like(data))
+                param_class = data_groups.get('parametrization', utils.FakeSparsity)  # change once public_api for utils is fixed!
+                parametrize.register_parametrization(self._container, name, param_class(mask))
+
+    def load_state_dict(self, state_dict, strict=True):
+        r"""The load_state_dict() restores the state of the sparsifier based on the state_dict
+
+        Args:
+        * state_dict - the dictionary that to which the current sparsifier needs to be restored to
+        * strict - If True - the sparsifier is reset and is restored exactly to the state in state_dict.
+            If False - the current sparsifier is not reset before loading the state_dict i.e. data added
+            before loading the state_dict is not erased.
+        """
+        states = copy.deepcopy(state_dict['state'])
+        data_groups = copy.deepcopy(state_dict['data_groups'])
+        container_state_dict = copy.deepcopy(state_dict['_container'])
+
+        states = self._convert_mask(states, sparse_coo=False)  # convert sparse coo mask to dense
+        if strict:
+            # if strict load -> then reset container
+            self._container = _Container()
+
+        self._load_container_from_state(states, data_groups, container_state_dict)
+
+        if not strict:
+            states.update(self.state)
+            data_groups.update(self.data_groups)
+
+        self.__setstate__({'state': states, 'data_groups': data_groups})
+
+    def __setstate__(self, state):
+        if '_container' in state:  # If container object is in state then load model
+            container_dict = state.pop('_container')
+            self._container = _Container()
+            state['state'] = self._convert_mask(state['state'], sparse_coo=False)  # convert sparse coo mask to dense
+            self._load_container_from_state(state['state'], state['data_groups'], container_dict)
+
+        self.__dict__.update(state)
+
+    def __getstate__(self):
+        state = self._convert_mask(self.state)
+        return {
+            'defaults': self.defaults,
+            'state': state,
+            'data_groups': self.data_groups,
+            '_container': self._container.state_dict()
+        }
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + ' ('
+        for name, sparse_args in self.data_groups.items():
+            format_string += '\n'
+            format_string += '\tData Group\n'
+            format_string += f'\t    name: {name}\n'
+            for key in sorted(sparse_args.keys()):
+                if key == 'data':
+                    continue
+                format_string += f'\t    {key}: {sparse_args[key]}\n'
+        format_string += ')'
+        return format_string
+
+    def get_mask(self, name: str):
+        if name not in self.state:
+            raise ValueError("data with specified name does not exist")
+        return self.state[name]['mask']
+
+    def squash_mask(self, *args, leave_parametrized=True, names=None, **kwargs):
+        r"""Squashes the sparse masks into the appropriate tensors. Also, accepts list of strings
+        to squash mask for. If none, squashes mask for all the keys
+        kwargs:
+            * names: list of strings to squash mask for
+            * sparsified: if true - applies the mask before squashing
+                          if false - does not apply the mask before squashing
+        """
+        if names is None:
+            names = list(self.data_groups.keys())
+        for name in names:
+            parametrize.remove_parametrizations(self._container, name, leave_parametrized=leave_parametrized)
+
+    def step(self):
+        if not self.enable_mask_update:
+            return
+        with torch.no_grad():
+            for name, config in self.data_groups.items():
+                # get non-sparsified data
+                data = self.get_data(name)
+                # need name for the mask otherwise can directly pass mask?
+                self.update_mask(name, data, **config)
+
+    @abc.abstractmethod
+    def update_mask(self, name, data, **kwargs):
+        pass
+
+    def _delete_data(self, name):
+        """Detaches some data from the sparsifier.
+
+        Args:
+            name (str)
+                Name of the data to be removed from the sparsifier
+
+        Note:
+            Currently private. Kind of used as a helper function when replacing data of the same name
+        """
+        self.squash_mask(names=[name], leave_parametrized=False)  # do not apply the mask while deleting
+        delattr(self._container, name)
+        self.state.pop(name)
+        self.data_groups.pop(name)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/_data_sparstity_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/_data_sparstity_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8110bb33822753837de9948b3ef4c0ac9aa9fc96
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/_data_sparstity_utils.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/data_sparsity.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/data_sparsity.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c09f25c6111711be7ebf890d80fa60e895e43469
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/data_sparsity.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/match_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/match_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..814df1bfc6ecb9e200279b1ee307548762f3442c
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/match_utils.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/prune_functions.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/prune_functions.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75a3802516ca39262a0f1b40e7c646cede4885c9
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/prune_functions.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a0d74d6dc933552fefd47e0e950749079a627fb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py
@@ -0,0 +1,48 @@
+from typing import cast
+
+import torch
+from .base_structured_sparsifier import BaseStructuredSparsifier, FakeStructuredSparsity
+
+class LSTMSaliencyPruner(BaseStructuredSparsifier):
+    """
+    Prune packed LSTM weights based on saliency.
+    For each layer {k} inside a LSTM, we have two packed weight matrices
+    - weight_ih_l{k}
+    - weight_hh_l{k}
+
+    These tensors pack the weights for the 4 linear layers together for efficiency.
+
+    [W_ii | W_if | W_ig | W_io]
+
+    Pruning this tensor directly will lead to weights being misassigned when unpacked.
+    To ensure that each packed linear layer is pruned the same amount:
+        1. We split the packed weight into the 4 constituent linear parts
+        2. Update the mask for each individual piece using saliency individually
+
+    This applies to both weight_ih_l{k} and weight_hh_l{k}.
+    """
+
+    def update_mask(self, module, tensor_name, **kwargs):
+        weights = getattr(module, tensor_name)
+
+        for p in getattr(module.parametrizations, tensor_name):
+            if isinstance(p, FakeStructuredSparsity):
+                mask = cast(torch.Tensor, p.mask)
+
+                # select weights based on magnitude
+                if weights.dim() <= 1:
+                    raise Exception("Structured pruning can only be applied to a 2+dim weight tensor!")
+                # take norm over all but first dim
+                dims = tuple(range(1, weights.dim()))
+                saliency = weights.norm(dim=dims, p=1)
+
+                # handle weights in 4 groups
+                split_size = len(mask) // 4
+                masks = torch.split(mask, split_size)
+                saliencies = torch.split(saliency, split_size)
+
+                for keep_mask, sal in zip(masks, saliencies):
+                    # mask smallest k values to be removed
+                    k = int(len(keep_mask) * kwargs["sparsity_level"])
+                    prune = sal.topk(k, largest=False, sorted=False).indices
+                    keep_mask.data[prune] = False  # modifies underlying p.mask directly
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/parametrization.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/parametrization.py
new file mode 100644
index 0000000000000000000000000000000000000000..df94f7093b53db9dba9106a53b4bab0a2b9bb961
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/parametrization.py
@@ -0,0 +1,59 @@
+import torch
+from torch import nn
+from torch.nn.utils.parametrize import is_parametrized
+
+
+def module_contains_param(module, parametrization):
+    if is_parametrized(module):
+        # see if any of the module tensors have a parametriztion attached that matches the one passed in
+        return any(
+            any(isinstance(param, parametrization) for param in param_list)
+            for key, param_list in module.parametrizations.items()
+        )
+    return False
+
+
+# Structured Pruning Parameterizations
+class FakeStructuredSparsity(nn.Module):
+    r"""
+    Parametrization for Structured Pruning. Like FakeSparsity, this should be attached to
+    the  'weight' or any other parameter that requires a mask.
+
+    Instead of an element-wise bool mask, this parameterization uses a row-wise bool mask.
+    """
+
+    def __init__(self, mask):
+        super().__init__()
+        self.register_buffer("mask", mask)
+
+    def forward(self, x):
+        assert isinstance(self.mask, torch.Tensor)
+        assert self.mask.shape[0] == x.shape[0]
+        shape = [1] * len(x.shape)
+        shape[0] = -1
+        return self.mask.reshape(shape) * x
+
+    def state_dict(self, *args, **kwargs):
+        # avoid double saving masks
+        return {}
+
+
+class BiasHook:
+    def __init__(self, parametrization, prune_bias):
+        self.param = parametrization
+        self.prune_bias = prune_bias
+
+    def __call__(self, module, input, output):
+
+        if getattr(module, "_bias", None) is not None:
+            bias = module._bias.data
+            if self.prune_bias:
+                bias[~self.param.mask] = 0
+
+            # reshape bias to broadcast over output dimensions
+            idx = [1] * len(output.shape)
+            idx[1] = -1
+            bias = bias.reshape(idx)
+
+            output += bias
+        return output
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_mappings.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_mappings.py
new file mode 100644
index 0000000000000000000000000000000000000000..726cbc6b0fc8af91f1651d3b0f0a56dbb7f21fe2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_mappings.py
@@ -0,0 +1,18 @@
+__all__ = [
+    "get_static_sparse_quantized_mapping",
+    "get_dynamic_sparse_quantized_mapping",
+]
+
+def get_static_sparse_quantized_mapping():
+    import torch.ao.nn.sparse
+    _static_sparse_quantized_mapping = {
+        torch.nn.Linear: torch.ao.nn.sparse.quantized.Linear,
+    }
+    return _static_sparse_quantized_mapping
+
+def get_dynamic_sparse_quantized_mapping():
+    import torch.ao.nn.sparse
+    _dynamic_sparse_quantized_mapping = {
+        torch.nn.Linear: torch.ao.nn.sparse.quantized.dynamic.Linear,
+    }
+    return _dynamic_sparse_quantized_mapping
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/functional.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c07ae348631b50612823c5d913d075d3aa23fe7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/functional.py
@@ -0,0 +1,1983 @@
+from typing import (
+    List, Tuple, Optional, Union, Any, Sequence, TYPE_CHECKING
+)
+import operator
+import itertools
+
+import torch
+from torch._C import _add_docstr
+import torch.nn.functional as F
+from ._lowrank import svd_lowrank, pca_lowrank
+from .overrides import (
+    has_torch_function, has_torch_function_unary, has_torch_function_variadic,
+    handle_torch_function)
+from ._jit_internal import boolean_dispatch
+from ._jit_internal import _overload as overload
+
+Tensor = torch.Tensor
+from torch import _VF
+
+__all__ = [
+    'atleast_1d',
+    'atleast_2d',
+    'atleast_3d',
+    'align_tensors',
+    'broadcast_shapes',
+    'broadcast_tensors',
+    'cartesian_prod',
+    'block_diag',
+    'cdist',
+    'chain_matmul',
+    'einsum',
+    'istft',
+    'lu',
+    'norm',
+    'meshgrid',
+    'pca_lowrank',
+    'split',
+    'stft',
+    'svd_lowrank',
+    'tensordot',
+    'unique',
+    'unique_consecutive',
+    'unravel_index',
+]
+
+
+def broadcast_tensors(*tensors):
+    r"""broadcast_tensors(*tensors) -> List of Tensors
+
+    Broadcasts the given tensors according to :ref:`broadcasting-semantics`.
+
+    Args:
+        *tensors: any number of tensors of the same type
+
+    .. warning::
+
+        More than one element of a broadcasted tensor may refer to a single
+        memory location. As a result, in-place operations (especially ones that
+        are vectorized) may result in incorrect behavior. If you need to write
+        to the tensors, please clone them first.
+
+    Example::
+
+        >>> x = torch.arange(3).view(1, 3)
+        >>> y = torch.arange(2).view(2, 1)
+        >>> a, b = torch.broadcast_tensors(x, y)
+        >>> a.size()
+        torch.Size([2, 3])
+        >>> a
+        tensor([[0, 1, 2],
+                [0, 1, 2]])
+    """
+    # This wrapper exists to support variadic args.
+    if has_torch_function(tensors):
+        return handle_torch_function(broadcast_tensors, tensors, *tensors)
+    return _VF.broadcast_tensors(tensors)  # type: ignore[attr-defined]
+
+
+def broadcast_shapes(*shapes):
+    r"""broadcast_shapes(*shapes) -> Size
+
+    Similar to :func:`broadcast_tensors` but for shapes.
+
+    This is equivalent to
+    ``torch.broadcast_tensors(*map(torch.empty, shapes))[0].shape``
+    but avoids the need create to intermediate tensors. This is useful for
+    broadcasting tensors of common batch shape but different rightmost shape,
+    e.g. to broadcast mean vectors with covariance matrices.
+
+    Example::
+
+        >>> torch.broadcast_shapes((2,), (3, 1), (1, 1, 1))
+        torch.Size([1, 3, 2])
+
+    Args:
+        \*shapes (torch.Size): Shapes of tensors.
+
+    Returns:
+        shape (torch.Size): A shape compatible with all input shapes.
+
+    Raises:
+        RuntimeError: If shapes are incompatible.
+    """
+    # This wrapper exists to support variadic args.
+    # TODO Move this to C++ once the jit has better support for torch.Size.
+    if not torch.jit.is_tracing():
+        max_len = 0
+        for shape in shapes:
+            if isinstance(shape, (int, torch.SymInt)):
+                if max_len < 1:
+                    max_len = 1
+            elif isinstance(shape, (tuple, list)):
+                s = len(shape)
+                if max_len < s:
+                    max_len = s
+        result = [1] * max_len
+
+        from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+        for shape in shapes:
+            if isinstance(shape, (int, torch.SymInt)):
+                shape = (shape,)
+            if isinstance(shape, (tuple, list)):
+                for i in range(-1, -1 - len(shape), -1):
+                    if shape[i] < 0:
+                        raise RuntimeError(f"Trying to create tensor with negative dimension ({shape[i]}): ({shape[i]})")
+                    # NB: result is initialized to 1 so this is effectively an
+                    # equals one test
+                    if guard_size_oblivious(shape[i] == 1) or guard_size_oblivious(shape[i] == result[i]):
+                        continue
+                    if result[i] != 1:
+                        raise RuntimeError("Shape mismatch: objects cannot be broadcast to a single shape")
+                    result[i] = shape[i]
+            else:
+                raise RuntimeError("Input shapes should be of type ints, a tuple of ints, or a list of ints, got ", shape)
+        return torch.Size(result)
+    else:
+        # with implementation above, torch.jit.trace hardcodes the sizes which makes subsequent replays fail
+        with torch.no_grad():
+            scalar = torch.zeros((), device="cpu")
+            tensors = [scalar.expand(shape) for shape in shapes]
+            tensors = broadcast_tensors(*tensors)
+            return tensors[0].shape
+
+
+def split(
+    tensor: Tensor, split_size_or_sections: Union[int, List[int]], dim: int = 0
+) -> Tuple[Tensor, ...]:
+    r"""Splits the tensor into chunks. Each chunk is a view of the original tensor.
+
+    If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will
+    be split into equally sized chunks (if possible). Last chunk will be smaller if
+    the tensor size along the given dimension :attr:`dim` is not divisible by
+    :attr:`split_size`.
+
+    If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split
+    into ``len(split_size_or_sections)`` chunks with sizes in :attr:`dim` according
+    to :attr:`split_size_or_sections`.
+
+    Args:
+        tensor (Tensor): tensor to split.
+        split_size_or_sections (int) or (list(int)): size of a single chunk or
+            list of sizes for each chunk
+        dim (int): dimension along which to split the tensor.
+
+    Example::
+
+        >>> a = torch.arange(10).reshape(5, 2)
+        >>> a
+        tensor([[0, 1],
+                [2, 3],
+                [4, 5],
+                [6, 7],
+                [8, 9]])
+        >>> torch.split(a, 2)
+        (tensor([[0, 1],
+                 [2, 3]]),
+         tensor([[4, 5],
+                 [6, 7]]),
+         tensor([[8, 9]]))
+        >>> torch.split(a, [1, 4])
+        (tensor([[0, 1]]),
+         tensor([[2, 3],
+                 [4, 5],
+                 [6, 7],
+                 [8, 9]]))
+    """
+    if has_torch_function_unary(tensor):
+        return handle_torch_function(
+            split, (tensor,), tensor, split_size_or_sections, dim=dim)
+    # Overwriting reason:
+    # This dispatches to two ATen functions depending on the type of
+    # split_size_or_sections. The branching code is in _tensor.py, which we
+    # call here.
+    return tensor.split(split_size_or_sections, dim)
+
+
+def einsum(*args: Any) -> Tensor:
+    r"""einsum(equation, *operands) -> Tensor
+
+    Sums the product of the elements of the input :attr:`operands` along dimensions specified using a notation
+    based on the Einstein summation convention.
+
+    Einsum allows computing many common multi-dimensional linear algebraic array operations by representing them
+    in a short-hand format based on the Einstein summation convention, given by :attr:`equation`. The details of
+    this format are described below, but the general idea is to label every dimension of the input :attr:`operands`
+    with some subscript and define which subscripts are part of the output. The output is then computed by summing
+    the product of the elements of the :attr:`operands` along the dimensions whose subscripts are not part of the
+    output. For example, matrix multiplication can be computed using einsum as `torch.einsum("ij,jk->ik", A, B)`.
+    Here, j is the summation subscript and i and k the output subscripts (see section below for more details on why).
+
+    Equation:
+
+        The :attr:`equation` string specifies the subscripts (letters in `[a-zA-Z]`) for each dimension of
+        the input :attr:`operands` in the same order as the dimensions, separating subscripts for each operand by a
+        comma (','), e.g. `'ij,jk'` specify subscripts for two 2D operands. The dimensions labeled with the same subscript
+        must be broadcastable, that is, their size must either match or be `1`. The exception is if a subscript is
+        repeated for the same input operand, in which case the dimensions labeled with this subscript for this operand
+        must match in size and the operand will be replaced by its diagonal along these dimensions. The subscripts that
+        appear exactly once in the :attr:`equation` will be part of the output, sorted in increasing alphabetical order.
+        The output is computed by multiplying the input :attr:`operands` element-wise, with their dimensions aligned based
+        on the subscripts, and then summing out the dimensions whose subscripts are not part of the output.
+
+        Optionally, the output subscripts can be explicitly defined by adding an arrow ('->') at the end of the equation
+        followed by the subscripts for the output. For instance, the following equation computes the transpose of a
+        matrix multiplication: 'ij,jk->ki'. The output subscripts must appear at least once for some input operand and
+        at most once for the output.
+
+        Ellipsis ('...') can be used in place of subscripts to broadcast the dimensions covered by the ellipsis.
+        Each input operand may contain at most one ellipsis which will cover the dimensions not covered by subscripts,
+        e.g. for an input operand with 5 dimensions, the ellipsis in the equation `'ab...c'` cover the third and fourth
+        dimensions. The ellipsis does not need to cover the same number of dimensions across the :attr:`operands` but the
+        'shape' of the ellipsis (the size of the dimensions covered by them) must broadcast together. If the output is not
+        explicitly defined with the arrow ('->') notation, the ellipsis will come first in the output (left-most dimensions),
+        before the subscript labels that appear exactly once for the input operands. e.g. the following equation implements
+        batch matrix multiplication `'...ij,...jk'`.
+
+        A few final notes: the equation may contain whitespaces between the different elements (subscripts, ellipsis,
+        arrow and comma) but something like `'. . .'` is not valid. An empty string `''` is valid for scalar operands.
+
+    .. note::
+
+        ``torch.einsum`` handles ellipsis ('...') differently from NumPy in that it allows dimensions
+        covered by the ellipsis to be summed over, that is, ellipsis are not required to be part of the output.
+
+    .. note::
+
+        This function uses opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/) to speed up computation or to
+        consume less memory by optimizing contraction order. This optimization occurs when there are at least three
+        inputs, since the order does not matter otherwise. Note that finding _the_ optimal path is an NP-hard problem,
+        thus, opt_einsum relies on different heuristics to achieve near-optimal results. If opt_einsum is not available,
+        the default order is to contract from left to right.
+
+        To bypass this default behavior, add the following line to disable the usage of opt_einsum and skip path
+        calculation: `torch.backends.opt_einsum.enabled = False`
+
+        To specify which strategy you'd like for opt_einsum to compute the contraction path, add the following line:
+        `torch.backends.opt_einsum.strategy = 'auto'`. The default strategy is 'auto', and we also support 'greedy' and
+        'optimal'. Disclaimer that the runtime of 'optimal' is factorial in the number of inputs! See more details in
+        the opt_einsum documentation (https://optimized-einsum.readthedocs.io/en/stable/path_finding.html).
+
+    .. note::
+
+        As of PyTorch 1.10 :func:`torch.einsum` also supports the sublist format (see examples below). In this format,
+        subscripts for each operand are specified by sublists, list of integers in the range [0, 52). These sublists
+        follow their operands, and an extra sublist can appear at the end of the input to specify the output's
+        subscripts., e.g. `torch.einsum(op1, sublist1, op2, sublist2, ..., [subslist_out])`. Python's `Ellipsis` object
+        may be provided in a sublist to enable broadcasting as described in the Equation section above.
+
+    Args:
+        equation (str): The subscripts for the Einstein summation.
+        operands (List[Tensor]): The tensors to compute the Einstein summation of.
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # trace
+        >>> torch.einsum('ii', torch.randn(4, 4))
+        tensor(-1.2104)
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # diagonal
+        >>> torch.einsum('ii->i', torch.randn(4, 4))
+        tensor([-0.1034,  0.7952, -0.2433,  0.4545])
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # outer product
+        >>> x = torch.randn(5)
+        >>> y = torch.randn(4)
+        >>> torch.einsum('i,j->ij', x, y)
+        tensor([[ 0.1156, -0.2897, -0.3918,  0.4963],
+                [-0.3744,  0.9381,  1.2685, -1.6070],
+                [ 0.7208, -1.8058, -2.4419,  3.0936],
+                [ 0.1713, -0.4291, -0.5802,  0.7350],
+                [ 0.5704, -1.4290, -1.9323,  2.4480]])
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # batch matrix multiplication
+        >>> As = torch.randn(3, 2, 5)
+        >>> Bs = torch.randn(3, 5, 4)
+        >>> torch.einsum('bij,bjk->bik', As, Bs)
+        tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
+                [-1.6706, -0.8097, -0.8025, -2.1183]],
+
+                [[ 4.2239,  0.3107, -0.5756, -0.2354],
+                [-1.4558, -0.3460,  1.5087, -0.8530]],
+
+                [[ 2.8153,  1.8787, -4.3839, -1.2112],
+                [ 0.3728, -2.1131,  0.0921,  0.8305]]])
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # with sublist format and ellipsis
+        >>> torch.einsum(As, [..., 0, 1], Bs, [..., 1, 2], [..., 0, 2])
+        tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
+                [-1.6706, -0.8097, -0.8025, -2.1183]],
+
+                [[ 4.2239,  0.3107, -0.5756, -0.2354],
+                [-1.4558, -0.3460,  1.5087, -0.8530]],
+
+                [[ 2.8153,  1.8787, -4.3839, -1.2112],
+                [ 0.3728, -2.1131,  0.0921,  0.8305]]])
+
+        >>> # batch permute
+        >>> A = torch.randn(2, 3, 4, 5)
+        >>> torch.einsum('...ij->...ji', A).shape
+        torch.Size([2, 3, 5, 4])
+
+        >>> # equivalent to torch.nn.functional.bilinear
+        >>> A = torch.randn(3, 5, 4)
+        >>> l = torch.randn(2, 5)
+        >>> r = torch.randn(2, 4)
+        >>> torch.einsum('bn,anm,bm->ba', l, A, r)
+        tensor([[-0.3430, -5.2405,  0.4494],
+                [ 0.3311,  5.5201, -3.0356]])
+    """
+    import torch.backends.opt_einsum as opt_einsum
+    # This wrapper exists to support variadic args.
+    if len(args) < 2:
+        raise ValueError('einsum(): must specify the equation string and at least one operand, '
+                         'or at least one operand and its subscripts list')
+
+    equation = None
+    operands = None
+
+    if isinstance(args[0], torch.Tensor):
+        # Convert the subscript list format which is an interleaving of operand and its subscripts
+        # list with an optional output subscripts list at the end (see documentation for more details on this)
+        # to the equation string format by creating the equation string from the subscripts list and grouping the
+        # input operands into a tensorlist (List[Tensor]).
+        def parse_subscript(n: int) -> str:
+            if n == Ellipsis:
+                return '...'
+            if n >= 0 and n < 26:
+                return chr(ord('A') + n)
+            if n >= 26 and n < 52:
+                return chr(ord('a') + n - 26)
+            raise ValueError('einsum(): subscript in subscript list is not within the valid range [0, 52)')
+
+        # Parse subscripts for input operands
+        equation = ','.join(''.join(parse_subscript(s) for s in l) for l in args[1::2])
+
+        # Parse optional output subscripts (provided when the number of arguments is odd)
+        if len(args) % 2 == 1:
+            equation += '->' + ''.join(parse_subscript(s) for s in args[-1])
+            operands = args[:-1:2]
+        else:
+            operands = args[::2]
+    else:
+        equation = args[0]
+        operands = args[1:]
+
+    if has_torch_function(operands):
+        return handle_torch_function(einsum, operands, equation, *operands)
+
+    if len(operands) == 1 and isinstance(operands[0], (list, tuple)):
+        # the old interface of passing the operands as one list argument
+        _operands = operands[0]
+        # recurse incase operands contains value that has torch function
+        # in the original implementation this line is omitted
+        return einsum(equation, *_operands)
+
+    if len(operands) <= 2 or not opt_einsum.enabled:
+        # the path for contracting 0 or 1 time(s) is already optimized
+        # or the user has disabled using opt_einsum
+        return _VF.einsum(equation, operands)  # type: ignore[attr-defined]
+
+    path = None
+    if opt_einsum.is_available():
+        _opt_einsum = opt_einsum.get_opt_einsum()
+        tupled_path = _opt_einsum.contract_path(equation, *operands, optimize=opt_einsum.strategy)[0]
+        # flatten path for dispatching to C++
+        path = [item for pair in tupled_path for item in pair]
+    return _VF.einsum(equation, operands, path=path)  # type: ignore[attr-defined]
+
+
+# This wrapper exists to support variadic args.
+if TYPE_CHECKING:
+    # The JIT doesn't understand Union, so only add type annotation for mypy
+    def meshgrid(*tensors: Union[Tensor, List[Tensor]],
+                 indexing: Optional[str] = None) -> Tuple[Tensor, ...]:
+        return _meshgrid(*tensors, indexing=indexing)
+else:
+    def meshgrid(*tensors, indexing: Optional[str] = None) -> Tuple[Tensor, ...]:
+        r"""Creates grids of coordinates specified by the 1D inputs in `attr`:tensors.
+
+        This is helpful when you want to visualize data over some
+        range of inputs. See below for a plotting example.
+
+        Given :math:`N` 1D tensors :math:`T_0 \ldots T_{N-1}` as
+        inputs with corresponding sizes :math:`S_0 \ldots S_{N-1}`,
+        this creates :math:`N` N-dimensional tensors :math:`G_0 \ldots
+        G_{N-1}`, each with shape :math:`(S_0, ..., S_{N-1})` where
+        the output :math:`G_i` is constructed by expanding :math:`T_i`
+        to the result shape.
+
+        .. note::
+            0D inputs are treated equivalently to 1D inputs of a
+            single element.
+
+        .. warning::
+            `torch.meshgrid(*tensors)` currently has the same behavior
+            as calling `numpy.meshgrid(*arrays, indexing='ij')`.
+
+            In the future `torch.meshgrid` will transition to
+            `indexing='xy'` as the default.
+
+            https://github.com/pytorch/pytorch/issues/50276 tracks
+            this issue with the goal of migrating to NumPy's behavior.
+
+        .. seealso::
+
+            :func:`torch.cartesian_prod` has the same effect but it
+            collects the data in a tensor of vectors.
+
+        Args:
+            tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be
+                treated as tensors of size :math:`(1,)` automatically
+
+            indexing: (str, optional): the indexing mode, either "xy"
+                or "ij", defaults to "ij". See warning for future changes.
+
+                If "xy" is selected, the first dimension corresponds
+                to the cardinality of the second input and the second
+                dimension corresponds to the cardinality of the first
+                input.
+
+                If "ij" is selected, the dimensions are in the same
+                order as the cardinality of the inputs.
+
+        Returns:
+            seq (sequence of Tensors): If the input has :math:`N`
+            tensors of size :math:`S_0 \ldots S_{N-1}``, then the
+            output will also have :math:`N` tensors, where each tensor
+            is of shape :math:`(S_0, ..., S_{N-1})`.
+
+        Example::
+
+            >>> x = torch.tensor([1, 2, 3])
+            >>> y = torch.tensor([4, 5, 6])
+
+            Observe the element-wise pairings across the grid, (1, 4),
+            (1, 5), ..., (3, 6). This is the same thing as the
+            cartesian product.
+            >>> grid_x, grid_y = torch.meshgrid(x, y, indexing='ij')
+            >>> grid_x
+            tensor([[1, 1, 1],
+                    [2, 2, 2],
+                    [3, 3, 3]])
+            >>> grid_y
+            tensor([[4, 5, 6],
+                    [4, 5, 6],
+                    [4, 5, 6]])
+
+            This correspondence can be seen when these grids are
+            stacked properly.
+            >>> torch.equal(torch.cat(tuple(torch.dstack([grid_x, grid_y]))),
+            ...             torch.cartesian_prod(x, y))
+            True
+
+            `torch.meshgrid` is commonly used to produce a grid for
+            plotting.
+            >>> # xdoctest: +REQUIRES(module:matplotlib)
+            >>> # xdoctest: +REQUIRES(env:DOCTEST_SHOW)
+            >>> import matplotlib.pyplot as plt
+            >>> xs = torch.linspace(-5, 5, steps=100)
+            >>> ys = torch.linspace(-5, 5, steps=100)
+            >>> x, y = torch.meshgrid(xs, ys, indexing='xy')
+            >>> z = torch.sin(torch.sqrt(x * x + y * y))
+            >>> ax = plt.axes(projection='3d')
+            >>> ax.plot_surface(x.numpy(), y.numpy(), z.numpy())
+            >>> plt.show()
+
+        .. image:: ../_static/img/meshgrid.png
+            :width: 512
+
+        """
+        return _meshgrid(*tensors, indexing=indexing)
+
+
+def _meshgrid(*tensors, indexing: Optional[str]):
+    if has_torch_function(tensors):
+        return handle_torch_function(meshgrid, tensors, *tensors, indexing=indexing)
+    if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)):
+        # the old interface of passing the operands as one list argument
+        tensors = tensors[0]  # type: ignore[assignment]
+
+    # Continue allowing call of old method that takes no indexing
+    # kwarg for forward compatibility reasons.
+    #
+    # Remove this two weeks after landing.
+    kwargs = {} if indexing is None else {'indexing': indexing}
+    return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
+
+
+def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
+         win_length: Optional[int] = None, window: Optional[Tensor] = None,
+         center: bool = True, pad_mode: str = 'reflect', normalized: bool = False,
+         onesided: Optional[bool] = None,
+         return_complex: Optional[bool] = None) -> Tensor:
+    r"""Short-time Fourier transform (STFT).
+
+    .. warning::
+        From version 1.8.0, :attr:`return_complex` must always be given
+        explicitly for real inputs and `return_complex=False` has been
+        deprecated. Strongly prefer `return_complex=True` as in a future
+        pytorch release, this function will only return complex tensors.
+
+        Note that :func:`torch.view_as_real` can be used to recover a real
+        tensor with an extra last dimension for real and imaginary components.
+
+    .. warning::
+        From version 2.1, a warning will be provided if a :attr:`window` is
+        not specified. In a future release, this attribute will be required.
+        Not providing a window currently defaults to using a rectangular window,
+        which may result in undesirable artifacts. Consider using tapered windows,
+        such as :func:`torch.hann_window`.
+
+    The STFT computes the Fourier transform of short overlapping windows of the
+    input. This giving frequency components of the signal as they change over
+    time. The interface of this function is modeled after (but *not* a drop-in
+    replacement for) librosa_ stft function.
+
+    .. _librosa: https://librosa.org/doc/latest/generated/librosa.stft.html
+
+    Ignoring the optional batch dimension, this method computes the following
+    expression:
+
+    .. math::
+        X[\omega, m] = \sum_{k = 0}^{\text{win\_length-1}}%
+                            \text{window}[k]\ \text{input}[m \times \text{hop\_length} + k]\ %
+                            \exp\left(- j \frac{2 \pi \cdot \omega k}{\text{n\_fft}}\right),
+
+    where :math:`m` is the index of the sliding window, and :math:`\omega` is
+    the frequency :math:`0 \leq \omega < \text{n\_fft}` for ``onesided=False``,
+    or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for ``onesided=True``.
+
+    * :attr:`input` must be either a 1-D time sequence or a 2-D batch of time
+      sequences.
+
+    * If :attr:`hop_length` is ``None`` (default), it is treated as equal to
+      ``floor(n_fft / 4)``.
+
+    * If :attr:`win_length` is ``None`` (default), it is treated as equal to
+      :attr:`n_fft`.
+
+    * :attr:`window` can be a 1-D tensor of size :attr:`win_length`, e.g., from
+      :meth:`torch.hann_window`. If :attr:`window` is ``None`` (default), it is
+      treated as if having :math:`1` everywhere in the window. If
+      :math:`\text{win\_length} < \text{n\_fft}`, :attr:`window` will be padded on
+      both sides to length :attr:`n_fft` before being applied.
+
+    * If :attr:`center` is ``True`` (default), :attr:`input` will be padded on
+      both sides so that the :math:`t`-th frame is centered at time
+      :math:`t \times \text{hop\_length}`. Otherwise, the :math:`t`-th frame
+      begins at time  :math:`t \times \text{hop\_length}`.
+
+    * :attr:`pad_mode` determines the padding method used on :attr:`input` when
+      :attr:`center` is ``True``. See :meth:`torch.nn.functional.pad` for
+      all available options. Default is ``"reflect"``.
+
+    * If :attr:`onesided` is ``True`` (default for real input), only values for
+      :math:`\omega` in :math:`\left[0, 1, 2, \dots, \left\lfloor
+      \frac{\text{n\_fft}}{2} \right\rfloor + 1\right]` are returned because
+      the real-to-complex Fourier transform satisfies the conjugate symmetry,
+      i.e., :math:`X[m, \omega] = X[m, \text{n\_fft} - \omega]^*`.
+      Note if the input or window tensors are complex, then :attr:`onesided`
+      output is not possible.
+
+    * If :attr:`normalized` is ``True`` (default is ``False``), the function
+      returns the normalized STFT results, i.e., multiplied by :math:`(\text{frame\_length})^{-0.5}`.
+
+    * If :attr:`return_complex` is ``True`` (default if input is complex), the
+      return is a ``input.dim() + 1`` dimensional complex tensor. If ``False``,
+      the output is a ``input.dim() + 2`` dimensional real tensor where the last
+      dimension represents the real and imaginary components.
+
+    Returns either a complex tensor of size :math:`(* \times N \times T)` if
+    :attr:`return_complex` is true, or a real tensor of size :math:`(* \times N
+    \times T \times 2)`. Where :math:`*` is the optional batch size of
+    :attr:`input`, :math:`N` is the number of frequencies where STFT is applied
+    and :math:`T` is the total number of frames used.
+
+    .. warning::
+      This function changed signature at version 0.4.1. Calling with the
+      previous signature may cause error or return incorrect result.
+
+    Args:
+        input (Tensor): the input tensor of shape `(B?, L)` where `B?` is an optional
+            batch dimension
+        n_fft (int): size of Fourier transform
+        hop_length (int, optional): the distance between neighboring sliding window
+            frames. Default: ``None`` (treated as equal to ``floor(n_fft / 4)``)
+        win_length (int, optional): the size of window frame and STFT filter.
+            Default: ``None``  (treated as equal to :attr:`n_fft`)
+        window (Tensor, optional): the optional window function.
+            Shape must be 1d and `<= n_fft`
+            Default: ``None`` (treated as window of all :math:`1` s)
+        center (bool, optional): whether to pad :attr:`input` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            Default: ``True``
+        pad_mode (str, optional): controls the padding method used when
+            :attr:`center` is ``True``. Default: ``"reflect"``
+        normalized (bool, optional): controls whether to return the normalized STFT results
+             Default: ``False``
+        onesided (bool, optional): controls whether to return half of results to
+            avoid redundancy for real inputs.
+            Default: ``True`` for real :attr:`input` and :attr:`window`, ``False`` otherwise.
+        return_complex (bool, optional): whether to return a complex tensor, or
+            a real tensor with an extra last dimension for the real and
+            imaginary components.
+
+            .. versionchanged:: 2.0
+               ``return_complex`` is now a required argument for real inputs,
+               as the default is being transitioned to ``True``.
+
+            .. deprecated:: 2.0
+               ``return_complex=False`` is deprecated, instead use ``return_complex=True``
+               Note that calling :func:`torch.view_as_real` on the output will
+               recover the deprecated output format.
+
+    Returns:
+        Tensor: A tensor containing the STFT result with shape `(B?, N, T, C?)` where
+           - `B?` is an optional batch dimension from the input.
+           - `N` is the number of frequency samples, `(n_fft // 2) + 1` for
+             `onesided=True`, or otherwise `n_fft`.
+           - `T` is the number of frames, `1 + L // hop_length`
+             for `center=True`, or `1 + (L - n_fft) // hop_length` otherwise.
+           - `C?` is an optional length-2 dimension of real and imaginary
+             components, present when `return_complex=False`.
+
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            stft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length,
+            window=window, center=center, pad_mode=pad_mode, normalized=normalized,
+            onesided=onesided, return_complex=return_complex)
+    # NOTE: Do not edit. This code will be removed once the forward-compatibility
+    #       period is over for PR #73432
+    if center:
+        signal_dim = input.dim()
+        extended_shape = [1] * (3 - signal_dim) + list(input.size())
+        pad = int(n_fft // 2)
+        input = F.pad(input.view(extended_shape), [pad, pad], pad_mode)
+        input = input.view(input.shape[-signal_dim:])
+    return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
+                    normalized, onesided, return_complex)
+
+
+istft = _add_docstr(
+    torch.istft,
+    "istft(input, n_fft, hop_length=None, win_length=None, window=None, center=True, "
+    "normalized=False, onesided=None, length=None, return_complex=False) -> Tensor:\n"
+    r"""
+Inverse short time Fourier Transform. This is expected to be the inverse of :func:`~torch.stft`.
+
+.. warning::
+    From version 2.1, a warning will be provided if a :attr:`window` is
+    not specified. In a future release, this attribute will be required.
+    Please provide the same window used in the stft call.
+
+It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the
+least squares estimation of the original signal. The algorithm will check using the NOLA condition (
+nonzero overlap).
+
+Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelope
+created by the summation of all the windows is never zero at certain point in time. Specifically,
+:math:`\sum_{t=-\infty}^{\infty} |w|^2[n-t\times hop\_length] \cancel{=} 0`.
+
+Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame,
+``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False
+since the signal isn't padded). If `length` is given in the arguments and is longer than expected,
+``istft`` will pad zeros to the end of the returned signal.
+
+If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc.
+Left padding can be trimmed off exactly because they can be calculated but right padding cannot be
+calculated without additional information.
+
+Example: Suppose the last window is:
+``[17, 18, 0, 0, 0]`` vs ``[18, 0, 0, 0, 0]``
+
+The :attr:`n_fft`, :attr:`hop_length`, :attr:`win_length` are all the same which prevents the calculation
+of right padding. These additional values could be zeros or a reflection of the signal so providing
+:attr:`length` could be useful. If :attr:`length` is ``None`` then padding will be aggressively removed
+(some loss of signal).
+
+[1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform,"
+IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984.
+
+Args:
+    input (Tensor): The input tensor. Expected to be in the format of :func:`~torch.stft`,
+        output. That is a complex tensor of shape `(B?, N, T)` where
+
+        - `B?` is an optional batch dimension
+        - `N` is the number of frequency samples, `(n_fft // 2) + 1`
+          for onesided input, or otherwise `n_fft`.
+        - `T` is the number of frames, `1 + length // hop_length` for centered stft,
+          or `1 + (length - n_fft) // hop_length` otherwise.
+
+        .. versionchanged:: 2.0
+            Real datatype inputs are no longer supported. Input must now have a
+            complex datatype, as returned by ``stft(..., return_complex=True)``.
+    n_fft (int): Size of Fourier transform
+    hop_length (Optional[int]): The distance between neighboring sliding window frames.
+        (Default: ``n_fft // 4``)
+    win_length (Optional[int]): The size of window frame and STFT filter. (Default: ``n_fft``)
+    window (Optional[torch.Tensor]): The optional window function.
+        Shape must be 1d and `<= n_fft`
+        (Default: ``torch.ones(win_length)``)
+    center (bool): Whether :attr:`input` was padded on both sides so that the :math:`t`-th frame is
+        centered at time :math:`t \times \text{hop\_length}`.
+        (Default: ``True``)
+    normalized (bool): Whether the STFT was normalized. (Default: ``False``)
+    onesided (Optional[bool]): Whether the STFT was onesided.
+        (Default: ``True`` if `n_fft != fft_size` in the input size)
+    length (Optional[int]): The amount to trim the signal by (i.e. the
+        original signal length). Defaults to `(T - 1) * hop_length` for
+        centered stft, or `n_fft + (T - 1) * hop_length` otherwise, where `T`
+        is the number of input frames.
+    return_complex (Optional[bool]):
+        Whether the output should be complex, or if the input should be
+        assumed to derive from a real signal and window.
+        Note that this is incompatible with ``onesided=True``.
+        (Default: ``False``)
+
+Returns:
+    Tensor: Least squares estimation of the original signal of shape `(B?, length)` where
+        `B?` is an optional batch dimension from the input tensor.
+""")
+
+
+if TYPE_CHECKING:
+    # These _impl functions return a variable number of tensors as output with
+    # __torch_function__; tuple unpacking is done already rather than being
+    # done by the caller of the _impl function
+    _unique_impl_out = Any
+else:
+    _unique_impl_out = Tuple[Tensor, Tensor, Tensor]
+
+
+def _unique_impl(input: Tensor, sorted: bool = True,
+                 return_inverse: bool = False, return_counts: bool = False,
+                 dim: Optional[int] = None) -> _unique_impl_out:
+    r"""unique(input, sorted=True, return_inverse=False, return_counts=False, dim=None) -> Tuple[Tensor, Tensor, Tensor]
+
+    Returns the unique elements of the input tensor.
+
+    .. note:: This function is different from :func:`torch.unique_consecutive` in the sense that
+        this function also eliminates non-consecutive duplicate values.
+
+    .. note:: Currently in the CUDA implementation and the CPU implementation,
+        `torch.unique` always sort the tensor at the beginning regardless of the `sort` argument.
+        Sorting could be slow, so if your input tensor is already sorted, it is recommended to use
+        :func:`torch.unique_consecutive` which avoids the sorting.
+
+    Args:
+        input (Tensor): the input tensor
+        sorted (bool): Whether to sort the unique elements in ascending order
+            before returning as output.
+        return_inverse (bool): Whether to also return the indices for where
+            elements in the original input ended up in the returned unique list.
+        return_counts (bool): Whether to also return the counts for each unique
+            element.
+        dim (int, optional): the dimension to operate upon. If ``None``, the
+            unique of the flattened input is returned. Otherwise, each of the
+            tensors indexed by the given dimension is treated as one of the
+            elements to apply the unique operation upon. See examples for more
+            details. Default: ``None``
+
+    Returns:
+        (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing
+
+            - **output** (*Tensor*): the output list of unique scalar elements.
+            - **inverse_indices** (*Tensor*): (optional) if
+              :attr:`return_inverse` is True, there will be an additional
+              returned tensor (same shape as input) representing the indices
+              for where elements in the original input map to in the output;
+              otherwise, this function will only return a single tensor.
+            - **counts** (*Tensor*): (optional) if
+              :attr:`return_counts` is True, there will be an additional
+              returned tensor (same shape as output or output.size(dim),
+              if dim was specified) representing the number of occurrences
+              for each unique value or tensor.
+
+    Example::
+
+        >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long))
+        >>> output
+        tensor([1, 2, 3])
+
+        >>> output, inverse_indices = torch.unique(
+        ...     torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True)
+        >>> output
+        tensor([1, 2, 3])
+        >>> inverse_indices
+        tensor([0, 2, 1, 2])
+
+        >>> output, inverse_indices = torch.unique(
+        ...     torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True)
+        >>> output
+        tensor([1, 2, 3])
+        >>> inverse_indices
+        tensor([[0, 2],
+                [1, 2]])
+
+        >>> a = torch.tensor([
+        ...     [
+        ...         [1, 1, 0, 0],
+        ...         [1, 1, 0, 0],
+        ...         [0, 0, 1, 1],
+        ...     ],
+        ...     [
+        ...         [0, 0, 1, 1],
+        ...         [0, 0, 1, 1],
+        ...         [1, 1, 1, 1],
+        ...     ],
+        ...     [
+        ...         [1, 1, 0, 0],
+        ...         [1, 1, 0, 0],
+        ...         [0, 0, 1, 1],
+        ...     ],
+        ... ])
+
+        >>> # If we call `torch.unique(a, dim=0)`, each of the tensors `a[idx, :, :]`
+        >>> # will be compared. We can see that `a[0, :, :]` and `a[2, :, :]` match
+        >>> # each other, so one of them will be removed.
+        >>> (a[0, :, :] == a[2, :, :]).all()
+        tensor(True)
+        >>> a_unique_dim0 = torch.unique(a, dim=0)
+        >>> a_unique_dim0
+        tensor([[[0, 0, 1, 1],
+                 [0, 0, 1, 1],
+                 [1, 1, 1, 1]],
+                [[1, 1, 0, 0],
+                 [1, 1, 0, 0],
+                 [0, 0, 1, 1]]])
+
+        >>> # Notice which sub-tensors from `a` match with the sub-tensors from
+        >>> # `a_unique_dim0`:
+        >>> (a_unique_dim0[0, :, :] == a[1, :, :]).all()
+        tensor(True)
+        >>> (a_unique_dim0[1, :, :] == a[0, :, :]).all()
+        tensor(True)
+
+        >>> # For `torch.unique(a, dim=1)`, each of the tensors `a[:, idx, :]` are
+        >>> # compared. `a[:, 0, :]` and `a[:, 1, :]` match each other, so one of
+        >>> # them will be removed.
+        >>> (a[:, 0, :] == a[:, 1, :]).all()
+        tensor(True)
+        >>> torch.unique(a, dim=1)
+        tensor([[[0, 0, 1, 1],
+                 [1, 1, 0, 0]],
+                [[1, 1, 1, 1],
+                 [0, 0, 1, 1]],
+                [[0, 0, 1, 1],
+                 [1, 1, 0, 0]]])
+
+        >>> # For `torch.unique(a, dim=2)`, the tensors `a[:, :, idx]` are compared.
+        >>> # `a[:, :, 0]` and `a[:, :, 1]` match each other. Also, `a[:, :, 2]` and
+        >>> # `a[:, :, 3]` match each other as well. So in this case, two of the
+        >>> # sub-tensors will be removed.
+        >>> (a[:, :, 0] == a[:, :, 1]).all()
+        tensor(True)
+        >>> (a[:, :, 2] == a[:, :, 3]).all()
+        tensor(True)
+        >>> torch.unique(a, dim=2)
+        tensor([[[0, 1],
+                 [0, 1],
+                 [1, 0]],
+                [[1, 0],
+                 [1, 0],
+                 [1, 1]],
+                [[0, 1],
+                 [0, 1],
+                 [1, 0]]])
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            unique, (input,), input, sorted=sorted, return_inverse=return_inverse,
+            return_counts=return_counts, dim=dim)
+
+    if dim is not None:
+        output, inverse_indices, counts = _VF.unique_dim(
+            input,
+            dim,
+            sorted=sorted,
+            return_inverse=return_inverse,
+            return_counts=return_counts,
+        )
+    else:
+        output, inverse_indices, counts = torch._unique2(
+            input,
+            sorted=sorted,
+            return_inverse=return_inverse,
+            return_counts=return_counts,
+        )
+    return output, inverse_indices, counts
+
+
+def _unique_consecutive_impl(input: Tensor, return_inverse: bool = False,
+                             return_counts: bool = False,
+                             dim: Optional[int] = None) -> _unique_impl_out:
+    r"""Eliminates all but the first element from every consecutive group of equivalent elements.
+
+    .. note:: This function is different from :func:`torch.unique` in the sense that this function
+        only eliminates consecutive duplicate values. This semantics is similar to `std::unique`
+        in C++.
+
+    Args:
+        input (Tensor): the input tensor
+        return_inverse (bool): Whether to also return the indices for where
+            elements in the original input ended up in the returned unique list.
+        return_counts (bool): Whether to also return the counts for each unique
+            element.
+        dim (int): the dimension to apply unique. If ``None``, the unique of the
+            flattened input is returned. default: ``None``
+
+    Returns:
+        (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing
+
+            - **output** (*Tensor*): the output list of unique scalar elements.
+            - **inverse_indices** (*Tensor*): (optional) if
+              :attr:`return_inverse` is True, there will be an additional
+              returned tensor (same shape as input) representing the indices
+              for where elements in the original input map to in the output;
+              otherwise, this function will only return a single tensor.
+            - **counts** (*Tensor*): (optional) if
+              :attr:`return_counts` is True, there will be an additional
+              returned tensor (same shape as output or output.size(dim),
+              if dim was specified) representing the number of occurrences
+              for each unique value or tensor.
+
+    Example::
+
+        >>> x = torch.tensor([1, 1, 2, 2, 3, 1, 1, 2])
+        >>> output = torch.unique_consecutive(x)
+        >>> output
+        tensor([1, 2, 3, 1, 2])
+
+        >>> output, inverse_indices = torch.unique_consecutive(x, return_inverse=True)
+        >>> output
+        tensor([1, 2, 3, 1, 2])
+        >>> inverse_indices
+        tensor([0, 0, 1, 1, 2, 3, 3, 4])
+
+        >>> output, counts = torch.unique_consecutive(x, return_counts=True)
+        >>> output
+        tensor([1, 2, 3, 1, 2])
+        >>> counts
+        tensor([2, 2, 1, 2, 1])
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            unique_consecutive, (input,), input, return_inverse=return_inverse,
+            return_counts=return_counts, dim=dim)
+    output, inverse_indices, counts = _VF.unique_consecutive(  # type: ignore[attr-defined]
+        input, return_inverse=return_inverse, return_counts=return_counts, dim=dim)
+    return output, inverse_indices, counts
+
+
+def _return_counts(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
+    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]
+
+    if has_torch_function_unary(input):
+        return _unique_impl(input, sorted, return_inverse, return_counts, dim)
+
+    output, _, counts = _unique_impl(input, sorted, return_inverse, return_counts, dim)
+    return output, counts
+
+
+def _return_output(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
+    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tensor
+
+    if has_torch_function_unary(input):
+        return _unique_impl(input, sorted, return_inverse, return_counts, dim)
+
+    output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
+    return output
+
+
+def _return_inverse(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
+    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]
+
+    if has_torch_function_unary(input):
+        return _unique_impl(input, sorted, return_inverse, return_counts, dim)
+
+    output, inverse_indices, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
+    return output, inverse_indices
+
+
+_return_inverse_false = boolean_dispatch(
+    arg_name='return_counts',
+    arg_index=3,
+    default=False,
+    if_true=_return_counts,
+    if_false=_return_output,
+    module_name=__name__,
+    func_name='unique')
+
+_return_inverse_true = boolean_dispatch(
+    arg_name='return_counts',
+    arg_index=3,
+    default=False,
+    if_true=_unique_impl,
+    if_false=_return_inverse,
+    module_name=__name__,
+    func_name='unique')
+
+# The return type of unique depends on `return_inverse`, and `return_counts` so in order to
+# resolve the output type in TorchScript we need to statically know the value of both parameters
+
+unique = boolean_dispatch(
+    arg_name='return_inverse',
+    arg_index=2,
+    default=False,
+    if_true=_return_inverse_true,
+    if_false=_return_inverse_false,
+    module_name=__name__,
+    func_name='unique')
+unique.__doc__ = _unique_impl.__doc__
+
+
+def _consecutive_return_counts(input, return_inverse=False, return_counts=False, dim=None):
+    # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]
+
+    if has_torch_function_unary(input):
+        return _unique_consecutive_impl(input, return_inverse, return_counts, dim)
+
+    output, _, counts = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
+    return output, counts
+
+
+def _consecutive_return_output(input, return_inverse=False, return_counts=False, dim=None):
+    # type: (Tensor, bool, bool, Optional[int]) -> Tensor
+
+    if has_torch_function_unary(input):
+        return _unique_consecutive_impl(input, return_inverse, return_counts, dim)
+
+    output, _, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
+    return output
+
+
+def _consecutive_return_inverse(input, return_inverse=False, return_counts=False, dim=None):
+    # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]
+
+    if has_torch_function_unary(input):
+        return _unique_consecutive_impl(input, return_inverse, return_counts, dim)
+
+    output, inverse_indices, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
+    return output, inverse_indices
+
+
+_consecutive_return_inverse_false = boolean_dispatch(
+    arg_name='return_counts',
+    arg_index=1,
+    default=False,
+    if_true=_consecutive_return_counts,
+    if_false=_consecutive_return_output,
+    module_name=__name__,
+    func_name='unique_consecutive')
+
+_consecutive_return_inverse_true = boolean_dispatch(
+    arg_name='return_counts',
+    arg_index=1,
+    default=False,
+    if_true=_unique_consecutive_impl,
+    if_false=_consecutive_return_inverse,
+    module_name=__name__,
+    func_name='unique_consecutive')
+
+# The return type of unique depends on `return_inverse`, and `return_counts` so in order to
+# resolve the output type in TorchScript we need to statically know the value of both parameters
+
+unique_consecutive = boolean_dispatch(
+    arg_name='return_inverse',
+    arg_index=2,
+    default=False,
+    if_true=_consecutive_return_inverse_true,
+    if_false=_consecutive_return_inverse_false,
+    module_name=__name__,
+    func_name='unique_consecutive')
+unique_consecutive.__doc__ = _unique_consecutive_impl.__doc__
+
+if TYPE_CHECKING:
+    pass
+    # There's no good way to use this type annotation without breaking JIT
+    # overloads. So leave untyped for mypy for now.
+else:
+    @overload
+    def tensordot(a, b, dims: int = 2, out: Optional[torch.Tensor] = None):
+        pass
+
+    @overload  # noqa: F811
+    def tensordot(a, b, dims: Tuple[List[int], List[int]], out: Optional[torch.Tensor] = None):  # noqa: F811
+        pass
+
+    @overload  # noqa: F811
+    def tensordot(a, b, dims: List[List[int]], out: Optional[torch.Tensor] = None):  # noqa: F811
+        pass
+
+    @overload  # noqa: F811
+    def tensordot(a, b, dims: torch.Tensor, out: Optional[torch.Tensor] = None):  # noqa: F811
+        pass
+
+
+def tensordot(a, b, dims=2, out: Optional[torch.Tensor] = None):  # noqa: F811
+    r"""Returns a contraction of a and b over multiple dimensions.
+
+    :attr:`tensordot` implements a generalized matrix product.
+
+    Args:
+      a (Tensor): Left tensor to contract
+      b (Tensor): Right tensor to contract
+      dims (int or Tuple[List[int], List[int]] or List[List[int]] containing two lists or Tensor): number of dimensions to
+         contract or explicit lists of dimensions for :attr:`a` and
+         :attr:`b` respectively
+
+    When called with a non-negative integer argument :attr:`dims` = :math:`d`, and
+    the number of dimensions of :attr:`a` and :attr:`b` is :math:`m` and :math:`n`,
+    respectively, :func:`~torch.tensordot` computes
+
+    .. math::
+        r_{i_0,...,i_{m-d}, i_d,...,i_n}
+          = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}.
+
+    When called with :attr:`dims` of the list form, the given dimensions will be contracted
+    in place of the last :math:`d` of :attr:`a` and the first :math:`d` of :math:`b`. The sizes
+    in these dimensions must match, but :func:`~torch.tensordot` will deal with broadcasted
+    dimensions.
+
+    Examples::
+
+        >>> a = torch.arange(60.).reshape(3, 4, 5)
+        >>> b = torch.arange(24.).reshape(4, 3, 2)
+        >>> torch.tensordot(a, b, dims=([1, 0], [0, 1]))
+        tensor([[4400., 4730.],
+                [4532., 4874.],
+                [4664., 5018.],
+                [4796., 5162.],
+                [4928., 5306.]])
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> a = torch.randn(3, 4, 5, device='cuda')
+        >>> b = torch.randn(4, 5, 6, device='cuda')
+        >>> c = torch.tensordot(a, b, dims=2).cpu()
+        tensor([[ 8.3504, -2.5436,  6.2922,  2.7556, -1.0732,  3.2741],
+                [ 3.3161,  0.0704,  5.0187, -0.4079, -4.3126,  4.8744],
+                [ 0.8223,  3.9445,  3.2168, -0.2400,  3.4117,  1.7780]])
+
+        >>> a = torch.randn(3, 5, 4, 6)
+        >>> b = torch.randn(6, 4, 5, 3)
+        >>> torch.tensordot(a, b, dims=([2, 1, 3], [1, 2, 0]))
+        tensor([[  7.7193,  -2.4867, -10.3204],
+                [  1.5513, -14.4737,  -6.5113],
+                [ -0.2850,   4.2573,  -3.5997]])
+    """
+    if has_torch_function_variadic(a, b):
+        return handle_torch_function(tensordot, (a, b), a, b, dims=dims, out=out)
+
+    if not isinstance(dims, (tuple, list, torch.Tensor, int, torch.SymInt)):
+        raise RuntimeError("tensordot expects dims to be int or "
+                           + "Tuple[List[int], List[int]] or "
+                           + "List[List[int]] containing two lists, but got "
+                           + f"dims={dims}")
+
+    dims_a: List[int] = []
+    dims_b: List[int] = []
+
+    if isinstance(dims, (tuple, list)):
+        dims_a, dims_b = dims
+
+    if isinstance(dims, torch.Tensor):
+        num_elements = dims.numel()
+        if num_elements > 1:
+            assert dims.size()[0] == 2
+            dims_a = torch.jit.annotate(List[int], dims[0].tolist())
+            dims_b = torch.jit.annotate(List[int], dims[1].tolist())
+        else:
+            dims_val = int(dims.item())
+            if dims_val < 0:
+                raise RuntimeError(f"tensordot expects dims >= 0, but got dims={dims}")
+            dims_a = list(range(-dims_val, 0))
+            dims_b = list(range(dims_val))
+
+    if isinstance(dims, (int, torch.SymInt)):
+        if dims < 0:
+            raise RuntimeError(f"tensordot expects dims >= 0, but got dims={dims}")
+        if dims > min(a.dim(), b.dim()):
+            raise RuntimeError(f"tensordot expects dims < ndim_a or ndim_b, but got dims={dims}")
+        dims_a = list(range(-dims, 0))
+        dims_b = list(range(dims))
+
+    if out is None:
+        return _VF.tensordot(a, b, dims_a, dims_b)  # type: ignore[attr-defined]
+    else:
+        return _VF.tensordot(a, b, dims_a, dims_b, out=out)  # type: ignore[attr-defined]
+
+
+def cartesian_prod(*tensors: Tensor) -> Tensor:
+    """Do cartesian product of the given sequence of tensors. The behavior is similar to
+    python's `itertools.product`.
+
+    Args:
+        *tensors: any number of 1 dimensional tensors.
+
+    Returns:
+        Tensor: A tensor equivalent to converting all the input tensors into lists,
+        do `itertools.product` on these lists, and finally convert the resulting list
+        into tensor.
+
+    Example::
+
+        >>> import itertools
+        >>> a = [1, 2, 3]
+        >>> b = [4, 5]
+        >>> list(itertools.product(a, b))
+        [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)]
+        >>> tensor_a = torch.tensor(a)
+        >>> tensor_b = torch.tensor(b)
+        >>> torch.cartesian_prod(tensor_a, tensor_b)
+        tensor([[1, 4],
+                [1, 5],
+                [2, 4],
+                [2, 5],
+                [3, 4],
+                [3, 5]])
+    """
+    # This wrapper exists to support variadic args.
+    if has_torch_function(tensors):
+        return handle_torch_function(cartesian_prod, tensors, *tensors)
+    return _VF.cartesian_prod(tensors)  # type: ignore[attr-defined]
+
+
+def block_diag(*tensors):
+    """Create a block diagonal matrix from provided tensors.
+
+    Args:
+        *tensors: One or more tensors with 0, 1, or 2 dimensions.
+
+    Returns:
+        Tensor: A 2 dimensional tensor with all the input tensors arranged in
+        order such that their upper left and lower right corners are
+        diagonally adjacent. All other elements are set to 0.
+
+    Example::
+
+        >>> import torch
+        >>> A = torch.tensor([[0, 1], [1, 0]])
+        >>> B = torch.tensor([[3, 4, 5], [6, 7, 8]])
+        >>> C = torch.tensor(7)
+        >>> D = torch.tensor([1, 2, 3])
+        >>> E = torch.tensor([[4], [5], [6]])
+        >>> torch.block_diag(A, B, C, D, E)
+        tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+                [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                [0, 0, 3, 4, 5, 0, 0, 0, 0, 0],
+                [0, 0, 6, 7, 8, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 7, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0, 1, 2, 3, 0],
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 4],
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 6]])
+    """
+    # This wrapper exists to support variadic args.
+    if has_torch_function(tensors):
+        return handle_torch_function(block_diag, tensors, *tensors)
+    return torch._C._VariableFunctions.block_diag(tensors)  # type: ignore[attr-defined]
+
+
+def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessary'):
+    # type: (Tensor, Tensor, float, str) -> (Tensor)
+    r"""Computes batched the p-norm distance between each pair of the two collections of row vectors.
+
+    Args:
+        x1 (Tensor): input tensor of shape :math:`B \times P \times M`.
+        x2 (Tensor): input tensor of shape :math:`B \times R \times M`.
+        p: p value for the p-norm distance to calculate between each vector pair
+            :math:`\in [0, \infty]`.
+        compute_mode:
+            'use_mm_for_euclid_dist_if_necessary' - will use matrix multiplication approach to calculate
+            euclidean distance (p = 2) if P > 25 or R > 25
+            'use_mm_for_euclid_dist' - will always use matrix multiplication approach to calculate
+            euclidean distance (p = 2)
+            'donot_use_mm_for_euclid_dist' - will never use matrix multiplication approach to calculate
+            euclidean distance (p = 2)
+            Default: use_mm_for_euclid_dist_if_necessary.
+
+    If x1 has shape :math:`B \times P \times M` and x2 has shape :math:`B \times R \times M` then the
+    output will have shape :math:`B \times P \times R`.
+
+    This function is equivalent to `scipy.spatial.distance.cdist(input,'minkowski', p=p)`
+    if :math:`p \in (0, \infty)`. When :math:`p = 0` it is equivalent to
+    `scipy.spatial.distance.cdist(input, 'hamming') * M`. When :math:`p = \infty`, the closest
+    scipy function is `scipy.spatial.distance.cdist(xn, lambda x, y: np.abs(x - y).max())`.
+
+    Example:
+
+        >>> a = torch.tensor([[0.9041,  0.0196], [-0.3108, -2.4423], [-0.4821,  1.059]])
+        >>> a
+        tensor([[ 0.9041,  0.0196],
+                [-0.3108, -2.4423],
+                [-0.4821,  1.0590]])
+        >>> b = torch.tensor([[-2.1763, -0.4713], [-0.6986,  1.3702]])
+        >>> b
+        tensor([[-2.1763, -0.4713],
+                [-0.6986,  1.3702]])
+        >>> torch.cdist(a, b, p=2)
+        tensor([[3.1193, 2.0959],
+                [2.7138, 3.8322],
+                [2.2830, 0.3791]])
+    """
+    if has_torch_function_variadic(x1, x2):
+        return handle_torch_function(
+            cdist, (x1, x2), x1, x2, p=p, compute_mode=compute_mode)
+    if compute_mode == 'use_mm_for_euclid_dist_if_necessary':
+        return _VF.cdist(x1, x2, p, None)  # type: ignore[attr-defined]
+    elif compute_mode == 'use_mm_for_euclid_dist':
+        return _VF.cdist(x1, x2, p, 1)  # type: ignore[attr-defined]
+    elif compute_mode == 'donot_use_mm_for_euclid_dist':
+        return _VF.cdist(x1, x2, p, 2)  # type: ignore[attr-defined]
+    else:
+        raise ValueError(f"{compute_mode} is not a valid value for compute_mode")
+
+
+def atleast_1d(*tensors):
+    r"""
+    Returns a 1-dimensional view of each input tensor with zero dimensions.
+    Input tensors with one or more dimensions are returned as-is.
+
+    Args:
+        input (Tensor or list of Tensors)
+
+    Returns:
+        output (Tensor or tuple of Tensors)
+
+    Example::
+
+        >>> x = torch.arange(2)
+        >>> x
+        tensor([0, 1])
+        >>> torch.atleast_1d(x)
+        tensor([0, 1])
+        >>> x = torch.tensor(1.)
+        >>> x
+        tensor(1.)
+        >>> torch.atleast_1d(x)
+        tensor([1.])
+        >>> x = torch.tensor(0.5)
+        >>> y = torch.tensor(1.)
+        >>> torch.atleast_1d((x, y))
+        (tensor([0.5000]), tensor([1.]))
+    """
+    # This wrapper exists to support variadic args.
+    if has_torch_function(tensors):
+        return handle_torch_function(atleast_1d, tensors, *tensors)
+    if len(tensors) == 1:
+        tensors = tensors[0]
+    return _VF.atleast_1d(tensors)  # type: ignore[attr-defined]
+
+
+def atleast_2d(*tensors):
+    r"""
+    Returns a 2-dimensional view of each input tensor with zero dimensions.
+    Input tensors with two or more dimensions are returned as-is.
+
+    Args:
+        input (Tensor or list of Tensors)
+
+    Returns:
+        output (Tensor or tuple of Tensors)
+
+    Example::
+
+        >>> x = torch.tensor(1.)
+        >>> x
+        tensor(1.)
+        >>> torch.atleast_2d(x)
+        tensor([[1.]])
+        >>> x = torch.arange(4).view(2, 2)
+        >>> x
+        tensor([[0, 1],
+                [2, 3]])
+        >>> torch.atleast_2d(x)
+        tensor([[0, 1],
+                [2, 3]])
+        >>> x = torch.tensor(0.5)
+        >>> y = torch.tensor(1.)
+        >>> torch.atleast_2d((x, y))
+        (tensor([[0.5000]]), tensor([[1.]]))
+    """
+    # This wrapper exists to support variadic args.
+    if has_torch_function(tensors):
+        return handle_torch_function(atleast_2d, tensors, *tensors)
+    if len(tensors) == 1:
+        tensors = tensors[0]
+    return _VF.atleast_2d(tensors)  # type: ignore[attr-defined]
+
+
+def atleast_3d(*tensors):
+    r"""
+    Returns a 3-dimensional view of each input tensor with zero dimensions.
+    Input tensors with three or more dimensions are returned as-is.
+
+    Args:
+        input (Tensor or list of Tensors)
+
+    Returns:
+        output (Tensor or tuple of Tensors)
+
+    Example:
+
+        >>> x = torch.tensor(0.5)
+        >>> x
+        tensor(0.5000)
+        >>> torch.atleast_3d(x)
+        tensor([[[0.5000]]])
+        >>> y = torch.arange(4).view(2, 2)
+        >>> y
+        tensor([[0, 1],
+                [2, 3]])
+        >>> torch.atleast_3d(y)
+        tensor([[[0],
+                 [1]],
+                <BLANKLINE>
+                [[2],
+                 [3]]])
+        >>> x = torch.tensor(1).view(1, 1, 1)
+        >>> x
+        tensor([[[1]]])
+        >>> torch.atleast_3d(x)
+        tensor([[[1]]])
+        >>> x = torch.tensor(0.5)
+        >>> y = torch.tensor(1.)
+        >>> torch.atleast_3d((x, y))
+        (tensor([[[0.5000]]]), tensor([[[1.]]]))
+    """
+    # This wrapper exists to support variadic args.
+    if has_torch_function(tensors):
+        return handle_torch_function(atleast_3d, tensors, *tensors)
+    if len(tensors) == 1:
+        tensors = tensors[0]
+    return _VF.atleast_3d(tensors)  # type: ignore[attr-defined]
+
+
+if TYPE_CHECKING:
+    pass
+    # There's no good way to use this type annotation; cannot rename norm() to
+    # _norm_impl() in a way that doesn't break JIT overloads. So leave untyped
+    # for mypy for now.
+    #    def norm(input: Tensor,
+    #             p: Optional[Union[str, Number]] = "fro",
+    #             dim: Optional[Union[int, List[int]]] = None,
+    #             keepdim: bool = False,
+    #             out: Optional[Tensor] = None,
+    #             dtype: _dtype = None) -> Tensor:
+    #        return _norm_impl(input, p, dim, keepdim, out, dtype)
+else:
+    # TODO: type dim as BroadcastingList when
+    # https://github.com/pytorch/pytorch/issues/33782 is fixed
+    @overload
+    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):
+        # type: (Tensor, str, Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor
+        pass
+
+    @overload  # noqa: F811
+    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: F811
+        # type: (Tensor, Optional[number], Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor
+        pass
+
+    @overload  # noqa: F811
+    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: F811
+        # type: (Tensor, Optional[number], Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor
+        pass
+
+    @overload  # noqa: F811
+    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: F811
+        # type: (Tensor, str, Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor
+        pass
+
+
+def norm(input, p: Optional[Union[float, str]] = "fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: F811
+    r"""Returns the matrix norm or vector norm of a given tensor.
+
+    .. warning::
+
+        torch.norm is deprecated and may be removed in a future PyTorch release.
+        Its documentation and behavior may be incorrect, and it is no longer
+        actively maintained.
+
+        Use :func:`torch.linalg.vector_norm` when computing vector norms and
+        :func:`torch.linalg.matrix_norm` when computing matrix norms.
+        For a function with a similar behavior as this one see :func:`torch.linalg.norm`.
+        Note, however, the signature for these functions is slightly different than the
+        signature for ``torch.norm``.
+
+    Args:
+        input (Tensor): The input tensor. Its data type must be either a floating
+            point or complex type. For complex inputs, the norm is calculated using the
+            absolute value of each element. If the input is complex and neither
+            :attr:`dtype` nor :attr:`out` is specified, the result's data type will
+            be the corresponding floating point type (e.g. float if :attr:`input` is
+            complexfloat).
+
+        p (int, float, inf, -inf, 'fro', 'nuc', optional): the order of norm. Default: ``'fro'``
+            The following norms can be calculated:
+
+            ======  ==============  ==========================
+            ord     matrix norm     vector norm
+            ======  ==============  ==========================
+            'fro'   Frobenius norm  --
+            'nuc'   nuclear norm    --
+            Number  --              sum(abs(x)**ord)**(1./ord)
+            ======  ==============  ==========================
+
+            The vector norm can be calculated across any number of dimensions.
+            The corresponding dimensions of :attr:`input` are flattened into
+            one dimension, and the norm is calculated on the flattened
+            dimension.
+
+            Frobenius norm produces the same result as ``p=2`` in all cases
+            except when :attr:`dim` is a list of three or more dims, in which
+            case Frobenius norm throws an error.
+
+            Nuclear norm can only be calculated across exactly two dimensions.
+
+        dim (int, tuple of ints, list of ints, optional):
+            Specifies which dimension or dimensions of :attr:`input` to
+            calculate the norm across. If :attr:`dim` is ``None``, the norm will
+            be calculated across all dimensions of :attr:`input`. If the norm
+            type indicated by :attr:`p` does not support the specified number of
+            dimensions, an error will occur.
+        keepdim (bool, optional): whether the output tensors have :attr:`dim`
+            retained or not. Ignored if :attr:`dim` = ``None`` and
+            :attr:`out` = ``None``. Default: ``False``
+        out (Tensor, optional): the output tensor. Ignored if
+            :attr:`dim` = ``None`` and :attr:`out` = ``None``.
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor. If specified, the input tensor is casted to
+            :attr:`dtype` while performing the operation. Default: None.
+
+    .. note::
+        Even though ``p='fro'`` supports any number of dimensions, the true
+        mathematical definition of Frobenius norm only applies to tensors with
+        exactly two dimensions. :func:`torch.linalg.matrix_norm` with ``ord='fro'``
+        aligns with the mathematical definition, since it can only be applied across
+        exactly two dimensions.
+
+    Example::
+
+        >>> import torch
+        >>> a = torch.arange(9, dtype= torch.float) - 4
+        >>> b = a.reshape((3, 3))
+        >>> torch.norm(a)
+        tensor(7.7460)
+        >>> torch.norm(b)
+        tensor(7.7460)
+        >>> torch.norm(a, float('inf'))
+        tensor(4.)
+        >>> torch.norm(b, float('inf'))
+        tensor(4.)
+        >>> c = torch.tensor([[ 1, 2, 3], [-1, 1, 4]] , dtype=torch.float)
+        >>> torch.norm(c, dim=0)
+        tensor([1.4142, 2.2361, 5.0000])
+        >>> torch.norm(c, dim=1)
+        tensor([3.7417, 4.2426])
+        >>> torch.norm(c, p=1, dim=1)
+        tensor([6., 6.])
+        >>> d = torch.arange(8, dtype=torch.float).reshape(2, 2, 2)
+        >>> torch.norm(d, dim=(1, 2))
+        tensor([ 3.7417, 11.2250])
+        >>> torch.norm(d[0, :, :]), torch.norm(d[1, :, :])
+        (tensor(3.7417), tensor(11.2250))
+    """
+
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            norm, (input,), input, p=p, dim=dim, keepdim=keepdim, out=out, dtype=dtype)
+
+    # NB. All the repeated code and weird python is to please TorchScript.
+    #     For a more compact implementation see the relevant function in `_refs/__init__.py`
+
+    # We don't do this for MPS or sparse tensors
+    if input.layout == torch.strided and input.device.type in \
+            ("cpu", "cuda", "meta", torch.utils.backend_registration._privateuse1_backend_name):
+        if dim is not None:
+            if isinstance(dim, (int, torch.SymInt)):
+                _dim = [dim]
+            else:
+                _dim = dim
+        else:
+            _dim = None  # type: ignore[assignment]
+
+        if isinstance(p, str):
+            if p == "fro" and (dim is None or isinstance(dim, (int, torch.SymInt)) or len(dim) <= 2):
+                if out is None:
+                    return torch.linalg.vector_norm(input, 2, _dim, keepdim, dtype=dtype)
+                else:
+                    return torch.linalg.vector_norm(input, 2, _dim, keepdim, dtype=dtype, out=out)
+
+            # Here we either call the nuclear norm, or we call matrix_norm with some arguments
+            # that will throw an error
+            if _dim is None:
+                _dim = list(range(input.ndim))
+            if out is None:
+                return torch.linalg.matrix_norm(input, p, _dim, keepdim, dtype=dtype)
+            else:
+                return torch.linalg.matrix_norm(input, p, _dim, keepdim, dtype=dtype, out=out)
+        else:
+            # NB. p should be Union[str, number], not Optional!
+            _p = 2.0 if p is None else p
+            if out is None:
+                return torch.linalg.vector_norm(input, _p, _dim, keepdim, dtype=dtype)
+            else:
+                return torch.linalg.vector_norm(input, _p, _dim, keepdim, dtype=dtype, out=out)
+
+    ndim = input.dim()
+
+    # catch default case
+    if dim is None and out is None and dtype is None and p is not None:
+        if isinstance(p, str):
+            if p == "fro":
+                return _VF.frobenius_norm(input, dim=(), keepdim=keepdim)
+        if not isinstance(p, str):
+            _dim = [i for i in range(ndim)]  # noqa: C416 TODO: rewrite as list(range(m))
+            return _VF.norm(input, p, dim=_dim, keepdim=keepdim)  # type: ignore[attr-defined]
+
+    # TODO: when https://github.com/pytorch/pytorch/issues/33782 is fixed
+    # remove the overloads where dim is an int and replace with BraodcastingList1
+    # and remove next four lines, replace _dim with dim
+    if dim is not None:
+        if isinstance(dim, (int, torch.SymInt)):
+            _dim = [dim]
+        else:
+            _dim = dim
+    else:
+        _dim = None  # type: ignore[assignment]
+
+    if isinstance(p, str):
+        if p == "fro":
+            if dtype is not None:
+                raise ValueError("dtype argument is not supported in frobenius norm")
+
+            if _dim is None:
+                _dim = list(range(ndim))
+            if out is None:
+                return _VF.frobenius_norm(input, _dim, keepdim=keepdim)  # type: ignore[arg-type]
+            else:
+                return _VF.frobenius_norm(input, _dim, keepdim=keepdim, out=out)  # type: ignore[arg-type]
+        elif p == "nuc":
+            if dtype is not None:
+                raise ValueError("dtype argument is not supported in nuclear norm")
+            if _dim is None:
+                if out is None:
+                    return _VF.nuclear_norm(input, keepdim=keepdim)  # type: ignore[arg-type]
+                else:
+                    return _VF.nuclear_norm(input, keepdim=keepdim, out=out)  # type: ignore[arg-type]
+            else:
+                if out is None:
+                    return _VF.nuclear_norm(input, _dim, keepdim=keepdim)  # type: ignore[arg-type]
+                else:
+                    return _VF.nuclear_norm(input, _dim, keepdim=keepdim, out=out)  # type: ignore[arg-type]
+        raise RuntimeError(f"only valid string values are 'fro' and 'nuc', found {p}")
+    else:
+        if _dim is None:
+            _dim = list(range(ndim))
+
+        if out is None:
+            if dtype is None:
+                return _VF.norm(input, p, _dim, keepdim=keepdim)  # type: ignore[attr-defined]
+            else:
+                return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype)  # type: ignore[attr-defined]
+        else:
+            if dtype is None:
+                return _VF.norm(input, p, _dim, keepdim=keepdim, out=out)  # type: ignore[attr-defined]
+            else:
+                return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype, out=out)  # type: ignore[attr-defined]
+
+def unravel_index(indices: Tensor, shape: Union[int, Sequence[int], torch.Size]) -> Tuple[Tensor, ...]:
+    r"""Converts a tensor of flat indices into a tuple of coordinate tensors that
+    index into an arbitrary tensor of the specified shape.
+
+    Args:
+        indices (Tensor): An integer tensor containing indices into the
+            flattened version of an arbitrary tensor of shape :attr:`shape`.
+            All elements must be in the range ``[0, prod(shape) - 1]``.
+
+        shape (int, sequence of ints, or torch.Size): The shape of the arbitrary
+            tensor. All elements must be non-negative.
+
+    Returns:
+        tuple of Tensors: Each ``i``-th tensor in the output corresponds with
+        dimension ``i`` of :attr:`shape`. Each tensor has the same shape as
+        ``indices`` and contains one index into dimension ``i`` for each of the
+        flat indices given by ``indices``.
+
+    Example::
+
+        >>> import torch
+        >>> torch.unravel_index(torch.tensor(4), (3, 2))
+        (tensor(2),
+         tensor(0))
+
+        >>> torch.unravel_index(torch.tensor([4, 1]), (3, 2))
+        (tensor([2, 0]),
+         tensor([0, 1]))
+
+        >>> torch.unravel_index(torch.tensor([0, 1, 2, 3, 4, 5]), (3, 2))
+        (tensor([0, 0, 1, 1, 2, 2]),
+         tensor([0, 1, 0, 1, 0, 1]))
+
+        >>> torch.unravel_index(torch.tensor([1234, 5678]), (10, 10, 10, 10))
+        (tensor([1, 5]),
+         tensor([2, 6]),
+         tensor([3, 7]),
+         tensor([4, 8]))
+
+        >>> torch.unravel_index(torch.tensor([[1234], [5678]]), (10, 10, 10, 10))
+        (tensor([[1], [5]]),
+         tensor([[2], [6]]),
+         tensor([[3], [7]]),
+         tensor([[4], [8]]))
+
+        >>> torch.unravel_index(torch.tensor([[1234], [5678]]), (100, 100))
+        (tensor([[12], [56]]),
+         tensor([[34], [78]]))
+    """
+    if has_torch_function_unary(indices):
+        return handle_torch_function(
+            unravel_index, (indices,), indices, shape=shape)
+    res_tensor = _unravel_index(indices, shape)
+    return res_tensor.unbind(-1)
+
+def _unravel_index(indices: Tensor, shape: Union[int, Sequence[int]]) -> Tensor:
+    torch._check_type(
+        not indices.is_complex() and not indices.is_floating_point() and not indices.dtype == torch.bool,
+        lambda: f"expected 'indices' to be integer dtype, but got {indices.dtype}")
+
+    torch._check_type(
+        isinstance(shape, (int, torch.SymInt, Sequence)),
+        lambda: f"expected 'shape' to be int or sequence of ints, but got {type(shape)}")
+
+    if isinstance(shape, (int, torch.SymInt)):
+        shape = torch.Size([shape])
+    else:
+        for dim in shape:
+            torch._check_type(
+                isinstance(dim, (int, torch.SymInt)),
+                lambda: f"expected 'shape' sequence to only contain ints, but got {type(dim)}")
+        shape = torch.Size(shape)
+
+    torch._check_value(
+        all(dim >= 0 for dim in shape),
+        lambda: f"'shape' cannot have negative values, but got {tuple(shape)}")
+
+    coefs = list(reversed(list(itertools.accumulate(reversed(shape[1:] + torch.Size([1])), func=operator.mul))))
+    return indices.unsqueeze(-1).floor_divide(
+        torch.tensor(coefs, device=indices.device, dtype=torch.int64)
+    ) % torch.tensor(shape, device=indices.device, dtype=torch.int64)
+
+def chain_matmul(*matrices, out=None):
+    r"""Returns the matrix product of the :math:`N` 2-D tensors. This product is efficiently computed
+    using the matrix chain order algorithm which selects the order in which incurs the lowest cost in terms
+    of arithmetic operations (`[CLRS]`_). Note that since this is a function to compute the product, :math:`N`
+    needs to be greater than or equal to 2; if equal to 2 then a trivial matrix-matrix product is returned.
+    If :math:`N` is 1, then this is a no-op - the original matrix is returned as is.
+
+    .. warning::
+
+        :func:`torch.chain_matmul` is deprecated and will be removed in a future PyTorch release.
+        Use :func:`torch.linalg.multi_dot` instead, which accepts a list of two or more tensors
+        rather than multiple arguments.
+
+    Args:
+        matrices (Tensors...): a sequence of 2 or more 2-D tensors whose product is to be determined.
+        out (Tensor, optional): the output tensor. Ignored if :attr:`out` = ``None``.
+
+    Returns:
+        Tensor: if the :math:`i^{th}` tensor was of dimensions :math:`p_{i} \times p_{i + 1}`, then the product
+        would be of dimensions :math:`p_{1} \times p_{N + 1}`.
+
+    Example::
+
+        >>> # xdoctest: +SKIP
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> a = torch.randn(3, 4)
+        >>> b = torch.randn(4, 5)
+        >>> c = torch.randn(5, 6)
+        >>> d = torch.randn(6, 7)
+        >>> # will raise a deprecation warning
+        >>> torch.chain_matmul(a, b, c, d)
+        tensor([[ -2.3375,  -3.9790,  -4.1119,  -6.6577,   9.5609, -11.5095,  -3.2614],
+                [ 21.4038,   3.3378,  -8.4982,  -5.2457, -10.2561,  -2.4684,   2.7163],
+                [ -0.9647,  -5.8917,  -2.3213,  -5.2284,  12.8615, -12.2816,  -2.5095]])
+
+    .. _`[CLRS]`: https://mitpress.mit.edu/books/introduction-algorithms-third-edition
+    """
+    # This wrapper exists to support variadic args.
+    if has_torch_function(matrices):
+        return handle_torch_function(chain_matmul, matrices, *matrices)
+
+    if out is None:
+        return _VF.chain_matmul(matrices)  # type: ignore[attr-defined]
+    else:
+        return _VF.chain_matmul(matrices, out=out)  # type: ignore[attr-defined]
+
+
+def _lu_impl(A, pivot=True, get_infos=False, out=None):
+    # type: (Tensor, bool, bool, Any) -> Tuple[Tensor, Tensor, Tensor]
+    r"""Computes the LU factorization of a matrix or batches of matrices
+    :attr:`A`. Returns a tuple containing the LU factorization and
+    pivots of :attr:`A`.  Pivoting is done if :attr:`pivot` is set to
+    ``True``.
+
+    .. warning::
+
+        :func:`torch.lu` is deprecated in favor of :func:`torch.linalg.lu_factor`
+        and :func:`torch.linalg.lu_factor_ex`. :func:`torch.lu` will be removed in a
+        future PyTorch release.
+        ``LU, pivots, info = torch.lu(A, compute_pivots)`` should be replaced with
+
+        .. code:: python
+
+            LU, pivots = torch.linalg.lu_factor(A, compute_pivots)
+
+        ``LU, pivots, info = torch.lu(A, compute_pivots, get_infos=True)`` should be replaced with
+
+        .. code:: python
+
+            LU, pivots, info = torch.linalg.lu_factor_ex(A, compute_pivots)
+
+    .. note::
+        * The returned permutation matrix for every matrix in the batch is
+          represented by a 1-indexed vector of size ``min(A.shape[-2], A.shape[-1])``.
+          ``pivots[i] == j`` represents that in the ``i``-th step of the algorithm,
+          the ``i``-th row was permuted with the ``j-1``-th row.
+        * LU factorization with :attr:`pivot` = ``False`` is not available
+          for CPU, and attempting to do so will throw an error. However,
+          LU factorization with :attr:`pivot` = ``False`` is available for
+          CUDA.
+        * This function does not check if the factorization was successful
+          or not if :attr:`get_infos` is ``True`` since the status of the
+          factorization is present in the third element of the return tuple.
+        * In the case of batches of square matrices with size less or equal
+          to 32 on a CUDA device, the LU factorization is repeated for
+          singular matrices due to the bug in the MAGMA library
+          (see magma issue 13).
+        * ``L``, ``U``, and ``P`` can be derived using :func:`torch.lu_unpack`.
+
+    .. warning::
+        The gradients of this function will only be finite when :attr:`A` is full rank.
+        This is because the LU decomposition is just differentiable at full rank matrices.
+        Furthermore, if :attr:`A` is close to not being full rank,
+        the gradient will be numerically unstable as it depends on the computation of :math:`L^{-1}` and :math:`U^{-1}`.
+
+    Args:
+        A (Tensor): the tensor to factor of size :math:`(*, m, n)`
+        pivot (bool, optional): controls whether pivoting is done. Default: ``True``
+        get_infos (bool, optional): if set to ``True``, returns an info IntTensor.
+                                    Default: ``False``
+        out (tuple, optional): optional output tuple. If :attr:`get_infos` is ``True``,
+                               then the elements in the tuple are Tensor, IntTensor,
+                               and IntTensor. If :attr:`get_infos` is ``False``, then the
+                               elements in the tuple are Tensor, IntTensor. Default: ``None``
+
+    Returns:
+        (Tensor, IntTensor, IntTensor (optional)): A tuple of tensors containing
+
+            - **factorization** (*Tensor*): the factorization of size :math:`(*, m, n)`
+
+            - **pivots** (*IntTensor*): the pivots of size :math:`(*, \text{min}(m, n))`.
+              ``pivots`` stores all the intermediate transpositions of rows.
+              The final permutation ``perm`` could be reconstructed by
+              applying ``swap(perm[i], perm[pivots[i] - 1])`` for ``i = 0, ..., pivots.size(-1) - 1``,
+              where ``perm`` is initially the identity permutation of :math:`m` elements
+              (essentially this is what :func:`torch.lu_unpack` is doing).
+
+            - **infos** (*IntTensor*, *optional*): if :attr:`get_infos` is ``True``, this is a tensor of
+              size :math:`(*)` where non-zero values indicate whether factorization for the matrix or
+              each minibatch has succeeded or failed
+
+    Example::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> A = torch.randn(2, 3, 3)
+        >>> A_LU, pivots = torch.lu(A)
+        >>> A_LU
+        tensor([[[ 1.3506,  2.5558, -0.0816],
+                 [ 0.1684,  1.1551,  0.1940],
+                 [ 0.1193,  0.6189, -0.5497]],
+
+                [[ 0.4526,  1.2526, -0.3285],
+                 [-0.7988,  0.7175, -0.9701],
+                 [ 0.2634, -0.9255, -0.3459]]])
+        >>> pivots
+        tensor([[ 3,  3,  3],
+                [ 3,  3,  3]], dtype=torch.int32)
+        >>> A_LU, pivots, info = torch.lu(A, get_infos=True)
+        >>> if info.nonzero().size(0) == 0:
+        ...     print('LU factorization succeeded for all samples!')
+        LU factorization succeeded for all samples!
+    """
+    # If get_infos is True, then we don't need to check for errors and vice versa
+    return torch._lu_with_info(A, pivot=pivot, check_errors=(not get_infos))
+
+if TYPE_CHECKING:
+    _ListOrSeq = Sequence[Tensor]
+else:
+    _ListOrSeq = List[Tensor]
+
+
+def _check_list_size(out_len: int, get_infos: bool, out: _ListOrSeq) -> None:
+    get_infos_int = 1 if get_infos else 0
+    if out_len - get_infos_int != 2:
+        raise TypeError(f"expected tuple of {2 + int(get_infos)} elements but got {out_len}")
+    if not isinstance(out, (tuple, list)):
+        raise TypeError(f"argument 'out' must be tuple of Tensors, not {type(out).__name__}")
+
+
+def _lu_with_infos(A, pivot=True, get_infos=False, out=None):
+    # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor, Tensor]]) -> Tuple[Tensor, Tensor, Tensor]
+    if has_torch_function_unary(A):
+        return handle_torch_function(
+            lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out)
+    result = _lu_impl(A, pivot, get_infos, out)
+    if out is not None:
+        _check_list_size(len(out), get_infos, out)
+        for i in range(len(out)):
+            out[i].resize_as_(result[i]).copy_(result[i])
+        return out
+    else:
+        return result  # A_LU, pivots, infos
+
+
+def _lu_no_infos(A, pivot=True, get_infos=False, out=None):
+    # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor]
+    # need to check for torch_function here so that we exit if
+    if has_torch_function_unary(A):
+        return handle_torch_function(
+            lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out)
+    result = _lu_impl(A, pivot, get_infos, out)
+    if out is not None:
+        _check_list_size(len(out), get_infos, out)
+        for i in range(len(out)):
+            out[i].resize_as_(result[i]).copy_(result[i])
+        return out
+    else:
+        return result[0], result[1]  # A_LU, pivots
+
+# The return type of lu depends on `get_infos`, so in order to resolve the output type
+# of lu in TorchScript we need to statically know the value of `get_infos`
+lu = boolean_dispatch(
+    arg_name='get_infos',
+    arg_index=2,
+    default=False,
+    if_true=_lu_with_infos,
+    if_false=_lu_no_infos,
+    module_name=__name__,
+    func_name='lu')
+lu.__doc__ = _lu_impl.__doc__
+
+
+def align_tensors(*tensors):
+    raise RuntimeError('`align_tensors` not yet implemented.')
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/library.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/library.py
new file mode 100644
index 0000000000000000000000000000000000000000..350aa8995c090ee0af8933d5ddeafc3011e84559
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/library.py
@@ -0,0 +1,532 @@
+from ._ops import OpOverload
+from typing import Any, Optional, Set, List
+import traceback
+import torch
+import weakref
+import functools
+import inspect
+import re
+import contextlib
+import sys
+
+__all__ = [
+    'Library',
+    'impl',
+    'define',
+    'fallthrough_kernel',
+    'impl_abstract',
+    'get_ctx',
+]
+
+# Set containing the combination of (namespace, operator, DispatchKey) for which a new kernel has been registered
+# The keys in the set are of the form `namespace + "/" + op_name + "/" + dispatch_key`.
+# This set is maintained to ensure that two libraries don't try to override the exact same functionality to avoid
+# libraries calling into kernels not intended to be called.
+_impls: Set[str] = set()
+_defs: Set[str] = set()
+
+# prim is reserved by TorchScript interpreter
+_reserved_namespaces = ['prim']
+
+def fallthrough_kernel():
+    """
+    A dummy function to pass to ``Library.impl`` in order to register a fallthrough.
+    """
+    raise NotImplementedError("fallthrough_kernel() should never be called.")
+
+class Library:
+    """
+    A class to create libraries that can be used to register new operators or
+    override operators in existing libraries from Python.
+    A user can optionally pass in a dispatch keyname if they only want to register
+    kernels corresponding to only one specific dispatch key.
+
+    To create a library to override operators in an existing library (with name ns), set the kind to "IMPL".
+    To create a new library (with name ns) to register new operators, set the kind to "DEF".
+    To create a fragment of a possibly existing library to register operators (and bypass
+    the limitation that there is only one library for a given namespace), set the kind to
+    "FRAGMENT".
+
+    Args:
+        ns: library name
+        kind: "DEF", "IMPL" (default: "IMPL"), "FRAGMENT"
+        dispatch_key: PyTorch dispatch key (default: "")
+    """
+    def __init__(self, ns, kind, dispatch_key=""):
+        if kind not in ('IMPL', 'DEF', 'FRAGMENT'):
+            raise ValueError("Unsupported kind: ", kind)
+
+        if ns in _reserved_namespaces and (kind == "DEF" or kind == 'FRAGMENT'):
+            raise ValueError(ns, " is a reserved namespace. Please try creating a library with another name.")
+
+        frame = traceback.extract_stack(limit=3)[0]
+        filename, lineno = frame.filename, frame.lineno
+        self.m: Optional[Any] = torch._C._dispatch_library(kind, ns, dispatch_key, filename, lineno)
+        self.ns = ns
+        self._op_defs: Set[str] = set()
+        self._op_impls: Set[str] = set()
+        self._registration_handles: List["torch._library.utils.RegistrationHandle"] = []
+        self.kind = kind
+        self.dispatch_key = dispatch_key
+        # Use a finalizer to setup the "destructor" instead of __del__.
+        # Python __del__ can lead to weird things (globals and locals may already
+        # be gone when __del__ actually gets called!). finalizers help the
+        # situation because it lets us capture references and keeps them alive
+        weakref.finalize(self, _del_library, _impls, self._op_impls, _defs, self._op_defs, self._registration_handles)
+
+    def __repr__(self):
+        return f"Library(kind={self.kind}, ns={self.ns}, dispatch_key={self.dispatch_key})>"
+
+    def define(self, schema, alias_analysis="", *, tags=()):
+        r'''Defines a new operator and its semantics in the ns namespace.
+
+        Args:
+            schema: function schema to define a new operator.
+            alias_analysis (optional): Indicates if the aliasing properties of the operator arguments can be
+                                       inferred from the schema (default behavior) or not ("CONSERVATIVE").
+            tags (Tag | Sequence[Tag]): one or more torch.Tag to apply to this
+                                       operator. Tagging an operator changes the operator's behavior
+                                       under various PyTorch subsystems; please read the docs for the
+                                       torch.Tag carefully before applying it.
+
+        Returns:
+            name of the operator as inferred from the schema.
+
+        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LIBRARY)
+            >>> my_lib = Library("foo", "DEF")
+            >>> my_lib.define("sum(Tensor self) -> Tensor")
+        '''
+        # This is added because we also want to disallow PURE_FUNCTION alias analysis which is a valid
+        # AliasAnalysis type in C++
+        if alias_analysis not in ["", "FROM_SCHEMA", "CONSERVATIVE"]:
+            raise RuntimeError(f"Invalid alias_analysis type {alias_analysis}")
+        assert self.m is not None
+        if isinstance(tags, torch.Tag):
+            tags = (tags,)
+        result = self.m.define(schema, alias_analysis, tuple(tags))
+        qualname = self.ns + "::" + schema.split("(")[0]
+        self._op_defs.add(qualname)
+        _defs.add(qualname)
+        return result
+
+    def impl(self, op_name, fn, dispatch_key=''):
+        r'''Registers the function implementation for an operator defined in the library.
+
+        Args:
+            op_name: operator name (along with the overload) or OpOverload object.
+            fn: function that's the operator implementation for the input dispatch key or :func:`~fallthrough_kernel`
+                to register a fallthrough.
+            dispatch_key: dispatch key that the input function should be registered for. By default, it uses
+                          the dispatch key that the library was created with.
+
+        Example::
+            >>> my_lib = Library("aten", "IMPL")
+            >>> def div_cpu(self, other):
+            >>>     return self * (1 / other)
+            >>> my_lib.impl("div.Tensor", div_cpu, "CPU")
+        '''
+        if not callable(fn):
+            raise TypeError(f"Input function is required to be a callable but found type {type(fn)}")
+        if dispatch_key == '':
+            dispatch_key = self.dispatch_key
+
+        if isinstance(op_name, str):
+            name = op_name
+        elif isinstance(op_name, OpOverload):
+            name = op_name._schema.name
+            overload_name = op_name._schema.overload_name
+            if overload_name != '':
+                name = name + '.' + overload_name
+        else:
+            raise RuntimeError("impl should be passed either a name or an OpOverload object as the first argument")
+
+        key = self.ns + "/" + name.split("::")[-1] + "/" + dispatch_key
+        if key in _impls:
+            # TODO: in future, add more info about where the existing function is registered (this info is
+            # today already returned by the C++ warning when impl is called but we error out before that)
+            raise RuntimeError("This is not allowed since there's already a kernel registered from python overriding {}"
+                               "'s behavior for {} dispatch key and {} namespace.".
+                               format(name.split("::")[-1], dispatch_key, self.ns))
+
+        if dispatch_key == "Meta":
+            dispatcher_op_name = name
+            if '::' not in dispatcher_op_name:
+                dispatcher_op_name = f'{self.ns}::{dispatcher_op_name}'
+
+            # Internally, we shouldn't be registering meta kernels for any operators that
+            # have CompositeImplicitAutograd kernels.
+            # Instead, we should be letting those decompositions run, and writing meta kernels
+            # only for the base operators.
+            if torch._C._dispatch_has_kernel_for_dispatch_key(dispatcher_op_name, "CompositeImplicitAutograd"):
+                raise RuntimeError(
+                    f"We should not register a meta kernel directly to the operator '{name}',"
+                    " because it has a CompositeImplicitAutograd kernel in core."
+                    " Instead we should let the operator decompose, and ensure that we have meta kernels"
+                    " for the base ops that it decomposes into.")
+
+        assert self.m is not None
+        self.m.impl(name, dispatch_key if dispatch_key != "" else "CompositeImplicitAutograd", fn)
+
+        _impls.add(key)
+        self._op_impls.add(key)
+
+    def _destroy(self):
+        if self.m is not None:
+            self.m.reset()
+        self.m = None
+        for handle in self._registration_handles:
+            handle.destroy()
+        self._registration_handles.clear()
+        for name in self._op_defs:
+            # Delete the cached torch.ops.ns.foo if it was registered.
+            # Otherwise, accessing it leads to a segfault.
+            # It's possible that we only registered an overload in this Library
+            # and another library owns an alive overload.
+            # That's OK - the next time torch.ops.ns.foo gets called, it'll be
+            # recomputed to point at the right collection of overloads.
+            ns, name_with_overload = name.split("::")
+            name = name_with_overload.split(".")[0]
+            if not hasattr(torch.ops, ns):
+                continue
+            namespace = getattr(torch.ops, ns)
+            if not hasattr(namespace, name):
+                continue
+            delattr(namespace, name)
+
+
+def _del_library(captured_impls, op_impls, captured_defs, op_defs, registration_handles):
+    captured_impls -= op_impls
+    captured_defs -= op_defs
+    for handle in registration_handles:
+        handle.destroy()
+
+
+@contextlib.contextmanager
+def _scoped_library(*args, **kwargs):
+    try:
+        lib = Library(*args, **kwargs)
+        yield lib
+    finally:
+        lib._destroy()
+
+
+_keep_alive: List[Library] = []
+
+
+NAMELESS_SCHEMA = re.compile(r"\(.*\) -> .*")
+
+
+@functools.singledispatch
+def define(qualname, schema, *, lib=None, tags=()):
+    r"""Defines a new operator.
+
+    In PyTorch, defining an op (short for "operator") is a two step-process:
+    - we need to define the op (by providing an operator name and schema)
+    - we need to implement behavior for how the operator interacts with
+    various PyTorch subsystems, like CPU/CUDA Tensors, Autograd, etc.
+
+    This entrypoint defines the custom operator (the first step)
+    you must then perform the second step by calling various
+    ``impl_*`` APIs, like :func:`torch.library.impl` or
+    :func:`torch.library.impl_abstract`.
+
+    Args:
+        qualname (str): The qualified name for the operator. Should be
+            a string that looks like "namespace::name", e.g. "aten::sin".
+            Operators in PyTorch need a namespace to
+            avoid name collisions; a given operator may only be created once.
+            If you are writing a Python library, we recommend the namespace to
+            be the name of your top-level module.
+        schema (str): The schema of the operator. E.g. "(Tensor x) -> Tensor"
+            for an op that accepts one Tensor and returns one Tensor. It does
+            not contain the operator name (that is passed in ``qualname``).
+        lib (Optional[Library]): If provided, the lifetime of this operator
+            will be tied to the lifetime of the Library object.
+        tags (Tag | Sequence[Tag]): one or more torch.Tag to apply to this
+            operator. Tagging an operator changes the operator's behavior
+            under various PyTorch subsystems; please read the docs for the
+            torch.Tag carefully before applying it.
+
+    Example::
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LIBRARY)
+        >>> import torch
+        >>> import numpy as np
+        >>>
+        >>> # Define the operator
+        >>> torch.library.define("mylib::sin", "(Tensor x) -> Tensor")
+        >>>
+        >>> # Add implementations for the operator
+        >>> @torch.library.impl("mylibrary::sin", "cpu")
+        >>> def f(x):
+        >>>     return torch.from_numpy(np.sin(x.numpy()))
+        >>>
+        >>> # Call the new operator from torch.ops.
+        >>> x = torch.randn(3)
+        >>> y = torch.ops.mylib.sin(x)
+        >>> assert torch.allclose(y, x)
+
+    """
+    if not isinstance(qualname, str):
+        raise ValueError(
+            f"define(qualname, schema): expected qualname "
+            f"to be instance of str, got {type(qualname)}")
+    namespace, name = torch._library.utils.parse_namespace(qualname)
+    if lib is None:
+        lib = Library(namespace, "FRAGMENT")
+        _keep_alive.append(lib)
+    if not NAMELESS_SCHEMA.fullmatch(schema):
+        raise ValueError(
+            f"define(qualname, schema, ...): expected schema "
+            f"to look like e.g. \"(Tensor x) -> Tensor\" but "
+            f"got \"{schema}\"")
+    lib.define(name + schema, alias_analysis="", tags=tags)
+
+
+@define.register
+def _(lib: Library, schema, alias_analysis=""):
+    """The old torch.library.define.
+    We're keeping this around for BC reasons
+    """
+    def wrap(f):
+        name = lib.define(schema, alias_analysis)
+        lib.impl(name, f)
+        return f
+    return wrap
+
+
+@functools.singledispatch
+def impl(qualname, types, func=None, *, lib=None):
+    """Register an implementation for a device type for this operator.
+
+    You may pass "default" for ``types`` to register this implementation as the
+    default implementation for ALL device types.
+    Please only use this if the implementation truly supports all device types;
+    for example, this is true if it is a composition of built-in PyTorch operators.
+
+    Some valid types are: "cpu", "cuda", "xla", "mps", "ipu", "xpu".
+
+    Args:
+        qualname (str): Should be a string that looks like "namespace::operator_name".
+        types (str | Sequence[str]): The device types to register an impl to.
+        lib (Optional[Library]): If provided, the lifetime of this registration
+            will be tied to the lifetime of the Library object.
+
+    Examples:
+        >>> import torch
+        >>> import numpy as np
+        >>>
+        >>> # Define the operator
+        >>> torch.library.define("mylibrary::sin", "(Tensor x) -> Tensor")
+        >>>
+        >>> # Add implementations for the cpu device
+        >>> @torch.library.impl("mylibrary::sin", "cpu")
+        >>> def f(x):
+        >>>     return torch.from_numpy(np.sin(x.numpy()))
+        >>>
+        >>> x = torch.randn(3)
+        >>> y = torch.ops.mylibrary.sin(x)
+        >>> assert torch.allclose(y, x.sin())
+    """
+    if isinstance(types, str):
+        types = (types,)
+    keys = set({})
+    for typ in types:
+        is_dispatch_key = torch._C._parse_dispatch_key(typ)
+        if is_dispatch_key:
+            # We also support passing a DispatchKey to impl. Please prefer using
+            # the higher-level torch.library APIs and only pass DispatchKey to
+            # torch.library.impl with caution (or even better, don't use this
+            # option and file an issue on GitHub for what you need).
+            # We don't advertise this to users because
+            # it is very easy to shoot yourself in the foot.
+            keys.add(typ)
+        else:
+            keys.add(_device_type_to_key(typ))
+
+    def register(func):
+        namespace, _ = torch._library.utils.parse_namespace(qualname)
+        if lib is None:
+            use_lib = Library(namespace, "FRAGMENT")
+            _keep_alive.append(use_lib)
+        else:
+            use_lib = lib
+        for key in keys:
+            use_lib.impl(qualname, func, key)
+
+    if func is None:
+        return register
+    else:
+        register(func)
+
+
+def _device_type_to_key(device_type: str) -> str:
+    if device_type == "default":
+        # This is technically not correct, because although all device_type
+        # DispatchKeys are included in CompositeExplicitAutograd,
+        # not everything in CompositeExplicitAutograd is associated with a
+        # device_type. I don't really care that much about the difference.
+        return "CompositeExplicitAutograd"
+    return torch._C._dispatch_key_for_device(device_type)
+
+
+@impl.register
+def _(lib: Library, name, dispatch_key=""):
+    """Legacy torch.library.impl API. Kept around for BC"""
+    def wrap(f):
+        lib.impl(name, f, dispatch_key)
+        return f
+    return wrap
+
+
+
+def impl_abstract(qualname, func=None, *, lib=None, _stacklevel=1):
+    r"""Register an abstract implementation for this operator.
+
+    An "abstract implementation" specifies the behavior of this operator on
+    Tensors that carry no data. Given some input Tensors with certain properties
+    (sizes/strides/storage_offset/device), it specifies what the properties of
+    the output Tensors are.
+
+    The abstract implementation has the same signature as the operator.
+    It is run for both FakeTensors and meta tensors. To write an abstract
+    implementation, assume that all Tensor inputs to the operator are
+    regular CPU/CUDA/Meta tensors, but they do not have storage, and
+    you are trying to return regular CPU/CUDA/Meta tensor(s) as output.
+    The abstract implementation must consist of only PyTorch operations
+    (and may not directly access the storage or data of any input or
+    intermediate Tensors).
+
+    This API may be used as a decorator (see examples).
+
+    For a detailed guide on custom ops, please see
+    https://docs.google.com/document/d/1W--T6wz8IY8fOI0Vm8BF44PdBgs283QvpelJZWieQWQ/edit
+
+    Examples:
+        >>> import torch
+        >>> import numpy as np
+        >>> from torch import Tensor
+        >>>
+        >>> # Example 1: an operator without data-dependent output shape
+        >>> torch.library.define(
+        >>>     "mylib::custom_linear",
+        >>>     "(Tensor x, Tensor weight, Tensor bias) -> Tensor")
+        >>>
+        >>> @torch.library.impl_abstract("mylib::custom_linear")
+        >>> def custom_linear_abstract(x, weight):
+        >>>     assert x.dim() == 2
+        >>>     assert weight.dim() == 2
+        >>>     assert bias.dim() == 1
+        >>>     assert x.shape[1] == weight.shape[1]
+        >>>     assert weight.shape[0] == bias.shape[0]
+        >>>     assert x.device == weight.device
+        >>>
+        >>>     return (x @ weight.t()) + bias
+        >>>
+        >>> # Example 2: an operator with data-dependent output shape
+        >>> torch.library.define("mylib::custom_nonzero", "(Tensor x) -> Tensor")
+        >>>
+        >>> @torch.library.impl_abstract("mylib::custom_nonzero")
+        >>> def custom_nonzero_abstract(x):
+        >>>     # Number of nonzero-elements is data-dependent.
+        >>>     # Since we cannot peek at the data in an abstract impl,
+        >>>     # we use the ctx object to construct a new symint that
+        >>>     # represents the data-dependent size.
+        >>>     ctx = torch.library.get_ctx()
+        >>>     nnz = ctx.new_dynamic_size()
+        >>>     shape = [nnz, x.dim()]
+        >>>     result = x.new_empty(shape, dtype=torch.int64)
+        >>>     return result
+        >>>
+        >>> @torch.library.impl("mylib::custom_nonzero", "cpu")
+        >>> def custom_nonzero_cpu(x):
+        >>>     x_np = x.numpy()
+        >>>     res = np.stack(np.nonzero(x_np), axis=1)
+        >>>     return torch.tensor(res, device=x.device)
+
+    """
+    source = torch._library.utils.get_source(_stacklevel + 1)
+    frame = sys._getframe(_stacklevel)
+    caller_module = inspect.getmodule(frame)
+    # Can be none if you call impl_abstract from somewhere there isn't a module
+    # (e.g. __main__)
+    caller_module_name = None if caller_module is None else caller_module.__name__
+
+    # TODO(rzou): We're gonna need to stage this change with torchvision,
+    # since torchvision is github first.
+    if caller_module_name is not None and caller_module_name.startswith("torchvision."):
+        caller_module_name = None
+
+    def inner(func):
+        entry = torch._library.simple_registry.singleton.find(qualname)
+        if caller_module_name is not None:
+            func_to_register = _check_pystubs_once(func, qualname, caller_module_name)
+        else:
+            func_to_register = func
+
+        handle = entry.abstract_impl.register(func_to_register, source)
+        if lib is not None:
+            lib._registration_handles.append(handle)
+        return func
+
+    if func is None:
+        return inner
+    return inner(func)
+
+
+# If the op was defined in C++, then we want to make sure there was an
+# m.impl_abstract_pystub(module, ...) call and that the module is the
+# same as the module that called torch.library.impl_abstract.
+def _check_pystubs_once(func, qualname, actual_module_name):
+    checked = False
+
+    def inner(*args, **kwargs):
+        nonlocal checked
+        if checked:
+            return func(*args, **kwargs)
+
+        op = torch._library.utils.lookup_op(qualname)
+        if op._defined_in_python:
+            checked = True
+            return func(*args, **kwargs)
+
+        maybe_pystub = torch._C._dispatch_pystub(
+            op._schema.name,
+            op._schema.overload_name)
+        if not maybe_pystub:
+            namespace = op.namespace
+            cpp_filename = op._handle().debug()
+            raise RuntimeError(
+                f"Operator '{qualname}' was defined in C++ and has a Python "
+                f"abstract impl. In this situation, we require there to also be a "
+                f"companion C++ `m.impl_abstract_pystub(\"{actual_module_name}\")` "
+                f"call, but we could not find one. Please add that to "
+                f"to the top of the C++ TORCH_LIBRARY({namespace}, ...) block the "
+                f"operator was registered in ({cpp_filename})")
+        pystub_module = maybe_pystub[0]
+        if actual_module_name != pystub_module:
+            cpp_filename = op._handle().debug()
+            raise RuntimeError(
+                f"Operator '{qualname}' specified that its python abstract impl "
+                f"is in the Python module '{pystub_module}' but it was actually found "
+                f"in '{actual_module_name}'. Please either move the abstract impl "
+                f"or correct the m.impl_abstract_pystub call ({cpp_filename})")
+        checked = True
+        return func(*args, **kwargs)
+    return inner
+
+
+# NOTE [ctx inside the fake implementation]
+# If a user has an operator with data-dependent output shape, then when writing
+# a fake implementation they must query the current ctx and use methods on the
+# ctx to construct a new unbacked symint.
+#
+# This is done via us setting the global_ctx_getter function every time a fake
+# implementation is invoked.
+def get_ctx() -> "torch._library.abstract_impl.AbstractImplCtx":
+    """get_ctx() returns the current AbstractImplCtx object.
+
+    Calling ``get_ctx()`` is only valid inside of an abstract impl
+    (see :func:`torch.library.impl_abstract` for more usage details.
+    """
+    return torch._library.abstract_impl.global_ctx_getter()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/overrides.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/overrides.py
new file mode 100644
index 0000000000000000000000000000000000000000..8076802e48a7ca9a11340045ea516f243bc6ad05
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/overrides.py
@@ -0,0 +1,1973 @@
+"""
+Python implementation of ``__torch_function__``
+
+While most of the torch API and handling for ``__torch_function__`` happens
+at the C++ level, some of the torch API is written in Python so we need
+python-level handling for ``__torch_function__`` overrides as well. The main
+developer-facing functionality in this file are handle_torch_function and
+has_torch_function. See torch/functional.py and test/test_overrides.py
+for usage examples.
+
+Note
+----
+heavily inspired by NumPy's ``__array_function__`` (see:
+https://github.com/pytorch/pytorch/issues/24015 and
+https://www.numpy.org/neps/nep-0018-array-function-protocol.html
+)
+
+If changing this file in a way that can affect ``__torch_function__`` overhead,
+please report the benchmarks in ``benchmarks/overrides_benchmark``. See the
+instructions in the ``README.md`` in that directory.
+"""
+
+import __future__  # noqa: F404
+
+import collections
+import functools
+import types
+import warnings
+from typing import Dict, Set, List, Any, Callable, Iterable, Type, Tuple
+from functools import wraps
+import contextlib
+
+import torch
+from torch._C import (
+    _has_torch_function, _has_torch_function_unary,
+    _has_torch_function_variadic, _add_docstr,
+    _push_on_torch_function_stack, _pop_torch_function_stack, _get_function_stack_at, _len_torch_function_stack,
+    _is_torch_function_mode_enabled)
+
+__all__ = [
+    "get_ignored_functions",
+    "get_overridable_functions",
+    "get_testing_overrides",
+    "handle_torch_function",
+    "has_torch_function",
+    "resolve_name",
+    "is_tensor_like",
+    "is_tensor_method_or_property",
+    "wrap_torch_function",
+    "enable_reentrant_dispatch",
+]
+
+
+def _disable_user_warnings(
+        func: Callable, regex: str = '.*is deprecated, please use.*', module: str = 'torch') -> Callable:
+    """
+    Decorator that temporarily disables ``UserWarning``s for the given ``module`` if the warning message matches the
+    given ``regex`` pattern.
+
+    Arguments
+    ---------
+    func : function
+        Function to disable the warnings for.
+    regex : str
+        A regex pattern compilable by ``re.compile``. This is used to match the ``UserWarning`` message.
+    module : str
+        The python module to which the filtering should be restricted.
+
+    Returns
+    -------
+    function
+        The wrapped function.
+    """
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=UserWarning, message=regex, module=module)
+            return func(*args, **kwargs)
+    return wrapper
+
+
+@functools.lru_cache(None)
+@_disable_user_warnings
+def get_ignored_functions() -> Set[Callable]:
+    """
+    Return public functions that cannot be overridden by ``__torch_function__``.
+
+    Returns
+    -------
+    Set[Callable]
+        A tuple of functions that are publicly available in the torch API but cannot
+        be overridden with ``__torch_function__``. Mostly this is because none of the
+        arguments of these functions are tensors or tensor-likes.
+
+    Examples
+    --------
+    >>> torch.Tensor.as_subclass in torch.overrides.get_ignored_functions()
+    True
+    >>> torch.add in torch.overrides.get_ignored_functions()
+    False
+    """
+    Tensor = torch.Tensor
+    return {
+        torch.typename,
+        torch.is_tensor,
+        torch.is_storage,
+        torch.set_default_tensor_type,
+        torch.set_default_device,
+        torch.get_default_device,
+        torch.set_rng_state,
+        torch.get_rng_state,
+        torch.manual_seed,
+        torch.initial_seed,
+        torch.seed,
+        torch.save,
+        torch.load,
+        torch.set_printoptions,
+        torch.fork,
+        torch.get_default_dtype,
+        torch.get_num_interop_threads,
+        torch.get_num_threads,
+        torch.init_num_threads,
+        torch.import_ir_module,
+        torch.import_ir_module_from_buffer,
+        torch.is_anomaly_enabled,
+        torch.is_anomaly_check_nan_enabled,
+        torch.is_grad_enabled,
+        torch.merge_type_from_type_comment,
+        torch.parse_ir,
+        torch.parse_schema,
+        torch.parse_type_comment,
+        torch.set_anomaly_enabled,
+        torch.set_flush_denormal,
+        torch.set_num_interop_threads,
+        torch.set_num_threads,
+        torch.wait,
+        torch.as_tensor,
+        torch.from_numpy,
+        torch.get_device,
+        torch.tensor,
+        torch.default_generator,
+        torch.has_cuda,
+        torch.has_cudnn,
+        torch.has_lapack,
+        torch.device,
+        torch.dtype,
+        torch.finfo,
+        torch.has_mkl,
+        torch.has_mps,
+        torch.has_mkldnn,
+        torch.has_openmp,
+        torch.iinfo,
+        torch.memory_format,
+        torch.qscheme,
+        torch.set_grad_enabled,
+        torch.no_grad,
+        torch.enable_grad,
+        torch.inference_mode,
+        torch.is_inference_mode_enabled,
+        torch.layout,
+        torch.align_tensors,
+        torch.arange,
+        torch.as_strided,
+        torch.bartlett_window,
+        torch.blackman_window,
+        torch.broadcast_shapes,
+        torch.can_cast,
+        torch.compile,
+        torch.cudnn_affine_grid_generator,
+        torch.cudnn_batch_norm,
+        torch.cudnn_convolution,
+        torch.cudnn_convolution_transpose,
+        torch.cudnn_convolution_relu,
+        torch.cudnn_convolution_add_relu,
+        torch.cudnn_grid_sampler,
+        torch.cudnn_is_acceptable,
+        torch.empty,
+        torch.empty_permuted,
+        torch.empty_strided,
+        torch.empty_quantized,
+        torch.export.dynamic_dim,
+        torch.export.export,
+        torch.export.load,
+        torch.export.register_dataclass,
+        torch.export.save,
+        torch.eye,
+        torch.fft.fftfreq,
+        torch.fft.rfftfreq,
+        torch.from_file,
+        torch.full,
+        torch.fill,
+        torch.hamming_window,
+        torch.hann_window,
+        torch.kaiser_window,
+        torch.linspace,
+        torch.logspace,
+        torch.mkldnn_adaptive_avg_pool2d,
+        torch.mkldnn_convolution,
+        torch.mkldnn_max_pool2d,
+        torch.mkldnn_max_pool3d,
+        torch.mkldnn_linear_backward_weights,
+        torch.mkldnn_rnn_layer,
+        torch.normal,
+        torch.ones,
+        torch.promote_types,
+        torch.rand,
+        torch.randn,
+        torch.randint,
+        torch.randperm,
+        torch.range,
+        torch.result_type,
+        torch.scalar_tensor,
+        torch.sparse_coo_tensor,
+        torch.sparse_compressed_tensor,
+        torch.sparse_csr_tensor,
+        torch.sparse_csc_tensor,
+        torch.sparse_bsr_tensor,
+        torch.sparse_bsc_tensor,
+        torch.sym_constrain_range,
+        torch.sym_constrain_range_for_size,
+        torch.tril_indices,
+        torch.triu_indices,
+        torch.vander,
+        torch.zeros,
+        torch._jit_internal.boolean_dispatch,
+        torch.nn.functional.assert_int_or_pair,
+        torch.nn.functional.upsample,
+        torch.nn.functional.upsample_bilinear,
+        torch.nn.functional.upsample_nearest,
+        torch.nn.functional.has_torch_function,
+        torch.nn.functional.has_torch_function_unary,
+        torch.nn.functional.has_torch_function_variadic,
+        torch.nn.functional.handle_torch_function,
+        torch.nn.functional.sigmoid,
+        torch.nn.functional.hardsigmoid,
+        torch.nn.functional.tanh,
+        torch.nn.functional._canonical_mask,
+        torch.nn.functional._none_or_dtype,
+        # Doesn't actually take or return tensor arguments
+        torch.nn.init.calculate_gain,
+        # These are deprecated; don't test them
+        torch.nn.init.uniform,
+        torch.nn.init.normal,
+        torch.nn.init.constant,
+        torch.nn.init.eye,
+        torch.nn.init.dirac,
+        torch.nn.init.xavier_uniform,
+        torch.nn.init.xavier_normal,
+        torch.nn.init.kaiming_uniform,
+        torch.nn.init.kaiming_normal,
+        torch.nn.init.orthogonal,
+        torch.nn.init.sparse,
+        torch.nested.to_padded_tensor,
+        has_torch_function,
+        handle_torch_function,
+        torch.set_autocast_enabled,
+        torch.is_autocast_enabled,
+        torch.clear_autocast_cache,
+        torch.set_autocast_cpu_enabled,
+        torch.is_autocast_cpu_enabled,
+        torch.set_autocast_xla_enabled,
+        torch.is_autocast_xla_enabled,
+        torch.set_autocast_ipu_enabled,
+        torch.is_autocast_ipu_enabled,
+        torch.set_autocast_cpu_dtype,
+        torch.get_autocast_cpu_dtype,
+        torch.set_autocast_ipu_dtype,
+        torch.get_autocast_ipu_dtype,
+        torch.get_autocast_gpu_dtype,
+        torch.set_autocast_gpu_dtype,
+        torch.get_autocast_xla_dtype,
+        torch.set_autocast_xla_dtype,
+        torch.autocast_increment_nesting,
+        torch.autocast_decrement_nesting,
+        torch.is_autocast_cache_enabled,
+        torch.set_autocast_cache_enabled,
+        torch.nn.functional.hardswish,
+        torch.is_vulkan_available,
+        torch.are_deterministic_algorithms_enabled,
+        torch.use_deterministic_algorithms,
+        torch.is_deterministic_algorithms_warn_only_enabled,
+        torch.set_deterministic_debug_mode,
+        torch.get_deterministic_debug_mode,
+        torch.set_float32_matmul_precision,
+        torch.get_float32_matmul_precision,
+        torch.unify_type_list,
+        torch.is_warn_always_enabled,
+        torch.set_warn_always,
+        torch.vitals_enabled,
+        torch.set_vital,
+        torch.read_vitals,
+        torch.vmap,
+        torch.cond,
+        torch.frombuffer,
+        torch.asarray,
+        torch._functional_sym_constrain_range,
+        torch._make_dep_token,
+        Tensor.__delitem__,
+        Tensor.__dir__,
+        Tensor.__getattribute__,
+        Tensor.__init__,
+        Tensor.__iter__,
+        Tensor.__init_subclass__,
+        Tensor.__delattr__,
+        Tensor.__setattr__,
+        Tensor.__torch_function__,
+        Tensor.__torch_dispatch__,
+        Tensor.__new__,
+        Tensor.__class__,
+        Tensor.__subclasshook__,
+        Tensor.__hash__,
+        Tensor.as_subclass,
+        Tensor.eig,
+        Tensor.lstsq,
+        Tensor.reinforce,
+        Tensor.new,
+        Tensor.new_tensor,
+        Tensor.new_empty,
+        Tensor.new_empty_strided,
+        Tensor.new_zeros,
+        Tensor.new_ones,
+        Tensor.new_full,
+        Tensor._make_subclass,
+        Tensor.solve,
+        Tensor.symeig,
+        Tensor.stride,
+        Tensor.unflatten,
+        Tensor.to_sparse_coo,
+        Tensor.to_sparse_csr,
+        Tensor.to_sparse_csc,
+        Tensor.to_sparse_bsr,
+        Tensor.to_sparse_bsc,
+        Tensor._to_sparse,
+        Tensor._to_sparse_csr,
+        Tensor._to_sparse_csc,
+        Tensor._to_sparse_bsr,
+        Tensor._to_sparse_bsc,
+        Tensor._typed_storage,
+        Tensor._reduce_ex_internal,
+        Tensor._fix_weakref,
+        Tensor._view_func,
+        Tensor._view_func_unsafe,
+        Tensor._rev_view_func_unsafe,
+        Tensor._make_wrapper_subclass,
+        Tensor._python_dispatch.__get__,
+        Tensor._has_symbolic_sizes_strides.__get__,
+        Tensor._conj,
+        Tensor._conj_physical,
+        Tensor._lazy_clone,
+        Tensor._neg_view,
+        Tensor._is_zerotensor,
+        Tensor._is_all_true,
+        Tensor._is_any_true,
+        Tensor._addmm_activation,
+        Tensor.to_padded_tensor,
+    }
+
+
+@functools.lru_cache(None)
+def get_default_nowrap_functions() -> Set[Callable]:
+    """
+    Return public functions that do not wrap in a subclass when invoked by
+    the default ``Tensor.__torch_function__`` that preserves subclasses.  Typically,
+    these functions represent field accesses (i.e., retrieving a Tensor that
+    is stored somewhere on the Tensor) as opposed to computation.  Users of
+    these functions expect object identity to be preserved over multiple accesses
+    (e.g., ``a.grad is a.grad``) which cannot be upheld if we're wrapping on
+    the fly every time (furthermore, the tensor stored here might already be
+    the subclass, in which case wrapping really ought not to happen).
+
+    Not ALL property accessors have this property; for example ``Tensor.T`` actually
+    just creates a new transposed tensor on the fly, and so we SHOULD interpose on
+    these calls (you need to check the implementation of the function to see if
+    this is the case or not).  Additionally, if a property accessor doesn't return a Tensor,
+    it doesn't have to be on this list (though it is harmless if it is).
+    """
+    Tensor = torch.Tensor
+    return {
+        Tensor._base.__get__,
+        Tensor.grad.__get__,
+        Tensor._grad.__get__,
+    }
+
+
+@functools.lru_cache(None)
+@_disable_user_warnings
+def get_testing_overrides() -> Dict[Callable, Callable]:
+    """Return a dict containing dummy overrides for all overridable functions
+
+    Returns
+    -------
+    Dict[Callable, Callable]
+        A dictionary that maps overridable functions in the PyTorch API to
+        lambda functions that have the same signature as the real function
+        and unconditionally return -1. These lambda functions are useful
+        for testing API coverage for a type that defines ``__torch_function__``.
+
+    Examples
+    --------
+    >>> import inspect
+    >>> my_add = torch.overrides.get_testing_overrides()[torch.add]
+    >>> inspect.signature(my_add)
+    <Signature (input, other, out=None)>
+    """
+    # Every function in the PyTorchAPI that can be overriden needs an entry
+    # in this dict.
+    #
+    # Optimally we would use inspect to get the function signature and define
+    # the lambda function procedurally but that is blocked by generating
+    # function signatures for native kernels that can be consumed by inspect.
+    # See Issue #28233.
+    Tensor = torch.Tensor
+    ret: Dict[Callable, Callable] = {
+        torch.abs: lambda input, out=None: -1,
+        torch.absolute: lambda input, out=None: -1,
+        torch.adaptive_avg_pool1d: lambda input, output_size: -1,
+        torch.adaptive_max_pool1d: lambda inputs, output_size: -1,
+        torch.acos: lambda input, out=None: -1,
+        torch.adjoint: lambda input: -1,
+        torch.arccos: lambda input, out=None: -1,
+        torch.acosh: lambda input, out=None: -1,
+        torch.arccosh: lambda input, out=None: -1,
+        torch.add: lambda input, other, out=None: -1,
+        torch.addbmm: lambda input, batch1, batch2, alpha=1, beta=1, out=None: -1,
+        torch.addcdiv: lambda input, tensor1, tensor2, value=1, out=None: -1,
+        torch.addcmul: lambda input, tensor1, tensor2, value=1, out=None: -1,
+        torch.addmm: lambda input, mat1, mat2, beta=1, alpha=1, out=None: -1,
+        torch.addmv: lambda input, mat, vec, beta=1, alpha=1, out=None: -1,
+        torch.addr: lambda input, vec1, vec2, beta=1, alpha=1, out=None: -1,
+        torch.affine_grid_generator: lambda theta, size, align_corners: -1,
+        torch.all: lambda input, dim=None: -1,
+        torch.allclose: lambda input, other, trol=1e-05, atol=1e-08, equal_nan=False: -1,
+        torch.alpha_dropout: lambda input, p, train, inplace=False: -1,
+        torch.amax: lambda input, dim=None: -1,
+        torch.amin: lambda input, dim=None: -1,
+        torch.aminmax: lambda input, dim=None, keepdim=False, out=None: -1,
+        torch.angle: lambda input, out=None: -1,
+        torch.any: lambda input, dim=None, keepdim=False, out=None: -1,
+        torch.argmax: lambda input: -1,
+        torch.argmin: lambda input: -1,
+        torch.argsort: lambda input, dim=None: -1,
+        torch.asin: lambda input, out=None: -1,
+        torch._assert_async: lambda input, msg: -1,
+        torch.arcsin: lambda input, out=None: -1,
+        torch.asinh: lambda input, out=None: -1,
+        torch.arcsinh: lambda input, out=None: -1,
+        torch.atan: lambda input, out=None: -1,
+        torch.arctan: lambda input, out=None: -1,
+        torch.atan2: lambda input, other, out=None: -1,
+        torch.arctan2: lambda input, other, out=None: -1,
+        torch.atanh: lambda input, out=None: -1,
+        torch.arctanh: lambda input, out=None: -1,
+        torch.atleast_1d: lambda *tensors: -1,
+        torch.atleast_2d: lambda *tensors: -1,
+        torch.atleast_3d: lambda *tensors: -1,
+        torch.avg_pool1d: lambda input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True: -1,
+        torch.baddbmm: lambda input, batch1, batch2, alpha=1, beta=1, out=None: -1,
+        torch.batch_norm: lambda input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled: -1,
+        torch.batch_norm_backward_elemt: lambda grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count_tensor: -1,
+        torch.batch_norm_backward_reduce: lambda grad_out, input, mean, invstd, weight, input_g, weight_g, bias_g: -1,
+        torch.batch_norm_elemt: lambda input, weight, bias, mean, invstd, eps: -1,
+        torch.batch_norm_gather_stats: lambda input, mean, invstd, running_mean, running_var, momentum, eps, count: -1,
+        torch.batch_norm_gather_stats_with_counts: lambda input, mean, invstd, running_mean, running_var, momentum, eps, count: -1,
+        torch.batch_norm_stats: lambda input, eps: -1,
+        torch.batch_norm_update_stats: lambda input, running_mean, running_var, momentum: -1,
+        torch.bernoulli: lambda input, generator=None, out=None: -1,
+        torch.bilinear: lambda input1, input2, weight, bias: -1,
+        torch.binary_cross_entropy_with_logits: (lambda input, target, weight=None, size_average=None, reduce=None,
+                                                 reduction='mean', pos_weight=None: -1),
+        torch.bincount: lambda input, weights=None, minlength=0: -1,
+        torch.binomial: lambda count, prob, generator=None: -1,
+        torch.bitwise_and: lambda input, other, out=None: -1,
+        torch.bitwise_not: lambda input, out=None: -1,
+        torch.bitwise_or: lambda input, other, out=None: -1,
+        torch.bitwise_xor: lambda input, other, out=None: -1,
+        torch.bitwise_left_shift: lambda input, other, out=None: -1,
+        torch.bitwise_right_shift: lambda input, other, out=None: -1,
+        torch.block_diag: lambda *tensors: -1,
+        torch.bmm: lambda input, mat2, out=None: -1,
+        torch.broadcast_tensors: lambda *tensors: -1,
+        torch.broadcast_to: lambda self, size: -1,
+        torch.bucketize: lambda input, boundaries, out_int32=False, right=False, out=None: -1,
+        torch.cartesian_prod: lambda *tensors: -1,
+        torch.cat: lambda tensors, dim=0, out=None: -1,
+        torch.concat: lambda tensors, dim=0, out=None: -1,  # alias for torch.cat
+        torch.concatenate: lambda tensors, dim=0, out=None: -1,  # alias for torch.concatenate
+        torch.cdist: lambda x1, x2, p=2.0, compute_mode='use_mm_for_euclid_dist_if_necessary': -1,
+        torch.ceil: lambda input, out=None: -1,
+        torch.celu: lambda input, alpha=1., inplace=False: -1,
+        torch.chain_matmul: lambda *matrices, out=None: -1,
+        torch.channel_shuffle: lambda input, groups : -1,
+        torch.cholesky: lambda input, upper=False, out=None: -1,
+        torch.linalg.cholesky: lambda input, out=None: -1,
+        torch.linalg.cholesky_ex: lambda input, check_errors=False, out=None: -1,
+        torch.cholesky_inverse: lambda input, upper=False, out=None: -1,
+        torch.cholesky_solve: lambda input1, input2, upper=False, out=None: -1,
+        torch.choose_qparams_optimized: lambda input, numel, n_bins, ratio, bit_width: -1,
+        torch.chunk: lambda input, chunks, dim=0: -1,
+        torch.clamp: lambda input, min=None, max=None, out=None: -1,
+        torch.clip: lambda input, min=None, max=None, out=None: -1,
+        torch.clamp_min: lambda input, min, out=None: -1,
+        torch.clamp_max: lambda input, max, out=None: -1,
+        torch.column_stack: lambda tensors, out=None: -1,
+        torch.cov: lambda input, correction=1, fweights=None, aweights=None: -1,
+        torch.clone: lambda input: -1,
+        torch.combinations: lambda input, r=2, with_replacement=False: -1,
+        torch.complex: lambda real, imag: -1,
+        torch.copysign: lambda input, other, out=None: -1,
+        torch.polar: lambda abs, ang: -1,
+        torch.linalg.cond: lambda input, ord=None: -1,
+        torch.conj: lambda input, out=None: -1,
+        torch.conj_physical: lambda input, out=None: -1,
+        torch.resolve_conj: lambda input, out=None: -1,
+        torch.resolve_neg: lambda input, out=None: -1,
+        torch.constant_pad_nd: lambda input, pad, value=0: -1,
+        torch.conv1d: lambda input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1: -1,
+        torch.conv2d: lambda input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1: -1,
+        torch.conv3d: lambda input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1: -1,
+        torch.convolution: lambda input, weight, bias, stride, padding, dilation, transposed, output_adding, groups: -1,
+        torch.conv_tbc: lambda input, weight, bias, pad=0: -1,
+        torch.conv_transpose1d: lambda input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1: -1,
+        torch.conv_transpose2d: lambda input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1: -1,
+        torch.conv_transpose3d: lambda input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1: -1,
+        torch.corrcoef: lambda input: -1,
+        torch.cos: lambda input, out=None: -1,
+        torch.cosine_embedding_loss: lambda input1, input2, target, margin=0, size_average=None, reduce=None, reduction='mean': -1,
+        torch.cosh: lambda input, out=None: -1,
+        torch.cosine_similarity: lambda x1, x2, dim=1, eps=1e-8: -1,
+        torch.count_nonzero: lambda input: -1,
+        torch.cross: lambda input, other, dim=None, out=None: -1,
+        torch.linalg.cross: lambda input, other, dim=-1, out=None: -1,
+        torch.ctc_loss: (lambda log_probs, targets, input_lengths, target_lengths, blank=0, reduction='mean',
+                         zero_infinity=False: -1),
+        torch.cummax: lambda input, dim, out=None: -1,
+        torch.cummin: lambda input, dim, out=None: -1,
+        torch.cumprod: lambda input, dim, out=None, dtype=None: -1,
+        torch.cumsum: lambda input, dim, out=None, dtype=None: -1,
+        torch.cumulative_trapezoid: lambda y, x=None, dim=-1: -1,
+        torch.logcumsumexp: lambda input, dim, out=None: -1,
+        torch.deg2rad: lambda input, out=None: -1,
+        torch.dequantize: lambda input: -1,
+        torch.det: lambda input: -1,
+        torch.linalg.det: lambda input: -1,  # alias for torch.det  # type: ignore[attr-defined]
+        torch.detach: lambda input: -1,
+        torch.diag: lambda input, diagonal=0, out=None: -1,
+        torch.diag_embed: lambda input, diagonal=0, out=None: -1,
+        torch.diagflat: lambda input, offset=0: -1,
+        torch.diff: lambda input, n=1, dim=-1, prepend=None, append=None, out=None: -1,
+        torch.diagonal: lambda input, offset=0, dim1=0, dim2=1: -1,
+        torch.linalg.diagonal: lambda input, offset=0, dim1=-2, dim2=-1: -1,
+        torch.diagonal_scatter: lambda input, src, offset=0, dim1=0, dim2=1: -1,
+        torch.as_strided_scatter: lambda self, src, size, stride, storage_offset=None: -1,
+        torch.digamma: lambda input, out=None: -1,
+        torch.dist: lambda input, other, p=2: -1,
+        torch.div: lambda input, other, rounding_mode=None, out=None: -1,
+        torch.divide: lambda input, other, rounding_mode=None, out=None: -1,
+        torch.dot: lambda input, other, out=None: -1,
+        torch.dropout: lambda input, p, train, inplace=False: -1,
+        torch.dsmm: lambda input, mat2: -1,
+        torch.hsmm: lambda mat1, mat2: -1,
+        torch.dsplit: lambda input, indices_or_sections: -1,
+        torch.dstack: lambda tensors, out=None: -1,
+        torch.linalg.eig: lambda input, out=None: -1,
+        torch.linalg.eigvals: lambda input, out=None: -1,
+        torch.linalg.eigh: lambda input, UPLO="L", out=None: -1,
+        torch.linalg.eigvalsh: lambda input, UPLO="L", out=None: -1,
+        torch.einsum: lambda equation, *operands: -1,
+        torch.embedding: (lambda input, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False,
+                          sparse=False: -1),
+        torch.embedding_bag: (lambda input, weight, offsets, max_norm=None, norm_type=2, scale_grad_by_freq=False,
+                              mode='mean', sparse=False, per_sample_weights=None, padding_idx=None: -1),
+        torch.empty_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1,
+        torch.eq: lambda input, other, out=None: -1,
+        torch.equal: lambda input, other: -1,
+        torch.erf: lambda input, out=None: -1,
+        torch.erfc: lambda input, out=None: -1,
+        torch.erfinv: lambda input, out=None: -1,
+        torch.exp: lambda input, out=None: -1,
+        torch.exp2: lambda input, out=None: -1,
+        torch.expm1: lambda input, out=None: -1,
+        torch.fake_quantize_per_channel_affine: lambda input, scale, zero_point, axis, quant_min, quant_max: -1,
+        torch.fake_quantize_per_tensor_affine: lambda input, scale, zero_point, quant_min, quant_max: -1,
+        torch.fused_moving_avg_obs_fake_quant: (lambda x, observer_on, fake_quant_on, averaging_const, running_min,
+                                                running_max, scale, zero_point, quant_min, quant_max, ch_axis,
+                                                per_row_fake_quant=False, symmetric_quant=False: -1),
+        torch.fbgemm_linear_fp16_weight: lambda input, packed_weight, bias: -1,
+        torch.fbgemm_linear_fp16_weight_fp32_activation: lambda input, packed_weight, bias: -1,
+        torch.fbgemm_linear_int8_weight: lambda input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias: -1,
+        torch.fbgemm_linear_int8_weight_fp32_activation: (lambda input, weight, packed, col_offsets, weight_scale,
+                                                          weight_zero_point, bias: -1),
+        torch.fbgemm_linear_quantize_weight: lambda input: -1,
+        torch.fbgemm_pack_gemm_matrix_fp16: lambda input: -1,
+        torch.fbgemm_pack_quantized_matrix: lambda input, a, b: -1,
+        torch.feature_alpha_dropout: lambda input, p, train: -1,
+        torch.feature_dropout: lambda input, p, train: -1,
+        torch.fft.ifft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.rfft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.irfft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.hfft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.ihfft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.hfft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.ihfft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.hfftn: lambda input, s=None, dim=-1, norm=None: -1,
+        torch.fft.ihfftn: lambda input, s=None, dim=-1, norm=None: -1,
+        torch.fft.fftn: lambda input, s=None, dim=None, norm=None: -1,
+        torch.fft.ifftn: lambda input, s=None, dim=None, norm=None: -1,
+        torch.fft.rfftn: lambda input, s=None, dim=None, norm=None: -1,
+        torch.fft.irfftn: lambda input, s=None, dim=None, norm=None: -1,
+        torch.fft.fft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.ifft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.rfft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.irfft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.fftshift: lambda input, dim=None: -1,
+        torch.fft.ifftshift: lambda input, dim=None: -1,
+        torch.fft.fft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fix: lambda input, out=None: -1,
+        torch.flatten: lambda input, start_dim=0, end_dim=-1: -1,
+        torch.flip: lambda input, dims: -1,
+        torch.fliplr: lambda input: -1,
+        torch.flipud: lambda input: -1,
+        torch.frobenius_norm: lambda input, dim=None, keepdim=False, out=None: -1,
+        torch.floor: lambda input, out=None: -1,
+        torch.floor_divide: lambda input, other: -1,
+        torch.float_power: lambda input, exponent, out=None: -1,
+        torch.fmod: lambda input, other, out=None: -1,
+        torch.frac: lambda input, out=None: -1,
+        torch.frexp: lambda input, out=None: -1,
+        torch.full_like: lambda input, fill_value, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False: -1,
+        torch._functional_assert_async: lambda input, msg, dep_token: -1,
+        torch.lu_unpack: lambda LU_data, LU_pivots, unpack_data=True, unpack_pivots=True: -1,
+        torch.gather: lambda input, dim, index, out=None, sparse_grad=False: -1,
+        torch.gcd: lambda input, other, out=None: -1,
+        torch.ge: lambda input, other, out=None: -1,
+        torch.greater_equal: lambda input, other, out=None: -1,
+        torch.geqrf: lambda input, out=None: -1,
+        torch.i0: lambda input, out=None: -1,
+        torch.inner: lambda input, other, out=None: -1,
+        torch.outer: lambda input, vec2, out=None: -1,
+        torch.ger: lambda input, vec2, out=None: -1,  # alias for torch.outer
+        torch.gradient: lambda input, spacing=None, dim=None, edge_order=1: -1,
+        torch.grid_sampler: lambda input, grid, interpolation_mode, padding_mode, align_corners: -1,
+        torch.grid_sampler_2d: lambda input, grid, interpolation_mode, padding_mode, align_corners: -1,
+        torch.grid_sampler_3d: lambda input, grid, interpolation_mode, padding_mode, align_corners: -1,
+        torch.group_norm: lambda input, num_groups, weight=None, bias=None, eps=1e-05, cudnn_enabled=True: -1,
+        torch.gru: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1,
+        torch.gru_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1,
+        torch.gt: lambda input, other, out=None: -1,
+        torch.greater: lambda input, other, out=None: -1,
+        torch.hardshrink: lambda input, lambd=0.5: -1,
+        torch.heaviside: lambda input, values, out=None: -1,
+        torch.hinge_embedding_loss: lambda input, target, margin=1.0, size_average=None, reduce=None, reduction='mean': -1,
+        torch.histc: lambda input, bins=100, min=0, max=0, out=None: -1,
+        torch.histogram: lambda input, bins=100, min=None, max=None, weight=None, density=False, out=None: -1,
+        torch.histogramdd: lambda input, bins, range=None, weight=None, density=False: -1,
+        torch.linalg.householder_product: lambda input, tau: -1,
+        torch.hspmm: lambda mat1, mat2, out=None: -1,
+        torch.hsplit: lambda input, indices_or_sections: -1,
+        torch.hstack: lambda tensors, out=None: -1,
+        torch.hypot: lambda input, other, out=None: -1,
+        torch.igamma: lambda input, other, out=None: -1,
+        torch.igammac: lambda input, other, out=None: -1,
+        torch.imag: lambda input, out=None: -1,
+        torch.index_add: lambda input, dim, index, source: -1,
+        torch.index_copy: lambda input, dim, index, source: -1,
+        torch.index_put: lambda input, indices, values, accumulate=False: -1,
+        torch.index_select: lambda input, dim, index, out=None: -1,
+        torch.index_fill: lambda input, dim, index, value: -1,
+        torch.index_reduce: lambda input, dim, index, source, reduce, include_input=True: -1,
+        torch.isfinite: lambda tensor: -1,
+        torch.isin: lambda e, te, assume_unique=False, invert=False: -1,
+        torch.isinf: lambda tensor: -1,
+        torch.isreal: lambda tensor: -1,
+        torch.isposinf: lambda input, out=None: -1,
+        torch.isneginf: lambda input, out=None: -1,
+        torch.instance_norm: (lambda input, running_mean, running_var, weight, bias, use_input_stats, momentum, eps,
+                              cudnn_enabled: -1),
+        torch.int_repr: lambda input: -1,
+        torch.inverse: lambda input, out=None: -1,
+        torch.linalg.inv: lambda input, out=None: -1,
+        torch.linalg.inv_ex: lambda input, check_errors=False, out=None: -1,
+        torch.is_complex: lambda input: -1,
+        torch.is_conj: lambda input: -1,
+        torch.is_neg: lambda input: -1,
+        torch.is_distributed: lambda input: -1,
+        torch.is_inference: lambda input: -1,
+        torch.is_floating_point: lambda input: -1,
+        torch.is_nonzero: lambda input: -1,
+        torch.is_same_size: lambda input, other: -1,
+        torch.is_signed: lambda input: -1,
+        torch.isclose: lambda input, other, rtol=1e-05, atol=1e-08, equal_nan=False: -1,
+        torch.isnan: lambda input: -1,
+        torch.istft: (lambda input, n_fft, hop_length=None, win_length=None, window=None, center=True,
+                      normalized=False, onesided=None, length=None, return_complex=False: -1),
+        torch.kl_div: lambda input, target, size_average=None, reduce=None, reduction='mean', log_target=False: -1,
+        torch.kron: lambda input, other: -1,
+        torch.kthvalue: lambda input, k, dim=None, keepdim=False, out=None: -1,
+        torch.linalg.ldl_factor_ex: lambda input, hermitian=False, check_errors=False, out=None: -1,
+        torch.linalg.ldl_factor: lambda input, hermitian=False, out=None: -1,
+        torch.linalg.ldl_solve: lambda LD, pivots, B, hermitian=False, out=None: -1,
+        torch.layer_norm: lambda input, normalized_shape, weight=None, bias=None, esp=1e-05, cudnn_enabled=True: -1,
+        torch.lcm: lambda input, other, out=None: -1,
+        torch.ldexp: lambda input, other, out=None: -1,
+        torch.le: lambda input, other, out=None: -1,
+        torch.less_equal: lambda input, other, out=None: -1,
+        torch.lerp: lambda input, end, weight, out=None: -1,
+        torch.lgamma: lambda input, out=None: -1,
+        torch.lobpcg: lambda input, k=None, B=None, X=None, n=None, iK=None, niter=None, tol=None, largest=None, method=None,
+        tracker=None, ortho_iparams=None, ortho_fparams=None, ortho_bparams=None: -1,
+        torch.log: lambda input, out=None: -1,
+        torch.log_softmax: lambda input, dim, dtype=None: -1,
+        torch.log10: lambda input, out=None: -1,
+        torch.log1p: lambda input, out=None: -1,
+        torch.log2: lambda input, out=None: -1,
+        torch.logaddexp: lambda input, other, out=None: -1,
+        torch.logaddexp2: lambda input, other, out=None: -1,
+        torch.logdet: lambda input: -1,
+        torch.xlogy: lambda x, y, out=None: -1,
+        torch.logical_and: lambda input, other, out=None: -1,
+        torch.logical_not: lambda input, out=None: -1,
+        torch.logical_or: lambda input, other, out=None: -1,
+        torch.logical_xor: lambda input, other, out=None: -1,
+        torch.logit: lambda input, eps=None: -1,
+        torch.logsumexp: lambda input, names, keepdim=False, out=None: -1,
+        torch.lstm: lambda data, batch_sizes, hx, params, has_biases, num_layers, dropout, train, bidirectional: -1,
+        torch.lstm_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1,
+        torch.lt: lambda input, other, out=None: -1,
+        torch.less: lambda input, other, out=None: -1,
+        torch.lu: lambda A, pivot=True, get_infos=False, out=None: -1,
+        torch.lu_solve: lambda b, LU_data, LU_pivots, out=None: -1,
+        torch.margin_ranking_loss: lambda input1, input2, target, margin=0, size_average=None, reduce=None, reduction='mean': -1,  # type: ignore[attr-defined]  # noqa: B950
+        torch.masked_fill: lambda input, mask, value: -1,
+        torch.masked_scatter: lambda input, mask, source: -1,
+        torch.masked_select: lambda input, mask, out=None: -1,
+        torch.matmul: lambda input, other, out=None: -1,
+        torch.linalg.lu: lambda input, pivot=True, out=None: -1,
+        torch.linalg.lu_factor: lambda input, pivot=True, out=None: -1,
+        torch.linalg.lu_factor_ex: lambda input, pivot=True, check_errors=False, out=None: -1,
+        torch.linalg.lu_solve: lambda LU, pivots, B, left=True, adjoint=False, out=None: -1,
+        torch.linalg.matmul: lambda input, other, out=None: -1,  # alias for torch.matmul
+        torch.matrix_power: lambda input, n: -1,
+        torch.linalg.matrix_power: lambda input, n, out=None: -1,
+        torch.linalg.matrix_rank: lambda input, tol=None, hermitian=False: -1,
+        torch.linalg.multi_dot: lambda tensors, out=None: -1,
+        torch.matrix_exp: lambda input: -1,
+        torch.linalg.matrix_exp: lambda input: -1,
+        torch.max: lambda input, out=None: -1,
+        torch.maximum: lambda input, other, out=None: -1,
+        torch.fmax: lambda input, other, out=None: -1,
+        torch.max_pool1d: lambda input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False: -1,
+        torch.max_pool2d: lambda input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False: -1,
+        torch.max_pool3d: lambda input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False: -1,
+        torch.max_pool1d_with_indices: (lambda input, kernel_size, stride=None, padding=0, dilation=1,
+                                        return_indices=False, ceil_mode=False: -1),
+        torch.mean: lambda input, dim=None: -1,
+        torch.nanmean: lambda input, dim=None, keepdim=False, dtype=None, out=None: -1,
+        torch.median: lambda input, dim=None: -1,
+        torch.nanmedian: lambda input, dim=None: -1,
+        torch.meshgrid: lambda *tensors, **kwargs: -1,
+        torch.min: lambda input, out=None: -1,
+        torch.minimum: lambda input, other, out=None: -1,
+        torch.fmin: lambda input, other, out=None: -1,
+        torch.miopen_batch_norm: (lambda input, weight, bias, running_mean, running_var, training,
+                                  exponential_average_factor, epsilon: -1),
+        torch.miopen_convolution: lambda input, weight, bias, padding, stride, dilation, groups, benchmark, deterministic: -1,
+        torch.miopen_convolution_add_relu: lambda input, weight, z, alpha, bias, stride, padding, dilation, groups: -1,
+        torch.miopen_convolution_relu: lambda input, weight, bias, stride, padding, dilation, groups: -1,
+        torch.miopen_convolution_transpose: (lambda input, weight, bias, padding, output_padding, stride, dilation,
+                                             groups, benchmark, deterministic: -1),
+        torch.miopen_depthwise_convolution: (lambda input, weight, bias, padding, stride, dilation, groups, benchmark,
+                                             deterministic: -1),
+        torch.miopen_rnn: (lambda input, weight, weight_stride0, hx, cx, mode, hidden_size, num_layers, batch_first,
+                           dropout, train, bidirectional, batch_sizes, dropout_state: -1),
+        torch.mm: lambda input, mat2, out=None: -1,
+        torch.mode: lambda input, dim=-1, keepdim=False, out=None: -1,
+        torch.movedim: lambda input, source, destination: -1,
+        torch.moveaxis: lambda input, source, destination: -1,
+        torch.msort: lambda input, descending=False, out=None: -1,
+        torch.mul: lambda input, other, out=None: -1,
+        torch.multiply: lambda input, other, out=None: -1,
+        torch.multinomial: lambda input, num_samples, replacement=False, out=None: -1,
+        torch.mv: lambda input, vec, out=None: -1,
+        torch.mvlgamma: lambda input, p: -1,
+        torch.narrow: lambda input, dim, start, length: -1,
+        torch.nan_to_num: lambda input, nan=0.0, posinf=None, neginf=None, out=None: -1,
+        torch.native_batch_norm: lambda input, weight, bias, running_mean, running_var, training, momentum, eps: -1,
+        torch._native_batch_norm_legit: lambda input, weight, bias, training, momentum, eps: -1,
+        torch.native_dropout: lambda input, p, train: -1,
+        torch.native_layer_norm: lambda input, normalized_shape, weight=None, bias=None, eps=1e-05: -1,
+        torch.native_group_norm: lambda input, weight, bias, N, C, HxW, group, eps: -1,
+        torch.native_norm: lambda input, p=2, dim=None, keepdim=False, dtype=None: -1,
+        torch.native_channel_shuffle: lambda input, groups : -1,
+        torch.ne: lambda input, other, out=None: -1,
+        torch.not_equal: lambda input, other, out=None: -1,
+        torch.neg: lambda input, out=None: -1,
+        torch.negative: lambda input, out=None: -1,
+        torch.nextafter: lambda input, other, out=None: -1,
+        torch.nn.functional.adaptive_avg_pool2d: lambda input, output_size: -1,
+        torch.nn.functional.adaptive_avg_pool3d: lambda input, output_size: -1,
+        torch.nn.functional.adaptive_max_pool1d: lambda input, output_size, return_indices=False: -1,
+        torch.nn.functional.adaptive_max_pool1d_with_indices: lambda input, output_size, return_indices=False: -1,
+        torch.nn.functional.adaptive_max_pool2d: lambda input, output_size, return_indices=False: -1,
+        torch.nn.functional.adaptive_max_pool2d_with_indices: lambda input, output_size, return_indices=False: -1,
+        torch.nn.functional.adaptive_max_pool3d: lambda input, output_size, return_indices=False: -1,
+        torch.nn.functional.adaptive_max_pool3d_with_indices: lambda input, output_size, return_indices=False: -1,
+        torch.nn.functional.affine_grid: lambda theta, size, align_corners=None: -1,
+        torch.nn.functional.alpha_dropout: lambda input, p=0.5, training=False, inplace=False: -1,
+        torch.nn.functional.avg_pool2d: (lambda input, kernel_size, stride=None, padding=0, ceil_mode=False,
+                                         count_include_pad=True, divisor_override=None: -1),
+        torch.nn.functional.avg_pool3d: (lambda input, kernel_size, stride=None, padding=0, ceil_mode=False,
+                                         count_include_pad=True, divisor_override=None: -1),
+        torch.nn.functional.batch_norm: (lambda input, running_mean, running_var, weight=None, bias=None, training=False,
+                                         momentum=0.1, eps=1e-05: -1),
+        torch.nn.functional.bilinear: lambda input1, input2, weight, bias=None: -1,
+        torch.nn.functional.binary_cross_entropy: (lambda input, target, weight=None, size_average=None, reduce=None,
+                                                   reduction="mean": -1),
+        torch.nn.functional.binary_cross_entropy_with_logits: (lambda input, target, weight=None, size_average=None,
+                                                               reduce=None, reduction="mean", pos_weight=None: -1),
+        torch.nn.functional.celu: lambda input, alpha=1.0, inplace=False: -1,
+        torch.nn.functional.cosine_embedding_loss: (lambda input1, input2, target, margin=0, size_average=None,
+                                                    reduce=None, reduction='mean': -1),
+        torch.nn.functional.cross_entropy: (lambda input, target, weight=None, size_average=None, ignore_index=-100,
+                                            reduce=None, reduction="mean", label_smoothing=0.0: -1),
+        torch.nn.functional.ctc_loss: (lambda log_probs, targets, input_lengths, target_lengths, blank=0,
+                                       reduction='mean', zero_infinity=False: -1),
+        torch.nn.functional.dropout: lambda input, p=0.5, training=True, inplace=False: -1,
+        torch.nn.functional.dropout1d: lambda input, p=0.5, training=True, inplace=False: -1,
+        torch.nn.functional.dropout2d: lambda input, p=0.5, training=True, inplace=False: -1,
+        torch.nn.functional.dropout3d: lambda input, p=0.5, training=True, inplace=False: -1,
+        torch.nn.functional.elu: lambda input, alpha=1.0, inplace=False: -1,
+        torch.nn.functional.embedding: (lambda input, weight, padding_idx=None, max_norm=None, norm_type=2.0,
+                                        scale_grad_by_freq=False, sparse=False: -1),
+        torch.nn.functional.embedding_bag: (lambda input, weight, offsets=None, max_norm=None, norm_type=2,
+                                            scale_grad_by_freq=False, mode='mean', sparse=False, per_sample_weights=None,
+                                            include_last_offset=False, padding_idx=None: -1),
+        torch.nn.functional.feature_alpha_dropout: lambda input, p=0.5, training=False, inplace=False: -1,
+        torch.nn.functional.fold: lambda input, output_size, kernel_size, dilation=1, padding=0, stride=1: -1,
+        torch.nn.functional.fractional_max_pool2d: (lambda input, kernel_size, output_size=None, output_ratio=None,
+                                                    return_indices=False, _random_samples=None: -1),
+        torch.nn.functional.fractional_max_pool2d_with_indices: (
+            lambda input, kernel_size, output_size=None, output_ratio=None, return_indices=False,
+            _random_samples=None: -1),
+        torch.nn.functional.fractional_max_pool3d: (lambda input, kernel_size, output_size=None, output_ratio=None,
+                                                    return_indices=False, _random_samples=None: -1),
+        torch.nn.functional.fractional_max_pool3d_with_indices: (
+            lambda input, kernel_size, output_size=None, output_ratio=None, return_indices=False,
+            _random_samples=None: -1),
+        torch.nn.functional.gaussian_nll_loss: lambda input, target, var, full=False, eps=1e-06, reduction='mean': -1,
+        torch.nn.functional.gelu: lambda input, approximate='none': -1,
+        torch.nn.functional.glu: lambda input, dim=-1: -1,
+        torch.nn.functional.grid_sample: lambda input, grid, mode='bilinear', padding_mode='zeros', align_corners=None: -1,
+        torch.nn.functional.group_norm: lambda input, num_groups, weight=None, bias=None, eps=1e-05: -1,
+        torch.nn.functional.gumbel_softmax: lambda logits, tau=1, hard=False, eps=1e-10, dim=-1: -1,
+        torch.nn.functional.hardshrink: lambda input, lambd=0.5: -1,
+        torch.nn.functional.hardtanh: lambda input, min_val=-1., max_val=1., inplace=False: -1,
+        torch.nn.functional.hinge_embedding_loss: (lambda input, target, margin=1.0, size_average=None, reduce=None,
+                                                   reduction='mean': -1),
+        torch.nn.functional.instance_norm: (lambda input, running_mean=None, running_var=None, weight=None, bias=None,
+                                            use_input_stats=True, momentum=0.1, eps=1e-05: -1),
+        torch.nn.functional.interpolate: (lambda input, size=None, scale_factor=None, mode='nearest', align_corners=None,
+                                          recompute_scale_factor=None, antialias=False: -1),
+        torch.nn.functional.kl_div: lambda input, target, size_average=None, reduce=None, reduction='mean', log_target=False: -1,
+        torch.nn.functional.l1_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1,
+        torch.nn.functional.layer_norm: lambda input, normalized_shape, weight=None, bias=None, eps=1e-05: -1,
+        torch.nn.functional.leaky_relu: lambda input, negative_slope=0.01, inplace=False: -1,
+        torch.nn.functional.linear: lambda input, weight, bias=None: -1,
+        torch.nn.functional.local_response_norm: lambda input, size, alpha=0.0001, beta=0.75, k=1.0: -1,
+        torch.nn.functional.log_softmax: lambda input, dim=None, _stacklevel=3, dtype=None: -1,
+        torch.nn.functional.logsigmoid: lambda input: -1,
+        torch.nn.functional.lp_pool1d: lambda input, norm_type, kernel_size, stride=None, ceil_mode=False: -1,
+        torch.nn.functional.lp_pool2d: lambda input, norm_type, kernel_size, stride=None, ceil_mode=False: -1,
+        torch.nn.functional.lp_pool3d: lambda input, norm_type, kernel_size, stride=None, ceil_mode=False: -1,
+        torch.nn.functional.margin_ranking_loss: (lambda input1, input2, target, margin=0, size_average=None,
+                                                  reduce=None, reduction='mean': -1),
+        torch.nn.functional.max_pool1d: (lambda input, kernel_size, stride=None, padding=0, dilation=1,
+                                         ceil_mode=False, return_indices=False: -1),
+        torch.nn.functional.max_pool1d_with_indices: (lambda input, kernel_size, stride=None, padding=0, dilation=1,
+                                                      return_indices=False, ceil_mode=False: -1),
+        torch.nn.functional.max_pool2d: (lambda input, kernel_size, stride=None, padding=0, dilation=1,
+                                         ceil_mode=False, return_indices=False: -1),
+        torch.nn.functional.max_pool2d_with_indices: (lambda input, kernel_size, stride=None, padding=0, dilation=1,
+                                                      return_indices=False, ceil_mode=False: -1),
+        torch.nn.functional.max_pool3d: (lambda input, kernel_size, stride=None, padding=0, dilation=1,
+                                         return_indices=False, ceil_mode=False: -1),
+        torch.nn.functional.max_pool3d_with_indices: (lambda input, kernel_size, stride=None, padding=0, dilation=1,
+                                                      return_indices=False, ceil_mode=False: -1),
+        torch.nn.functional.max_unpool1d: lambda input, indices, kernel_size, stride=None, padding=0, output_size=None: -1,
+        torch.nn.functional.max_unpool2d: lambda input, indices, kernel_size, stride=None, padding=0, output_size=None: -1,
+        torch.nn.functional.max_unpool3d: lambda input, indices, kernel_size, stride=None, padding=0, output_size=None: -1,
+        torch.nn.functional.mse_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1,
+        torch.nn.functional.multi_head_attention_forward: (
+            lambda query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v,
+            add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training=True, key_padding_mask=None,
+            need_weights=True, attn_mask=None, use_separate_proj_weight=False, q_proj_weight=None, k_proj_weight=None,
+            v_proj_weight=None, static_k=None, static_v=None, average_attn_weights=None, is_causal=False: -1),
+        torch.nn.functional.multi_margin_loss: (lambda input, target, p=1, margin=1.0, weight=None, size_average=None,
+                                                reduce=None, reduction='mean': -1),
+        torch.nn.functional.multilabel_margin_loss: (lambda input, target, size_average=None, reduce=None,
+                                                     reduction='mean': -1),
+        torch.nn.functional.multilabel_soft_margin_loss: (lambda input, target, weight=None, size_average=None,
+                                                          reduce=None, reduction='mean': -1),
+        torch.nn.functional.nll_loss: (lambda input, target, weight=None, size_average=None, ignore_index=-100,
+                                       reduce=None, reduction='mean': -1),
+        torch.nn.functional.normalize: lambda input, p=2, dim=1, eps=1e-12, out=None: -1,
+        torch.nn.functional.one_hot: lambda tensor, num_classes=-1: -1,
+        torch.nn.functional.pad: lambda input, pad, mode='constant', value=0: -1,
+        torch.nn.functional.pairwise_distance: lambda x1, x2, p=2.0, eps=1e-06, keepdim=False: -1,
+        torch.nn.functional.poisson_nll_loss: (lambda input, target, log_input=True, full=False, size_average=None,
+                                               eps=1e-08, reduce=None, reduction='mean': -1),
+        torch.nn.functional.prelu: lambda input, weight: -1,
+        torch.nn.functional.relu: lambda input, inplace=False: -1,
+        torch.nn.functional.relu6: lambda input, inplace=False: -1,
+        torch.nn.functional.rrelu: lambda input, lower=0.125, upper=0.3333333333333333, training=False, inplace=False: -1,
+        torch.nn.functional.selu: lambda input, inplace=False: -1,
+        torch.nn.functional.silu: lambda input, inplace=False: -1,
+        torch.nn.functional.mish: lambda input, inplace=False: -1,
+        torch.nn.functional.scaled_dot_product_attention: lambda query, key, value, attn_mask=None, dropout_p=0.0: -1,
+        torch.nn.functional.smooth_l1_loss: lambda input, target, size_average=None, reduce=None, reduction='mean', beta=1.: -1,
+        torch.nn.functional.huber_loss: lambda input, target, reduction='mean', delta=1.: -1,
+        torch.nn.functional.soft_margin_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1,
+        torch.nn.functional.softmax: lambda input, dim=None, _stacklevel=3, dtype=None: -1,
+        torch.nn.functional.softmin: lambda input, dim=None, _stacklevel=3, dtype=None: -1,
+        torch.nn.functional.softplus: lambda input, beta=1, threshold=20: -1,
+        torch.nn.functional.softshrink: lambda input, lambd=0.5: -1,
+        torch.nn.functional.softsign: lambda input: -1,
+        torch.nn.functional.tanhshrink: lambda input: -1,
+        torch.nn.functional.threshold: lambda input, threshold, value, inplace=False: -1,
+        torch.nn.functional.triplet_margin_loss: (lambda anchor, positive, negative, margin=1.0, p=2, eps=1e-06,
+                                                  swap=False, size_average=None, reduce=None, reduction='mean': -1),
+        torch.nn.functional.triplet_margin_with_distance_loss: (lambda anchor, positive, negative, *,
+                                                                distance_function=None, margin=1.0,
+                                                                swap=False, reduction='mean': -1),
+        torch.nn.functional.unfold: lambda input, kernel_size, dilation=1, padding=0, stride=1: -1,
+        torch.nn.init.uniform_: lambda tensor, a=0., b=1., generator=None: -1,
+        torch.nn.init.normal_: lambda tensor, mean=0., std=1., generator=None: -1,
+        torch.nn.init.constant_: lambda tensor, val: -1,
+        torch.nn.init.kaiming_uniform_: lambda tensor, a=0, mode='fan_in', nonlinearity='leaky_relu', generator=None: -1,
+        torch.nonzero: lambda input, as_tuple=False: -1,
+        torch.nonzero_static: lambda input, *, size, fill_value=-1: -1,
+        torch.argwhere: lambda input: -1,
+        torch.norm: lambda input, p='fro', dim=None, keepdim=False, out=None, dtype=None: -1,
+        torch.linalg.norm: lambda input, ord=None, dim=None, keepdim=False, out=None, dtype=None: -1,
+        torch.linalg.vector_norm: lambda input, ord=2, dim=None, keepdim=False, out=None, dtype=None: -1,
+        torch.linalg.matrix_norm: lambda input, ord='fro', dim=(-2, -1), keepdim=False, out=None, dtype=None: -1,
+        torch.norm_except_dim: lambda v, pow=2, dim=0: -1,
+        torch.nuclear_norm: lambda input, p='fro', dim=None, keepdim=False, out=None, dtype=None: -1,
+        torch.numel: lambda input: -1,
+        torch.orgqr: lambda input, tau: -1,
+        torch.ormqr: lambda input, input2, input3, left=True, transpose=False: -1,
+        torch.pairwise_distance: lambda x1, x2, p=2.0, eps=1e-06, keepdim=False: -1,
+        torch.permute: lambda self, dim: -1,
+        torch.pca_lowrank: lambda input, q=None, center=True, niter=2: -1,
+        torch.pdist: lambda input, p=2: -1,
+        torch.pinverse: lambda input, rcond=1e-15: -1,
+        torch.linalg.pinv: lambda input, rcond=1e-15, hermitian=False: -1,
+        torch.pixel_shuffle: lambda input, upscale_factor: -1,
+        torch.pixel_unshuffle: lambda input, downscale_factor: -1,
+        torch.poisson: lambda input, generator=None: -1,
+        torch.poisson_nll_loss: lambda input, target, log_input, full, eps, reduction: -1,
+        torch.polygamma: lambda input, n, out=None: -1,
+        torch.positive: lambda input, out=None: -1,
+        torch.prelu: lambda input, weight: -1,
+        torch.ones_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1,
+        torch.pow: lambda input, exponent, out=None: -1,
+        torch.prod: lambda input, dtype=None: -1,
+        torch.put: lambda input, index, source, accumulate=False: -1,
+        torch.q_per_channel_axis: lambda input: -1,
+        torch.q_per_channel_scales: lambda input: -1,
+        torch.q_per_channel_zero_points: lambda input: -1,
+        torch.q_scale: lambda input: -1,
+        torch.q_zero_point: lambda input: -1,
+        torch.qr: lambda input, some=True, out=None: -1,
+        torch.linalg.qr: lambda input, mode='reduced', out=None: -1,
+        torch.quantile: lambda input, q, dim=None, keepdim=False, interpolation='linear', out=None: -1,
+        torch.nanquantile: lambda input, q, dim=None, keepdim=False, interpolation='linear', out=None: -1,
+        torch.quantize_per_channel: lambda input, scales, zero_points, axis, dtype: -1,
+        torch.quantize_per_tensor: lambda input, scale, zero_point, dtype: -1,
+        torch.quantize_per_tensor_dynamic: lambda input, dtype, reduce_range: -1,
+        torch.quantized_batch_norm: lambda input, weight, bias, mean, var, eps, output_scale, output_zero_point: -1,
+        torch.quantized_gru_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih,
+                                   col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh: -1),
+
+        torch.quantized_lstm_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih,
+                                    col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh: -1),
+        torch.quantized_max_pool1d: (lambda input, kernel_size, stride=tuple(), padding=(0,),
+                                     dilation=(1,), ceil_mode=False: -1),
+        torch.quantized_max_pool2d: (lambda input, kernel_size, stride=tuple(), padding=(0, 0),
+                                     dilation=(1, 1), ceil_mode=False: -1),
+        torch.quantized_max_pool3d: (lambda input, kernel_size, stride=tuple(), padding=(0, 0, 0),
+                                     dilation=(1, 1, 1), ceil_mode=False: -1),
+        torch.quantized_rnn_relu_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih,
+                                        col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh: -1),
+        torch.quantized_rnn_tanh_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih,
+                                        col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh: -1),
+        torch.rad2deg: lambda input, out=None: -1,
+        torch.rand_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1,
+        torch.randint_like: lambda input, high, dtype=None, layout=torch.strided, device=None, requires_grad=False: -1,
+        torch.randn_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1,
+        torch.ravel: lambda input: -1,
+        torch.real: lambda input, out=None: -1,
+        torch.vdot: lambda input, other, out=None: -1,
+        torch.linalg.vecdot: lambda input, other, dim=-1, out=None: -1,
+        torch.view_as_real: lambda input: -1,
+        torch.view_as_complex: lambda input: -1,
+        torch.reciprocal: lambda input, out=None: -1,
+        torch.relu: lambda input, inplace=False: -1,
+        torch.remainder: lambda input, other, out=None: -1,
+        torch.renorm: lambda input, p, dim, maxnorm, out=None: -1,
+        torch.repeat_interleave: lambda input, dim=None: -1,
+        torch.reshape: lambda input, shape: -1,
+        torch.rnn_relu: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1,
+        torch.rnn_relu_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1,
+        torch.rnn_tanh: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1,
+        torch.rnn_tanh_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1,
+        torch.roll: lambda input, shifts, dims=None: -1,
+        torch.rot90: lambda input, k=1, dims=(0, 1): -1,
+        torch.round: lambda input, out=None: -1,
+        torch.row_stack: lambda tensors, out=None: -1,  # alias for torch.vstack
+        torch._rowwise_prune: (lambda weight, mask, compressed_indices_dtype: -1),
+        torch.rrelu: lambda input, lower=1. / 8, upper=1. / 3, training=False, inplace=False: -1,
+        torch.rsqrt: lambda input, out=None: -1,
+        torch.rsub: lambda input, other, alpha=1: -1,
+        torch.saddmm: lambda input, mat1, mat2, beta=1, alpha=1, out=None: -1,
+        torch.scatter: lambda input, dim, index, src: -1,
+        torch.scatter_add: lambda input, dim, index, src: -1,
+        torch.scatter_reduce: lambda input, dim, index, src, reduce, include_self=True: -1,
+        torch.searchsorted: lambda sorted_sequence, input, out_int32=False, right=False, out=None: -1,
+        torch._segment_reduce: lambda data, reduce="max", lengths=None, indices=None, offsets=None, axis=0, unsafe=False: -1,
+        torch.select: lambda input, dim, index: -1,
+        torch.select_scatter: lambda input, src, dim, index: -1,
+        torch.slice_inverse: lambda input, src, dim=0, start=None, end=None, step=1: -1,
+        torch.slice_scatter: lambda input, src, dim=0, start=None, end=None, step=1: -1,
+        torch.selu: lambda input, inplace=False: -1,
+        torch.sigmoid: lambda input, out=None: -1,
+        torch.sign: lambda input, out=None: -1,
+        torch.signbit: lambda input, out=None: -1,
+        torch.sgn: lambda input, out=None: -1,
+        torch.sin: lambda input, out=None: -1,
+        torch.sinc: lambda input, out=None: -1,
+        torch.sinh: lambda input, out=None: -1,
+        torch.slogdet: lambda input: -1,
+        torch.linalg.slogdet: lambda input: -1,
+        torch.smm: lambda input, mat2: -1,
+        torch.spmm: lambda input, mat2: -1,
+        torch.softmax: lambda input, dim, dtype=None: -1,
+        torch.linalg.solve: lambda A, B, left=True, out=None: -1,
+        torch.linalg.solve_ex: lambda A, B, left=True, check_errors=False, out=None: -1,
+        torch.sort: lambda input, dim=-1, descending=False, *, stable=False, out=None: -1,
+        torch.split: lambda tensor, split_size_or_sections, dim=0: -1,
+        torch.split_with_sizes: lambda tensor, split_size_or_sections, dim=0: -1,
+        torch.sqrt: lambda input, out=None: -1,
+        torch.square: lambda input, out=None: -1,
+        torch.squeeze: lambda input, dim=None, out=None: -1,
+        torch.sspaddmm: lambda input, mat1, mat2, beta=1, alpha=1, out=None: -1,
+        torch.stack: lambda tensors, dim=0, out=None: -1,
+        torch.std: lambda input, dim=None: -1,
+        torch.std_mean: lambda input, dim=None: -1,
+        torch.stft: (lambda input, n_fft, hop_length=None, win_length=None, window=None, center=True,
+                     pad_mode='reflect', normalized=False, onesided=True, return_complex=None: -1),
+        torch.sub: lambda input, other, out=None: -1,
+        torch.subtract: lambda input, other, out=None: -1,
+        torch.sum: lambda input, dim=None: -1,
+        torch.sym_float: lambda input: -1,
+        torch.sym_int: lambda input: -1,
+        torch.sym_max: lambda a, b: -1,
+        torch.sym_min: lambda a, b: -1,
+        torch.sym_not: lambda input: -1,
+        torch.sym_ite: lambda a, b, c: -1,
+        torch._sym_sqrt: lambda input: -1,
+        torch._sym_cos: lambda input: -1,
+        torch._sym_cosh: lambda input: -1,
+        torch._sym_sin: lambda input: -1,
+        torch._sym_sinh: lambda input: -1,
+        torch._sym_tan: lambda input: -1,
+        torch._sym_tanh: lambda input: -1,
+        torch._sym_asin: lambda input: -1,
+        torch._sym_acos: lambda input: -1,
+        torch._sym_atan: lambda input: -1,
+        torch.nansum: lambda input, dim=None: -1,
+        torch.svd: lambda input, some=True, compute_uv=True, out=None: -1,
+        torch.svd_lowrank: lambda input, q=6, niter=2, M=None: -1,
+        torch.linalg.svd: lambda input, full_matrices=True, out=None: -1,
+        torch.linalg.svdvals: lambda input, out=None: -1,
+        torch.swapaxes: lambda input, dim0, dim1: -1,
+        torch.swapdims: lambda input, axis0, axis1: -1,
+        torch.special.airy_ai: lambda input: -1,
+        torch.special.bessel_j0: lambda input: -1,
+        torch.special.bessel_j1: lambda input: -1,
+        torch.special.bessel_y0: lambda input: -1,
+        torch.special.bessel_y1: lambda input: -1,
+        torch.special.chebyshev_polynomial_t: lambda input, n, out=None: -1,
+        torch.special.chebyshev_polynomial_u: lambda input, n, out=None: -1,
+        torch.special.chebyshev_polynomial_v: lambda input, n, out=None: -1,
+        torch.special.chebyshev_polynomial_w: lambda input, n, out=None: -1,
+        torch.special.digamma: lambda input: -1,
+        torch.special.entr: lambda input: -1,
+        torch.special.erf: lambda input: -1,
+        torch.special.erfc: lambda input: -1,
+        torch.special.erfcx: lambda input: -1,
+        torch.special.erfinv: lambda input: -1,
+        torch.special.exp2: lambda input: -1,
+        torch.special.expit: lambda input: -1,
+        torch.special.expm1: lambda input: -1,
+        torch.special.gammainc: lambda input, other, out=None: -1,
+        torch.special.gammaincc: lambda input, other, out=None: -1,
+        torch.special.gammaln: lambda input: -1,
+        torch.special.hermite_polynomial_h: lambda input, n, out=None: -1,
+        torch.special.hermite_polynomial_he: lambda input, n, out=None: -1,
+        torch.special.i0: lambda input: -1,
+        torch.special.i0e: lambda input: -1,
+        torch.special.i1: lambda input: -1,
+        torch.special.i1e: lambda input: -1,
+        torch.special.laguerre_polynomial_l: lambda input, n, out=None: -1,
+        torch.special.legendre_polynomial_p: lambda input, n, out=None: -1,
+        torch.special.log1p: lambda input: -1,
+        torch.special.log_ndtr: lambda input: -1,
+        torch.special.log_softmax: lambda input, dim, dtype=None: -1,
+        torch.special.logit: lambda input: -1,
+        torch.special.logsumexp: lambda input, dim, keepdim=False, out=None: -1,
+        torch.special.modified_bessel_i0: lambda input: -1,
+        torch.special.modified_bessel_i1: lambda input: -1,
+        torch.special.modified_bessel_k0: lambda input: -1,
+        torch.special.modified_bessel_k1: lambda input: -1,
+        torch.special.multigammaln: lambda input, p: -1,
+        torch.special.ndtr: lambda input: -1,
+        torch.special.ndtri: lambda input: -1,
+        torch.special.polygamma: lambda input, n, out=None: -1,
+        torch.special.psi: lambda input: -1,
+        torch.special.round: lambda input: -1,
+        torch.special.scaled_modified_bessel_k0: lambda input: -1,
+        torch.special.scaled_modified_bessel_k1: lambda input: -1,
+        torch.special.shifted_chebyshev_polynomial_t: lambda input, n, out=None: -1,
+        torch.special.shifted_chebyshev_polynomial_u: lambda input, n, out=None: -1,
+        torch.special.shifted_chebyshev_polynomial_v: lambda input, n, out=None: -1,
+        torch.special.shifted_chebyshev_polynomial_w: lambda input, n, out=None: -1,
+        torch.special.sinc: lambda input: -1,
+        torch.special.softmax: lambda input, dim, dtype=None: -1,
+        torch.special.spherical_bessel_j0: lambda input: -1,
+        torch.special.xlog1py: lambda input, other, out=None: -1,
+        torch.special.xlogy: lambda input, other, out=None: -1,
+        torch.special.zeta: lambda self, other, out=None: -1,
+        torch.t: lambda input: -1,
+        torch.take: lambda input, index: -1,
+        torch.take_along_dim: lambda input, indices, dim=None, out=None: -1,
+        torch.tan: lambda input, out=None: -1,
+        torch.tanh: lambda input, out=None: -1,
+        torch.linalg.tensorinv: lambda a, ind=2: -1,
+        torch.linalg.tensorsolve: lambda a, b, dims=None: -1,
+        torch.tensordot: lambda a, b, dims=2, out=None: -1,
+        torch.tensor_split: lambda input, indices_or_sections, dim=0: -1,
+        torch.threshold: lambda input, threshold, value, inplace=False: -1,
+        torch.tile: lambda input, dims: -1,
+        torch.topk: lambda input, k, dim=-1, descending=False, out=None: -1,
+        torch.trace: lambda input: -1,
+        torch.transpose: lambda input, dim0, dim1: -1,
+        torch.trapz: lambda y, x=None, dim=-1: -1,
+        torch.trapezoid: lambda y, x=None, dim=-1: -1,
+        torch.triangular_solve: lambda input, A, upper=True, transpose=False, unitriangular=False: -1,
+        torch.linalg.solve_triangular: lambda input, B, upper, left=True, unitriangular=False: -1,
+        torch.tril: lambda input, diagonal=0, out=None: -1,
+        torch.triplet_margin_loss: (lambda anchor, positive, negative, margin=1.0, p=2, eps=1e-06, swap=False,
+
+                                    size_average=None, reduce=None, reduction='mean': -1),
+        torch.triu: lambda input, diagonal=0, out=None: -1,
+        torch.true_divide: lambda input, other: -1,
+        torch.trunc: lambda input, out=None: -1,
+        torch.unbind: lambda input, dim=0: -1,
+        torch.unflatten: lambda input, dim, sizes, names: -1,
+        torch.unique: lambda input, sorted=True, return_inverse=False, return_counts=False, dim=None: -1,
+        torch.unique_consecutive: lambda input, return_inverse=False, return_counts=False, dim=None: -1,
+        torch.unravel_index: lambda indices, shape: -1,
+        torch.unsafe_chunk: lambda input, chunks, dim=0: -1,
+        torch.unsafe_split: lambda tensor, split_size_or_sections, dim=0: -1,
+        torch.unsafe_split_with_sizes: lambda tensor, split_size_or_sections, dim=0: -1,
+        torch.unsqueeze: lambda input, dim, out=None: -1,
+        torch.linalg.vander: lambda x, N=None: -1,
+        torch.var: lambda input, dim=None: -1,
+        torch.var_mean: lambda input, dim=None: -1,
+        torch.vsplit: lambda input, indices_or_sections: -1,
+        torch.vstack: lambda tensors, out=None: -1,
+        torch.where: lambda condition, x=None, y=None: -1,
+        torch.zeros_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1,
+        torch._fw_primal_copy: lambda self, level: -1,
+        torch._make_dual_copy: lambda primal, tangent, level: -1,
+        torch.view_as_real_copy: lambda self: -1,
+        torch.view_as_complex_copy: lambda self: -1,
+        torch._conj_copy: lambda self: -1,
+        torch._neg_view_copy: lambda self: -1,
+        torch.as_strided_copy: lambda self, size, stride, storage_offset=None: -1,
+        torch._sparse_broadcast_to_copy: lambda self, size: -1,
+        torch.diagonal_copy: lambda self, offset=0, dim1=0, dim2=1: -1,
+        torch.expand_copy: lambda self, size, *, implicit=False: -1,
+        torch.narrow_copy: lambda self, dim, start, length: -1,
+        torch.permute_copy: lambda self, dims: -1,
+        torch._reshape_alias_copy: lambda self, size, stride: -1,
+        torch.select_copy: lambda self, dim, index: -1,
+        torch.detach_copy: lambda self: -1,
+        torch.slice_copy: lambda self, dim=0, start=None, end=None, step=1: -1,
+        torch.split_copy: lambda self, split_size, dim=0: -1,
+        torch.split_with_sizes_copy: lambda self, split_sizes, dim=0: -1,
+        torch.squeeze_copy: lambda self, dim: -1,
+        torch.t_copy: lambda self: -1,
+        torch.transpose_copy: lambda self, dim0, dim1: -1,
+        torch.unsqueeze_copy: lambda self, dim: -1,
+        torch._indices_copy: lambda self: -1,
+        torch._values_copy: lambda self: -1,
+        torch.indices_copy: lambda self: -1,
+        torch.values_copy: lambda self: -1,
+        torch.crow_indices_copy: lambda self: -1,
+        torch.col_indices_copy: lambda self: -1,
+        torch.ccol_indices_copy: lambda self: -1,
+        torch.row_indices_copy: lambda self: -1,
+        torch.unbind_copy: lambda self, dim=0: -1,
+        torch.view_copy: lambda self, dtype: -1,
+        torch.unfold_copy: lambda self, dimension, size, step: -1,
+        torch.alias_copy: lambda self: -1,
+        Tensor.__floordiv__: lambda self, other: -1,
+        Tensor.__rfloordiv__: lambda self, other: -1,
+        Tensor.__ifloordiv__: lambda self, other: -1,
+        Tensor.__truediv__: lambda self, other: -1,
+        Tensor.__rtruediv__: lambda self, other: -1,
+        Tensor.__itruediv__: lambda self, other: -1,
+        Tensor.__lshift__: lambda self, other: -1,
+        Tensor.__rlshift__: lambda self, other: -1,
+        Tensor.__ilshift__: lambda self, other: -1,
+        Tensor.__rshift__: lambda self, other: -1,
+        Tensor.__rrshift__: lambda self, other: -1,
+        Tensor.__irshift__: lambda self, other: -1,
+        Tensor.__and__: lambda self, other: -1,
+        Tensor.__or__: lambda self, other: -1,
+        Tensor.__xor__: lambda self, other: -1,
+        Tensor.__float__: lambda self: -1,
+        Tensor.__complex__: lambda self: -1,
+        Tensor.__array__: lambda self, dtype: -1,
+        Tensor.__bool__: lambda self: -1,
+        Tensor.__contains__: lambda self, other: -1,
+        Tensor.__neg__: lambda self: -1,
+        Tensor.__invert__: lambda self: -1,
+        Tensor.__mod__: lambda self, other: -1,
+        Tensor.__rmod__: lambda self, other: -1,
+        Tensor.__imod__: lambda self, other: -1,
+        Tensor.__array_wrap__: lambda self, array: -1,
+        Tensor.__getitem__: lambda self, idx: -1,
+        Tensor.__deepcopy__: lambda self, memo: -1,
+        Tensor.__int__: lambda self: -1,
+        Tensor.__long__: lambda self: -1,
+        Tensor.__index__: lambda self: -1,
+        Tensor.__len__: lambda self: -1,
+        Tensor.__format__: lambda self, format_spec: -1,
+        Tensor.__reduce_ex__: lambda self, proto: -1,
+        Tensor.__reversed__: lambda self: -1,
+        Tensor.__repr__: lambda self, *, tensor_contents=None: -1,
+        Tensor.__setitem__: lambda self, k, v: -1,
+        Tensor.__setstate__: lambda self, d: -1,
+        Tensor.T.__get__: lambda self: -1,
+        Tensor.H.__get__: lambda self: -1,
+        Tensor.mT.__get__: lambda self: -1,
+        Tensor.mH.__get__: lambda self: -1,
+        Tensor._backward_hooks.__get__: lambda self: -1,
+        Tensor._post_accumulate_grad_hooks.__get__: lambda self: -1,
+        Tensor._base.__get__: lambda self: -1,
+        Tensor._cdata.__get__: lambda self: -1,
+        Tensor.grad.__get__: lambda self: -1,
+        Tensor._grad.__get__: lambda self: -1,
+        Tensor._grad_fn.__get__: lambda self: -1,
+        Tensor.grad_fn.__get__: lambda self: -1,
+        Tensor._version.__get__: lambda self: -1,
+        Tensor._autocast_to_reduced_precision: lambda self, cuda_enabled, cpu_enabled, cuda_dtype, cpu_dtype: -1,
+        Tensor._autocast_to_full_precision: lambda self, cuda_enabled, cpu_enabled: -1,
+        Tensor.data.__get__: lambda self: -1,
+        Tensor.device.__get__: lambda self: -1,
+        Tensor.dtype.__get__: lambda self: -1,
+        Tensor.is_cuda.__get__: lambda self: -1,
+        Tensor.is_cpu.__get__: lambda self: -1,
+        Tensor.is_xla.__get__: lambda self: -1,
+        Tensor.is_xpu.__get__: lambda self: -1,
+        Tensor.is_ipu.__get__: lambda self: -1,
+        Tensor.is_leaf.__get__: lambda self: -1,
+        Tensor.retains_grad.__get__: lambda self: -1,
+        Tensor.is_meta.__get__: lambda self: -1,
+        Tensor.is_mps.__get__: lambda self: -1,
+        Tensor.is_mtia.__get__: lambda self: -1,
+        Tensor.is_nested.__get__: lambda self: -1,
+        Tensor.is_ort.__get__: lambda self: -1,
+        Tensor.is_mkldnn.__get__: lambda self: -1,
+        Tensor.is_quantized.__get__: lambda self: -1,
+        Tensor.is_sparse.__get__: lambda self: -1,
+        Tensor.is_sparse_csr.__get__: lambda self: -1,
+        Tensor.is_vulkan.__get__: lambda self: -1,
+        Tensor.itemsize.__get__: lambda self: -1,
+        Tensor.layout.__get__: lambda self: -1,
+        Tensor.name.__get__: lambda self: -1,
+        Tensor.names.__get__: lambda self: -1,
+        Tensor.nbytes.__get__: lambda self: -1,
+        Tensor.ndim.__get__: lambda self: -1,
+        Tensor.output_nr.__get__: lambda self: -1,
+        Tensor.requires_grad.__get__: lambda self: -1,
+        Tensor.shape.__get__: lambda self: -1,
+        Tensor.volatile.__get__: lambda self: -1,
+        Tensor.real.__get__: lambda self: -1,
+        Tensor.imag.__get__: lambda self: -1,
+        Tensor.__cuda_array_interface__.__get__: lambda self: -1,
+        Tensor.type: lambda self, dtype=None, non_blocking=False, **kwargs: -1,
+        Tensor._dimI: lambda self: -1,
+        Tensor._dimV: lambda self: -1,
+        Tensor._indices: lambda self: -1,
+        Tensor._is_view: lambda self: -1,
+        Tensor._nnz: lambda self: -1,
+        Tensor.crow_indices: lambda self: -1,
+        Tensor.col_indices: lambda self: -1,
+        Tensor.ccol_indices: lambda self: -1,
+        Tensor.row_indices: lambda self: -1,
+        Tensor._update_names: lambda self, names, inplace: -1,
+        Tensor._values: lambda self: -1,
+        Tensor.adjoint: lambda self: -1,
+        Tensor.align_as: lambda self, other: -1,
+        Tensor.align_to: lambda self, order, ellipsis_idx: -1,
+        Tensor.apply_: lambda self, callable: -1,
+        Tensor.as_strided: lambda self, size, stride: -1,
+        Tensor.as_strided_: lambda self, size, stride: -1,
+        Tensor.backward: lambda self, gradient=None, retain_graph=None, create_graph=False, inputs=None: -1,
+        Tensor.bfloat16: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.bool: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.byte: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.char: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.cauchy_: lambda self, median=0, sigma=1, *, generator=None: -1,
+        Tensor.coalesce: lambda self: -1,
+        Tensor._coalesced_: lambda self, coalesced: -1,
+        Tensor.contiguous: lambda self, memory_format=torch.contiguous_format: -1,
+        Tensor.copy_: lambda self, src, non_blocking=False: -1,
+        Tensor.cpu: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.cuda: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.xpu: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.ipu: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.data_ptr: lambda self: -1,
+        Tensor.dense_dim: lambda self: -1,
+        Tensor.diagonal_scatter: lambda self, src, offset=0, dim1=0, dim2=1: -1,
+        Tensor.dim: lambda self: -1,
+        Tensor.dim_order: lambda self: -1,
+        Tensor.double: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.cdouble: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.element_size: lambda self: -1,
+        Tensor.expand: lambda self, size: -1,
+        Tensor.expand_as: lambda self, other: -1,
+        Tensor.exponential_: lambda self, lambd=1, *, generator=None: -1,
+        Tensor.fill_: lambda self, value: -1,
+        Tensor.fill_diagonal_: lambda self, value: -1,
+        Tensor.float: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.cfloat: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.geometric_: lambda self, p, *, generator=None: -1,
+        Tensor.get_device: lambda self: -1,
+        Tensor.half: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.chalf: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.has_names: lambda self: -1,
+        Tensor.indices: lambda self: -1,
+        Tensor.int: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.is_coalesced: lambda self: -1,
+        Tensor.is_contiguous: lambda self: -1,
+        Tensor.is_inference: lambda self: -1,
+        Tensor.is_pinned: lambda self: -1,
+        Tensor.is_set_to: lambda self, tensor: -1,
+        Tensor.is_shared: lambda self: -1,
+        Tensor.item: lambda self: -1,
+        Tensor.log_normal_: lambda self, mean=1, std=2, *, generator=None: -1,
+        Tensor.log_softmax: lambda self, dim: -1,
+        Tensor.long: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.map_: lambda self, tensor, callable: -1,
+        Tensor.map2_: lambda self, x, y, callable: -1,
+        Tensor.mm: lambda self, mat2: -1,
+        Tensor.module_load: lambda self, other, assign=False: -1,
+        Tensor.narrow_copy: lambda self, dimension, start, length: -1,
+        Tensor.ndimension: lambda self: -1,
+        Tensor.nelement: lambda self: -1,
+        Tensor._nested_tensor_size: lambda self: -1,
+        Tensor._nested_tensor_storage_offsets: lambda self: -1,
+        Tensor._nested_tensor_strides: lambda self: -1,
+        Tensor.normal_: lambda self: -1,
+        Tensor.numpy: lambda self: -1,
+        Tensor.permute: lambda self, dim: -1,
+        Tensor.pin_memory: lambda self: -1,
+        Tensor.put_: lambda self, indices, tensor, accumulate=False: -1,
+        Tensor.qscheme: lambda self: -1,
+        Tensor.random_: lambda self, from_=0, to=None, *, generator=None: -1,
+        Tensor.record_stream: lambda self, stream: -1,
+        Tensor.refine_names: lambda self, names: -1,
+        Tensor.register_hook: lambda self, hook: -1,
+        Tensor.register_post_accumulate_grad_hook: lambda self, hook: -1,
+        Tensor.rename: lambda self, name: -1,
+        Tensor.repeat: lambda self, *size: -1,
+        Tensor.requires_grad_: lambda self, requires_grad=True: -1,
+        Tensor.reshape_as: lambda self, other: -1,
+        Tensor.resize: lambda self, *size: -1,
+        Tensor.resize_: lambda self, size: -1,
+        Tensor.resize_as: lambda self, other: -1,
+        Tensor.resize_as_sparse_: lambda self, other: -1,
+        Tensor.retain_grad: lambda self: -1,
+        Tensor.set_: lambda self, source=None, storage_offset=0, size=None, stride=None: -1,
+        Tensor.select_scatter: lambda self, src, dim, index: -1,
+        Tensor.share_memory_: lambda self: -1,
+        Tensor.short: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.size: lambda self: -1,
+        Tensor.slice_scatter: lambda self, src, dim=0, start=None, end=None, step=1: -1,
+        Tensor.sparse_dim: lambda self: -1,
+        Tensor.sparse_mask: lambda self, mask: -1,
+        Tensor._sparse_mask_projection: lambda self, mask, accumulate_matches=False: -1,
+        Tensor.sparse_resize_: lambda self, size1, size2, dense_dim: -1,
+        Tensor.sparse_resize_and_clear_: lambda self, size1, size2, dense_dim: -1,
+        Tensor.sspaddmm: lambda self, mat1, mat2, beta=1, alpha=1, out=None: -1,
+        Tensor.storage: lambda self: -1,
+        Tensor.untyped_storage: lambda self: -1,
+        Tensor.storage_offset: lambda self: -1,
+        Tensor.storage_type: lambda self: -1,
+        Tensor.sum_to_size: lambda self, size: -1,
+        Tensor.tile: lambda self, *reps: -1,
+        Tensor.to: lambda self, dtype, non_blocking=False, copy=False, memory_format=torch.preserve_format: -1,
+        Tensor.to_dense: lambda self, dtype=None, *, masked_grad=None: -1,
+        Tensor._to_dense: lambda self, dtype=None, masked_grad=None: -1,
+        Tensor.to_sparse: lambda self: -1,
+        Tensor.tolist: lambda self: -1,
+        Tensor.to_mkldnn: lambda self: -1,
+        Tensor.type_as: lambda self, other: -1,
+        Tensor.unfold: lambda self, dimension, size, step: -1,
+        Tensor.uniform_: lambda self, from_=0, to=1: -1,
+        Tensor.values: lambda self: -1,
+        Tensor.view: lambda self, shape: -1,
+        Tensor.view_as: lambda self, other: -1,
+        Tensor.zero_: lambda self: -1,
+        Tensor.__dlpack__: lambda self, stream=None: -1,
+        Tensor.__dlpack_device__: lambda self: -1,
+        torch.linalg.lstsq: lambda self, b, cond=None, driver=None: -1,
+    }
+
+    ret2 = {}
+    ignored = get_ignored_functions()
+
+    for k, v in ret.items():
+        # Generate methods like __add__ and add_ by default from add
+        names = [
+            k.__name__,  # Default method
+            k.__name__ + "_",  # Inplace variant
+            "__" + k.__name__ + "__",  # Dunder method
+            "__i" + k.__name__ + "__",  # Inplace dunder method
+            "__r" + k.__name__ + "__",  # Reverse dunder method
+        ]
+
+        if k.__name__.startswith("bitwise_"):
+            # bitwise_<op> have dunder methods of the form __<op>__
+            # And so on.
+            subname = k.__name__[len("bitwise_"):]
+            names.extend([
+                "__" + subname + "__",
+                "__i" + subname + "__",
+                "__r" + subname + "__"
+            ])
+
+        for name in names:
+            func = getattr(Tensor, name, None)
+            if callable(func) and func not in ret and func not in ignored:
+                ret2[func] = v
+
+    ret.update(ret2)
+    return ret
+
+def wrap_torch_function(dispatcher: Callable):
+    """Wraps a given function with ``__torch_function__`` -related functionality.
+
+    Parameters
+    ----------
+    dispatcher: Callable
+        A callable that returns an iterable of Tensor-likes passed into the function.
+
+    Note
+    ----
+    This decorator may reduce the performance of your code. Generally, it's enough to express
+    your code as a series of functions that, themselves, support __torch_function__. If you
+    find yourself in the rare situation where this is not the case, e.g. if you're wrapping a
+    low-level library and you also need it to work for Tensor-likes, then this function is available.
+
+    Examples
+    --------
+    >>> def dispatcher(a): # Must have the same signature as func
+    ...     return (a,)
+    >>> @torch.overrides.wrap_torch_function(dispatcher)
+    >>> def func(a): # This will make func dispatchable by __torch_function__
+    ...     return a + 0
+    """
+    def inner(func):
+        @functools.wraps(func)
+        def wrapped(*args, **kwargs):
+            relevant_args = dispatcher(*args, **kwargs)
+            if has_torch_function(relevant_args):
+                return handle_torch_function(wrapped, relevant_args, *args, **kwargs)
+
+            return func(*args, **kwargs)
+
+        return wrapped
+
+    return inner
+
+def _get_overloaded_args(relevant_args: Iterable[Any], get_type_fn: Callable[[Any], Type] = None) -> List[Any]:
+    """Returns a list of arguments on which to call __torch_function__.
+
+    Checks arguments in relevant_args for __torch_function__ implementations,
+    storing references to the arguments and their types in overloaded_args and
+    overloaded_types in order of calling precedence. Only distinct types are
+    considered. If a type is a subclass of another type it will have higher
+    precedence, otherwise the precedence order is the same as the order of
+    arguments in relevant_args, that is, from left-to-right in the argument list.
+
+    The precedence-determining algorithm implemented in this function is
+    described in `NEP-0018`_.
+
+    See torch::append_overloaded_arg for the equivalent function in the C++
+    implementation.
+
+    Parameters
+    ----------
+    relevant_args : iterable of array-like
+        Iterable of array-like arguments to check for __torch_function__
+        methods.
+
+    get_type_fn : callable, optional
+        Function to call on each argument in relevant_args to get its type.
+
+    Returns
+    -------
+    overloaded_args : list
+        Arguments from relevant_args on which to call __torch_function__
+        methods, in the order in which they should be called.
+
+    .. _NEP-0018:
+       https://numpy.org/neps/nep-0018-array-function-protocol.html
+    """
+    if get_type_fn is None:
+        get_type_fn = type
+
+    # If torch function is not enabled, there are no overloaded types
+    if not torch._C._is_torch_function_enabled():
+        return []
+    # Runtime is O(num_arguments * num_unique_types)
+    overloaded_types: Set[Type] = set()
+    overloaded_args: List[Any] = []
+    for arg in relevant_args:
+        arg_type = get_type_fn(arg)
+        # We only collect arguments if they have a unique type, which ensures
+        # reasonable performance even with a long list of possibly overloaded
+        # arguments.
+        #
+        # NB: Important to exclude _disabled_torch_function_impl, otherwise
+        # https://github.com/pytorch/pytorch/issues/64687
+        if (arg_type not in overloaded_types and hasattr(arg_type, '__torch_function__') and
+                arg_type.__torch_function__ != torch._C._disabled_torch_function_impl):
+            # Create lists explicitly for the first type (usually the only one
+            # done) to avoid setting up the iterator for overloaded_args.
+            if overloaded_types:
+                overloaded_types.add(arg_type)
+                # By default, insert argument at the end, but if it is
+                # subclass of another argument, insert it before that argument.
+                # This ensures "subclasses before superclasses".
+                index = len(overloaded_args)
+                for i, old_arg in enumerate(overloaded_args):
+                    if issubclass(arg_type, get_type_fn(old_arg)):
+                        index = i
+                        break
+                overloaded_args.insert(index, arg)
+            else:
+                overloaded_types = {arg_type}
+                overloaded_args = [arg]
+    return overloaded_args
+
+
+def handle_torch_function(
+        public_api: Callable, relevant_args: Iterable[Any], *args, **kwargs) -> Any:
+    """Implement a function with checks for ``__torch_function__`` overrides.
+
+    See torch::autograd::handle_torch_function for the equivalent of this
+    function in the C++ implementation.
+
+    Arguments
+    ---------
+    public_api : function
+        Function exposed by the public torch API originally called like
+        ``public_api(*args, **kwargs)`` on which arguments are now being
+        checked.
+    relevant_args : iterable
+        Iterable of arguments to check for __torch_function__ methods.
+    args : tuple
+        Arbitrary positional arguments originally passed into ``public_api``.
+    kwargs : tuple
+        Arbitrary keyword arguments originally passed into ``public_api``.
+
+    Returns
+    -------
+    object
+        Result from calling ``implementation`` or an ``__torch_function__``
+        method, as appropriate.
+
+    Raises
+    ------
+    TypeError : if no implementation is found.
+
+    Example
+    -------
+    >>> def func(a):
+    ...     if has_torch_function_unary(a):
+    ...         return handle_torch_function(func, (a,), a)
+    ...     return a + 0
+    """
+    # Check for __torch_function__ methods.
+    overloaded_args = _get_overloaded_args(relevant_args)
+    # overloaded_args already have unique types.
+    types = tuple(map(type, overloaded_args))
+
+    # Check for __torch_function__ mode.
+    if _is_torch_function_mode_enabled():
+        # if we're here, the mode must be set to a TorchFunctionStackMode
+        # this unsets it and calls directly into TorchFunctionStackMode's torch function
+        with _pop_mode_temporarily() as mode:
+            result = mode.__torch_function__(public_api, types, args, kwargs)
+        if result is not NotImplemented:
+            return result
+
+    # Call overrides
+    for overloaded_arg in overloaded_args:
+        # This call needs to become a classmethod call in the future.
+        # See https://github.com/pytorch/pytorch/issues/63767
+        torch_func_method = overloaded_arg.__torch_function__
+        if hasattr(torch_func_method, "__self__") and torch_func_method.__self__ is overloaded_arg and \
+                torch_func_method is not torch._C._disabled_torch_function_impl:
+            warnings.warn("Defining your `__torch_function__ as a plain method is deprecated and "
+                          "will be an error in future, please define it as a classmethod.",
+                          DeprecationWarning)
+
+        # Use `public_api` instead of `implementation` so __torch_function__
+        # implementations can do equality/identity comparisons.
+        result = torch_func_method(public_api, types, args, kwargs)
+
+        if result is not NotImplemented:
+            return result
+
+    func_name = f'{public_api.__module__}.{public_api.__name__}'
+    msg = (
+        f"no implementation found for '{func_name}' on types that implement "
+        f'__torch_function__: {[type(arg) for arg in overloaded_args]}'
+    )
+    if _is_torch_function_mode_enabled():
+        msg += f" nor in mode {_get_current_function_mode()}"
+    raise TypeError(msg)
+
+has_torch_function = _add_docstr(
+    _has_torch_function,
+    r"""Check for __torch_function__ implementations in the elements of an iterable
+    or if a __torch_function__ mode is enabled.  Considers exact ``Tensor`` s
+    and ``Parameter`` s non-dispatchable.  Use this to guard a call to
+    :func:`handle_torch_function`; don't use it to test if something
+    is Tensor-like, use :func:`is_tensor_like` instead.
+    Arguments
+    ---------
+    relevant_args : iterable
+        Iterable or arguments to check for __torch_function__ methods.
+    Returns
+    -------
+    bool
+        True if any of the elements of relevant_args have __torch_function__
+        implementations, False otherwise.
+    See Also
+    ________
+    torch.is_tensor_like
+        Checks if something is a Tensor-like, including an exact ``Tensor``.
+    """
+)
+
+has_torch_function_unary = _add_docstr(
+    _has_torch_function_unary,
+    r"""Special case of `has_torch_function` for single inputs.
+    Instead of:
+      `has_torch_function((t,))`
+    call:
+      `has_torch_function_unary(t)`
+    which skips unnecessary packing and unpacking work.
+    """
+)
+
+has_torch_function_variadic = _add_docstr(
+    _has_torch_function_variadic,
+    r"""Special case of `has_torch_function` that skips tuple creation.
+
+    This uses the METH_FASTCALL protocol introduced in Python 3.7
+
+    Instead of:
+      `has_torch_function((a, b))`
+    call:
+      `has_torch_function_variadic(a, b)`
+    which skips unnecessary packing and unpacking work.
+    """
+)
+
+@functools.lru_cache(None)
+def _get_overridable_functions() -> Tuple[Dict[Any, List[Callable]], Dict[Callable, str]]:
+    overridable_funcs = collections.defaultdict(list)
+    index = {}
+    tested_namespaces = [
+        ("torch", torch, torch.__all__),
+        ("torch.functional", torch.functional, torch.functional.__all__),
+        ("torch.nn.functional", torch.nn.functional, dir(torch.nn.functional)),
+        ("torch.nn.init", torch.nn.init, dir(torch.nn.init)),
+        ("torch.Tensor", torch.Tensor, dir(torch.Tensor)),
+        ("torch.linalg", torch.linalg, dir(torch.linalg)),
+        ("torch.fft", torch.fft, dir(torch.fft)),
+        ("torch.special", torch.special, dir(torch.special)),
+    ]
+    for namespace_str, namespace, ns_funcs in tested_namespaces:
+        for func_name in ns_funcs:
+            ignore = False
+            # ignore private functions or functions that are deleted in torch.__init__
+            if namespace is not torch.Tensor:
+                if func_name.startswith('__'):
+                    continue
+                elif func_name.startswith('_'):
+                    ignore = True
+                elif func_name.endswith('_'):
+                    ignore = True
+                elif not func_name[0].islower():
+                    ignore = True
+                elif func_name == 'unique_dim':
+                    continue
+            else:
+                func = getattr(namespace, func_name)
+                if getattr(object, func_name, None) == func:
+                    continue
+                if func_name == '__weakref__':
+                    continue
+            func = getattr(namespace, func_name)
+            if namespace is torch.Tensor and getattr(object, func_name, None) == func:
+                continue
+            # ignore re-exported modules
+            if isinstance(func, types.ModuleType):
+                continue
+            # ignore __future__ imports
+            if isinstance(func, __future__._Feature):
+                continue
+
+            if not callable(func) and hasattr(func, "__get__"):
+                index[func.__get__] = f"{namespace_str}.{func_name}.__get__"
+                index[func.__set__] = f"{namespace_str}.{func_name}.__set__"
+                if ignore:
+                    continue
+                if func.__get__ in get_ignored_functions():
+                    msg = ("{}.{} is in the tuple returned by torch._overrides.get_ignored_functions "
+                           "but still has an explicit override")
+                    assert func.__get__ not in get_testing_overrides(), msg.format(namespace, func.__name__)
+                    continue
+                else:
+                    overridable_funcs[func].append(func.__get__)
+                    continue
+
+            if not callable(func):
+                continue
+
+            index[func] = f"{namespace_str}.{func_name}"
+
+            if ignore:
+                continue
+
+            # cannot be overriden by __torch_function__
+            if func in get_ignored_functions():
+                msg = ("{}.{} is in the tuple returned by torch._overrides.get_ignored_functions "
+                       "but still has an explicit override")
+                assert func not in get_testing_overrides(), msg.format(namespace, func.__name__)
+                continue
+            overridable_funcs[namespace].append(func)
+    return overridable_funcs, index
+
+@_disable_user_warnings
+def get_overridable_functions() -> Dict[Any, List[Callable]]:
+    """List functions that are overridable via __torch_function__
+
+    Returns
+    -------
+    Dict[Any, List[Callable]]
+        A dictionary that maps namespaces that contain overridable functions
+        to functions in that namespace that can be overridden.
+    """
+    return _get_overridable_functions()[0]
+
+@_disable_user_warnings
+def resolve_name(f):
+    """Get a human readable string name for a function passed to
+    __torch_function__
+
+    Arguments
+    ---------
+    f : Callable
+        Function to resolve the name of.
+
+    Returns
+    -------
+    str
+        Name of the function; if eval'ed it should give back the input
+        function.
+    """
+    if isinstance(f, (torch._ops.OpOverload, torch._ops.OpOverloadPacket)):
+        return str(f)
+    return _get_overridable_functions()[1].get(f)
+
+@functools.lru_cache(None)
+def _get_tensor_methods() -> Set[Callable]:
+    """ Returns a set of the overridable methods on ``torch.Tensor`` """
+    overridable_funcs = get_overridable_functions()
+    methods = set(overridable_funcs[torch.Tensor])
+    return methods
+
+@_disable_user_warnings
+def is_tensor_method_or_property(func: Callable) -> bool:
+    """
+    Returns True if the function passed in is a handler for a
+    method or property belonging to ``torch.Tensor``, as passed
+    into ``__torch_function__``.
+
+    .. note::
+       For properties, their ``__get__`` method must be passed in.
+
+    This may be needed, in particular, for the following reasons:
+
+    1. Methods/properties sometimes don't contain a `__module__` slot.
+    2. They require that the first passed-in argument is an instance
+       of ``torch.Tensor``.
+
+    Examples
+    --------
+    >>> is_tensor_method_or_property(torch.Tensor.add)
+    True
+    >>> is_tensor_method_or_property(torch.add)
+    False
+    """
+    return func in _get_tensor_methods() or func.__name__ == "__get__"
+
+def is_tensor_like(inp):
+    """
+    Returns ``True`` if the passed-in input is a Tensor-like.
+
+    Currently, this occurs whenever there's a ``__torch_function__``
+    attribute on the type of the input.
+
+    Examples
+    --------
+    A subclass of tensor is generally a Tensor-like.
+
+    >>> class SubTensor(torch.Tensor): ...
+    >>> is_tensor_like(SubTensor([0]))
+    True
+
+    Built-in or user types aren't usually Tensor-like.
+
+    >>> is_tensor_like(6)
+    False
+    >>> is_tensor_like(None)
+    False
+    >>> class NotATensor: ...
+    >>> is_tensor_like(NotATensor())
+    False
+
+    But, they can be made Tensor-like by implementing __torch_function__.
+
+    >>> class TensorLike:
+    ...     @classmethod
+    ...     def __torch_function__(cls, func, types, args, kwargs):
+    ...         return -1
+    >>> is_tensor_like(TensorLike())
+    True
+    """
+    return type(inp) is torch.Tensor or hasattr(inp, "__torch_function__")
+
+class TorchFunctionMode:
+    """
+    A ``TorchFunctionMode`` allows you to override the meaning of all
+    ``__torch_function__`` overrideable functions within a dynamic scope,
+    without having to actually create a tensor subclass or manually
+    monkey-patch functions in the PyTorch API.  Some common situations
+    where you should use a mode:
+
+        * You want to override the meaning of factory functions, or other
+          functions that do not otherwise take a tensor as an argument
+          (these cannot be overridden with tensor subclasses).
+
+        * You want to override the behavior of all functions without needing
+          to wrap your inputs in tensor subclasses; e.g., if you are just
+          interested in logging intermediate computations.
+
+        * You want to control the order of execution of various tensor
+          subclasses explicitly, rather than implicitly via the return of
+          ``NotImplemented``.
+
+    Independent subclasses of :class:`TorchFunctionMode` are compositional:
+    modes can be pushed onto a stack using ``with MyMode():``.
+    When you call functions in the PyTorch API inside your
+    ``__torch_function__`` implementation, by default, they will forward on to
+    the next mode on the mode stack.  If you want recursively call back into
+    your current ``__torch_function__`` implementation, either explicitly
+    invoke ``self.__torch_function__(...)``, or use the context manager
+    ``enable_torch_function_mode(self, replace=self.inner)`` to make PyTorch
+    API self-referential (beware of infinite loops, in this case!)
+    """
+    inner: "TorchFunctionMode"
+
+    # Force metaclass to generate constructor at the base of the hierarchy
+    def __init__(self):
+        pass
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        raise NotImplementedError()
+
+    def __enter__(self):
+        _push_mode(self)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        _pop_mode()
+
+    @classmethod
+    def push(cls, *args, **kwargs):
+        warnings.warn("`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`")
+        instance = cls(*args, **kwargs)
+        return instance
+
+
+def _get_current_function_mode():
+    stack_len = _len_torch_function_stack()
+    return _get_function_stack_at(stack_len - 1) if stack_len > 0 else None
+
+
+def _get_current_function_mode_stack():
+    stack_len = _len_torch_function_stack()
+    return [_get_function_stack_at(i) for i in range(stack_len)]
+
+def _push_mode(mode):
+    _push_on_torch_function_stack(mode)
+
+
+def _pop_mode():
+    old = _pop_torch_function_stack()
+    return old
+
+
+@contextlib.contextmanager
+def _pop_mode_temporarily():
+    old = _pop_mode()
+    try:
+        yield old
+    finally:
+        _push_mode(old)
+
+class BaseTorchFunctionMode(TorchFunctionMode):
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        return func(*args, **kwargs)
+
+
+@contextlib.contextmanager
+def enable_reentrant_dispatch():
+    # NB: this can't simply be
+    # `enable_reentrant_dispatch = torch._C._RestorePythonTLSSnapshot`
+    # because:
+    # 1. torch._C._RestorePythonTLSSnapshot is unavailable when this file
+    #    initially gets imported. Probably an import order thing.
+    # 2. enable_reentrant_dispatch is technically public API; assigning
+    #    it the object would change the __module__ to look private.
+    with torch._C._RestorePythonTLSSnapshot():
+        try:
+            yield
+        finally:
+            pass
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quasirandom.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quasirandom.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c9b949c55651c42895c1a1afb6d9050d41aca2f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quasirandom.py
@@ -0,0 +1,180 @@
+import torch
+from typing import Optional
+
+
+class SobolEngine:
+    r"""
+    The :class:`torch.quasirandom.SobolEngine` is an engine for generating
+    (scrambled) Sobol sequences. Sobol sequences are an example of low
+    discrepancy quasi-random sequences.
+
+    This implementation of an engine for Sobol sequences is capable of
+    sampling sequences up to a maximum dimension of 21201. It uses direction
+    numbers from https://web.maths.unsw.edu.au/~fkuo/sobol/ obtained using the
+    search criterion D(6) up to the dimension 21201. This is the recommended
+    choice by the authors.
+
+    References:
+      - Art B. Owen. Scrambling Sobol and Niederreiter-Xing points.
+        Journal of Complexity, 14(4):466-489, December 1998.
+
+      - I. M. Sobol. The distribution of points in a cube and the accurate
+        evaluation of integrals.
+        Zh. Vychisl. Mat. i Mat. Phys., 7:784-802, 1967.
+
+    Args:
+        dimension (Int): The dimensionality of the sequence to be drawn
+        scramble (bool, optional): Setting this to ``True`` will produce
+                                   scrambled Sobol sequences. Scrambling is
+                                   capable of producing better Sobol
+                                   sequences. Default: ``False``.
+        seed (Int, optional): This is the seed for the scrambling. The seed
+                              of the random number generator is set to this,
+                              if specified. Otherwise, it uses a random seed.
+                              Default: ``None``
+
+    Examples::
+
+        >>> # xdoctest: +SKIP("unseeded random state")
+        >>> soboleng = torch.quasirandom.SobolEngine(dimension=5)
+        >>> soboleng.draw(3)
+        tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
+                [0.5000, 0.5000, 0.5000, 0.5000, 0.5000],
+                [0.7500, 0.2500, 0.2500, 0.2500, 0.7500]])
+    """
+    MAXBIT = 30
+    MAXDIM = 21201
+
+    def __init__(self, dimension, scramble=False, seed=None):
+        if dimension > self.MAXDIM or dimension < 1:
+            raise ValueError("Supported range of dimensionality "
+                             f"for SobolEngine is [1, {self.MAXDIM}]")
+
+        self.seed = seed
+        self.scramble = scramble
+        self.dimension = dimension
+
+        cpu = torch.device("cpu")
+
+        self.sobolstate = torch.zeros(dimension, self.MAXBIT, device=cpu, dtype=torch.long)
+        torch._sobol_engine_initialize_state_(self.sobolstate, self.dimension)
+
+        if not self.scramble:
+            self.shift = torch.zeros(self.dimension, device=cpu, dtype=torch.long)
+        else:
+            self._scramble()
+
+        self.quasi = self.shift.clone(memory_format=torch.contiguous_format)
+        self._first_point = (self.quasi / 2 ** self.MAXBIT).reshape(1, -1)
+        self.num_generated = 0
+
+    def draw(self, n: int = 1, out: Optional[torch.Tensor] = None,
+             dtype: torch.dtype = torch.float32) -> torch.Tensor:
+        r"""
+        Function to draw a sequence of :attr:`n` points from a Sobol sequence.
+        Note that the samples are dependent on the previous samples. The size
+        of the result is :math:`(n, dimension)`.
+
+        Args:
+            n (Int, optional): The length of sequence of points to draw.
+                               Default: 1
+            out (Tensor, optional): The output tensor
+            dtype (:class:`torch.dtype`, optional): the desired data type of the
+                                                    returned tensor.
+                                                    Default: ``torch.float32``
+        """
+        if self.num_generated == 0:
+            if n == 1:
+                result = self._first_point.to(dtype)
+            else:
+                result, self.quasi = torch._sobol_engine_draw(
+                    self.quasi, n - 1, self.sobolstate, self.dimension, self.num_generated, dtype=dtype,
+                )
+                result = torch.cat((self._first_point, result), dim=-2)
+        else:
+            result, self.quasi = torch._sobol_engine_draw(
+                self.quasi, n, self.sobolstate, self.dimension, self.num_generated - 1, dtype=dtype,
+            )
+
+        self.num_generated += n
+
+        if out is not None:
+            out.resize_as_(result).copy_(result)
+            return out
+
+        return result
+
+    def draw_base2(self, m: int, out: Optional[torch.Tensor] = None,
+                   dtype: torch.dtype = torch.float32) -> torch.Tensor:
+        r"""
+        Function to draw a sequence of :attr:`2**m` points from a Sobol sequence.
+        Note that the samples are dependent on the previous samples. The size
+        of the result is :math:`(2**m, dimension)`.
+
+        Args:
+            m (Int): The (base2) exponent of the number of points to draw.
+            out (Tensor, optional): The output tensor
+            dtype (:class:`torch.dtype`, optional): the desired data type of the
+                                                    returned tensor.
+                                                    Default: ``torch.float32``
+        """
+        n = 2 ** m
+        total_n = self.num_generated + n
+        if not (total_n & (total_n - 1) == 0):
+            raise ValueError("The balance properties of Sobol' points require "
+                             f"n to be a power of 2. {self.num_generated} points have been "
+                             f"previously generated, then: n={self.num_generated}+2**{m}={total_n}. "
+                             "If you still want to do this, please use "
+                             "'SobolEngine.draw()' instead."
+                             )
+        return self.draw(n=n, out=out, dtype=dtype)
+
+    def reset(self):
+        r"""
+        Function to reset the ``SobolEngine`` to base state.
+        """
+        self.quasi.copy_(self.shift)
+        self.num_generated = 0
+        return self
+
+    def fast_forward(self, n):
+        r"""
+        Function to fast-forward the state of the ``SobolEngine`` by
+        :attr:`n` steps. This is equivalent to drawing :attr:`n` samples
+        without using the samples.
+
+        Args:
+            n (Int): The number of steps to fast-forward by.
+        """
+        if self.num_generated == 0:
+            torch._sobol_engine_ff_(self.quasi, n - 1, self.sobolstate, self.dimension, self.num_generated)
+        else:
+            torch._sobol_engine_ff_(self.quasi, n, self.sobolstate, self.dimension, self.num_generated - 1)
+        self.num_generated += n
+        return self
+
+    def _scramble(self):
+        g: Optional[torch.Generator] = None
+        if self.seed is not None:
+            g = torch.Generator()
+            g.manual_seed(self.seed)
+
+        cpu = torch.device("cpu")
+
+        # Generate shift vector
+        shift_ints = torch.randint(2, (self.dimension, self.MAXBIT), device=cpu, generator=g)
+        self.shift = torch.mv(shift_ints, torch.pow(2, torch.arange(0, self.MAXBIT, device=cpu)))
+
+        # Generate lower triangular matrices (stacked across dimensions)
+        ltm_dims = (self.dimension, self.MAXBIT, self.MAXBIT)
+        ltm = torch.randint(2, ltm_dims, device=cpu, generator=g).tril()
+
+        torch._sobol_engine_scramble_(self.sobolstate, ltm, self.dimension)
+
+    def __repr__(self):
+        fmt_string = [f'dimension={self.dimension}']
+        if self.scramble:
+            fmt_string += ['scramble=True']
+        if self.seed is not None:
+            fmt_string += [f'seed={self.seed}']
+        return self.__class__.__name__ + '(' + ', '.join(fmt_string) + ')'
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/random.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..75c48dab7213bf86ff7d9443d7af369e4932de68
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/random.py
@@ -0,0 +1,175 @@
+import contextlib
+from typing import Generator
+import warnings
+
+from torch._C import default_generator
+import torch
+
+
+def set_rng_state(new_state: torch.Tensor) -> None:
+    r"""Sets the random number generator state.
+
+    .. note: This function only works for CPU. For CUDA, please use
+             torch.manual_seed(seed), which works for both CPU and CUDA.
+
+    Args:
+        new_state (torch.ByteTensor): The desired state
+    """
+    default_generator.set_state(new_state)
+
+
+def get_rng_state() -> torch.Tensor:
+    r"""Returns the random number generator state as a `torch.ByteTensor`."""
+    return default_generator.get_state()
+
+
+def manual_seed(seed) -> torch._C.Generator:
+    r"""Sets the seed for generating random numbers. Returns a
+    `torch.Generator` object.
+
+    Args:
+        seed (int): The desired seed. Value must be within the inclusive range
+            `[-0x8000_0000_0000_0000, 0xffff_ffff_ffff_ffff]`. Otherwise, a RuntimeError
+            is raised. Negative inputs are remapped to positive values with the formula
+            `0xffff_ffff_ffff_ffff + seed`.
+    """
+    seed = int(seed)
+    import torch.cuda
+
+    if not torch.cuda._is_in_bad_fork():
+        torch.cuda.manual_seed_all(seed)
+
+    import torch.mps
+    if not torch.mps._is_in_bad_fork():
+        torch.mps.manual_seed(seed)
+
+    import torch.xpu
+    if not torch.xpu._is_in_bad_fork():
+        torch.xpu.manual_seed_all(seed)
+
+    _seed_custom_device(seed)
+
+    return default_generator.manual_seed(seed)
+
+
+def seed() -> int:
+    r"""Sets the seed for generating random numbers to a non-deterministic
+    random number. Returns a 64 bit number used to seed the RNG.
+    """
+    seed = default_generator.seed()
+    import torch.cuda
+
+    if not torch.cuda._is_in_bad_fork():
+        torch.cuda.manual_seed_all(seed)
+
+    import torch.mps
+    if not torch.mps._is_in_bad_fork():
+        torch.mps.manual_seed(seed)
+
+    import torch.xpu
+    if not torch.xpu._is_in_bad_fork():
+        torch.xpu.manual_seed_all(seed)
+
+    _seed_custom_device(seed)
+
+    return seed
+
+
+def _seed_custom_device(seed) -> None:
+    r"""Sets the seed to generate random numbers for custom device.
+
+    Args:
+        seed (int): The desired seed.
+
+    See [Note: support the custom device with privateuse1]
+    """
+    seed = int(seed)
+    custom_backend_name = torch._C._get_privateuse1_backend_name()
+    if hasattr(torch, custom_backend_name):
+        custom_device_mod = getattr(torch, custom_backend_name)
+        _bad_fork_name = "_is_in_bad_fork"
+        _seed_all_name = "manual_seed_all"
+        if hasattr(custom_device_mod, _bad_fork_name) and hasattr(custom_device_mod, _seed_all_name):
+            if not getattr(custom_device_mod, _bad_fork_name)():
+                getattr(custom_device_mod, _seed_all_name)(seed)
+        else:
+            message = f"Set seed for `{custom_backend_name}` device does not take effect, please add API's "
+            message += f"`{_bad_fork_name}` and `{_seed_all_name}` to `{custom_backend_name}` device module."
+            warnings.warn(message, UserWarning, stacklevel=3)
+
+
+def initial_seed() -> int:
+    r"""Returns the initial seed for generating random numbers as a
+    Python `long`.
+    """
+    return default_generator.initial_seed()
+
+
+_fork_rng_warned_already = False
+
+
+@contextlib.contextmanager
+def fork_rng(devices=None, enabled=True, _caller="fork_rng", _devices_kw="devices", device_type="cuda") -> Generator:
+    """
+    Forks the RNG, so that when you return, the RNG is reset
+    to the state that it was previously in.
+
+    Args:
+        devices (iterable of Device IDs): devices for which to fork
+            the RNG. CPU RNG state is always forked. By default, :meth:`fork_rng` operates
+            on all devices, but will emit a warning if your machine has a lot
+            of devices, since this function will run very slowly in that case.
+            If you explicitly specify devices, this warning will be suppressed
+        enabled (bool): if ``False``, the RNG is not forked.  This is a convenience
+            argument for easily disabling the context manager without having
+            to delete it and unindent your Python code under it.
+        deivce_type (str): device type str, default is `cuda`. As for custom device,
+            see details in [Note: support the custom device with privateuse1]
+    """
+
+    device_type = torch.device(device_type).type
+    device_mod = getattr(torch, device_type, None)
+    if device_mod is None:
+        raise RuntimeError(f"torch has no module of `{device_type}`, you should register " +
+                           "a module by `torch._register_device_module`.")
+    global _fork_rng_warned_already
+
+    # Internal arguments:
+    #   _caller: the function which called fork_rng, which the user used
+    #   _devices_kw: the devices keyword of _caller
+
+    if not enabled:
+        yield
+        return
+
+    if devices is None:
+        num_devices = device_mod.device_count()
+        if num_devices > 1 and not _fork_rng_warned_already:
+            message = (f"{device_type.upper()} reports that you have {num_devices} available devices, and "
+                       f"you have used {_caller} without explicitly specifying which devices are being used. "
+                       f"For safety, we initialize *every* {device_type.upper()} device by default, which can "
+                       f"be quite slow if you have a lot of {device_type.upper()}s. If you know that you are only"
+                       f" making use of a few {device_type.upper()} devices, set the environment variable "
+                       f"{device_type.upper()}_VISIBLE_DEVICES or the '{_devices_kw}' keyword argument of {_caller} "
+                       "with the set of devices you are actually using. For example, if you are using CPU only, "
+                       "set device.upper()_VISIBLE_DEVICES= or devices=[]; if you are using device 0 only, "
+                       f"set {device_type.upper()}_VISIBLE_DEVICES=0 or devices=[0].  To initialize all devices "
+                       f"and suppress this warning, set the '{_devices_kw}' keyword argument to "
+                       f"`range(torch.{device_type}.device_count())`.")
+            warnings.warn(message)
+            _fork_rng_warned_already = True
+        devices = list(range(num_devices))
+    else:
+        # Protect against user passing us a generator; we need to traverse this
+        # multiple times but a generator will be exhausted upon first traversal
+        devices = list(devices)
+
+    cpu_rng_state = torch.get_rng_state()
+    device_rng_states = [device_mod.get_rng_state(device) for device in devices]
+
+    try:
+        yield
+    finally:
+        torch.set_rng_state(cpu_rng_state)
+        for device, device_rng_state in zip(devices, device_rng_states):
+            device_mod.set_rng_state(device_rng_state, device)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aed7d735af2cc66c96224a149bdfbcc3cb86ca52
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b53b6b9d49d55864964b3395b16dde18bce16e76fdd3f1fb4a0b26048e29fd0
+size 123690
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton-2.3.0.dist-info/METADATA b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton-2.3.0.dist-info/METADATA
new file mode 100644
index 0000000000000000000000000000000000000000..3e4e2809775aad6f54e7c9ea2a4cb9b51fd78b27
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton-2.3.0.dist-info/METADATA
@@ -0,0 +1,35 @@
+Metadata-Version: 2.1
+Name: triton
+Version: 2.3.0
+Summary: A language and compiler for custom Deep Learning operations
+Home-page: https://github.com/openai/triton/
+Author: Philippe Tillet
+Author-email: phil@openai.com
+Keywords: Compiler,Deep Learning
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Topic :: Software Development :: Build Tools
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Requires-Dist: filelock
+Provides-Extra: build
+Requires-Dist: cmake >=3.20 ; extra == 'build'
+Requires-Dist: lit ; extra == 'build'
+Provides-Extra: tests
+Requires-Dist: autopep8 ; extra == 'tests'
+Requires-Dist: flake8 ; extra == 'tests'
+Requires-Dist: isort ; extra == 'tests'
+Requires-Dist: numpy ; extra == 'tests'
+Requires-Dist: pytest ; extra == 'tests'
+Requires-Dist: scipy >=1.7.1 ; extra == 'tests'
+Requires-Dist: torch ; extra == 'tests'
+Provides-Extra: tutorials
+Requires-Dist: matplotlib ; extra == 'tutorials'
+Requires-Dist: pandas ; extra == 'tutorials'
+Requires-Dist: tabulate ; extra == 'tutorials'
+Requires-Dist: torch ; extra == 'tutorials'
+