diff --git a/.gitattributes b/.gitattributes index 2b716bbd9280bea303a16943d1aad9687eefda20..9255f7e32eab613442ed1ba133efa16f7356cac9 100644 --- a/.gitattributes +++ b/.gitattributes @@ -51,3 +51,14 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Plex/Trans tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/ExprNodes.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/idna/__pycache__/uts46data.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/rich/__pycache__/_emoji_codes.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/Scanning.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/idna/__pycache__/idnadata.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/PyrexTypes.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Code.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/typing_extensions.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/ParseTreeTransforms.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_functions2.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Nodes.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/Scanning.cpython-311-x86_64-linux-gnu.so b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/Scanning.cpython-311-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..0460cbae69bb4331de217febe2102ddc98276943 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/Scanning.cpython-311-x86_64-linux-gnu.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35bbd7708e61d6b2d4704c7139018d3eae67bca303d9fa03228b50845f6fffe6 +size 340320 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Code.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Code.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de5990d5f865148dca19d1380505714f00d679ad --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Code.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e03d22fd7cc8b4e378f65e07858c4720dcc03e0fa3553c776863e4969826cfd4 +size 145746 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Nodes.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Nodes.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b2569700f7d23448c43dd5ef7e1aa08aca06537 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Nodes.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c423f97f1ac36f06a8a2c6ff723696608c3e094001049a85ad421706ae558dea +size 522167 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/ParseTreeTransforms.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/ParseTreeTransforms.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60261f39b9770d7458b77b9729df9f271f7e8ee7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/ParseTreeTransforms.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab3f1d1811e8f1f97f96bc002bc8705a4adb7a26f43def577bf24b25263f4b32 +size 213081 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/PyrexTypes.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/PyrexTypes.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72f3b8d343a587b579984deb2ebf7487e5af6cea --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/PyrexTypes.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:465b72a0af764658a62bbb1d50e50b9a762ba16ddb1a6be0dd5b3b1f15c8a205 +size 254554 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/typing_extensions.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/typing_extensions.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b21fa6a35fad1e2aa4d277778fcc8dbdfa0643da --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/typing_extensions.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f505b823a26bd0da98ceb5e93ba4f79513f56cebf4f8cb1c8ed579dcdabaac32 +size 129942 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..71ade6609aaccad8856ecff8899d967fe3ee91ee Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_unix.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_unix.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9cdf7e8ffe81efb2449ef9e3e232c6b16844915 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_unix.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/INSTALLER b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/INSTALLER new file mode 100644 index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/RECORD b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..efaf5f8ceafaeab082f59ade8a16b3d560198153 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/RECORD @@ -0,0 +1,104 @@ +fsspec-2024.2.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +fsspec-2024.2.0.dist-info/LICENSE,sha256=LcNUls5TpzB5FcAIqESq1T53K0mzTN0ARFBnaRQH7JQ,1513 +fsspec-2024.2.0.dist-info/METADATA,sha256=uwzW1Braxnd_QGVI8W6J0KHi5KTiTJEm8YzSUdG-_Dc,6786 +fsspec-2024.2.0.dist-info/RECORD,, +fsspec-2024.2.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92 +fsspec-2024.2.0.dist-info/top_level.txt,sha256=blt2pDrQDwN3Gklcw13CSPLQRd6aaOgJ8AxqrW395MI,7 +fsspec/__init__.py,sha256=2kT62GfFK-AjgS-LgwSsCo_VA2IePvsyv8Ash5oiaFA,1982 +fsspec/__pycache__/__init__.cpython-311.pyc,, +fsspec/__pycache__/_version.cpython-311.pyc,, +fsspec/__pycache__/archive.cpython-311.pyc,, +fsspec/__pycache__/asyn.cpython-311.pyc,, +fsspec/__pycache__/caching.cpython-311.pyc,, +fsspec/__pycache__/callbacks.cpython-311.pyc,, +fsspec/__pycache__/compression.cpython-311.pyc,, +fsspec/__pycache__/config.cpython-311.pyc,, +fsspec/__pycache__/conftest.cpython-311.pyc,, +fsspec/__pycache__/core.cpython-311.pyc,, +fsspec/__pycache__/dircache.cpython-311.pyc,, +fsspec/__pycache__/exceptions.cpython-311.pyc,, +fsspec/__pycache__/fuse.cpython-311.pyc,, +fsspec/__pycache__/generic.cpython-311.pyc,, +fsspec/__pycache__/gui.cpython-311.pyc,, +fsspec/__pycache__/mapping.cpython-311.pyc,, +fsspec/__pycache__/parquet.cpython-311.pyc,, +fsspec/__pycache__/registry.cpython-311.pyc,, +fsspec/__pycache__/spec.cpython-311.pyc,, +fsspec/__pycache__/transaction.cpython-311.pyc,, +fsspec/__pycache__/utils.cpython-311.pyc,, +fsspec/_version.py,sha256=onTKKWe4fXkBjQxbTwM82SUT0H3x4U17IYrciFAryaU,500 +fsspec/archive.py,sha256=S__DzfZj-urAN3tp2W6jJ6YDiXG1fAl7FjvWUN73qIE,2386 +fsspec/asyn.py,sha256=kJ45sFFya2lZsmu2v8CVc8ZPRs8AccEzAy6Jot2ylkU,36157 +fsspec/caching.py,sha256=N45pzJdD4w5FOX_sxGvHWirggPNB66JTGP1HH6fpSck,28781 +fsspec/callbacks.py,sha256=BDIwLzK6rr_0V5ch557fSzsivCElpdqhXr5dZ9Te-EE,9210 +fsspec/compression.py,sha256=Yyd8FXw2rwWRtVoRVah_yguv-J7BUcBo4yDu6Qt52a0,4859 +fsspec/config.py,sha256=LF4Zmu1vhJW7Je9Q-cwkRc3xP7Rhyy7Xnwj26Z6sv2g,4279 +fsspec/conftest.py,sha256=fVfx-NLrH_OZS1TIpYNoPzM7efEcMoL62reHOdYeFCA,1245 +fsspec/core.py,sha256=0yCj1Z5MhbSDIQiqFs49VORl9QaGwV6hp9bXdkIoPIo,22363 +fsspec/dircache.py,sha256=YzogWJrhEastHU7vWz-cJiJ7sdtLXFXhEpInGKd4EcM,2717 +fsspec/exceptions.py,sha256=xcS7LiRrQ748kvOB9mrUR14kpjNztrHgEkZWi9M-VaI,330 +fsspec/fuse.py,sha256=66amOa6wdIbS0DMhhfAPUoOB37HPorfXD1izV0prmTY,10145 +fsspec/generic.py,sha256=NuNaP66OaphwMbuLHRFBLda78TD81isa9O4ozJqbUv0,13455 +fsspec/gui.py,sha256=XKoXZpUhRE7jOhRCJH4-jRbKhVu56aS8h9tecvPD3nc,13932 +fsspec/implementations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +fsspec/implementations/__pycache__/__init__.cpython-311.pyc,, +fsspec/implementations/__pycache__/arrow.cpython-311.pyc,, +fsspec/implementations/__pycache__/cache_mapper.cpython-311.pyc,, +fsspec/implementations/__pycache__/cache_metadata.cpython-311.pyc,, +fsspec/implementations/__pycache__/cached.cpython-311.pyc,, +fsspec/implementations/__pycache__/dask.cpython-311.pyc,, +fsspec/implementations/__pycache__/data.cpython-311.pyc,, +fsspec/implementations/__pycache__/dbfs.cpython-311.pyc,, +fsspec/implementations/__pycache__/dirfs.cpython-311.pyc,, +fsspec/implementations/__pycache__/ftp.cpython-311.pyc,, +fsspec/implementations/__pycache__/git.cpython-311.pyc,, +fsspec/implementations/__pycache__/github.cpython-311.pyc,, +fsspec/implementations/__pycache__/http.cpython-311.pyc,, +fsspec/implementations/__pycache__/jupyter.cpython-311.pyc,, +fsspec/implementations/__pycache__/libarchive.cpython-311.pyc,, +fsspec/implementations/__pycache__/local.cpython-311.pyc,, +fsspec/implementations/__pycache__/memory.cpython-311.pyc,, +fsspec/implementations/__pycache__/reference.cpython-311.pyc,, +fsspec/implementations/__pycache__/sftp.cpython-311.pyc,, +fsspec/implementations/__pycache__/smb.cpython-311.pyc,, +fsspec/implementations/__pycache__/tar.cpython-311.pyc,, +fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc,, +fsspec/implementations/__pycache__/zip.cpython-311.pyc,, +fsspec/implementations/arrow.py,sha256=_7TLuV6ZzNlpmUU_v6ud56u2wadzsKmY5qugPBxgMEs,8649 +fsspec/implementations/cache_mapper.py,sha256=iHgBA6gjzDJ7_mBboHFzpLTf55HP3UEwUOZ43xyUK4M,2429 +fsspec/implementations/cache_metadata.py,sha256=ZvyA7Y3KK-5Ct4E5pELzD6mH_5T03XqaKVT96qYDADU,8576 +fsspec/implementations/cached.py,sha256=LbbPbeUup07O0y7gXD_atFgajWM9p1vlDKu_BOyLfbo,30943 +fsspec/implementations/dask.py,sha256=CXZbJzIVOhKV8ILcxuy3bTvcacCueAbyQxmvAkbPkrk,4466 +fsspec/implementations/data.py,sha256=Oti0dKzyeadnVIedo3s8CADoh9bNM-96_6viTEYr4lo,1245 +fsspec/implementations/dbfs.py,sha256=cix9OYUveuSOx5UO5uRUwNUkYqjzyY0fkKnca1kTgZ0,15014 +fsspec/implementations/dirfs.py,sha256=inDIRSDPhI1_ud1MMBFrpZQ11VIAMJ_dZQtbE4V08Ng,11384 +fsspec/implementations/ftp.py,sha256=rp6cTog8xqjDPlKdSLKcsyP7K593_ByMabxGbNSEpTo,11655 +fsspec/implementations/git.py,sha256=vKGI-Vd5q4H2RrvhebkPc9NwlfkZ980OUGhebeCw-M0,4034 +fsspec/implementations/github.py,sha256=0kIiKkeAaROuHgdWBHVQFrzJ2ZfoDgymCehL_kJXHYA,7565 +fsspec/implementations/http.py,sha256=PkhfgUV3-T7fG2Jf-NLX9doH52snV5Wmw91uVA9k74M,29454 +fsspec/implementations/jupyter.py,sha256=B2uj7OEm7yIk-vRSsO37_ND0t0EBvn4B-Su43ibN4Pg,3811 +fsspec/implementations/libarchive.py,sha256=5_I2DiLXwQ1JC8x-K7jXu-tBwhO9dj7tFLnb0bTnVMQ,7102 +fsspec/implementations/local.py,sha256=nxiRKg9FAQHTQss9-ET8ZzDXPGhSOktgkxrg0ffMs2I,13454 +fsspec/implementations/memory.py,sha256=2iU--pOV2KCTrS-d5K8VKSygh9MPk2D7NZ_C8lMMEIw,9701 +fsspec/implementations/reference.py,sha256=0iGu8mscaQ3a5iTlRNByytQ3_-1Bj8__ARqVwyy4q2M,43871 +fsspec/implementations/sftp.py,sha256=fMY9XZcmpjszQ2tCqO_TPaJesaeD_Dv7ptYzgUPGoO0,5631 +fsspec/implementations/smb.py,sha256=k3RtzW97lJtYuw_QpP1rJRFnUBmSsw9twFjUCex0a5U,10591 +fsspec/implementations/tar.py,sha256=dam78Tp_CozybNqCY2JYgGBS3Uc9FuJUAT9oB0lolOs,4111 +fsspec/implementations/webhdfs.py,sha256=wqVfno7z0TY1HepaIvKTUUcl_bi5NkV6qWsST8t_s7Y,16745 +fsspec/implementations/zip.py,sha256=JDX-3HOI15qUl6VTBsNPuDp5RVN6s2n3Bywd4mMu0T0,4347 +fsspec/mapping.py,sha256=WFEXRWxujQwfzzkRP5tpdIE0265okAtlP97qFZGvV1k,8165 +fsspec/parquet.py,sha256=qVxDhwc960SGOt5etcYAJxCr-7HQKP01687KpDR02Gw,19463 +fsspec/registry.py,sha256=-dl7sh2tsfhMA2uxz5KQDsPFehQTgMJIbVjNq6QLoKU,11145 +fsspec/spec.py,sha256=3t96RgizRN_slIuHXnuR0bXjVUfBS1TfuDrEua4oQvE,66277 +fsspec/tests/abstract/__init__.py,sha256=i1wcFixV6QhOwdoB24c8oXjzobISNqiKVz9kl2DvAY8,10028 +fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc,, +fsspec/tests/abstract/__pycache__/common.cpython-311.pyc,, +fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc,, +fsspec/tests/abstract/__pycache__/get.cpython-311.pyc,, +fsspec/tests/abstract/__pycache__/put.cpython-311.pyc,, +fsspec/tests/abstract/common.py,sha256=1GQwNo5AONzAnzZj0fWgn8NJPLXALehbsuGxS3FzWVU,4973 +fsspec/tests/abstract/copy.py,sha256=gU5-d97U3RSde35Vp4RxPY4rWwL744HiSrJ8IBOp9-8,19967 +fsspec/tests/abstract/get.py,sha256=vNR4HztvTR7Cj56AMo7_tx7TeYz1Jgr_2Wb8Lv-UiBY,20755 +fsspec/tests/abstract/put.py,sha256=7aih17OKB_IZZh1Mkq1eBDIjobhtMQmI8x-Pw-S_aZk,21201 +fsspec/transaction.py,sha256=jeexB-H6Aw_gN6Z7hoKKe6v8zizITq39-gyTgpipIKE,2251 +fsspec/utils.py,sha256=_VX_0VwDtoAFSjMYrxvJvnPNX9FMoHO5BlFHXJ0bHFI,23053 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/WHEEL b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/WHEEL new file mode 100644 index 0000000000000000000000000000000000000000..98c0d20b7a64f4f998d7913e1d38a05dba20916c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/WHEEL @@ -0,0 +1,5 @@ +Wheel-Version: 1.0 +Generator: bdist_wheel (0.42.0) +Root-Is-Purelib: true +Tag: py3-none-any + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/top_level.txt b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..968fea66e533ba30593c7fbfe750c36fae2f3cfe --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec-2024.2.0.dist-info/top_level.txt @@ -0,0 +1 @@ +fsspec diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..a2be08170e5dea4db678c7a8f3dccb7d37e8f332 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d0c8228a395e1b7975c5d22cd5fe655e5a7b7024723a69164e0c9045aee847d +size 324168 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1411fcaefbf746364892ac2d72f6109b630472b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc6ec603b289fea3017e8bb0c8eb537328f368d775f0aee16f2837595da3258b +size 110499 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/__pycache__/ctx_mp.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/__pycache__/ctx_mp.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..598fa60a0df31c902712c040cef5ecec8db1f727 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/__pycache__/ctx_mp.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1573114afc4fbce73f2ba9d2ddc99882c00027c0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__init__.py @@ -0,0 +1,77 @@ +from .libmpf import (prec_to_dps, dps_to_prec, repr_dps, + round_down, round_up, round_floor, round_ceiling, round_nearest, + to_pickable, from_pickable, ComplexResult, + fzero, fnzero, fone, fnone, ftwo, ften, fhalf, fnan, finf, fninf, + math_float_inf, round_int, normalize, normalize1, + from_man_exp, from_int, to_man_exp, to_int, mpf_ceil, mpf_floor, + mpf_nint, mpf_frac, + from_float, from_npfloat, from_Decimal, to_float, from_rational, to_rational, to_fixed, + mpf_rand, mpf_eq, mpf_hash, mpf_cmp, mpf_lt, mpf_le, mpf_gt, mpf_ge, + mpf_pos, mpf_neg, mpf_abs, mpf_sign, mpf_add, mpf_sub, mpf_sum, + mpf_mul, mpf_mul_int, mpf_shift, mpf_frexp, + mpf_div, mpf_rdiv_int, mpf_mod, mpf_pow_int, + mpf_perturb, + to_digits_exp, to_str, str_to_man_exp, from_str, from_bstr, to_bstr, + mpf_sqrt, mpf_hypot) + +from .libmpc import (mpc_one, mpc_zero, mpc_two, mpc_half, + mpc_is_inf, mpc_is_infnan, mpc_to_str, mpc_to_complex, mpc_hash, + mpc_conjugate, mpc_is_nonzero, mpc_add, mpc_add_mpf, + mpc_sub, mpc_sub_mpf, mpc_pos, mpc_neg, mpc_shift, mpc_abs, + mpc_arg, mpc_floor, mpc_ceil, mpc_nint, mpc_frac, mpc_mul, mpc_square, + mpc_mul_mpf, mpc_mul_imag_mpf, mpc_mul_int, + mpc_div, mpc_div_mpf, mpc_reciprocal, mpc_mpf_div, + complex_int_pow, mpc_pow, mpc_pow_mpf, mpc_pow_int, + mpc_sqrt, mpc_nthroot, mpc_cbrt, mpc_exp, mpc_log, mpc_cos, mpc_sin, + mpc_tan, mpc_cos_pi, mpc_sin_pi, mpc_cosh, mpc_sinh, mpc_tanh, + mpc_atan, mpc_acos, mpc_asin, mpc_asinh, mpc_acosh, mpc_atanh, + mpc_fibonacci, mpf_expj, mpf_expjpi, mpc_expj, mpc_expjpi, + mpc_cos_sin, mpc_cos_sin_pi) + +from .libelefun import (ln2_fixed, mpf_ln2, ln10_fixed, mpf_ln10, + pi_fixed, mpf_pi, e_fixed, mpf_e, phi_fixed, mpf_phi, + degree_fixed, mpf_degree, + mpf_pow, mpf_nthroot, mpf_cbrt, log_int_fixed, agm_fixed, + mpf_log, mpf_log_hypot, mpf_exp, mpf_cos_sin, mpf_cos, mpf_sin, mpf_tan, + mpf_cos_sin_pi, mpf_cos_pi, mpf_sin_pi, mpf_cosh_sinh, + mpf_cosh, mpf_sinh, mpf_tanh, mpf_atan, mpf_atan2, mpf_asin, + mpf_acos, mpf_asinh, mpf_acosh, mpf_atanh, mpf_fibonacci) + +from .libhyper import (NoConvergence, make_hyp_summator, + mpf_erf, mpf_erfc, mpf_ei, mpc_ei, mpf_e1, mpc_e1, mpf_expint, + mpf_ci_si, mpf_ci, mpf_si, mpc_ci, mpc_si, mpf_besseljn, + mpc_besseljn, mpf_agm, mpf_agm1, mpc_agm, mpc_agm1, + mpf_ellipk, mpc_ellipk, mpf_ellipe, mpc_ellipe) + +from .gammazeta import (catalan_fixed, mpf_catalan, + khinchin_fixed, mpf_khinchin, glaisher_fixed, mpf_glaisher, + apery_fixed, mpf_apery, euler_fixed, mpf_euler, mertens_fixed, + mpf_mertens, twinprime_fixed, mpf_twinprime, + mpf_bernoulli, bernfrac, mpf_gamma_int, + mpf_factorial, mpc_factorial, mpf_gamma, mpc_gamma, + mpf_loggamma, mpc_loggamma, mpf_rgamma, mpc_rgamma, + mpf_harmonic, mpc_harmonic, mpf_psi0, mpc_psi0, + mpf_psi, mpc_psi, mpf_zeta_int, mpf_zeta, mpc_zeta, + mpf_altzeta, mpc_altzeta, mpf_zetasum, mpc_zetasum) + +from .libmpi import (mpi_str, + mpi_from_str, mpi_to_str, + mpi_eq, mpi_ne, + mpi_lt, mpi_le, mpi_gt, mpi_ge, + mpi_add, mpi_sub, mpi_delta, mpi_mid, + mpi_pos, mpi_neg, mpi_abs, mpi_mul, mpi_div, mpi_exp, + mpi_log, mpi_sqrt, mpi_pow_int, mpi_pow, mpi_cos_sin, + mpi_cos, mpi_sin, mpi_tan, mpi_cot, + mpi_atan, mpi_atan2, + mpci_pos, mpci_neg, mpci_add, mpci_sub, mpci_mul, mpci_div, mpci_pow, + mpci_abs, mpci_pow, mpci_exp, mpci_log, mpci_cos, mpci_sin, + mpi_gamma, mpci_gamma, mpi_loggamma, mpci_loggamma, + mpi_rgamma, mpci_rgamma, mpi_factorial, mpci_factorial) + +from .libintmath import (trailing, bitcount, numeral, bin_to_radix, + isqrt, isqrt_small, isqrt_fast, sqrt_fixed, sqrtrem, ifib, ifac, + list_primes, isprime, moebius, gcd, eulernum, stirling1, stirling2) + +from .backend import (gmpy, sage, BACKEND, STRICT, MPZ, MPZ_TYPE, + MPZ_ZERO, MPZ_ONE, MPZ_TWO, MPZ_THREE, MPZ_FIVE, int_types, + HASH_MODULUS, HASH_BITS) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libelefun.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libelefun.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b237a35594f26f29abf244b8aa8bad49ae13dc2 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libelefun.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libhyper.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libhyper.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0f53d6b0c7006eb4a5fcf8f1f7cb6b5b5c214f8 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libhyper.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libmpc.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libmpc.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..904ba5a435ee661e1f2ecc6f769d3d80131f1bb1 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/__pycache__/libmpc.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libelefun.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libelefun.py new file mode 100644 index 0000000000000000000000000000000000000000..3de2e5aaef02296ed03dec3df3021b56823f3728 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libelefun.py @@ -0,0 +1,1428 @@ +""" +This module implements computation of elementary transcendental +functions (powers, logarithms, trigonometric and hyperbolic +functions, inverse trigonometric and hyperbolic) for real +floating-point numbers. + +For complex and interval implementations of the same functions, +see libmpc and libmpi. + +""" + +import math +from bisect import bisect + +from .backend import xrange +from .backend import MPZ, MPZ_ZERO, MPZ_ONE, MPZ_TWO, MPZ_FIVE, BACKEND + +from .libmpf import ( + round_floor, round_ceiling, round_down, round_up, + round_nearest, round_fast, + ComplexResult, + bitcount, bctable, lshift, rshift, giant_steps, sqrt_fixed, + from_int, to_int, from_man_exp, to_fixed, to_float, from_float, + from_rational, normalize, + fzero, fone, fnone, fhalf, finf, fninf, fnan, + mpf_cmp, mpf_sign, mpf_abs, + mpf_pos, mpf_neg, mpf_add, mpf_sub, mpf_mul, mpf_div, mpf_shift, + mpf_rdiv_int, mpf_pow_int, mpf_sqrt, + reciprocal_rnd, negative_rnd, mpf_perturb, + isqrt_fast +) + +from .libintmath import ifib + + +#------------------------------------------------------------------------------- +# Tuning parameters +#------------------------------------------------------------------------------- + +# Cutoff for computing exp from cosh+sinh. This reduces the +# number of terms by half, but also requires a square root which +# is expensive with the pure-Python square root code. +if BACKEND == 'python': + EXP_COSH_CUTOFF = 600 +else: + EXP_COSH_CUTOFF = 400 +# Cutoff for using more than 2 series +EXP_SERIES_U_CUTOFF = 1500 + +# Also basically determined by sqrt +if BACKEND == 'python': + COS_SIN_CACHE_PREC = 400 +else: + COS_SIN_CACHE_PREC = 200 +COS_SIN_CACHE_STEP = 8 +cos_sin_cache = {} + +# Number of integer logarithms to cache (for zeta sums) +MAX_LOG_INT_CACHE = 2000 +log_int_cache = {} + +LOG_TAYLOR_PREC = 2500 # Use Taylor series with caching up to this prec +LOG_TAYLOR_SHIFT = 9 # Cache log values in steps of size 2^-N +log_taylor_cache = {} +# prec/size ratio of x for fastest convergence in AGM formula +LOG_AGM_MAG_PREC_RATIO = 20 + +ATAN_TAYLOR_PREC = 3000 # Same as for log +ATAN_TAYLOR_SHIFT = 7 # steps of size 2^-N +atan_taylor_cache = {} + + +# ~= next power of two + 20 +cache_prec_steps = [22,22] +for k in xrange(1, bitcount(LOG_TAYLOR_PREC)+1): + cache_prec_steps += [min(2**k,LOG_TAYLOR_PREC)+20] * 2**(k-1) + + +#----------------------------------------------------------------------------# +# # +# Elementary mathematical constants # +# # +#----------------------------------------------------------------------------# + +def constant_memo(f): + """ + Decorator for caching computed values of mathematical + constants. This decorator should be applied to a + function taking a single argument prec as input and + returning a fixed-point value with the given precision. + """ + f.memo_prec = -1 + f.memo_val = None + def g(prec, **kwargs): + memo_prec = f.memo_prec + if prec <= memo_prec: + return f.memo_val >> (memo_prec-prec) + newprec = int(prec*1.05+10) + f.memo_val = f(newprec, **kwargs) + f.memo_prec = newprec + return f.memo_val >> (newprec-prec) + g.__name__ = f.__name__ + g.__doc__ = f.__doc__ + return g + +def def_mpf_constant(fixed): + """ + Create a function that computes the mpf value for a mathematical + constant, given a function that computes the fixed-point value. + + Assumptions: the constant is positive and has magnitude ~= 1; + the fixed-point function rounds to floor. + """ + def f(prec, rnd=round_fast): + wp = prec + 20 + v = fixed(wp) + if rnd in (round_up, round_ceiling): + v += 1 + return normalize(0, v, -wp, bitcount(v), prec, rnd) + f.__doc__ = fixed.__doc__ + return f + +def bsp_acot(q, a, b, hyperbolic): + if b - a == 1: + a1 = MPZ(2*a + 3) + if hyperbolic or a&1: + return MPZ_ONE, a1 * q**2, a1 + else: + return -MPZ_ONE, a1 * q**2, a1 + m = (a+b)//2 + p1, q1, r1 = bsp_acot(q, a, m, hyperbolic) + p2, q2, r2 = bsp_acot(q, m, b, hyperbolic) + return q2*p1 + r1*p2, q1*q2, r1*r2 + +# the acoth(x) series converges like the geometric series for x^2 +# N = ceil(p*log(2)/(2*log(x))) +def acot_fixed(a, prec, hyperbolic): + """ + Compute acot(a) or acoth(a) for an integer a with binary splitting; see + http://numbers.computation.free.fr/Constants/Algorithms/splitting.html + """ + N = int(0.35 * prec/math.log(a) + 20) + p, q, r = bsp_acot(a, 0,N, hyperbolic) + return ((p+q)<> extraprec) + +# Logarithms of integers are needed for various computations involving +# logarithms, powers, radix conversion, etc + +@constant_memo +def ln2_fixed(prec): + """ + Computes ln(2). This is done with a hyperbolic Machin-type formula, + with binary splitting at high precision. + """ + return machin([(18, 26), (-2, 4801), (8, 8749)], prec, True) + +@constant_memo +def ln10_fixed(prec): + """ + Computes ln(10). This is done with a hyperbolic Machin-type formula. + """ + return machin([(46, 31), (34, 49), (20, 161)], prec, True) + + +r""" +For computation of pi, we use the Chudnovsky series: + + oo + ___ k + 1 \ (-1) (6 k)! (A + B k) + ----- = ) ----------------------- + 12 pi /___ 3 3k+3/2 + (3 k)! (k!) C + k = 0 + +where A, B, and C are certain integer constants. This series adds roughly +14 digits per term. Note that C^(3/2) can be extracted so that the +series contains only rational terms. This makes binary splitting very +efficient. + +The recurrence formulas for the binary splitting were taken from +ftp://ftp.gmplib.org/pub/src/gmp-chudnovsky.c + +Previously, Machin's formula was used at low precision and the AGM iteration +was used at high precision. However, the Chudnovsky series is essentially as +fast as the Machin formula at low precision and in practice about 3x faster +than the AGM at high precision (despite theoretically having a worse +asymptotic complexity), so there is no reason not to use it in all cases. + +""" + +# Constants in Chudnovsky's series +CHUD_A = MPZ(13591409) +CHUD_B = MPZ(545140134) +CHUD_C = MPZ(640320) +CHUD_D = MPZ(12) + +def bs_chudnovsky(a, b, level, verbose): + """ + Computes the sum from a to b of the series in the Chudnovsky + formula. Returns g, p, q where p/q is the sum as an exact + fraction and g is a temporary value used to save work + for recursive calls. + """ + if b-a == 1: + g = MPZ((6*b-5)*(2*b-1)*(6*b-1)) + p = b**3 * CHUD_C**3 // 24 + q = (-1)**b * g * (CHUD_A+CHUD_B*b) + else: + if verbose and level < 4: + print(" binary splitting", a, b) + mid = (a+b)//2 + g1, p1, q1 = bs_chudnovsky(a, mid, level+1, verbose) + g2, p2, q2 = bs_chudnovsky(mid, b, level+1, verbose) + p = p1*p2 + g = g1*g2 + q = q1*p2 + q2*g1 + return g, p, q + +@constant_memo +def pi_fixed(prec, verbose=False, verbose_base=None): + """ + Compute floor(pi * 2**prec) as a big integer. + + This is done using Chudnovsky's series (see comments in + libelefun.py for details). + """ + # The Chudnovsky series gives 14.18 digits per term + N = int(prec/3.3219280948/14.181647462 + 2) + if verbose: + print("binary splitting with N =", N) + g, p, q = bs_chudnovsky(0, N, 0, verbose) + sqrtC = isqrt_fast(CHUD_C<<(2*prec)) + v = p*CHUD_C*sqrtC//((q+CHUD_A*p)*CHUD_D) + return v + +def degree_fixed(prec): + return pi_fixed(prec)//180 + +def bspe(a, b): + """ + Sum series for exp(1)-1 between a, b, returning the result + as an exact fraction (p, q). + """ + if b-a == 1: + return MPZ_ONE, MPZ(b) + m = (a+b)//2 + p1, q1 = bspe(a, m) + p2, q2 = bspe(m, b) + return p1*q2+p2, q1*q2 + +@constant_memo +def e_fixed(prec): + """ + Computes exp(1). This is done using the ordinary Taylor series for + exp, with binary splitting. For a description of the algorithm, + see: + + http://numbers.computation.free.fr/Constants/ + Algorithms/splitting.html + """ + # Slight overestimate of N needed for 1/N! < 2**(-prec) + # This could be tightened for large N. + N = int(1.1*prec/math.log(prec) + 20) + p, q = bspe(0,N) + return ((p+q)<> 11 + +mpf_phi = def_mpf_constant(phi_fixed) +mpf_pi = def_mpf_constant(pi_fixed) +mpf_e = def_mpf_constant(e_fixed) +mpf_degree = def_mpf_constant(degree_fixed) +mpf_ln2 = def_mpf_constant(ln2_fixed) +mpf_ln10 = def_mpf_constant(ln10_fixed) + + +@constant_memo +def ln_sqrt2pi_fixed(prec): + wp = prec + 10 + # ln(sqrt(2*pi)) = ln(2*pi)/2 + return to_fixed(mpf_log(mpf_shift(mpf_pi(wp), 1), wp), prec-1) + +@constant_memo +def sqrtpi_fixed(prec): + return sqrt_fixed(pi_fixed(prec), prec) + +mpf_sqrtpi = def_mpf_constant(sqrtpi_fixed) +mpf_ln_sqrt2pi = def_mpf_constant(ln_sqrt2pi_fixed) + + +#----------------------------------------------------------------------------# +# # +# Powers # +# # +#----------------------------------------------------------------------------# + +def mpf_pow(s, t, prec, rnd=round_fast): + """ + Compute s**t. Raises ComplexResult if s is negative and t is + fractional. + """ + ssign, sman, sexp, sbc = s + tsign, tman, texp, tbc = t + if ssign and texp < 0: + raise ComplexResult("negative number raised to a fractional power") + if texp >= 0: + return mpf_pow_int(s, (-1)**tsign * (tman<> pbc)] + if pbc > workprec: + pm = pm >> (pbc-workprec) + pe += pbc - workprec + pbc = workprec + n -= 1 + if not n: + break + y = y*y + exp = exp+exp + bc = bc + bc - 2 + bc = bc + bctable[int(y >> bc)] + if bc > workprec: + y = y >> (bc-workprec) + exp += bc - workprec + bc = workprec + n = n // 2 + return pm, pe + +# froot(s, n, prec, rnd) computes the real n-th root of a +# positive mpf tuple s. +# To compute the root we start from a 50-bit estimate for r +# generated with ordinary floating-point arithmetic, and then refine +# the value to full accuracy using the iteration + +# 1 / y \ +# r = --- | (n-1) * r + ---------- | +# n+1 n \ n r_n**(n-1) / + +# which is simply Newton's method applied to the equation r**n = y. +# With giant_steps(start, prec+extra) = [p0,...,pm, prec+extra] +# and y = man * 2**-shift one has +# (man * 2**exp)**(1/n) = +# y**(1/n) * 2**(start-prec/n) * 2**(p0-start) * ... * 2**(prec+extra-pm) * +# 2**((exp+shift-(n-1)*prec)/n -extra)) +# The last factor is accounted for in the last line of froot. + +def nthroot_fixed(y, n, prec, exp1): + start = 50 + try: + y1 = rshift(y, prec - n*start) + r = MPZ(int(y1**(1.0/n))) + except OverflowError: + y1 = from_int(y1, start) + fn = from_int(n) + fn = mpf_rdiv_int(1, fn, start) + r = mpf_pow(y1, fn, start) + r = to_int(r) + extra = 10 + extra1 = n + prevp = start + for p in giant_steps(start, prec+extra): + pm, pe = int_pow_fixed(r, n-1, prevp) + r2 = rshift(pm, (n-1)*prevp - p - pe - extra1) + B = lshift(y, 2*p-prec+extra1)//r2 + r = (B + (n-1) * lshift(r, p-prevp))//n + prevp = p + return r + +def mpf_nthroot(s, n, prec, rnd=round_fast): + """nth-root of a positive number + + Use the Newton method when faster, otherwise use x**(1/n) + """ + sign, man, exp, bc = s + if sign: + raise ComplexResult("nth root of a negative number") + if not man: + if s == fnan: + return fnan + if s == fzero: + if n > 0: + return fzero + if n == 0: + return fone + return finf + # Infinity + if not n: + return fnan + if n < 0: + return fzero + return finf + flag_inverse = False + if n < 2: + if n == 0: + return fone + if n == 1: + return mpf_pos(s, prec, rnd) + if n == -1: + return mpf_div(fone, s, prec, rnd) + # n < 0 + rnd = reciprocal_rnd[rnd] + flag_inverse = True + extra_inverse = 5 + prec += extra_inverse + n = -n + if n > 20 and (n >= 20000 or prec < int(233 + 28.3 * n**0.62)): + prec2 = prec + 10 + fn = from_int(n) + nth = mpf_rdiv_int(1, fn, prec2) + r = mpf_pow(s, nth, prec2, rnd) + s = normalize(r[0], r[1], r[2], r[3], prec, rnd) + if flag_inverse: + return mpf_div(fone, s, prec-extra_inverse, rnd) + else: + return s + # Convert to a fixed-point number with prec2 bits. + prec2 = prec + 2*n - (prec%n) + # a few tests indicate that + # for 10 < n < 10**4 a bit more precision is needed + if n > 10: + prec2 += prec2//10 + prec2 = prec2 - prec2%n + # Mantissa may have more bits than we need. Trim it down. + shift = bc - prec2 + # Adjust exponents to make prec2 and exp+shift multiples of n. + sign1 = 0 + es = exp+shift + if es < 0: + sign1 = 1 + es = -es + if sign1: + shift += es%n + else: + shift -= es%n + man = rshift(man, shift) + extra = 10 + exp1 = ((exp+shift-(n-1)*prec2)//n) - extra + rnd_shift = 0 + if flag_inverse: + if rnd == 'u' or rnd == 'c': + rnd_shift = 1 + else: + if rnd == 'd' or rnd == 'f': + rnd_shift = 1 + man = nthroot_fixed(man+rnd_shift, n, prec2, exp1) + s = from_man_exp(man, exp1, prec, rnd) + if flag_inverse: + return mpf_div(fone, s, prec-extra_inverse, rnd) + else: + return s + +def mpf_cbrt(s, prec, rnd=round_fast): + """cubic root of a positive number""" + return mpf_nthroot(s, 3, prec, rnd) + +#----------------------------------------------------------------------------# +# # +# Logarithms # +# # +#----------------------------------------------------------------------------# + + +def log_int_fixed(n, prec, ln2=None): + """ + Fast computation of log(n), caching the value for small n, + intended for zeta sums. + """ + if n in log_int_cache: + value, vprec = log_int_cache[n] + if vprec >= prec: + return value >> (vprec - prec) + wp = prec + 10 + if wp <= LOG_TAYLOR_SHIFT: + if ln2 is None: + ln2 = ln2_fixed(wp) + r = bitcount(n) + x = n << (wp-r) + v = log_taylor_cached(x, wp) + r*ln2 + else: + v = to_fixed(mpf_log(from_int(n), wp+5), wp) + if n < MAX_LOG_INT_CACHE: + log_int_cache[n] = (v, wp) + return v >> (wp-prec) + +def agm_fixed(a, b, prec): + """ + Fixed-point computation of agm(a,b), assuming + a, b both close to unit magnitude. + """ + i = 0 + while 1: + anew = (a+b)>>1 + if i > 4 and abs(a-anew) < 8: + return a + b = isqrt_fast(a*b) + a = anew + i += 1 + return a + +def log_agm(x, prec): + """ + Fixed-point computation of -log(x) = log(1/x), suitable + for large precision. It is required that 0 < x < 1. The + algorithm used is the Sasaki-Kanada formula + + -log(x) = pi/agm(theta2(x)^2,theta3(x)^2). [1] + + For faster convergence in the theta functions, x should + be chosen closer to 0. + + Guard bits must be added by the caller. + + HYPOTHESIS: if x = 2^(-n), n bits need to be added to + account for the truncation to a fixed-point number, + and this is the only significant cancellation error. + + The number of bits lost to roundoff is small and can be + considered constant. + + [1] Richard P. Brent, "Fast Algorithms for High-Precision + Computation of Elementary Functions (extended abstract)", + http://wwwmaths.anu.edu.au/~brent/pd/RNC7-Brent.pdf + + """ + x2 = (x*x) >> prec + # Compute jtheta2(x)**2 + s = a = b = x2 + while a: + b = (b*x2) >> prec + a = (a*b) >> prec + s += a + s += (MPZ_ONE<>(prec-2) + s = (s*isqrt_fast(x<>prec + # Compute jtheta3(x)**2 + t = a = b = x + while a: + b = (b*x2) >> prec + a = (a*b) >> prec + t += a + t = (MPZ_ONE<>prec + # Final formula + p = agm_fixed(s, t, prec) + return (pi_fixed(prec) << prec) // p + +def log_taylor(x, prec, r=0): + """ + Fixed-point calculation of log(x). It is assumed that x is close + enough to 1 for the Taylor series to converge quickly. Convergence + can be improved by specifying r > 0 to compute + log(x^(1/2^r))*2^r, at the cost of performing r square roots. + + The caller must provide sufficient guard bits. + """ + for i in xrange(r): + x = isqrt_fast(x<> prec + v4 = (v2*v2) >> prec + s0 = v + s1 = v//3 + v = (v*v4) >> prec + k = 5 + while v: + s0 += v // k + k += 2 + s1 += v // k + v = (v*v4) >> prec + k += 2 + s1 = (s1*v2) >> prec + s = (s0+s1) << (1+r) + if sign: + return -s + return s + +def log_taylor_cached(x, prec): + """ + Fixed-point computation of log(x), assuming x in (0.5, 2) + and prec <= LOG_TAYLOR_PREC. + """ + n = x >> (prec-LOG_TAYLOR_SHIFT) + cached_prec = cache_prec_steps[prec] + dprec = cached_prec - prec + if (n, cached_prec) in log_taylor_cache: + a, log_a = log_taylor_cache[n, cached_prec] + else: + a = n << (cached_prec - LOG_TAYLOR_SHIFT) + log_a = log_taylor(a, cached_prec, 8) + log_taylor_cache[n, cached_prec] = (a, log_a) + a >>= dprec + log_a >>= dprec + u = ((x - a) << prec) // a + v = (u << prec) // ((MPZ_TWO << prec) + u) + v2 = (v*v) >> prec + v4 = (v2*v2) >> prec + s0 = v + s1 = v//3 + v = (v*v4) >> prec + k = 5 + while v: + s0 += v//k + k += 2 + s1 += v//k + v = (v*v4) >> prec + k += 2 + s1 = (s1*v2) >> prec + s = (s0+s1) << 1 + return log_a + s + +def mpf_log(x, prec, rnd=round_fast): + """ + Compute the natural logarithm of the mpf value x. If x is negative, + ComplexResult is raised. + """ + sign, man, exp, bc = x + #------------------------------------------------------------------ + # Handle special values + if not man: + if x == fzero: return fninf + if x == finf: return finf + if x == fnan: return fnan + if sign: + raise ComplexResult("logarithm of a negative number") + wp = prec + 20 + #------------------------------------------------------------------ + # Handle log(2^n) = log(n)*2. + # Here we catch the only possible exact value, log(1) = 0 + if man == 1: + if not exp: + return fzero + return from_man_exp(exp*ln2_fixed(wp), -wp, prec, rnd) + mag = exp+bc + abs_mag = abs(mag) + #------------------------------------------------------------------ + # Handle x = 1+eps, where log(x) ~ x. We need to check for + # cancellation when moving to fixed-point math and compensate + # by increasing the precision. Note that abs_mag in (0, 1) <=> + # 0.5 < x < 2 and x != 1 + if abs_mag <= 1: + # Calculate t = x-1 to measure distance from 1 in bits + tsign = 1-abs_mag + if tsign: + tman = (MPZ_ONE< wp: + t = normalize(tsign, tman, abs_mag-bc, tbc, tbc, 'n') + return mpf_perturb(t, tsign, prec, rnd) + else: + wp += cancellation + # TODO: if close enough to 1, we could use Taylor series + # even in the AGM precision range, since the Taylor series + # converges rapidly + #------------------------------------------------------------------ + # Another special case: + # n*log(2) is a good enough approximation + if abs_mag > 10000: + if bitcount(abs_mag) > wp: + return from_man_exp(exp*ln2_fixed(wp), -wp, prec, rnd) + #------------------------------------------------------------------ + # General case. + # Perform argument reduction using log(x) = log(x*2^n) - n*log(2): + # If we are in the Taylor precision range, choose magnitude 0 or 1. + # If we are in the AGM precision range, choose magnitude -m for + # some large m; benchmarking on one machine showed m = prec/20 to be + # optimal between 1000 and 100,000 digits. + if wp <= LOG_TAYLOR_PREC: + m = log_taylor_cached(lshift(man, wp-bc), wp) + if mag: + m += mag*ln2_fixed(wp) + else: + optimal_mag = -wp//LOG_AGM_MAG_PREC_RATIO + n = optimal_mag - mag + x = mpf_shift(x, n) + wp += (-optimal_mag) + m = -log_agm(to_fixed(x, wp), wp) + m -= n*ln2_fixed(wp) + return from_man_exp(m, -wp, prec, rnd) + +def mpf_log_hypot(a, b, prec, rnd): + """ + Computes log(sqrt(a^2+b^2)) accurately. + """ + # If either a or b is inf/nan/0, assume it to be a + if not b[1]: + a, b = b, a + # a is inf/nan/0 + if not a[1]: + # both are inf/nan/0 + if not b[1]: + if a == b == fzero: + return fninf + if fnan in (a, b): + return fnan + # at least one term is (+/- inf)^2 + return finf + # only a is inf/nan/0 + if a == fzero: + # log(sqrt(0+b^2)) = log(|b|) + return mpf_log(mpf_abs(b), prec, rnd) + if a == fnan: + return fnan + return finf + # Exact + a2 = mpf_mul(a,a) + b2 = mpf_mul(b,b) + extra = 20 + # Not exact + h2 = mpf_add(a2, b2, prec+extra) + cancelled = mpf_add(h2, fnone, 10) + mag_cancelled = cancelled[2]+cancelled[3] + # Just redo the sum exactly if necessary (could be smarter + # and avoid memory allocation when a or b is precisely 1 + # and the other is tiny...) + if cancelled == fzero or mag_cancelled < -extra//2: + h2 = mpf_add(a2, b2, prec+extra-min(a2[2],b2[2])) + return mpf_shift(mpf_log(h2, prec, rnd), -1) + + +#---------------------------------------------------------------------- +# Inverse tangent +# + +def atan_newton(x, prec): + if prec >= 100: + r = math.atan(int((x>>(prec-53)))/2.0**53) + else: + r = math.atan(int(x)/2.0**prec) + prevp = 50 + r = MPZ(int(r * 2.0**53) >> (53-prevp)) + extra_p = 50 + for wp in giant_steps(prevp, prec): + wp += extra_p + r = r << (wp-prevp) + cos, sin = cos_sin_fixed(r, wp) + tan = (sin << wp) // cos + a = ((tan-rshift(x, prec-wp)) << wp) // ((MPZ_ONE<>wp)) + r = r - a + prevp = wp + return rshift(r, prevp-prec) + +def atan_taylor_get_cached(n, prec): + # Taylor series with caching wins up to huge precisions + # To avoid unnecessary precomputation at low precision, we + # do it in steps + # Round to next power of 2 + prec2 = (1<<(bitcount(prec-1))) + 20 + dprec = prec2 - prec + if (n, prec2) in atan_taylor_cache: + a, atan_a = atan_taylor_cache[n, prec2] + else: + a = n << (prec2 - ATAN_TAYLOR_SHIFT) + atan_a = atan_newton(a, prec2) + atan_taylor_cache[n, prec2] = (a, atan_a) + return (a >> dprec), (atan_a >> dprec) + +def atan_taylor(x, prec): + n = (x >> (prec-ATAN_TAYLOR_SHIFT)) + a, atan_a = atan_taylor_get_cached(n, prec) + d = x - a + s0 = v = (d << prec) // ((a**2 >> prec) + (a*d >> prec) + (MPZ_ONE << prec)) + v2 = (v**2 >> prec) + v4 = (v2 * v2) >> prec + s1 = v//3 + v = (v * v4) >> prec + k = 5 + while v: + s0 += v // k + k += 2 + s1 += v // k + v = (v * v4) >> prec + k += 2 + s1 = (s1 * v2) >> prec + s = s0 - s1 + return atan_a + s + +def atan_inf(sign, prec, rnd): + if not sign: + return mpf_shift(mpf_pi(prec, rnd), -1) + return mpf_neg(mpf_shift(mpf_pi(prec, negative_rnd[rnd]), -1)) + +def mpf_atan(x, prec, rnd=round_fast): + sign, man, exp, bc = x + if not man: + if x == fzero: return fzero + if x == finf: return atan_inf(0, prec, rnd) + if x == fninf: return atan_inf(1, prec, rnd) + return fnan + mag = exp + bc + # Essentially infinity + if mag > prec+20: + return atan_inf(sign, prec, rnd) + # Essentially ~ x + if -mag > prec+20: + return mpf_perturb(x, 1-sign, prec, rnd) + wp = prec + 30 + abs(mag) + # For large x, use atan(x) = pi/2 - atan(1/x) + if mag >= 2: + x = mpf_rdiv_int(1, x, wp) + reciprocal = True + else: + reciprocal = False + t = to_fixed(x, wp) + if sign: + t = -t + if wp < ATAN_TAYLOR_PREC: + a = atan_taylor(t, wp) + else: + a = atan_newton(t, wp) + if reciprocal: + a = ((pi_fixed(wp)>>1)+1) - a + if sign: + a = -a + return from_man_exp(a, -wp, prec, rnd) + +# TODO: cleanup the special cases +def mpf_atan2(y, x, prec, rnd=round_fast): + xsign, xman, xexp, xbc = x + ysign, yman, yexp, ybc = y + if not yman: + if y == fzero and x != fnan: + if mpf_sign(x) >= 0: + return fzero + return mpf_pi(prec, rnd) + if y in (finf, fninf): + if x in (finf, fninf): + return fnan + # pi/2 + if y == finf: + return mpf_shift(mpf_pi(prec, rnd), -1) + # -pi/2 + return mpf_neg(mpf_shift(mpf_pi(prec, negative_rnd[rnd]), -1)) + return fnan + if ysign: + return mpf_neg(mpf_atan2(mpf_neg(y), x, prec, negative_rnd[rnd])) + if not xman: + if x == fnan: + return fnan + if x == finf: + return fzero + if x == fninf: + return mpf_pi(prec, rnd) + if y == fzero: + return fzero + return mpf_shift(mpf_pi(prec, rnd), -1) + tquo = mpf_atan(mpf_div(y, x, prec+4), prec+4) + if xsign: + return mpf_add(mpf_pi(prec+4), tquo, prec, rnd) + else: + return mpf_pos(tquo, prec, rnd) + +def mpf_asin(x, prec, rnd=round_fast): + sign, man, exp, bc = x + if bc+exp > 0 and x not in (fone, fnone): + raise ComplexResult("asin(x) is real only for -1 <= x <= 1") + # asin(x) = 2*atan(x/(1+sqrt(1-x**2))) + wp = prec + 15 + a = mpf_mul(x, x) + b = mpf_add(fone, mpf_sqrt(mpf_sub(fone, a, wp), wp), wp) + c = mpf_div(x, b, wp) + return mpf_shift(mpf_atan(c, prec, rnd), 1) + +def mpf_acos(x, prec, rnd=round_fast): + # acos(x) = 2*atan(sqrt(1-x**2)/(1+x)) + sign, man, exp, bc = x + if bc + exp > 0: + if x not in (fone, fnone): + raise ComplexResult("acos(x) is real only for -1 <= x <= 1") + if x == fnone: + return mpf_pi(prec, rnd) + wp = prec + 15 + a = mpf_mul(x, x) + b = mpf_sqrt(mpf_sub(fone, a, wp), wp) + c = mpf_div(b, mpf_add(fone, x, wp), wp) + return mpf_shift(mpf_atan(c, prec, rnd), 1) + +def mpf_asinh(x, prec, rnd=round_fast): + wp = prec + 20 + sign, man, exp, bc = x + mag = exp+bc + if mag < -8: + if mag < -wp: + return mpf_perturb(x, 1-sign, prec, rnd) + wp += (-mag) + # asinh(x) = log(x+sqrt(x**2+1)) + # use reflection symmetry to avoid cancellation + q = mpf_sqrt(mpf_add(mpf_mul(x, x), fone, wp), wp) + q = mpf_add(mpf_abs(x), q, wp) + if sign: + return mpf_neg(mpf_log(q, prec, negative_rnd[rnd])) + else: + return mpf_log(q, prec, rnd) + +def mpf_acosh(x, prec, rnd=round_fast): + # acosh(x) = log(x+sqrt(x**2-1)) + wp = prec + 15 + if mpf_cmp(x, fone) == -1: + raise ComplexResult("acosh(x) is real only for x >= 1") + q = mpf_sqrt(mpf_add(mpf_mul(x,x), fnone, wp), wp) + return mpf_log(mpf_add(x, q, wp), prec, rnd) + +def mpf_atanh(x, prec, rnd=round_fast): + # atanh(x) = log((1+x)/(1-x))/2 + sign, man, exp, bc = x + if (not man) and exp: + if x in (fzero, fnan): + return x + raise ComplexResult("atanh(x) is real only for -1 <= x <= 1") + mag = bc + exp + if mag > 0: + if mag == 1 and man == 1: + return [finf, fninf][sign] + raise ComplexResult("atanh(x) is real only for -1 <= x <= 1") + wp = prec + 15 + if mag < -8: + if mag < -wp: + return mpf_perturb(x, sign, prec, rnd) + wp += (-mag) + a = mpf_add(x, fone, wp) + b = mpf_sub(fone, x, wp) + return mpf_shift(mpf_log(mpf_div(a, b, wp), prec, rnd), -1) + +def mpf_fibonacci(x, prec, rnd=round_fast): + sign, man, exp, bc = x + if not man: + if x == fninf: + return fnan + return x + # F(2^n) ~= 2^(2^n) + size = abs(exp+bc) + if exp >= 0: + # Exact + if size < 10 or size <= bitcount(prec): + return from_int(ifib(to_int(x)), prec, rnd) + # Use the modified Binet formula + wp = prec + size + 20 + a = mpf_phi(wp) + b = mpf_add(mpf_shift(a, 1), fnone, wp) + u = mpf_pow(a, x, wp) + v = mpf_cos_pi(x, wp) + v = mpf_div(v, u, wp) + u = mpf_sub(u, v, wp) + u = mpf_div(u, b, prec, rnd) + return u + + +#------------------------------------------------------------------------------- +# Exponential-type functions +#------------------------------------------------------------------------------- + +def exponential_series(x, prec, type=0): + """ + Taylor series for cosh/sinh or cos/sin. + + type = 0 -- returns exp(x) (slightly faster than cosh+sinh) + type = 1 -- returns (cosh(x), sinh(x)) + type = 2 -- returns (cos(x), sin(x)) + """ + if x < 0: + x = -x + sign = 1 + else: + sign = 0 + r = int(0.5*prec**0.5) + xmag = bitcount(x) - prec + r = max(0, xmag + r) + extra = 10 + 2*max(r,-xmag) + wp = prec + extra + x <<= (extra - r) + one = MPZ_ONE << wp + alt = (type == 2) + if prec < EXP_SERIES_U_CUTOFF: + x2 = a = (x*x) >> wp + x4 = (x2*x2) >> wp + s0 = s1 = MPZ_ZERO + k = 2 + while a: + a //= (k-1)*k; s0 += a; k += 2 + a //= (k-1)*k; s1 += a; k += 2 + a = (a*x4) >> wp + s1 = (x2*s1) >> wp + if alt: + c = s1 - s0 + one + else: + c = s1 + s0 + one + else: + u = int(0.3*prec**0.35) + x2 = a = (x*x) >> wp + xpowers = [one, x2] + for i in xrange(1, u): + xpowers.append((xpowers[-1]*x2)>>wp) + sums = [MPZ_ZERO] * u + k = 2 + while a: + for i in xrange(u): + a //= (k-1)*k + if alt and k & 2: sums[i] -= a + else: sums[i] += a + k += 2 + a = (a*xpowers[-1]) >> wp + for i in xrange(1, u): + sums[i] = (sums[i]*xpowers[i]) >> wp + c = sum(sums) + one + if type == 0: + s = isqrt_fast(c*c - (one<> wp + return v >> extra + else: + # Repeatedly apply the double-angle formula + # cosh(2*x) = 2*cosh(x)^2 - 1 + # cos(2*x) = 2*cos(x)^2 - 1 + pshift = wp-1 + for i in xrange(r): + c = ((c*c) >> pshift) - one + # With the abs, this is the same for sinh and sin + s = isqrt_fast(abs((one<>extra), (s>>extra) + +def exp_basecase(x, prec): + """ + Compute exp(x) as a fixed-point number. Works for any x, + but for speed should have |x| < 1. For an arbitrary number, + use exp(x) = exp(x-m*log(2)) * 2^m where m = floor(x/log(2)). + """ + if prec > EXP_COSH_CUTOFF: + return exponential_series(x, prec, 0) + r = int(prec**0.5) + prec += r + s0 = s1 = (MPZ_ONE << prec) + k = 2 + a = x2 = (x*x) >> prec + while a: + a //= k; s0 += a; k += 1 + a //= k; s1 += a; k += 1 + a = (a*x2) >> prec + s1 = (s1*x) >> prec + s = s0 + s1 + u = r + while r: + s = (s*s) >> prec + r -= 1 + return s >> u + +def exp_expneg_basecase(x, prec): + """ + Computation of exp(x), exp(-x) + """ + if prec > EXP_COSH_CUTOFF: + cosh, sinh = exponential_series(x, prec, 1) + return cosh+sinh, cosh-sinh + a = exp_basecase(x, prec) + b = (MPZ_ONE << (prec+prec)) // a + return a, b + +def cos_sin_basecase(x, prec): + """ + Compute cos(x), sin(x) as fixed-point numbers, assuming x + in [0, pi/2). For an arbitrary number, use x' = x - m*(pi/2) + where m = floor(x/(pi/2)) along with quarter-period symmetries. + """ + if prec > COS_SIN_CACHE_PREC: + return exponential_series(x, prec, 2) + precs = prec - COS_SIN_CACHE_STEP + t = x >> precs + n = int(t) + if n not in cos_sin_cache: + w = t<<(10+COS_SIN_CACHE_PREC-COS_SIN_CACHE_STEP) + cos_t, sin_t = exponential_series(w, 10+COS_SIN_CACHE_PREC, 2) + cos_sin_cache[n] = (cos_t>>10), (sin_t>>10) + cos_t, sin_t = cos_sin_cache[n] + offset = COS_SIN_CACHE_PREC - prec + cos_t >>= offset + sin_t >>= offset + x -= t << precs + cos = MPZ_ONE << prec + sin = x + k = 2 + a = -((x*x) >> prec) + while a: + a //= k; cos += a; k += 1; a = (a*x) >> prec + a //= k; sin += a; k += 1; a = -((a*x) >> prec) + return ((cos*cos_t-sin*sin_t) >> prec), ((sin*cos_t+cos*sin_t) >> prec) + +def mpf_exp(x, prec, rnd=round_fast): + sign, man, exp, bc = x + if man: + mag = bc + exp + wp = prec + 14 + if sign: + man = -man + # TODO: the best cutoff depends on both x and the precision. + if prec > 600 and exp >= 0: + # Need about log2(exp(n)) ~= 1.45*mag extra precision + e = mpf_e(wp+int(1.45*mag)) + return mpf_pow_int(e, man<= 2 + if mag > 1: + # For large arguments: exp(2^mag*(1+eps)) = + # exp(2^mag)*exp(2^mag*eps) = exp(2^mag)*(1 + 2^mag*eps + ...) + # so about mag extra bits is required. + wpmod = wp + mag + offset = exp + wpmod + if offset >= 0: + t = man << offset + else: + t = man >> (-offset) + lg2 = ln2_fixed(wpmod) + n, t = divmod(t, lg2) + n = int(n) + t >>= mag + else: + offset = exp + wp + if offset >= 0: + t = man << offset + else: + t = man >> (-offset) + n = 0 + man = exp_basecase(t, wp) + return from_man_exp(man, n-wp, prec, rnd) + if not exp: + return fone + if x == fninf: + return fzero + return x + + +def mpf_cosh_sinh(x, prec, rnd=round_fast, tanh=0): + """Simultaneously compute (cosh(x), sinh(x)) for real x""" + sign, man, exp, bc = x + if (not man) and exp: + if tanh: + if x == finf: return fone + if x == fninf: return fnone + return fnan + if x == finf: return (finf, finf) + if x == fninf: return (finf, fninf) + return fnan, fnan + mag = exp+bc + wp = prec+14 + if mag < -4: + # Extremely close to 0, sinh(x) ~= x and cosh(x) ~= 1 + if mag < -wp: + if tanh: + return mpf_perturb(x, 1-sign, prec, rnd) + cosh = mpf_perturb(fone, 0, prec, rnd) + sinh = mpf_perturb(x, sign, prec, rnd) + return cosh, sinh + # Fix for cancellation when computing sinh + wp += (-mag) + # Does exp(-2*x) vanish? + if mag > 10: + if 3*(1<<(mag-1)) > wp: + # XXX: rounding + if tanh: + return mpf_perturb([fone,fnone][sign], 1-sign, prec, rnd) + c = s = mpf_shift(mpf_exp(mpf_abs(x), prec, rnd), -1) + if sign: + s = mpf_neg(s) + return c, s + # |x| > 1 + if mag > 1: + wpmod = wp + mag + offset = exp + wpmod + if offset >= 0: + t = man << offset + else: + t = man >> (-offset) + lg2 = ln2_fixed(wpmod) + n, t = divmod(t, lg2) + n = int(n) + t >>= mag + else: + offset = exp + wp + if offset >= 0: + t = man << offset + else: + t = man >> (-offset) + n = 0 + a, b = exp_expneg_basecase(t, wp) + # TODO: optimize division precision + cosh = a + (b>>(2*n)) + sinh = a - (b>>(2*n)) + if sign: + sinh = -sinh + if tanh: + man = (sinh << wp) // cosh + return from_man_exp(man, -wp, prec, rnd) + else: + cosh = from_man_exp(cosh, n-wp-1, prec, rnd) + sinh = from_man_exp(sinh, n-wp-1, prec, rnd) + return cosh, sinh + + +def mod_pi2(man, exp, mag, wp): + # Reduce to standard interval + if mag > 0: + i = 0 + while 1: + cancellation_prec = 20 << i + wpmod = wp + mag + cancellation_prec + pi2 = pi_fixed(wpmod-1) + pi4 = pi2 >> 1 + offset = wpmod + exp + if offset >= 0: + t = man << offset + else: + t = man >> (-offset) + n, y = divmod(t, pi2) + if y > pi4: + small = pi2 - y + else: + small = y + if small >> (wp+mag-10): + n = int(n) + t = y >> mag + wp = wpmod - mag + break + i += 1 + else: + wp += (-mag) + offset = exp + wp + if offset >= 0: + t = man << offset + else: + t = man >> (-offset) + n = 0 + return t, n, wp + + +def mpf_cos_sin(x, prec, rnd=round_fast, which=0, pi=False): + """ + which: + 0 -- return cos(x), sin(x) + 1 -- return cos(x) + 2 -- return sin(x) + 3 -- return tan(x) + + if pi=True, compute for pi*x + """ + sign, man, exp, bc = x + if not man: + if exp: + c, s = fnan, fnan + else: + c, s = fone, fzero + if which == 0: return c, s + if which == 1: return c + if which == 2: return s + if which == 3: return s + + mag = bc + exp + wp = prec + 10 + + # Extremely small? + if mag < 0: + if mag < -wp: + if pi: + x = mpf_mul(x, mpf_pi(wp)) + c = mpf_perturb(fone, 1, prec, rnd) + s = mpf_perturb(x, 1-sign, prec, rnd) + if which == 0: return c, s + if which == 1: return c + if which == 2: return s + if which == 3: return mpf_perturb(x, sign, prec, rnd) + if pi: + if exp >= -1: + if exp == -1: + c = fzero + s = (fone, fnone)[bool(man & 2) ^ sign] + elif exp == 0: + c, s = (fnone, fzero) + else: + c, s = (fone, fzero) + if which == 0: return c, s + if which == 1: return c + if which == 2: return s + if which == 3: return mpf_div(s, c, prec, rnd) + # Subtract nearest half-integer (= mod by pi/2) + n = ((man >> (-exp-2)) + 1) >> 1 + man = man - (n << (-exp-1)) + mag2 = bitcount(man) + exp + wp = prec + 10 - mag2 + offset = exp + wp + if offset >= 0: + t = man << offset + else: + t = man >> (-offset) + t = (t*pi_fixed(wp)) >> wp + else: + t, n, wp = mod_pi2(man, exp, mag, wp) + c, s = cos_sin_basecase(t, wp) + m = n & 3 + if m == 1: c, s = -s, c + elif m == 2: c, s = -c, -s + elif m == 3: c, s = s, -c + if sign: + s = -s + if which == 0: + c = from_man_exp(c, -wp, prec, rnd) + s = from_man_exp(s, -wp, prec, rnd) + return c, s + if which == 1: + return from_man_exp(c, -wp, prec, rnd) + if which == 2: + return from_man_exp(s, -wp, prec, rnd) + if which == 3: + return from_rational(s, c, prec, rnd) + +def mpf_cos(x, prec, rnd=round_fast): return mpf_cos_sin(x, prec, rnd, 1) +def mpf_sin(x, prec, rnd=round_fast): return mpf_cos_sin(x, prec, rnd, 2) +def mpf_tan(x, prec, rnd=round_fast): return mpf_cos_sin(x, prec, rnd, 3) +def mpf_cos_sin_pi(x, prec, rnd=round_fast): return mpf_cos_sin(x, prec, rnd, 0, 1) +def mpf_cos_pi(x, prec, rnd=round_fast): return mpf_cos_sin(x, prec, rnd, 1, 1) +def mpf_sin_pi(x, prec, rnd=round_fast): return mpf_cos_sin(x, prec, rnd, 2, 1) +def mpf_cosh(x, prec, rnd=round_fast): return mpf_cosh_sinh(x, prec, rnd)[0] +def mpf_sinh(x, prec, rnd=round_fast): return mpf_cosh_sinh(x, prec, rnd)[1] +def mpf_tanh(x, prec, rnd=round_fast): return mpf_cosh_sinh(x, prec, rnd, tanh=1) + + +# Low-overhead fixed-point versions + +def cos_sin_fixed(x, prec, pi2=None): + if pi2 is None: + pi2 = pi_fixed(prec-1) + n, t = divmod(x, pi2) + n = int(n) + c, s = cos_sin_basecase(t, prec) + m = n & 3 + if m == 0: return c, s + if m == 1: return -s, c + if m == 2: return -c, -s + if m == 3: return s, -c + +def exp_fixed(x, prec, ln2=None): + if ln2 is None: + ln2 = ln2_fixed(prec) + n, t = divmod(x, ln2) + n = int(n) + v = exp_basecase(t, prec) + if n >= 0: + return v << n + else: + return v >> (-n) + + +if BACKEND == 'sage': + try: + import sage.libs.mpmath.ext_libmp as _lbmp + mpf_sqrt = _lbmp.mpf_sqrt + mpf_exp = _lbmp.mpf_exp + mpf_log = _lbmp.mpf_log + mpf_cos = _lbmp.mpf_cos + mpf_sin = _lbmp.mpf_sin + mpf_pow = _lbmp.mpf_pow + exp_fixed = _lbmp.exp_fixed + cos_sin_fixed = _lbmp.cos_sin_fixed + log_int_fixed = _lbmp.log_int_fixed + except (ImportError, AttributeError): + print("Warning: Sage imports in libelefun failed") diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libhyper.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libhyper.py new file mode 100644 index 0000000000000000000000000000000000000000..04f52d59710be77819066aea5c1cf4b0883f72d7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libhyper.py @@ -0,0 +1,1150 @@ +""" +This module implements computation of hypergeometric and related +functions. In particular, it provides code for generic summation +of hypergeometric series. Optimized versions for various special +cases are also provided. +""" + +import operator +import math + +from .backend import MPZ_ZERO, MPZ_ONE, BACKEND, xrange, exec_ + +from .libintmath import gcd + +from .libmpf import (\ + ComplexResult, round_fast, round_nearest, + negative_rnd, bitcount, to_fixed, from_man_exp, from_int, to_int, + from_rational, + fzero, fone, fnone, ftwo, finf, fninf, fnan, + mpf_sign, mpf_add, mpf_abs, mpf_pos, + mpf_cmp, mpf_lt, mpf_le, mpf_gt, mpf_min_max, + mpf_perturb, mpf_neg, mpf_shift, mpf_sub, mpf_mul, mpf_div, + sqrt_fixed, mpf_sqrt, mpf_rdiv_int, mpf_pow_int, + to_rational, +) + +from .libelefun import (\ + mpf_pi, mpf_exp, mpf_log, pi_fixed, mpf_cos_sin, mpf_cos, mpf_sin, + mpf_sqrt, agm_fixed, +) + +from .libmpc import (\ + mpc_one, mpc_sub, mpc_mul_mpf, mpc_mul, mpc_neg, complex_int_pow, + mpc_div, mpc_add_mpf, mpc_sub_mpf, + mpc_log, mpc_add, mpc_pos, mpc_shift, + mpc_is_infnan, mpc_zero, mpc_sqrt, mpc_abs, + mpc_mpf_div, mpc_square, mpc_exp +) + +from .libintmath import ifac +from .gammazeta import mpf_gamma_int, mpf_euler, euler_fixed + +class NoConvergence(Exception): + pass + + +#-----------------------------------------------------------------------# +# # +# Generic hypergeometric series # +# # +#-----------------------------------------------------------------------# + +""" +TODO: + +1. proper mpq parsing +2. imaginary z special-cased (also: rational, integer?) +3. more clever handling of series that don't converge because of stupid + upwards rounding +4. checking for cancellation + +""" + +def make_hyp_summator(key): + """ + Returns a function that sums a generalized hypergeometric series, + for given parameter types (integer, rational, real, complex). + + """ + p, q, param_types, ztype = key + + pstring = "".join(param_types) + fname = "hypsum_%i_%i_%s_%s_%s" % (p, q, pstring[:p], pstring[p:], ztype) + #print "generating hypsum", fname + + have_complex_param = 'C' in param_types + have_complex_arg = ztype == 'C' + have_complex = have_complex_param or have_complex_arg + + source = [] + add = source.append + + aint = [] + arat = [] + bint = [] + brat = [] + areal = [] + breal = [] + acomplex = [] + bcomplex = [] + + #add("wp = prec + 40") + add("MAX = kwargs.get('maxterms', wp*100)") + add("HIGH = MPZ_ONE<= 0:") + add(" ZRE = xm << offset") + add("else:") + add(" ZRE = xm >> (-offset)") + if have_complex_arg: + add("offset = ye + wp") + add("if offset >= 0:") + add(" ZIM = ym << offset") + add("else:") + add(" ZIM = ym >> (-offset)") + + for i, flag in enumerate(param_types): + W = ["A", "B"][i >= p] + if flag == 'Z': + ([aint,bint][i >= p]).append(i) + add("%sINT_%i = coeffs[%i]" % (W, i, i)) + elif flag == 'Q': + ([arat,brat][i >= p]).append(i) + add("%sP_%i, %sQ_%i = coeffs[%i]._mpq_" % (W, i, W, i, i)) + elif flag == 'R': + ([areal,breal][i >= p]).append(i) + add("xsign, xm, xe, xbc = coeffs[%i]._mpf_" % i) + add("if xsign: xm = -xm") + add("offset = xe + wp") + add("if offset >= 0:") + add(" %sREAL_%i = xm << offset" % (W, i)) + add("else:") + add(" %sREAL_%i = xm >> (-offset)" % (W, i)) + elif flag == 'C': + ([acomplex,bcomplex][i >= p]).append(i) + add("__re, __im = coeffs[%i]._mpc_" % i) + add("xsign, xm, xe, xbc = __re") + add("if xsign: xm = -xm") + add("ysign, ym, ye, ybc = __im") + add("if ysign: ym = -ym") + + add("offset = xe + wp") + add("if offset >= 0:") + add(" %sCRE_%i = xm << offset" % (W, i)) + add("else:") + add(" %sCRE_%i = xm >> (-offset)" % (W, i)) + add("offset = ye + wp") + add("if offset >= 0:") + add(" %sCIM_%i = ym << offset" % (W, i)) + add("else:") + add(" %sCIM_%i = ym >> (-offset)" % (W, i)) + else: + raise ValueError + + l_areal = len(areal) + l_breal = len(breal) + cancellable_real = min(l_areal, l_breal) + noncancellable_real_num = areal[cancellable_real:] + noncancellable_real_den = breal[cancellable_real:] + + # LOOP + add("for n in xrange(1,10**8):") + + add(" if n in magnitude_check:") + add(" p_mag = bitcount(abs(PRE))") + if have_complex: + add(" p_mag = max(p_mag, bitcount(abs(PIM)))") + add(" magnitude_check[n] = wp-p_mag") + + # Real factors + multiplier = " * ".join(["AINT_#".replace("#", str(i)) for i in aint] + \ + ["AP_#".replace("#", str(i)) for i in arat] + \ + ["BQ_#".replace("#", str(i)) for i in brat]) + + divisor = " * ".join(["BINT_#".replace("#", str(i)) for i in bint] + \ + ["BP_#".replace("#", str(i)) for i in brat] + \ + ["AQ_#".replace("#", str(i)) for i in arat] + ["n"]) + + if multiplier: + add(" mul = " + multiplier) + add(" div = " + divisor) + + # Check for singular terms + add(" if not div:") + if multiplier: + add(" if not mul:") + add(" break") + add(" raise ZeroDivisionError") + + # Update product + if have_complex: + + # TODO: when there are several real parameters and just a few complex + # (maybe just the complex argument), we only need to do about + # half as many ops if we accumulate the real factor in a single real variable + for k in range(cancellable_real): add(" PRE = PRE * AREAL_%i // BREAL_%i" % (areal[k], breal[k])) + for i in noncancellable_real_num: add(" PRE = (PRE * AREAL_#) >> wp".replace("#", str(i))) + for i in noncancellable_real_den: add(" PRE = (PRE << wp) // BREAL_#".replace("#", str(i))) + for k in range(cancellable_real): add(" PIM = PIM * AREAL_%i // BREAL_%i" % (areal[k], breal[k])) + for i in noncancellable_real_num: add(" PIM = (PIM * AREAL_#) >> wp".replace("#", str(i))) + for i in noncancellable_real_den: add(" PIM = (PIM << wp) // BREAL_#".replace("#", str(i))) + + if multiplier: + if have_complex_arg: + add(" PRE, PIM = (mul*(PRE*ZRE-PIM*ZIM))//div, (mul*(PIM*ZRE+PRE*ZIM))//div") + add(" PRE >>= wp") + add(" PIM >>= wp") + else: + add(" PRE = ((mul * PRE * ZRE) >> wp) // div") + add(" PIM = ((mul * PIM * ZRE) >> wp) // div") + else: + if have_complex_arg: + add(" PRE, PIM = (PRE*ZRE-PIM*ZIM)//div, (PIM*ZRE+PRE*ZIM)//div") + add(" PRE >>= wp") + add(" PIM >>= wp") + else: + add(" PRE = ((PRE * ZRE) >> wp) // div") + add(" PIM = ((PIM * ZRE) >> wp) // div") + + for i in acomplex: + add(" PRE, PIM = PRE*ACRE_#-PIM*ACIM_#, PIM*ACRE_#+PRE*ACIM_#".replace("#", str(i))) + add(" PRE >>= wp") + add(" PIM >>= wp") + + for i in bcomplex: + add(" mag = BCRE_#*BCRE_#+BCIM_#*BCIM_#".replace("#", str(i))) + add(" re = PRE*BCRE_# + PIM*BCIM_#".replace("#", str(i))) + add(" im = PIM*BCRE_# - PRE*BCIM_#".replace("#", str(i))) + add(" PRE = (re << wp) // mag".replace("#", str(i))) + add(" PIM = (im << wp) // mag".replace("#", str(i))) + + else: + for k in range(cancellable_real): add(" PRE = PRE * AREAL_%i // BREAL_%i" % (areal[k], breal[k])) + for i in noncancellable_real_num: add(" PRE = (PRE * AREAL_#) >> wp".replace("#", str(i))) + for i in noncancellable_real_den: add(" PRE = (PRE << wp) // BREAL_#".replace("#", str(i))) + if multiplier: + add(" PRE = ((PRE * mul * ZRE) >> wp) // div") + else: + add(" PRE = ((PRE * ZRE) >> wp) // div") + + # Add product to sum + if have_complex: + add(" SRE += PRE") + add(" SIM += PIM") + add(" if (HIGH > PRE > LOW) and (HIGH > PIM > LOW):") + add(" break") + else: + add(" SRE += PRE") + add(" if HIGH > PRE > LOW:") + add(" break") + + #add(" from mpmath import nprint, log, ldexp") + #add(" nprint([n, log(abs(PRE),2), ldexp(PRE,-wp)])") + + add(" if n > MAX:") + add(" raise NoConvergence('Hypergeometric series converges too slowly. Try increasing maxterms.')") + + # +1 all parameters for next loop + for i in aint: add(" AINT_# += 1".replace("#", str(i))) + for i in bint: add(" BINT_# += 1".replace("#", str(i))) + for i in arat: add(" AP_# += AQ_#".replace("#", str(i))) + for i in brat: add(" BP_# += BQ_#".replace("#", str(i))) + for i in areal: add(" AREAL_# += one".replace("#", str(i))) + for i in breal: add(" BREAL_# += one".replace("#", str(i))) + for i in acomplex: add(" ACRE_# += one".replace("#", str(i))) + for i in bcomplex: add(" BCRE_# += one".replace("#", str(i))) + + if have_complex: + add("a = from_man_exp(SRE, -wp, prec, 'n')") + add("b = from_man_exp(SIM, -wp, prec, 'n')") + + add("if SRE:") + add(" if SIM:") + add(" magn = max(a[2]+a[3], b[2]+b[3])") + add(" else:") + add(" magn = a[2]+a[3]") + add("elif SIM:") + add(" magn = b[2]+b[3]") + add("else:") + add(" magn = -wp+1") + + add("return (a, b), True, magn") + else: + add("a = from_man_exp(SRE, -wp, prec, 'n')") + + add("if SRE:") + add(" magn = a[2]+a[3]") + add("else:") + add(" magn = -wp+1") + + add("return a, False, magn") + + source = "\n".join((" " + line) for line in source) + source = ("def %s(coeffs, z, prec, wp, epsshift, magnitude_check, **kwargs):\n" % fname) + source + + namespace = {} + + exec_(source, globals(), namespace) + + #print source + return source, namespace[fname] + + +if BACKEND == 'sage': + + def make_hyp_summator(key): + """ + Returns a function that sums a generalized hypergeometric series, + for given parameter types (integer, rational, real, complex). + """ + from sage.libs.mpmath.ext_main import hypsum_internal + p, q, param_types, ztype = key + def _hypsum(coeffs, z, prec, wp, epsshift, magnitude_check, **kwargs): + return hypsum_internal(p, q, param_types, ztype, coeffs, z, + prec, wp, epsshift, magnitude_check, kwargs) + + return "(none)", _hypsum + + +#-----------------------------------------------------------------------# +# # +# Error functions # +# # +#-----------------------------------------------------------------------# + +# TODO: mpf_erf should call mpf_erfc when appropriate (currently +# only the converse delegation is implemented) + +def mpf_erf(x, prec, rnd=round_fast): + sign, man, exp, bc = x + if not man: + if x == fzero: return fzero + if x == finf: return fone + if x== fninf: return fnone + return fnan + size = exp + bc + lg = math.log + # The approximation erf(x) = 1 is accurate to > x^2 * log(e,2) bits + if size > 3 and 2*(size-1) + 0.528766 > lg(prec,2): + if sign: + return mpf_perturb(fnone, 0, prec, rnd) + else: + return mpf_perturb(fone, 1, prec, rnd) + # erf(x) ~ 2*x/sqrt(pi) close to 0 + if size < -prec: + # 2*x + x = mpf_shift(x,1) + c = mpf_sqrt(mpf_pi(prec+20), prec+20) + # TODO: interval rounding + return mpf_div(x, c, prec, rnd) + wp = prec + abs(size) + 25 + # Taylor series for erf, fixed-point summation + t = abs(to_fixed(x, wp)) + t2 = (t*t) >> wp + s, term, k = t, 12345, 1 + while term: + t = ((t * t2) >> wp) // k + term = t // (2*k+1) + if k & 1: + s -= term + else: + s += term + k += 1 + s = (s << (wp+1)) // sqrt_fixed(pi_fixed(wp), wp) + if sign: + s = -s + return from_man_exp(s, -wp, prec, rnd) + +# If possible, we use the asymptotic series for erfc. +# This is an alternating divergent asymptotic series, so +# the error is at most equal to the first omitted term. +# Here we check if the smallest term is small enough +# for a given x and precision +def erfc_check_series(x, prec): + n = to_int(x) + if n**2 * 1.44 > prec: + return True + return False + +def mpf_erfc(x, prec, rnd=round_fast): + sign, man, exp, bc = x + if not man: + if x == fzero: return fone + if x == finf: return fzero + if x == fninf: return ftwo + return fnan + wp = prec + 20 + mag = bc+exp + # Preserve full accuracy when exponent grows huge + wp += max(0, 2*mag) + regular_erf = sign or mag < 2 + if regular_erf or not erfc_check_series(x, wp): + if regular_erf: + return mpf_sub(fone, mpf_erf(x, prec+10, negative_rnd[rnd]), prec, rnd) + # 1-erf(x) ~ exp(-x^2), increase prec to deal with cancellation + n = to_int(x)+1 + return mpf_sub(fone, mpf_erf(x, prec + int(n**2*1.44) + 10), prec, rnd) + s = term = MPZ_ONE << wp + term_prev = 0 + t = (2 * to_fixed(x, wp) ** 2) >> wp + k = 1 + while 1: + term = ((term * (2*k - 1)) << wp) // t + if k > 4 and term > term_prev or not term: + break + if k & 1: + s -= term + else: + s += term + term_prev = term + #print k, to_str(from_man_exp(term, -wp, 50), 10) + k += 1 + s = (s << wp) // sqrt_fixed(pi_fixed(wp), wp) + s = from_man_exp(s, -wp, wp) + z = mpf_exp(mpf_neg(mpf_mul(x,x,wp),wp),wp) + y = mpf_div(mpf_mul(z, s, wp), x, prec, rnd) + return y + + +#-----------------------------------------------------------------------# +# # +# Exponential integrals # +# # +#-----------------------------------------------------------------------# + +def ei_taylor(x, prec): + s = t = x + k = 2 + while t: + t = ((t*x) >> prec) // k + s += t // k + k += 1 + return s + +def complex_ei_taylor(zre, zim, prec): + _abs = abs + sre = tre = zre + sim = tim = zim + k = 2 + while _abs(tre) + _abs(tim) > 5: + tre, tim = ((tre*zre-tim*zim)//k)>>prec, ((tre*zim+tim*zre)//k)>>prec + sre += tre // k + sim += tim // k + k += 1 + return sre, sim + +def ei_asymptotic(x, prec): + one = MPZ_ONE << prec + x = t = ((one << prec) // x) + s = one + x + k = 2 + while t: + t = (k*t*x) >> prec + s += t + k += 1 + return s + +def complex_ei_asymptotic(zre, zim, prec): + _abs = abs + one = MPZ_ONE << prec + M = (zim*zim + zre*zre) >> prec + # 1 / z + xre = tre = (zre << prec) // M + xim = tim = ((-zim) << prec) // M + sre = one + xre + sim = xim + k = 2 + while _abs(tre) + _abs(tim) > 1000: + #print tre, tim + tre, tim = ((tre*xre-tim*xim)*k)>>prec, ((tre*xim+tim*xre)*k)>>prec + sre += tre + sim += tim + k += 1 + if k > prec: + raise NoConvergence + return sre, sim + +def mpf_ei(x, prec, rnd=round_fast, e1=False): + if e1: + x = mpf_neg(x) + sign, man, exp, bc = x + if e1 and not sign: + if x == fzero: + return finf + raise ComplexResult("E1(x) for x < 0") + if man: + xabs = 0, man, exp, bc + xmag = exp+bc + wp = prec + 20 + can_use_asymp = xmag > wp + if not can_use_asymp: + if exp >= 0: + xabsint = man << exp + else: + xabsint = man >> (-exp) + can_use_asymp = xabsint > int(wp*0.693) + 10 + if can_use_asymp: + if xmag > wp: + v = fone + else: + v = from_man_exp(ei_asymptotic(to_fixed(x, wp), wp), -wp) + v = mpf_mul(v, mpf_exp(x, wp), wp) + v = mpf_div(v, x, prec, rnd) + else: + wp += 2*int(to_int(xabs)) + u = to_fixed(x, wp) + v = ei_taylor(u, wp) + euler_fixed(wp) + t1 = from_man_exp(v,-wp) + t2 = mpf_log(xabs,wp) + v = mpf_add(t1, t2, prec, rnd) + else: + if x == fzero: v = fninf + elif x == finf: v = finf + elif x == fninf: v = fzero + else: v = fnan + if e1: + v = mpf_neg(v) + return v + +def mpc_ei(z, prec, rnd=round_fast, e1=False): + if e1: + z = mpc_neg(z) + a, b = z + asign, aman, aexp, abc = a + bsign, bman, bexp, bbc = b + if b == fzero: + if e1: + x = mpf_neg(mpf_ei(a, prec, rnd)) + if not asign: + y = mpf_neg(mpf_pi(prec, rnd)) + else: + y = fzero + return x, y + else: + return mpf_ei(a, prec, rnd), fzero + if a != fzero: + if not aman or not bman: + return (fnan, fnan) + wp = prec + 40 + amag = aexp+abc + bmag = bexp+bbc + zmag = max(amag, bmag) + can_use_asymp = zmag > wp + if not can_use_asymp: + zabsint = abs(to_int(a)) + abs(to_int(b)) + can_use_asymp = zabsint > int(wp*0.693) + 20 + try: + if can_use_asymp: + if zmag > wp: + v = fone, fzero + else: + zre = to_fixed(a, wp) + zim = to_fixed(b, wp) + vre, vim = complex_ei_asymptotic(zre, zim, wp) + v = from_man_exp(vre, -wp), from_man_exp(vim, -wp) + v = mpc_mul(v, mpc_exp(z, wp), wp) + v = mpc_div(v, z, wp) + if e1: + v = mpc_neg(v, prec, rnd) + else: + x, y = v + if bsign: + v = mpf_pos(x, prec, rnd), mpf_sub(y, mpf_pi(wp), prec, rnd) + else: + v = mpf_pos(x, prec, rnd), mpf_add(y, mpf_pi(wp), prec, rnd) + return v + except NoConvergence: + pass + #wp += 2*max(0,zmag) + wp += 2*int(to_int(mpc_abs(z, 5))) + zre = to_fixed(a, wp) + zim = to_fixed(b, wp) + vre, vim = complex_ei_taylor(zre, zim, wp) + vre += euler_fixed(wp) + v = from_man_exp(vre,-wp), from_man_exp(vim,-wp) + if e1: + u = mpc_log(mpc_neg(z),wp) + else: + u = mpc_log(z,wp) + v = mpc_add(v, u, prec, rnd) + if e1: + v = mpc_neg(v) + return v + +def mpf_e1(x, prec, rnd=round_fast): + return mpf_ei(x, prec, rnd, True) + +def mpc_e1(x, prec, rnd=round_fast): + return mpc_ei(x, prec, rnd, True) + +def mpf_expint(n, x, prec, rnd=round_fast, gamma=False): + """ + E_n(x), n an integer, x real + + With gamma=True, computes Gamma(n,x) (upper incomplete gamma function) + + Returns (real, None) if real, otherwise (real, imag) + The imaginary part is an optional branch cut term + + """ + sign, man, exp, bc = x + if not man: + if gamma: + if x == fzero: + # Actually gamma function pole + if n <= 0: + return finf, None + return mpf_gamma_int(n, prec, rnd), None + if x == finf: + return fzero, None + # TODO: could return finite imaginary value at -inf + return fnan, fnan + else: + if x == fzero: + if n > 1: + return from_rational(1, n-1, prec, rnd), None + else: + return finf, None + if x == finf: + return fzero, None + return fnan, fnan + n_orig = n + if gamma: + n = 1-n + wp = prec + 20 + xmag = exp + bc + # Beware of near-poles + if xmag < -10: + raise NotImplementedError + nmag = bitcount(abs(n)) + have_imag = n > 0 and sign + negx = mpf_neg(x) + # Skip series if direct convergence + if n == 0 or 2*nmag - xmag < -wp: + if gamma: + v = mpf_exp(negx, wp) + re = mpf_mul(v, mpf_pow_int(x, n_orig-1, wp), prec, rnd) + else: + v = mpf_exp(negx, wp) + re = mpf_div(v, x, prec, rnd) + else: + # Finite number of terms, or... + can_use_asymptotic_series = -3*wp < n <= 0 + # ...large enough? + if not can_use_asymptotic_series: + xi = abs(to_int(x)) + m = min(max(1, xi-n), 2*wp) + siz = -n*nmag + (m+n)*bitcount(abs(m+n)) - m*xmag - (144*m//100) + tol = -wp-10 + can_use_asymptotic_series = siz < tol + if can_use_asymptotic_series: + r = ((-MPZ_ONE) << (wp+wp)) // to_fixed(x, wp) + m = n + t = r*m + s = MPZ_ONE << wp + while m and t: + s += t + m += 1 + t = (m*r*t) >> wp + v = mpf_exp(negx, wp) + if gamma: + # ~ exp(-x) * x^(n-1) * (1 + ...) + v = mpf_mul(v, mpf_pow_int(x, n_orig-1, wp), wp) + else: + # ~ exp(-x)/x * (1 + ...) + v = mpf_div(v, x, wp) + re = mpf_mul(v, from_man_exp(s, -wp), prec, rnd) + elif n == 1: + re = mpf_neg(mpf_ei(negx, prec, rnd)) + elif n > 0 and n < 3*wp: + T1 = mpf_neg(mpf_ei(negx, wp)) + if gamma: + if n_orig & 1: + T1 = mpf_neg(T1) + else: + T1 = mpf_mul(T1, mpf_pow_int(negx, n-1, wp), wp) + r = t = to_fixed(x, wp) + facs = [1] * (n-1) + for k in range(1,n-1): + facs[k] = facs[k-1] * k + facs = facs[::-1] + s = facs[0] << wp + for k in range(1, n-1): + if k & 1: + s -= facs[k] * t + else: + s += facs[k] * t + t = (t*r) >> wp + T2 = from_man_exp(s, -wp, wp) + T2 = mpf_mul(T2, mpf_exp(negx, wp)) + if gamma: + T2 = mpf_mul(T2, mpf_pow_int(x, n_orig, wp), wp) + R = mpf_add(T1, T2) + re = mpf_div(R, from_int(ifac(n-1)), prec, rnd) + else: + raise NotImplementedError + if have_imag: + M = from_int(-ifac(n-1)) + if gamma: + im = mpf_div(mpf_pi(wp), M, prec, rnd) + if n_orig & 1: + im = mpf_neg(im) + else: + im = mpf_div(mpf_mul(mpf_pi(wp), mpf_pow_int(negx, n_orig-1, wp), wp), M, prec, rnd) + return re, im + else: + return re, None + +def mpf_ci_si_taylor(x, wp, which=0): + """ + 0 - Ci(x) - (euler+log(x)) + 1 - Si(x) + """ + x = to_fixed(x, wp) + x2 = -(x*x) >> wp + if which == 0: + s, t, k = 0, (MPZ_ONE<>wp + s += t//k + k += 2 + return from_man_exp(s, -wp) + +def mpc_ci_si_taylor(re, im, wp, which=0): + # The following code is only designed for small arguments, + # and not too small arguments (for relative accuracy) + if re[1]: + mag = re[2]+re[3] + elif im[1]: + mag = im[2]+im[3] + if im[1]: + mag = max(mag, im[2]+im[3]) + if mag > 2 or mag < -wp: + raise NotImplementedError + wp += (2-mag) + zre = to_fixed(re, wp) + zim = to_fixed(im, wp) + z2re = (zim*zim-zre*zre)>>wp + z2im = (-2*zre*zim)>>wp + tre = zre + tim = zim + one = MPZ_ONE< 2: + f = k*(k-1) + tre, tim = ((tre*z2re-tim*z2im)//f)>>wp, ((tre*z2im+tim*z2re)//f)>>wp + sre += tre//k + sim += tim//k + k += 2 + return from_man_exp(sre, -wp), from_man_exp(sim, -wp) + +def mpf_ci_si(x, prec, rnd=round_fast, which=2): + """ + Calculation of Ci(x), Si(x) for real x. + + which = 0 -- returns (Ci(x), -) + which = 1 -- returns (Si(x), -) + which = 2 -- returns (Ci(x), Si(x)) + + Note: if x < 0, Ci(x) needs an additional imaginary term, pi*i. + """ + wp = prec + 20 + sign, man, exp, bc = x + ci, si = None, None + if not man: + if x == fzero: + return (fninf, fzero) + if x == fnan: + return (x, x) + ci = fzero + if which != 0: + if x == finf: + si = mpf_shift(mpf_pi(prec, rnd), -1) + if x == fninf: + si = mpf_neg(mpf_shift(mpf_pi(prec, negative_rnd[rnd]), -1)) + return (ci, si) + # For small x: Ci(x) ~ euler + log(x), Si(x) ~ x + mag = exp+bc + if mag < -wp: + if which != 0: + si = mpf_perturb(x, 1-sign, prec, rnd) + if which != 1: + y = mpf_euler(wp) + xabs = mpf_abs(x) + ci = mpf_add(y, mpf_log(xabs, wp), prec, rnd) + return ci, si + # For huge x: Ci(x) ~ sin(x)/x, Si(x) ~ pi/2 + elif mag > wp: + if which != 0: + if sign: + si = mpf_neg(mpf_pi(prec, negative_rnd[rnd])) + else: + si = mpf_pi(prec, rnd) + si = mpf_shift(si, -1) + if which != 1: + ci = mpf_div(mpf_sin(x, wp), x, prec, rnd) + return ci, si + else: + wp += abs(mag) + # Use an asymptotic series? The smallest value of n!/x^n + # occurs for n ~ x, where the magnitude is ~ exp(-x). + asymptotic = mag-1 > math.log(wp, 2) + # Case 1: convergent series near 0 + if not asymptotic: + if which != 0: + si = mpf_pos(mpf_ci_si_taylor(x, wp, 1), prec, rnd) + if which != 1: + ci = mpf_ci_si_taylor(x, wp, 0) + ci = mpf_add(ci, mpf_euler(wp), wp) + ci = mpf_add(ci, mpf_log(mpf_abs(x), wp), prec, rnd) + return ci, si + x = mpf_abs(x) + # Case 2: asymptotic series for x >> 1 + xf = to_fixed(x, wp) + xr = (MPZ_ONE<<(2*wp)) // xf # 1/x + s1 = (MPZ_ONE << wp) + s2 = xr + t = xr + k = 2 + while t: + t = -t + t = (t*xr*k)>>wp + k += 1 + s1 += t + t = (t*xr*k)>>wp + k += 1 + s2 += t + s1 = from_man_exp(s1, -wp) + s2 = from_man_exp(s2, -wp) + s1 = mpf_div(s1, x, wp) + s2 = mpf_div(s2, x, wp) + cos, sin = mpf_cos_sin(x, wp) + # Ci(x) = sin(x)*s1-cos(x)*s2 + # Si(x) = pi/2-cos(x)*s1-sin(x)*s2 + if which != 0: + si = mpf_add(mpf_mul(cos, s1), mpf_mul(sin, s2), wp) + si = mpf_sub(mpf_shift(mpf_pi(wp), -1), si, wp) + if sign: + si = mpf_neg(si) + si = mpf_pos(si, prec, rnd) + if which != 1: + ci = mpf_sub(mpf_mul(sin, s1), mpf_mul(cos, s2), prec, rnd) + return ci, si + +def mpf_ci(x, prec, rnd=round_fast): + if mpf_sign(x) < 0: + raise ComplexResult + return mpf_ci_si(x, prec, rnd, 0)[0] + +def mpf_si(x, prec, rnd=round_fast): + return mpf_ci_si(x, prec, rnd, 1)[1] + +def mpc_ci(z, prec, rnd=round_fast): + re, im = z + if im == fzero: + ci = mpf_ci_si(re, prec, rnd, 0)[0] + if mpf_sign(re) < 0: + return (ci, mpf_pi(prec, rnd)) + return (ci, fzero) + wp = prec + 20 + cre, cim = mpc_ci_si_taylor(re, im, wp, 0) + cre = mpf_add(cre, mpf_euler(wp), wp) + ci = mpc_add((cre, cim), mpc_log(z, wp), prec, rnd) + return ci + +def mpc_si(z, prec, rnd=round_fast): + re, im = z + if im == fzero: + return (mpf_ci_si(re, prec, rnd, 1)[1], fzero) + wp = prec + 20 + z = mpc_ci_si_taylor(re, im, wp, 1) + return mpc_pos(z, prec, rnd) + + +#-----------------------------------------------------------------------# +# # +# Bessel functions # +# # +#-----------------------------------------------------------------------# + +# A Bessel function of the first kind of integer order, J_n(x), is +# given by the power series + +# oo +# ___ k 2 k + n +# \ (-1) / x \ +# J_n(x) = ) ----------- | - | +# /___ k! (k + n)! \ 2 / +# k = 0 + +# Simplifying the quotient between two successive terms gives the +# ratio x^2 / (-4*k*(k+n)). Hence, we only need one full-precision +# multiplication and one division by a small integer per term. +# The complex version is very similar, the only difference being +# that the multiplication is actually 4 multiplies. + +# In the general case, we have +# J_v(x) = (x/2)**v / v! * 0F1(v+1, (-1/4)*z**2) + +# TODO: for extremely large x, we could use an asymptotic +# trigonometric approximation. + +# TODO: recompute at higher precision if the fixed-point mantissa +# is very small + +def mpf_besseljn(n, x, prec, rounding=round_fast): + prec += 50 + negate = n < 0 and n & 1 + mag = x[2]+x[3] + n = abs(n) + wp = prec + 20 + n*bitcount(n) + if mag < 0: + wp -= n * mag + x = to_fixed(x, wp) + x2 = (x**2) >> wp + if not n: + s = t = MPZ_ONE << wp + else: + s = t = (x**n // ifac(n)) >> ((n-1)*wp + n) + k = 1 + while t: + t = ((t * x2) // (-4*k*(k+n))) >> wp + s += t + k += 1 + if negate: + s = -s + return from_man_exp(s, -wp, prec, rounding) + +def mpc_besseljn(n, z, prec, rounding=round_fast): + negate = n < 0 and n & 1 + n = abs(n) + origprec = prec + zre, zim = z + mag = max(zre[2]+zre[3], zim[2]+zim[3]) + prec += 20 + n*bitcount(n) + abs(mag) + if mag < 0: + prec -= n * mag + zre = to_fixed(zre, prec) + zim = to_fixed(zim, prec) + z2re = (zre**2 - zim**2) >> prec + z2im = (zre*zim) >> (prec-1) + if not n: + sre = tre = MPZ_ONE << prec + sim = tim = MPZ_ZERO + else: + re, im = complex_int_pow(zre, zim, n) + sre = tre = (re // ifac(n)) >> ((n-1)*prec + n) + sim = tim = (im // ifac(n)) >> ((n-1)*prec + n) + k = 1 + while abs(tre) + abs(tim) > 3: + p = -4*k*(k+n) + tre, tim = tre*z2re - tim*z2im, tim*z2re + tre*z2im + tre = (tre // p) >> prec + tim = (tim // p) >> prec + sre += tre + sim += tim + k += 1 + if negate: + sre = -sre + sim = -sim + re = from_man_exp(sre, -prec, origprec, rounding) + im = from_man_exp(sim, -prec, origprec, rounding) + return (re, im) + +def mpf_agm(a, b, prec, rnd=round_fast): + """ + Computes the arithmetic-geometric mean agm(a,b) for + nonnegative mpf values a, b. + """ + asign, aman, aexp, abc = a + bsign, bman, bexp, bbc = b + if asign or bsign: + raise ComplexResult("agm of a negative number") + # Handle inf, nan or zero in either operand + if not (aman and bman): + if a == fnan or b == fnan: + return fnan + if a == finf: + if b == fzero: + return fnan + return finf + if b == finf: + if a == fzero: + return fnan + return finf + # agm(0,x) = agm(x,0) = 0 + return fzero + wp = prec + 20 + amag = aexp+abc + bmag = bexp+bbc + mag_delta = amag - bmag + # Reduce to roughly the same magnitude using floating-point AGM + abs_mag_delta = abs(mag_delta) + if abs_mag_delta > 10: + while abs_mag_delta > 10: + a, b = mpf_shift(mpf_add(a,b,wp),-1), \ + mpf_sqrt(mpf_mul(a,b,wp),wp) + abs_mag_delta //= 2 + asign, aman, aexp, abc = a + bsign, bman, bexp, bbc = b + amag = aexp+abc + bmag = bexp+bbc + mag_delta = amag - bmag + #print to_float(a), to_float(b) + # Use agm(a,b) = agm(x*a,x*b)/x to obtain a, b ~= 1 + min_mag = min(amag,bmag) + max_mag = max(amag,bmag) + n = 0 + # If too small, we lose precision when going to fixed-point + if min_mag < -8: + n = -min_mag + # If too large, we waste time using fixed-point with large numbers + elif max_mag > 20: + n = -max_mag + if n: + a = mpf_shift(a, n) + b = mpf_shift(b, n) + #print to_float(a), to_float(b) + af = to_fixed(a, wp) + bf = to_fixed(b, wp) + g = agm_fixed(af, bf, wp) + return from_man_exp(g, -wp-n, prec, rnd) + +def mpf_agm1(a, prec, rnd=round_fast): + """ + Computes the arithmetic-geometric mean agm(1,a) for a nonnegative + mpf value a. + """ + return mpf_agm(fone, a, prec, rnd) + +def mpc_agm(a, b, prec, rnd=round_fast): + """ + Complex AGM. + + TODO: + * check that convergence works as intended + * optimize + * select a nonarbitrary branch + """ + if mpc_is_infnan(a) or mpc_is_infnan(b): + return fnan, fnan + if mpc_zero in (a, b): + return fzero, fzero + if mpc_neg(a) == b: + return fzero, fzero + wp = prec+20 + eps = mpf_shift(fone, -wp+10) + while 1: + a1 = mpc_shift(mpc_add(a, b, wp), -1) + b1 = mpc_sqrt(mpc_mul(a, b, wp), wp) + a, b = a1, b1 + size = mpf_min_max([mpc_abs(a,10), mpc_abs(b,10)])[1] + err = mpc_abs(mpc_sub(a, b, 10), 10) + if size == fzero or mpf_lt(err, mpf_mul(eps, size)): + return a + +def mpc_agm1(a, prec, rnd=round_fast): + return mpc_agm(mpc_one, a, prec, rnd) + +def mpf_ellipk(x, prec, rnd=round_fast): + if not x[1]: + if x == fzero: + return mpf_shift(mpf_pi(prec, rnd), -1) + if x == fninf: + return fzero + if x == fnan: + return x + if x == fone: + return finf + # TODO: for |x| << 1/2, one could use fall back to + # pi/2 * hyp2f1_rat((1,2),(1,2),(1,1), x) + wp = prec + 15 + # Use K(x) = pi/2/agm(1,a) where a = sqrt(1-x) + # The sqrt raises ComplexResult if x > 0 + a = mpf_sqrt(mpf_sub(fone, x, wp), wp) + v = mpf_agm1(a, wp) + r = mpf_div(mpf_pi(wp), v, prec, rnd) + return mpf_shift(r, -1) + +def mpc_ellipk(z, prec, rnd=round_fast): + re, im = z + if im == fzero: + if re == finf: + return mpc_zero + if mpf_le(re, fone): + return mpf_ellipk(re, prec, rnd), fzero + wp = prec + 15 + a = mpc_sqrt(mpc_sub(mpc_one, z, wp), wp) + v = mpc_agm1(a, wp) + r = mpc_mpf_div(mpf_pi(wp), v, prec, rnd) + return mpc_shift(r, -1) + +def mpf_ellipe(x, prec, rnd=round_fast): + # http://functions.wolfram.com/EllipticIntegrals/ + # EllipticK/20/01/0001/ + # E = (1-m)*(K'(m)*2*m + K(m)) + sign, man, exp, bc = x + if not man: + if x == fzero: + return mpf_shift(mpf_pi(prec, rnd), -1) + if x == fninf: + return finf + if x == fnan: + return x + if x == finf: + raise ComplexResult + if x == fone: + return fone + wp = prec+20 + mag = exp+bc + if mag < -wp: + return mpf_shift(mpf_pi(prec, rnd), -1) + # Compute a finite difference for K' + p = max(mag, 0) - wp + h = mpf_shift(fone, p) + K = mpf_ellipk(x, 2*wp) + Kh = mpf_ellipk(mpf_sub(x, h), 2*wp) + Kdiff = mpf_shift(mpf_sub(K, Kh), -p) + t = mpf_sub(fone, x) + b = mpf_mul(Kdiff, mpf_shift(x,1), wp) + return mpf_mul(t, mpf_add(K, b), prec, rnd) + +def mpc_ellipe(z, prec, rnd=round_fast): + re, im = z + if im == fzero: + if re == finf: + return (fzero, finf) + if mpf_le(re, fone): + return mpf_ellipe(re, prec, rnd), fzero + wp = prec + 15 + mag = mpc_abs(z, 1) + p = max(mag[2]+mag[3], 0) - wp + h = mpf_shift(fone, p) + K = mpc_ellipk(z, 2*wp) + Kh = mpc_ellipk(mpc_add_mpf(z, h, 2*wp), 2*wp) + Kdiff = mpc_shift(mpc_sub(Kh, K, wp), -p) + t = mpc_sub(mpc_one, z, wp) + b = mpc_mul(Kdiff, mpc_shift(z,1), wp) + return mpc_mul(t, mpc_add(K, b, wp), prec, rnd) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libintmath.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libintmath.py new file mode 100644 index 0000000000000000000000000000000000000000..7880546e135639208d136488408b102ad41682a2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libintmath.py @@ -0,0 +1,584 @@ +""" +Utility functions for integer math. + +TODO: rename, cleanup, perhaps move the gmpy wrapper code +here from settings.py + +""" + +import math +from bisect import bisect + +from .backend import xrange +from .backend import BACKEND, gmpy, sage, sage_utils, MPZ, MPZ_ONE, MPZ_ZERO + +small_trailing = [0] * 256 +for j in range(1,8): + small_trailing[1<>> giant_steps(50,1000) + [66, 128, 253, 502, 1000] + >>> giant_steps(50,1000,4) + [65, 252, 1000] + + """ + L = [target] + while L[-1] > start*n: + L = L + [L[-1]//n + 2] + return L[::-1] + +def rshift(x, n): + """For an integer x, calculate x >> n with the fastest (floor) + rounding. Unlike the plain Python expression (x >> n), n is + allowed to be negative, in which case a left shift is performed.""" + if n >= 0: return x >> n + else: return x << (-n) + +def lshift(x, n): + """For an integer x, calculate x << n. Unlike the plain Python + expression (x << n), n is allowed to be negative, in which case a + right shift with default (floor) rounding is performed.""" + if n >= 0: return x << n + else: return x >> (-n) + +if BACKEND == 'sage': + import operator + rshift = operator.rshift + lshift = operator.lshift + +def python_trailing(n): + """Count the number of trailing zero bits in abs(n).""" + if not n: + return 0 + low_byte = n & 0xff + if low_byte: + return small_trailing[low_byte] + t = 8 + n >>= 8 + while not n & 0xff: + n >>= 8 + t += 8 + return t + small_trailing[n & 0xff] + +if BACKEND == 'gmpy': + if gmpy.version() >= '2': + def gmpy_trailing(n): + """Count the number of trailing zero bits in abs(n) using gmpy.""" + if n: return MPZ(n).bit_scan1() + else: return 0 + else: + def gmpy_trailing(n): + """Count the number of trailing zero bits in abs(n) using gmpy.""" + if n: return MPZ(n).scan1() + else: return 0 + +# Small powers of 2 +powers = [1<<_ for _ in range(300)] + +def python_bitcount(n): + """Calculate bit size of the nonnegative integer n.""" + bc = bisect(powers, n) + if bc != 300: + return bc + bc = int(math.log(n, 2)) - 4 + return bc + bctable[n>>bc] + +def gmpy_bitcount(n): + """Calculate bit size of the nonnegative integer n.""" + if n: return MPZ(n).numdigits(2) + else: return 0 + +#def sage_bitcount(n): +# if n: return MPZ(n).nbits() +# else: return 0 + +def sage_trailing(n): + return MPZ(n).trailing_zero_bits() + +if BACKEND == 'gmpy': + bitcount = gmpy_bitcount + trailing = gmpy_trailing +elif BACKEND == 'sage': + sage_bitcount = sage_utils.bitcount + bitcount = sage_bitcount + trailing = sage_trailing +else: + bitcount = python_bitcount + trailing = python_trailing + +if BACKEND == 'gmpy' and 'bit_length' in dir(gmpy): + bitcount = gmpy.bit_length + +# Used to avoid slow function calls as far as possible +trailtable = [trailing(n) for n in range(256)] +bctable = [bitcount(n) for n in range(1024)] + +# TODO: speed up for bases 2, 4, 8, 16, ... + +def bin_to_radix(x, xbits, base, bdigits): + """Changes radix of a fixed-point number; i.e., converts + x * 2**xbits to floor(x * 10**bdigits).""" + return x * (MPZ(base)**bdigits) >> xbits + +stddigits = '0123456789abcdefghijklmnopqrstuvwxyz' + +def small_numeral(n, base=10, digits=stddigits): + """Return the string numeral of a positive integer in an arbitrary + base. Most efficient for small input.""" + if base == 10: + return str(n) + digs = [] + while n: + n, digit = divmod(n, base) + digs.append(digits[digit]) + return "".join(digs[::-1]) + +def numeral_python(n, base=10, size=0, digits=stddigits): + """Represent the integer n as a string of digits in the given base. + Recursive division is used to make this function about 3x faster + than Python's str() for converting integers to decimal strings. + + The 'size' parameters specifies the number of digits in n; this + number is only used to determine splitting points and need not be + exact.""" + if n <= 0: + if not n: + return "0" + return "-" + numeral(-n, base, size, digits) + # Fast enough to do directly + if size < 250: + return small_numeral(n, base, digits) + # Divide in half + half = (size // 2) + (size & 1) + A, B = divmod(n, base**half) + ad = numeral(A, base, half, digits) + bd = numeral(B, base, half, digits).rjust(half, "0") + return ad + bd + +def numeral_gmpy(n, base=10, size=0, digits=stddigits): + """Represent the integer n as a string of digits in the given base. + Recursive division is used to make this function about 3x faster + than Python's str() for converting integers to decimal strings. + + The 'size' parameters specifies the number of digits in n; this + number is only used to determine splitting points and need not be + exact.""" + if n < 0: + return "-" + numeral(-n, base, size, digits) + # gmpy.digits() may cause a segmentation fault when trying to convert + # extremely large values to a string. The size limit may need to be + # adjusted on some platforms, but 1500000 works on Windows and Linux. + if size < 1500000: + return gmpy.digits(n, base) + # Divide in half + half = (size // 2) + (size & 1) + A, B = divmod(n, MPZ(base)**half) + ad = numeral(A, base, half, digits) + bd = numeral(B, base, half, digits).rjust(half, "0") + return ad + bd + +if BACKEND == "gmpy": + numeral = numeral_gmpy +else: + numeral = numeral_python + +_1_800 = 1<<800 +_1_600 = 1<<600 +_1_400 = 1<<400 +_1_200 = 1<<200 +_1_100 = 1<<100 +_1_50 = 1<<50 + +def isqrt_small_python(x): + """ + Correctly (floor) rounded integer square root, using + division. Fast up to ~200 digits. + """ + if not x: + return x + if x < _1_800: + # Exact with IEEE double precision arithmetic + if x < _1_50: + return int(x**0.5) + # Initial estimate can be any integer >= the true root; round up + r = int(x**0.5 * 1.00000000000001) + 1 + else: + bc = bitcount(x) + n = bc//2 + r = int((x>>(2*n-100))**0.5+2)<<(n-50) # +2 is to round up + # The following iteration now precisely computes floor(sqrt(x)) + # See e.g. Crandall & Pomerance, "Prime Numbers: A Computational + # Perspective" + while 1: + y = (r+x//r)>>1 + if y >= r: + return r + r = y + +def isqrt_fast_python(x): + """ + Fast approximate integer square root, computed using division-free + Newton iteration for large x. For random integers the result is almost + always correct (floor(sqrt(x))), but is 1 ulp too small with a roughly + 0.1% probability. If x is very close to an exact square, the answer is + 1 ulp wrong with high probability. + + With 0 guard bits, the largest error over a set of 10^5 random + inputs of size 1-10^5 bits was 3 ulp. The use of 10 guard bits + almost certainly guarantees a max 1 ulp error. + """ + # Use direct division-based iteration if sqrt(x) < 2^400 + # Assume floating-point square root accurate to within 1 ulp, then: + # 0 Newton iterations good to 52 bits + # 1 Newton iterations good to 104 bits + # 2 Newton iterations good to 208 bits + # 3 Newton iterations good to 416 bits + if x < _1_800: + y = int(x**0.5) + if x >= _1_100: + y = (y + x//y) >> 1 + if x >= _1_200: + y = (y + x//y) >> 1 + if x >= _1_400: + y = (y + x//y) >> 1 + return y + bc = bitcount(x) + guard_bits = 10 + x <<= 2*guard_bits + bc += 2*guard_bits + bc += (bc&1) + hbc = bc//2 + startprec = min(50, hbc) + # Newton iteration for 1/sqrt(x), with floating-point starting value + r = int(2.0**(2*startprec) * (x >> (bc-2*startprec)) ** -0.5) + pp = startprec + for p in giant_steps(startprec, hbc): + # r**2, scaled from real size 2**(-bc) to 2**p + r2 = (r*r) >> (2*pp - p) + # x*r**2, scaled from real size ~1.0 to 2**p + xr2 = ((x >> (bc-p)) * r2) >> p + # New value of r, scaled from real size 2**(-bc/2) to 2**p + r = (r * ((3<> (pp+1) + pp = p + # (1/sqrt(x))*x = sqrt(x) + return (r*(x>>hbc)) >> (p+guard_bits) + +def sqrtrem_python(x): + """Correctly rounded integer (floor) square root with remainder.""" + # to check cutoff: + # plot(lambda x: timing(isqrt, 2**int(x)), [0,2000]) + if x < _1_600: + y = isqrt_small_python(x) + return y, x - y*y + y = isqrt_fast_python(x) + 1 + rem = x - y*y + # Correct remainder + while rem < 0: + y -= 1 + rem += (1+2*y) + else: + if rem: + while rem > 2*(1+y): + y += 1 + rem -= (1+2*y) + return y, rem + +def isqrt_python(x): + """Integer square root with correct (floor) rounding.""" + return sqrtrem_python(x)[0] + +def sqrt_fixed(x, prec): + return isqrt_fast(x<= '2': + isqrt_small = isqrt_fast = isqrt = gmpy.isqrt + sqrtrem = gmpy.isqrt_rem + else: + isqrt_small = isqrt_fast = isqrt = gmpy.sqrt + sqrtrem = gmpy.sqrtrem +elif BACKEND == 'sage': + isqrt_small = isqrt_fast = isqrt = \ + getattr(sage_utils, "isqrt", lambda n: MPZ(n).isqrt()) + sqrtrem = lambda n: MPZ(n).sqrtrem() +else: + isqrt_small = isqrt_small_python + isqrt_fast = isqrt_fast_python + isqrt = isqrt_python + sqrtrem = sqrtrem_python + + +def ifib(n, _cache={}): + """Computes the nth Fibonacci number as an integer, for + integer n.""" + if n < 0: + return (-1)**(-n+1) * ifib(-n) + if n in _cache: + return _cache[n] + m = n + # Use Dijkstra's logarithmic algorithm + # The following implementation is basically equivalent to + # http://en.literateprograms.org/Fibonacci_numbers_(Scheme) + a, b, p, q = MPZ_ONE, MPZ_ZERO, MPZ_ZERO, MPZ_ONE + while n: + if n & 1: + aq = a*q + a, b = b*q+aq+a*p, b*p+aq + n -= 1 + else: + qq = q*q + p, q = p*p+qq, qq+2*p*q + n >>= 1 + if m < 250: + _cache[m] = b + return b + +MAX_FACTORIAL_CACHE = 1000 + +def ifac(n, memo={0:1, 1:1}): + """Return n factorial (for integers n >= 0 only).""" + f = memo.get(n) + if f: + return f + k = len(memo) + p = memo[k-1] + MAX = MAX_FACTORIAL_CACHE + while k <= n: + p *= k + if k <= MAX: + memo[k] = p + k += 1 + return p + +def ifac2(n, memo_pair=[{0:1}, {1:1}]): + """Return n!! (double factorial), integers n >= 0 only.""" + memo = memo_pair[n&1] + f = memo.get(n) + if f: + return f + k = max(memo) + p = memo[k] + MAX = MAX_FACTORIAL_CACHE + while k < n: + k += 2 + p *= k + if k <= MAX: + memo[k] = p + return p + +if BACKEND == 'gmpy': + ifac = gmpy.fac +elif BACKEND == 'sage': + ifac = lambda n: int(sage.factorial(n)) + ifib = sage.fibonacci + +def list_primes(n): + n = n + 1 + sieve = list(xrange(n)) + sieve[:2] = [0, 0] + for i in xrange(2, int(n**0.5)+1): + if sieve[i]: + for j in xrange(i**2, n, i): + sieve[j] = 0 + return [p for p in sieve if p] + +if BACKEND == 'sage': + # Note: it is *VERY* important for performance that we convert + # the list to Python ints. + def list_primes(n): + return [int(_) for _ in sage.primes(n+1)] + +small_odd_primes = (3,5,7,11,13,17,19,23,29,31,37,41,43,47) +small_odd_primes_set = set(small_odd_primes) + +def isprime(n): + """ + Determines whether n is a prime number. A probabilistic test is + performed if n is very large. No special trick is used for detecting + perfect powers. + + >>> sum(list_primes(100000)) + 454396537 + >>> sum(n*isprime(n) for n in range(100000)) + 454396537 + + """ + n = int(n) + if not n & 1: + return n == 2 + if n < 50: + return n in small_odd_primes_set + for p in small_odd_primes: + if not n % p: + return False + m = n-1 + s = trailing(m) + d = m >> s + def test(a): + x = pow(a,d,n) + if x == 1 or x == m: + return True + for r in xrange(1,s): + x = x**2 % n + if x == m: + return True + return False + # See http://primes.utm.edu/prove/prove2_3.html + if n < 1373653: + witnesses = [2,3] + elif n < 341550071728321: + witnesses = [2,3,5,7,11,13,17] + else: + witnesses = small_odd_primes + for a in witnesses: + if not test(a): + return False + return True + +def moebius(n): + """ + Evaluates the Moebius function which is `mu(n) = (-1)^k` if `n` + is a product of `k` distinct primes and `mu(n) = 0` otherwise. + + TODO: speed up using factorization + """ + n = abs(int(n)) + if n < 2: + return n + factors = [] + for p in xrange(2, n+1): + if not (n % p): + if not (n % p**2): + return 0 + if not sum(p % f for f in factors): + factors.append(p) + return (-1)**len(factors) + +def gcd(*args): + a = 0 + for b in args: + if a: + while b: + a, b = b, a % b + else: + a = b + return a + + +# Comment by Juan Arias de Reyna: +# +# I learn this method to compute EulerE[2n] from van de Lune. +# +# We apply the formula EulerE[2n] = (-1)^n 2**(-2n) sum_{j=0}^n a(2n,2j+1) +# +# where the numbers a(n,j) vanish for j > n+1 or j <= -1 and satisfies +# +# a(0,-1) = a(0,0) = 0; a(0,1)= 1; a(0,2) = a(0,3) = 0 +# +# a(n,j) = a(n-1,j) when n+j is even +# a(n,j) = (j-1) a(n-1,j-1) + (j+1) a(n-1,j+1) when n+j is odd +# +# +# But we can use only one array unidimensional a(j) since to compute +# a(n,j) we only need to know a(n-1,k) where k and j are of different parity +# and we have not to conserve the used values. +# +# We cached up the values of Euler numbers to sufficiently high order. +# +# Important Observation: If we pretend to use the numbers +# EulerE[1], EulerE[2], ... , EulerE[n] +# it is convenient to compute first EulerE[n], since the algorithm +# computes first all +# the previous ones, and keeps them in the CACHE + +MAX_EULER_CACHE = 500 + +def eulernum(m, _cache={0:MPZ_ONE}): + r""" + Computes the Euler numbers `E(n)`, which can be defined as + coefficients of the Taylor expansion of `1/cosh x`: + + .. math :: + + \frac{1}{\cosh x} = \sum_{n=0}^\infty \frac{E_n}{n!} x^n + + Example:: + + >>> [int(eulernum(n)) for n in range(11)] + [1, 0, -1, 0, 5, 0, -61, 0, 1385, 0, -50521] + >>> [int(eulernum(n)) for n in range(11)] # test cache + [1, 0, -1, 0, 5, 0, -61, 0, 1385, 0, -50521] + + """ + # for odd m > 1, the Euler numbers are zero + if m & 1: + return MPZ_ZERO + f = _cache.get(m) + if f: + return f + MAX = MAX_EULER_CACHE + n = m + a = [MPZ(_) for _ in [0,0,1,0,0,0]] + for n in range(1, m+1): + for j in range(n+1, -1, -2): + a[j+1] = (j-1)*a[j] + (j+1)*a[j+2] + a.append(0) + suma = 0 + for k in range(n+1, -1, -2): + suma += a[k+1] + if n <= MAX: + _cache[n] = ((-1)**(n//2))*(suma // 2**n) + if n == m: + return ((-1)**(n//2))*suma // 2**n + +def stirling1(n, k): + """ + Stirling number of the first kind. + """ + if n < 0 or k < 0: + raise ValueError + if k >= n: + return MPZ(n == k) + if k < 1: + return MPZ_ZERO + L = [MPZ_ZERO] * (k+1) + L[1] = MPZ_ONE + for m in xrange(2, n+1): + for j in xrange(min(k, m), 0, -1): + L[j] = (m-1) * L[j] + L[j-1] + return (-1)**(n+k) * L[k] + +def stirling2(n, k): + """ + Stirling number of the second kind. + """ + if n < 0 or k < 0: + raise ValueError + if k >= n: + return MPZ(n == k) + if k <= 1: + return MPZ(k == 1) + s = MPZ_ZERO + t = MPZ_ONE + for j in xrange(k+1): + if (k + j) & 1: + s -= t * MPZ(j)**n + else: + s += t * MPZ(j)**n + t = t * (k - j) // (j + 1) + return s // ifac(k) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libmpc.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libmpc.py new file mode 100644 index 0000000000000000000000000000000000000000..cc22d0e73674676c8a9249ebc2d48da7f3be8b0d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/libmp/libmpc.py @@ -0,0 +1,835 @@ +""" +Low-level functions for complex arithmetic. +""" + +import sys + +from .backend import MPZ, MPZ_ZERO, MPZ_ONE, MPZ_TWO, BACKEND + +from .libmpf import (\ + round_floor, round_ceiling, round_down, round_up, + round_nearest, round_fast, bitcount, + bctable, normalize, normalize1, reciprocal_rnd, rshift, lshift, giant_steps, + negative_rnd, + to_str, to_fixed, from_man_exp, from_float, to_float, from_int, to_int, + fzero, fone, ftwo, fhalf, finf, fninf, fnan, fnone, + mpf_abs, mpf_pos, mpf_neg, mpf_add, mpf_sub, mpf_mul, + mpf_div, mpf_mul_int, mpf_shift, mpf_sqrt, mpf_hypot, + mpf_rdiv_int, mpf_floor, mpf_ceil, mpf_nint, mpf_frac, + mpf_sign, mpf_hash, + ComplexResult +) + +from .libelefun import (\ + mpf_pi, mpf_exp, mpf_log, mpf_cos_sin, mpf_cosh_sinh, mpf_tan, mpf_pow_int, + mpf_log_hypot, + mpf_cos_sin_pi, mpf_phi, + mpf_cos, mpf_sin, mpf_cos_pi, mpf_sin_pi, + mpf_atan, mpf_atan2, mpf_cosh, mpf_sinh, mpf_tanh, + mpf_asin, mpf_acos, mpf_acosh, mpf_nthroot, mpf_fibonacci +) + +# An mpc value is a (real, imag) tuple +mpc_one = fone, fzero +mpc_zero = fzero, fzero +mpc_two = ftwo, fzero +mpc_half = (fhalf, fzero) + +_infs = (finf, fninf) +_infs_nan = (finf, fninf, fnan) + +def mpc_is_inf(z): + """Check if either real or imaginary part is infinite""" + re, im = z + if re in _infs: return True + if im in _infs: return True + return False + +def mpc_is_infnan(z): + """Check if either real or imaginary part is infinite or nan""" + re, im = z + if re in _infs_nan: return True + if im in _infs_nan: return True + return False + +def mpc_to_str(z, dps, **kwargs): + re, im = z + rs = to_str(re, dps) + if im[0]: + return rs + " - " + to_str(mpf_neg(im), dps, **kwargs) + "j" + else: + return rs + " + " + to_str(im, dps, **kwargs) + "j" + +def mpc_to_complex(z, strict=False, rnd=round_fast): + re, im = z + return complex(to_float(re, strict, rnd), to_float(im, strict, rnd)) + +def mpc_hash(z): + if sys.version_info >= (3, 2): + re, im = z + h = mpf_hash(re) + sys.hash_info.imag * mpf_hash(im) + # Need to reduce either module 2^32 or 2^64 + h = h % (2**sys.hash_info.width) + return int(h) + else: + try: + return hash(mpc_to_complex(z, strict=True)) + except OverflowError: + return hash(z) + +def mpc_conjugate(z, prec, rnd=round_fast): + re, im = z + return re, mpf_neg(im, prec, rnd) + +def mpc_is_nonzero(z): + return z != mpc_zero + +def mpc_add(z, w, prec, rnd=round_fast): + a, b = z + c, d = w + return mpf_add(a, c, prec, rnd), mpf_add(b, d, prec, rnd) + +def mpc_add_mpf(z, x, prec, rnd=round_fast): + a, b = z + return mpf_add(a, x, prec, rnd), b + +def mpc_sub(z, w, prec=0, rnd=round_fast): + a, b = z + c, d = w + return mpf_sub(a, c, prec, rnd), mpf_sub(b, d, prec, rnd) + +def mpc_sub_mpf(z, p, prec=0, rnd=round_fast): + a, b = z + return mpf_sub(a, p, prec, rnd), b + +def mpc_pos(z, prec, rnd=round_fast): + a, b = z + return mpf_pos(a, prec, rnd), mpf_pos(b, prec, rnd) + +def mpc_neg(z, prec=None, rnd=round_fast): + a, b = z + return mpf_neg(a, prec, rnd), mpf_neg(b, prec, rnd) + +def mpc_shift(z, n): + a, b = z + return mpf_shift(a, n), mpf_shift(b, n) + +def mpc_abs(z, prec, rnd=round_fast): + """Absolute value of a complex number, |a+bi|. + Returns an mpf value.""" + a, b = z + return mpf_hypot(a, b, prec, rnd) + +def mpc_arg(z, prec, rnd=round_fast): + """Argument of a complex number. Returns an mpf value.""" + a, b = z + return mpf_atan2(b, a, prec, rnd) + +def mpc_floor(z, prec, rnd=round_fast): + a, b = z + return mpf_floor(a, prec, rnd), mpf_floor(b, prec, rnd) + +def mpc_ceil(z, prec, rnd=round_fast): + a, b = z + return mpf_ceil(a, prec, rnd), mpf_ceil(b, prec, rnd) + +def mpc_nint(z, prec, rnd=round_fast): + a, b = z + return mpf_nint(a, prec, rnd), mpf_nint(b, prec, rnd) + +def mpc_frac(z, prec, rnd=round_fast): + a, b = z + return mpf_frac(a, prec, rnd), mpf_frac(b, prec, rnd) + + +def mpc_mul(z, w, prec, rnd=round_fast): + """ + Complex multiplication. + + Returns the real and imaginary part of (a+bi)*(c+di), rounded to + the specified precision. The rounding mode applies to the real and + imaginary parts separately. + """ + a, b = z + c, d = w + p = mpf_mul(a, c) + q = mpf_mul(b, d) + r = mpf_mul(a, d) + s = mpf_mul(b, c) + re = mpf_sub(p, q, prec, rnd) + im = mpf_add(r, s, prec, rnd) + return re, im + +def mpc_square(z, prec, rnd=round_fast): + # (a+b*I)**2 == a**2 - b**2 + 2*I*a*b + a, b = z + p = mpf_mul(a,a) + q = mpf_mul(b,b) + r = mpf_mul(a,b, prec, rnd) + re = mpf_sub(p, q, prec, rnd) + im = mpf_shift(r, 1) + return re, im + +def mpc_mul_mpf(z, p, prec, rnd=round_fast): + a, b = z + re = mpf_mul(a, p, prec, rnd) + im = mpf_mul(b, p, prec, rnd) + return re, im + +def mpc_mul_imag_mpf(z, x, prec, rnd=round_fast): + """ + Multiply the mpc value z by I*x where x is an mpf value. + """ + a, b = z + re = mpf_neg(mpf_mul(b, x, prec, rnd)) + im = mpf_mul(a, x, prec, rnd) + return re, im + +def mpc_mul_int(z, n, prec, rnd=round_fast): + a, b = z + re = mpf_mul_int(a, n, prec, rnd) + im = mpf_mul_int(b, n, prec, rnd) + return re, im + +def mpc_div(z, w, prec, rnd=round_fast): + a, b = z + c, d = w + wp = prec + 10 + # mag = c*c + d*d + mag = mpf_add(mpf_mul(c, c), mpf_mul(d, d), wp) + # (a*c+b*d)/mag, (b*c-a*d)/mag + t = mpf_add(mpf_mul(a,c), mpf_mul(b,d), wp) + u = mpf_sub(mpf_mul(b,c), mpf_mul(a,d), wp) + return mpf_div(t,mag,prec,rnd), mpf_div(u,mag,prec,rnd) + +def mpc_div_mpf(z, p, prec, rnd=round_fast): + """Calculate z/p where p is real""" + a, b = z + re = mpf_div(a, p, prec, rnd) + im = mpf_div(b, p, prec, rnd) + return re, im + +def mpc_reciprocal(z, prec, rnd=round_fast): + """Calculate 1/z efficiently""" + a, b = z + m = mpf_add(mpf_mul(a,a),mpf_mul(b,b),prec+10) + re = mpf_div(a, m, prec, rnd) + im = mpf_neg(mpf_div(b, m, prec, rnd)) + return re, im + +def mpc_mpf_div(p, z, prec, rnd=round_fast): + """Calculate p/z where p is real efficiently""" + a, b = z + m = mpf_add(mpf_mul(a,a),mpf_mul(b,b), prec+10) + re = mpf_div(mpf_mul(a,p), m, prec, rnd) + im = mpf_div(mpf_neg(mpf_mul(b,p)), m, prec, rnd) + return re, im + +def complex_int_pow(a, b, n): + """Complex integer power: computes (a+b*I)**n exactly for + nonnegative n (a and b must be Python ints).""" + wre = 1 + wim = 0 + while n: + if n & 1: + wre, wim = wre*a - wim*b, wim*a + wre*b + n -= 1 + a, b = a*a - b*b, 2*a*b + n //= 2 + return wre, wim + +def mpc_pow(z, w, prec, rnd=round_fast): + if w[1] == fzero: + return mpc_pow_mpf(z, w[0], prec, rnd) + return mpc_exp(mpc_mul(mpc_log(z, prec+10), w, prec+10), prec, rnd) + +def mpc_pow_mpf(z, p, prec, rnd=round_fast): + psign, pman, pexp, pbc = p + if pexp >= 0: + return mpc_pow_int(z, (-1)**psign * (pman< 0: + aman <<= de + aexp = bexp + else: + bman <<= (-de) + bexp = aexp + re, im = complex_int_pow(aman, bman, n) + re = from_man_exp(re, int(n*aexp), prec, rnd) + im = from_man_exp(im, int(n*bexp), prec, rnd) + return re, im + return mpc_exp(mpc_mul_int(mpc_log(z, prec+10), n, prec+10), prec, rnd) + +def mpc_sqrt(z, prec, rnd=round_fast): + """Complex square root (principal branch). + + We have sqrt(a+bi) = sqrt((r+a)/2) + b/sqrt(2*(r+a))*i where + r = abs(a+bi), when a+bi is not a negative real number.""" + a, b = z + if b == fzero: + if a == fzero: + return (a, b) + # When a+bi is a negative real number, we get a real sqrt times i + if a[0]: + im = mpf_sqrt(mpf_neg(a), prec, rnd) + return (fzero, im) + else: + re = mpf_sqrt(a, prec, rnd) + return (re, fzero) + wp = prec+20 + if not a[0]: # case a positive + t = mpf_add(mpc_abs((a, b), wp), a, wp) # t = abs(a+bi) + a + u = mpf_shift(t, -1) # u = t/2 + re = mpf_sqrt(u, prec, rnd) # re = sqrt(u) + v = mpf_shift(t, 1) # v = 2*t + w = mpf_sqrt(v, wp) # w = sqrt(v) + im = mpf_div(b, w, prec, rnd) # im = b / w + else: # case a negative + t = mpf_sub(mpc_abs((a, b), wp), a, wp) # t = abs(a+bi) - a + u = mpf_shift(t, -1) # u = t/2 + im = mpf_sqrt(u, prec, rnd) # im = sqrt(u) + v = mpf_shift(t, 1) # v = 2*t + w = mpf_sqrt(v, wp) # w = sqrt(v) + re = mpf_div(b, w, prec, rnd) # re = b/w + if b[0]: + re = mpf_neg(re) + im = mpf_neg(im) + return re, im + +def mpc_nthroot_fixed(a, b, n, prec): + # a, b signed integers at fixed precision prec + start = 50 + a1 = int(rshift(a, prec - n*start)) + b1 = int(rshift(b, prec - n*start)) + try: + r = (a1 + 1j * b1)**(1.0/n) + re = r.real + im = r.imag + re = MPZ(int(re)) + im = MPZ(int(im)) + except OverflowError: + a1 = from_int(a1, start) + b1 = from_int(b1, start) + fn = from_int(n) + nth = mpf_rdiv_int(1, fn, start) + re, im = mpc_pow((a1, b1), (nth, fzero), start) + re = to_int(re) + im = to_int(im) + extra = 10 + prevp = start + extra1 = n + for p in giant_steps(start, prec+extra): + # this is slow for large n, unlike int_pow_fixed + re2, im2 = complex_int_pow(re, im, n-1) + re2 = rshift(re2, (n-1)*prevp - p - extra1) + im2 = rshift(im2, (n-1)*prevp - p - extra1) + r4 = (re2*re2 + im2*im2) >> (p + extra1) + ap = rshift(a, prec - p) + bp = rshift(b, prec - p) + rec = (ap * re2 + bp * im2) >> p + imc = (-ap * im2 + bp * re2) >> p + reb = (rec << p) // r4 + imb = (imc << p) // r4 + re = (reb + (n-1)*lshift(re, p-prevp))//n + im = (imb + (n-1)*lshift(im, p-prevp))//n + prevp = p + return re, im + +def mpc_nthroot(z, n, prec, rnd=round_fast): + """ + Complex n-th root. + + Use Newton method as in the real case when it is faster, + otherwise use z**(1/n) + """ + a, b = z + if a[0] == 0 and b == fzero: + re = mpf_nthroot(a, n, prec, rnd) + return (re, fzero) + if n < 2: + if n == 0: + return mpc_one + if n == 1: + return mpc_pos((a, b), prec, rnd) + if n == -1: + return mpc_div(mpc_one, (a, b), prec, rnd) + inverse = mpc_nthroot((a, b), -n, prec+5, reciprocal_rnd[rnd]) + return mpc_div(mpc_one, inverse, prec, rnd) + if n <= 20: + prec2 = int(1.2 * (prec + 10)) + asign, aman, aexp, abc = a + bsign, bman, bexp, bbc = b + pf = mpc_abs((a,b), prec) + if pf[-2] + pf[-1] > -10 and pf[-2] + pf[-1] < prec: + af = to_fixed(a, prec2) + bf = to_fixed(b, prec2) + re, im = mpc_nthroot_fixed(af, bf, n, prec2) + extra = 10 + re = from_man_exp(re, -prec2-extra, prec2, rnd) + im = from_man_exp(im, -prec2-extra, prec2, rnd) + return re, im + fn = from_int(n) + prec2 = prec+10 + 10 + nth = mpf_rdiv_int(1, fn, prec2) + re, im = mpc_pow((a, b), (nth, fzero), prec2, rnd) + re = normalize(re[0], re[1], re[2], re[3], prec, rnd) + im = normalize(im[0], im[1], im[2], im[3], prec, rnd) + return re, im + +def mpc_cbrt(z, prec, rnd=round_fast): + """ + Complex cubic root. + """ + return mpc_nthroot(z, 3, prec, rnd) + +def mpc_exp(z, prec, rnd=round_fast): + """ + Complex exponential function. + + We use the direct formula exp(a+bi) = exp(a) * (cos(b) + sin(b)*i) + for the computation. This formula is very nice because it is + pefectly stable; since we just do real multiplications, the only + numerical errors that can creep in are single-ulp rounding errors. + + The formula is efficient since mpmath's real exp is quite fast and + since we can compute cos and sin simultaneously. + + It is no problem if a and b are large; if the implementations of + exp/cos/sin are accurate and efficient for all real numbers, then + so is this function for all complex numbers. + """ + a, b = z + if a == fzero: + return mpf_cos_sin(b, prec, rnd) + if b == fzero: + return mpf_exp(a, prec, rnd), fzero + mag = mpf_exp(a, prec+4, rnd) + c, s = mpf_cos_sin(b, prec+4, rnd) + re = mpf_mul(mag, c, prec, rnd) + im = mpf_mul(mag, s, prec, rnd) + return re, im + +def mpc_log(z, prec, rnd=round_fast): + re = mpf_log_hypot(z[0], z[1], prec, rnd) + im = mpc_arg(z, prec, rnd) + return re, im + +def mpc_cos(z, prec, rnd=round_fast): + """Complex cosine. The formula used is cos(a+bi) = cos(a)*cosh(b) - + sin(a)*sinh(b)*i. + + The same comments apply as for the complex exp: only real + multiplications are pewrormed, so no cancellation errors are + possible. The formula is also efficient since we can compute both + pairs (cos, sin) and (cosh, sinh) in single stwps.""" + a, b = z + if b == fzero: + return mpf_cos(a, prec, rnd), fzero + if a == fzero: + return mpf_cosh(b, prec, rnd), fzero + wp = prec + 6 + c, s = mpf_cos_sin(a, wp) + ch, sh = mpf_cosh_sinh(b, wp) + re = mpf_mul(c, ch, prec, rnd) + im = mpf_mul(s, sh, prec, rnd) + return re, mpf_neg(im) + +def mpc_sin(z, prec, rnd=round_fast): + """Complex sine. We have sin(a+bi) = sin(a)*cosh(b) + + cos(a)*sinh(b)*i. See the docstring for mpc_cos for additional + comments.""" + a, b = z + if b == fzero: + return mpf_sin(a, prec, rnd), fzero + if a == fzero: + return fzero, mpf_sinh(b, prec, rnd) + wp = prec + 6 + c, s = mpf_cos_sin(a, wp) + ch, sh = mpf_cosh_sinh(b, wp) + re = mpf_mul(s, ch, prec, rnd) + im = mpf_mul(c, sh, prec, rnd) + return re, im + +def mpc_tan(z, prec, rnd=round_fast): + """Complex tangent. Computed as tan(a+bi) = sin(2a)/M + sinh(2b)/M*i + where M = cos(2a) + cosh(2b).""" + a, b = z + asign, aman, aexp, abc = a + bsign, bman, bexp, bbc = b + if b == fzero: return mpf_tan(a, prec, rnd), fzero + if a == fzero: return fzero, mpf_tanh(b, prec, rnd) + wp = prec + 15 + a = mpf_shift(a, 1) + b = mpf_shift(b, 1) + c, s = mpf_cos_sin(a, wp) + ch, sh = mpf_cosh_sinh(b, wp) + # TODO: handle cancellation when c ~= -1 and ch ~= 1 + mag = mpf_add(c, ch, wp) + re = mpf_div(s, mag, prec, rnd) + im = mpf_div(sh, mag, prec, rnd) + return re, im + +def mpc_cos_pi(z, prec, rnd=round_fast): + a, b = z + if b == fzero: + return mpf_cos_pi(a, prec, rnd), fzero + b = mpf_mul(b, mpf_pi(prec+5), prec+5) + if a == fzero: + return mpf_cosh(b, prec, rnd), fzero + wp = prec + 6 + c, s = mpf_cos_sin_pi(a, wp) + ch, sh = mpf_cosh_sinh(b, wp) + re = mpf_mul(c, ch, prec, rnd) + im = mpf_mul(s, sh, prec, rnd) + return re, mpf_neg(im) + +def mpc_sin_pi(z, prec, rnd=round_fast): + a, b = z + if b == fzero: + return mpf_sin_pi(a, prec, rnd), fzero + b = mpf_mul(b, mpf_pi(prec+5), prec+5) + if a == fzero: + return fzero, mpf_sinh(b, prec, rnd) + wp = prec + 6 + c, s = mpf_cos_sin_pi(a, wp) + ch, sh = mpf_cosh_sinh(b, wp) + re = mpf_mul(s, ch, prec, rnd) + im = mpf_mul(c, sh, prec, rnd) + return re, im + +def mpc_cos_sin(z, prec, rnd=round_fast): + a, b = z + if a == fzero: + ch, sh = mpf_cosh_sinh(b, prec, rnd) + return (ch, fzero), (fzero, sh) + if b == fzero: + c, s = mpf_cos_sin(a, prec, rnd) + return (c, fzero), (s, fzero) + wp = prec + 6 + c, s = mpf_cos_sin(a, wp) + ch, sh = mpf_cosh_sinh(b, wp) + cre = mpf_mul(c, ch, prec, rnd) + cim = mpf_mul(s, sh, prec, rnd) + sre = mpf_mul(s, ch, prec, rnd) + sim = mpf_mul(c, sh, prec, rnd) + return (cre, mpf_neg(cim)), (sre, sim) + +def mpc_cos_sin_pi(z, prec, rnd=round_fast): + a, b = z + if b == fzero: + c, s = mpf_cos_sin_pi(a, prec, rnd) + return (c, fzero), (s, fzero) + b = mpf_mul(b, mpf_pi(prec+5), prec+5) + if a == fzero: + ch, sh = mpf_cosh_sinh(b, prec, rnd) + return (ch, fzero), (fzero, sh) + wp = prec + 6 + c, s = mpf_cos_sin_pi(a, wp) + ch, sh = mpf_cosh_sinh(b, wp) + cre = mpf_mul(c, ch, prec, rnd) + cim = mpf_mul(s, sh, prec, rnd) + sre = mpf_mul(s, ch, prec, rnd) + sim = mpf_mul(c, sh, prec, rnd) + return (cre, mpf_neg(cim)), (sre, sim) + +def mpc_cosh(z, prec, rnd=round_fast): + """Complex hyperbolic cosine. Computed as cosh(z) = cos(z*i).""" + a, b = z + return mpc_cos((b, mpf_neg(a)), prec, rnd) + +def mpc_sinh(z, prec, rnd=round_fast): + """Complex hyperbolic sine. Computed as sinh(z) = -i*sin(z*i).""" + a, b = z + b, a = mpc_sin((b, a), prec, rnd) + return a, b + +def mpc_tanh(z, prec, rnd=round_fast): + """Complex hyperbolic tangent. Computed as tanh(z) = -i*tan(z*i).""" + a, b = z + b, a = mpc_tan((b, a), prec, rnd) + return a, b + +# TODO: avoid loss of accuracy +def mpc_atan(z, prec, rnd=round_fast): + a, b = z + # atan(z) = (I/2)*(log(1-I*z) - log(1+I*z)) + # x = 1-I*z = 1 + b - I*a + # y = 1+I*z = 1 - b + I*a + wp = prec + 15 + x = mpf_add(fone, b, wp), mpf_neg(a) + y = mpf_sub(fone, b, wp), a + l1 = mpc_log(x, wp) + l2 = mpc_log(y, wp) + a, b = mpc_sub(l1, l2, prec, rnd) + # (I/2) * (a+b*I) = (-b/2 + a/2*I) + v = mpf_neg(mpf_shift(b,-1)), mpf_shift(a,-1) + # Subtraction at infinity gives correct real part but + # wrong imaginary part (should be zero) + if v[1] == fnan and mpc_is_inf(z): + v = (v[0], fzero) + return v + +beta_crossover = from_float(0.6417) +alpha_crossover = from_float(1.5) + +def acos_asin(z, prec, rnd, n): + """ complex acos for n = 0, asin for n = 1 + The algorithm is described in + T.E. Hull, T.F. Fairgrieve and P.T.P. Tang + 'Implementing the Complex Arcsine and Arcosine Functions + using Exception Handling', + ACM Trans. on Math. Software Vol. 23 (1997), p299 + The complex acos and asin can be defined as + acos(z) = acos(beta) - I*sign(a)* log(alpha + sqrt(alpha**2 -1)) + asin(z) = asin(beta) + I*sign(a)* log(alpha + sqrt(alpha**2 -1)) + where z = a + I*b + alpha = (1/2)*(r + s); beta = (1/2)*(r - s) = a/alpha + r = sqrt((a+1)**2 + y**2); s = sqrt((a-1)**2 + y**2) + These expressions are rewritten in different ways in different + regions, delimited by two crossovers alpha_crossover and beta_crossover, + and by abs(a) <= 1, in order to improve the numerical accuracy. + """ + a, b = z + wp = prec + 10 + # special cases with real argument + if b == fzero: + am = mpf_sub(fone, mpf_abs(a), wp) + # case abs(a) <= 1 + if not am[0]: + if n == 0: + return mpf_acos(a, prec, rnd), fzero + else: + return mpf_asin(a, prec, rnd), fzero + # cases abs(a) > 1 + else: + # case a < -1 + if a[0]: + pi = mpf_pi(prec, rnd) + c = mpf_acosh(mpf_neg(a), prec, rnd) + if n == 0: + return pi, mpf_neg(c) + else: + return mpf_neg(mpf_shift(pi, -1)), c + # case a > 1 + else: + c = mpf_acosh(a, prec, rnd) + if n == 0: + return fzero, c + else: + pi = mpf_pi(prec, rnd) + return mpf_shift(pi, -1), mpf_neg(c) + asign = bsign = 0 + if a[0]: + a = mpf_neg(a) + asign = 1 + if b[0]: + b = mpf_neg(b) + bsign = 1 + am = mpf_sub(fone, a, wp) + ap = mpf_add(fone, a, wp) + r = mpf_hypot(ap, b, wp) + s = mpf_hypot(am, b, wp) + alpha = mpf_shift(mpf_add(r, s, wp), -1) + beta = mpf_div(a, alpha, wp) + b2 = mpf_mul(b,b, wp) + # case beta <= beta_crossover + if not mpf_sub(beta_crossover, beta, wp)[0]: + if n == 0: + re = mpf_acos(beta, wp) + else: + re = mpf_asin(beta, wp) + else: + # to compute the real part in this region use the identity + # asin(beta) = atan(beta/sqrt(1-beta**2)) + # beta/sqrt(1-beta**2) = (alpha + a) * (alpha - a) + # alpha + a is numerically accurate; alpha - a can have + # cancellations leading to numerical inaccuracies, so rewrite + # it in differente ways according to the region + Ax = mpf_add(alpha, a, wp) + # case a <= 1 + if not am[0]: + # c = b*b/(r + (a+1)); d = (s + (1-a)) + # alpha - a = (1/2)*(c + d) + # case n=0: re = atan(sqrt((1/2) * Ax * (c + d))/a) + # case n=1: re = atan(a/sqrt((1/2) * Ax * (c + d))) + c = mpf_div(b2, mpf_add(r, ap, wp), wp) + d = mpf_add(s, am, wp) + re = mpf_shift(mpf_mul(Ax, mpf_add(c, d, wp), wp), -1) + if n == 0: + re = mpf_atan(mpf_div(mpf_sqrt(re, wp), a, wp), wp) + else: + re = mpf_atan(mpf_div(a, mpf_sqrt(re, wp), wp), wp) + else: + # c = Ax/(r + (a+1)); d = Ax/(s - (1-a)) + # alpha - a = (1/2)*(c + d) + # case n = 0: re = atan(b*sqrt(c + d)/2/a) + # case n = 1: re = atan(a/(b*sqrt(c + d)/2) + c = mpf_div(Ax, mpf_add(r, ap, wp), wp) + d = mpf_div(Ax, mpf_sub(s, am, wp), wp) + re = mpf_shift(mpf_add(c, d, wp), -1) + re = mpf_mul(b, mpf_sqrt(re, wp), wp) + if n == 0: + re = mpf_atan(mpf_div(re, a, wp), wp) + else: + re = mpf_atan(mpf_div(a, re, wp), wp) + # to compute alpha + sqrt(alpha**2 - 1), if alpha <= alpha_crossover + # replace it with 1 + Am1 + sqrt(Am1*(alpha+1))) + # where Am1 = alpha -1 + # if alpha <= alpha_crossover: + if not mpf_sub(alpha_crossover, alpha, wp)[0]: + c1 = mpf_div(b2, mpf_add(r, ap, wp), wp) + # case a < 1 + if mpf_neg(am)[0]: + # Am1 = (1/2) * (b*b/(r + (a+1)) + b*b/(s + (1-a)) + c2 = mpf_add(s, am, wp) + c2 = mpf_div(b2, c2, wp) + Am1 = mpf_shift(mpf_add(c1, c2, wp), -1) + else: + # Am1 = (1/2) * (b*b/(r + (a+1)) + (s - (1-a))) + c2 = mpf_sub(s, am, wp) + Am1 = mpf_shift(mpf_add(c1, c2, wp), -1) + # im = log(1 + Am1 + sqrt(Am1*(alpha+1))) + im = mpf_mul(Am1, mpf_add(alpha, fone, wp), wp) + im = mpf_log(mpf_add(fone, mpf_add(Am1, mpf_sqrt(im, wp), wp), wp), wp) + else: + # im = log(alpha + sqrt(alpha*alpha - 1)) + im = mpf_sqrt(mpf_sub(mpf_mul(alpha, alpha, wp), fone, wp), wp) + im = mpf_log(mpf_add(alpha, im, wp), wp) + if asign: + if n == 0: + re = mpf_sub(mpf_pi(wp), re, wp) + else: + re = mpf_neg(re) + if not bsign and n == 0: + im = mpf_neg(im) + if bsign and n == 1: + im = mpf_neg(im) + re = normalize(re[0], re[1], re[2], re[3], prec, rnd) + im = normalize(im[0], im[1], im[2], im[3], prec, rnd) + return re, im + +def mpc_acos(z, prec, rnd=round_fast): + return acos_asin(z, prec, rnd, 0) + +def mpc_asin(z, prec, rnd=round_fast): + return acos_asin(z, prec, rnd, 1) + +def mpc_asinh(z, prec, rnd=round_fast): + # asinh(z) = I * asin(-I z) + a, b = z + a, b = mpc_asin((b, mpf_neg(a)), prec, rnd) + return mpf_neg(b), a + +def mpc_acosh(z, prec, rnd=round_fast): + # acosh(z) = -I * acos(z) for Im(acos(z)) <= 0 + # +I * acos(z) otherwise + a, b = mpc_acos(z, prec, rnd) + if b[0] or b == fzero: + return mpf_neg(b), a + else: + return b, mpf_neg(a) + +def mpc_atanh(z, prec, rnd=round_fast): + # atanh(z) = (log(1+z)-log(1-z))/2 + wp = prec + 15 + a = mpc_add(z, mpc_one, wp) + b = mpc_sub(mpc_one, z, wp) + a = mpc_log(a, wp) + b = mpc_log(b, wp) + v = mpc_shift(mpc_sub(a, b, wp), -1) + # Subtraction at infinity gives correct imaginary part but + # wrong real part (should be zero) + if v[0] == fnan and mpc_is_inf(z): + v = (fzero, v[1]) + return v + +def mpc_fibonacci(z, prec, rnd=round_fast): + re, im = z + if im == fzero: + return (mpf_fibonacci(re, prec, rnd), fzero) + size = max(abs(re[2]+re[3]), abs(re[2]+re[3])) + wp = prec + size + 20 + a = mpf_phi(wp) + b = mpf_add(mpf_shift(a, 1), fnone, wp) + u = mpc_pow((a, fzero), z, wp) + v = mpc_cos_pi(z, wp) + v = mpc_div(v, u, wp) + u = mpc_sub(u, v, wp) + u = mpc_div_mpf(u, b, prec, rnd) + return u + +def mpf_expj(x, prec, rnd='f'): + raise ComplexResult + +def mpc_expj(z, prec, rnd='f'): + re, im = z + if im == fzero: + return mpf_cos_sin(re, prec, rnd) + if re == fzero: + return mpf_exp(mpf_neg(im), prec, rnd), fzero + ey = mpf_exp(mpf_neg(im), prec+10) + c, s = mpf_cos_sin(re, prec+10) + re = mpf_mul(ey, c, prec, rnd) + im = mpf_mul(ey, s, prec, rnd) + return re, im + +def mpf_expjpi(x, prec, rnd='f'): + raise ComplexResult + +def mpc_expjpi(z, prec, rnd='f'): + re, im = z + if im == fzero: + return mpf_cos_sin_pi(re, prec, rnd) + sign, man, exp, bc = im + wp = prec+10 + if man: + wp += max(0, exp+bc) + im = mpf_neg(mpf_mul(mpf_pi(wp), im, wp)) + if re == fzero: + return mpf_exp(im, prec, rnd), fzero + ey = mpf_exp(im, prec+10) + c, s = mpf_cos_sin_pi(re, prec+10) + re = mpf_mul(ey, c, prec, rnd) + im = mpf_mul(ey, s, prec, rnd) + return re, im + + +if BACKEND == 'sage': + try: + import sage.libs.mpmath.ext_libmp as _lbmp + mpc_exp = _lbmp.mpc_exp + mpc_sqrt = _lbmp.mpc_sqrt + except (ImportError, AttributeError): + print("Warning: Sage imports in libmpc failed") diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_functions2.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_functions2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5387556064b641cdadfcdaf50914ebd39eff3f65 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_functions2.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c96a8c60ccaff3dbe94603afb496582f94300b3dd5c8ec016ff0c7e71f975baf +size 172649 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openacc/cupti_openacc.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openacc/cupti_openacc.h new file mode 100644 index 0000000000000000000000000000000000000000..b7ea50da7beb2187e77f7606dd70faed0e4b4add --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openacc/cupti_openacc.h @@ -0,0 +1,98 @@ +/* + * Copyright 2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#include + +#if !defined(_CUPTI_OPENACC_H_) +#define _CUPTI_OPENACC_H_ + +#ifndef CUPTIAPI +#ifdef _WIN32 +#define CUPTIAPI __stdcall +#else +#define CUPTIAPI +#endif +#endif + +#if defined(__LP64__) +#define CUPTILP64 1 +#elif defined(_WIN64) +#define CUPTILP64 1 +#else +#undef CUPTILP64 +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility push(default) +#endif + +/** + * \brief Initialize OpenACC support + * + * \param profRegister function of type acc_prof_reg as obtained from acc_register_library + * \param profUnregister function of type acc_prof_reg as obtained from acc_register_library + * \param profLookup function of type acc_prof_lookup as obtained from acc_register_library + */ +CUptiResult CUPTIAPI +cuptiOpenACCInitialize(void *profRegister, void *profUnregister, void *profLookup); + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility pop +#endif + +#if defined(__cplusplus) +} +#endif + +#endif /*_CUPTI_OPENACC_H_*/ + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity.h new file mode 100644 index 0000000000000000000000000000000000000000..fb98c23e5591a45789d7e72a0a4561dce199905a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity.h @@ -0,0 +1,10982 @@ +/* + * Copyright 2011-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(_CUPTI_ACTIVITY_H_) +#define _CUPTI_ACTIVITY_H_ + +#include +#include +#include +#include +#include +#if defined(CUPTI_DIRECTIVE_SUPPORT) +#include +#include +#endif + +#ifndef CUPTIAPI +#ifdef _WIN32 +#define CUPTIAPI __stdcall +#else +#define CUPTIAPI +#endif +#endif + +#if defined(__LP64__) +#define CUPTILP64 1 +#elif defined(_WIN64) +#define CUPTILP64 1 +#else +#undef CUPTILP64 +#endif + +#define ACTIVITY_RECORD_ALIGNMENT 8 +#if defined(_WIN32) // Windows 32- and 64-bit +#define START_PACKED_ALIGNMENT __pragma(pack(push,1)) // exact fit - no padding +#define PACKED_ALIGNMENT __declspec(align(ACTIVITY_RECORD_ALIGNMENT)) +#define END_PACKED_ALIGNMENT __pragma(pack(pop)) +#elif defined(__GNUC__) // GCC +#define START_PACKED_ALIGNMENT +#define PACKED_ALIGNMENT __attribute__ ((__packed__)) __attribute__ ((aligned (ACTIVITY_RECORD_ALIGNMENT))) +#define END_PACKED_ALIGNMENT +#else // all other compilers +#define START_PACKED_ALIGNMENT +#define PACKED_ALIGNMENT +#define END_PACKED_ALIGNMENT +#endif + +#define CUPTI_UNIFIED_MEMORY_CPU_DEVICE_ID ((uint32_t) 0xFFFFFFFFU) +#define CUPTI_INVALID_CONTEXT_ID ((uint32_t) 0xFFFFFFFFU) +#define CUPTI_INVALID_STREAM_ID ((uint32_t) 0xFFFFFFFFU) +#define CUPTI_INVALID_CHANNEL_ID ((uint32_t) 0xFFFFFFFFU) +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility push(default) +#endif + +/** + * \defgroup CUPTI_ACTIVITY_API CUPTI Activity API + * Functions, types, and enums that implement the CUPTI Activity API. + * @{ + */ + +/** + * \brief The kinds of activity records. + * + * Each activity record kind represents information about a GPU or an + * activity occurring on a CPU or GPU. Each kind is associated with a + * activity record structure that holds the information associated + * with the kind. + * \see CUpti_Activity + * \see CUpti_ActivityAPI + * \see CUpti_ActivityContext + * \see CUpti_ActivityDevice + * \see CUpti_ActivityDevice2 + * \see CUpti_ActivityDevice3 + * \see CUpti_ActivityDevice4 + * \see CUpti_ActivityDeviceAttribute + * \see CUpti_ActivityEvent + * \see CUpti_ActivityEventInstance + * \see CUpti_ActivityKernel + * \see CUpti_ActivityKernel2 + * \see CUpti_ActivityKernel3 + * \see CUpti_ActivityKernel4 + * \see CUpti_ActivityKernel5 + * \see CUpti_ActivityKernel6 + * \see CUpti_ActivityKernel7 + * \see CUpti_ActivityKernel8 + * \see CUpti_ActivityCdpKernel + * \see CUpti_ActivityPreemption + * \see CUpti_ActivityMemcpy + * \see CUpti_ActivityMemcpy3 + * \see CUpti_ActivityMemcpy4 + * \see CUpti_ActivityMemcpy5 + * \see CUpti_ActivityMemcpyPtoP + * \see CUpti_ActivityMemcpyPtoP2 + * \see CUpti_ActivityMemcpyPtoP3 + * \see CUpti_ActivityMemcpyPtoP4 + * \see CUpti_ActivityMemset + * \see CUpti_ActivityMemset2 + * \see CUpti_ActivityMemset3 + * \see CUpti_ActivityMemset4 + * \see CUpti_ActivityMetric + * \see CUpti_ActivityMetricInstance + * \see CUpti_ActivityName + * \see CUpti_ActivityMarker + * \see CUpti_ActivityMarker2 + * \see CUpti_ActivityMarkerData + * \see CUpti_ActivitySourceLocator + * \see CUpti_ActivityGlobalAccess + * \see CUpti_ActivityGlobalAccess2 + * \see CUpti_ActivityGlobalAccess3 + * \see CUpti_ActivityBranch + * \see CUpti_ActivityBranch2 + * \see CUpti_ActivityOverhead + * \see CUpti_ActivityEnvironment + * \see CUpti_ActivityInstructionExecution + * \see CUpti_ActivityUnifiedMemoryCounter + * \see CUpti_ActivityFunction + * \see CUpti_ActivityModule + * \see CUpti_ActivitySharedAccess + * \see CUpti_ActivityPCSampling + * \see CUpti_ActivityPCSampling2 + * \see CUpti_ActivityPCSampling3 + * \see CUpti_ActivityPCSamplingRecordInfo + * \see CUpti_ActivityCudaEvent + * \see CUpti_ActivityStream + * \see CUpti_ActivitySynchronization + * \see CUpti_ActivityInstructionCorrelation + * \see CUpti_ActivityExternalCorrelation + * \see CUpti_ActivityUnifiedMemoryCounter2 + * \see CUpti_ActivityOpenAccData + * \see CUpti_ActivityOpenAccLaunch + * \see CUpti_ActivityOpenAccOther + * \see CUpti_ActivityOpenMp + * \see CUpti_ActivityNvLink + * \see CUpti_ActivityNvLink2 + * \see CUpti_ActivityNvLink3 + * \see CUpti_ActivityNvLink4 + * \see CUpti_ActivityMemory + * \see CUpti_ActivityPcie + */ +typedef enum { + /** + * The activity record is invalid. + */ + CUPTI_ACTIVITY_KIND_INVALID = 0, + /** + * A host<->host, host<->device, or device<->device memory copy. The + * corresponding activity record structure is \ref + * CUpti_ActivityMemcpy5. + */ + CUPTI_ACTIVITY_KIND_MEMCPY = 1, + /** + * A memory set executing on the GPU. The corresponding activity + * record structure is \ref CUpti_ActivityMemset4. + */ + CUPTI_ACTIVITY_KIND_MEMSET = 2, + /** + * A kernel executing on the GPU. This activity kind may significantly change + * the overall performance characteristics of the application because all + * kernel executions are serialized on the GPU. Other activity kind for kernel + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL doesn't break kernel concurrency. + * The corresponding activity record structure is \ref CUpti_ActivityKernel8. + */ + CUPTI_ACTIVITY_KIND_KERNEL = 3, + /** + * A CUDA driver API function execution. The corresponding activity + * record structure is \ref CUpti_ActivityAPI. + */ + CUPTI_ACTIVITY_KIND_DRIVER = 4, + /** + * A CUDA runtime API function execution. The corresponding activity + * record structure is \ref CUpti_ActivityAPI. + */ + CUPTI_ACTIVITY_KIND_RUNTIME = 5, + /** + * An event value. The corresponding activity record structure is + * \ref CUpti_ActivityEvent. + */ + CUPTI_ACTIVITY_KIND_EVENT = 6, + /** + * A metric value. The corresponding activity record structure is + * \ref CUpti_ActivityMetric. + */ + CUPTI_ACTIVITY_KIND_METRIC = 7, + /** + * Information about a device. The corresponding activity record + * structure is \ref CUpti_ActivityDevice4. + */ + CUPTI_ACTIVITY_KIND_DEVICE = 8, + /** + * Information about a context. The corresponding activity record + * structure is \ref CUpti_ActivityContext. + */ + CUPTI_ACTIVITY_KIND_CONTEXT = 9, + /** + * A kernel executing on the GPU. This activity kind doesn't break + * kernel concurrency. The corresponding activity record structure + * is \ref CUpti_ActivityKernel8. + */ + CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL = 10, + /** + * Resource naming done via NVTX APIs for thread, device, context, etc. + * The corresponding activity record structure is \ref CUpti_ActivityName. + */ + CUPTI_ACTIVITY_KIND_NAME = 11, + /** + * Instantaneous, start, or end NVTX marker. The corresponding activity + * record structure is \ref CUpti_ActivityMarker2. + */ + CUPTI_ACTIVITY_KIND_MARKER = 12, + /** + * Extended, optional, data about a marker. The corresponding + * activity record structure is \ref CUpti_ActivityMarkerData. + */ + CUPTI_ACTIVITY_KIND_MARKER_DATA = 13, + /** + * Source information about source level result. The corresponding + * activity record structure is \ref CUpti_ActivitySourceLocator. + */ + CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR = 14, + /** + * Results for source-level global acccess. The + * corresponding activity record structure is \ref + * CUpti_ActivityGlobalAccess3. + */ + CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS = 15, + /** + * Results for source-level branch. The corresponding + * activity record structure is \ref CUpti_ActivityBranch2. + */ + CUPTI_ACTIVITY_KIND_BRANCH = 16, + /** + * Overhead activity records. The + * corresponding activity record structure is + * \ref CUpti_ActivityOverhead. + */ + CUPTI_ACTIVITY_KIND_OVERHEAD = 17, + /** + * A CDP (CUDA Dynamic Parallel) kernel executing on the GPU. The + * corresponding activity record structure is \ref + * CUpti_ActivityCdpKernel. This activity can not be directly + * enabled or disabled. It is enabled and disabled through + * concurrent kernel activity i.e. _CONCURRENT_KERNEL. + */ + CUPTI_ACTIVITY_KIND_CDP_KERNEL = 18, + /** + * Preemption activity record indicating a preemption of a CDP (CUDA + * Dynamic Parallel) kernel executing on the GPU. The corresponding + * activity record structure is \ref CUpti_ActivityPreemption. + */ + CUPTI_ACTIVITY_KIND_PREEMPTION = 19, + /** + * Environment activity records indicating power, clock, thermal, + * etc. levels of the GPU. The corresponding activity record + * structure is \ref CUpti_ActivityEnvironment. + */ + CUPTI_ACTIVITY_KIND_ENVIRONMENT = 20, + /** + * An event value associated with a specific event domain + * instance. The corresponding activity record structure is \ref + * CUpti_ActivityEventInstance. + */ + CUPTI_ACTIVITY_KIND_EVENT_INSTANCE = 21, + /** + * A peer to peer memory copy. The corresponding activity record + * structure is \ref CUpti_ActivityMemcpyPtoP4. + */ + CUPTI_ACTIVITY_KIND_MEMCPY2 = 22, + /** + * A metric value associated with a specific metric domain + * instance. The corresponding activity record structure is \ref + * CUpti_ActivityMetricInstance. + */ + CUPTI_ACTIVITY_KIND_METRIC_INSTANCE = 23, + /** + * Results for source-level instruction execution. + * The corresponding activity record structure is \ref + * CUpti_ActivityInstructionExecution. + */ + CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION = 24, + /** + * Unified Memory counter record. The corresponding activity + * record structure is \ref CUpti_ActivityUnifiedMemoryCounter2. + */ + CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER = 25, + /** + * Device global/function record. The corresponding activity + * record structure is \ref CUpti_ActivityFunction. + */ + CUPTI_ACTIVITY_KIND_FUNCTION = 26, + /** + * CUDA Module record. The corresponding activity + * record structure is \ref CUpti_ActivityModule. + */ + CUPTI_ACTIVITY_KIND_MODULE = 27, + /** + * A device attribute value. The corresponding activity record + * structure is \ref CUpti_ActivityDeviceAttribute. + */ + CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE = 28, + /** + * Results for source-level shared acccess. The + * corresponding activity record structure is \ref + * CUpti_ActivitySharedAccess. + */ + CUPTI_ACTIVITY_KIND_SHARED_ACCESS = 29, + /** + * Enable PC sampling for kernels. This will serialize + * kernels. The corresponding activity record structure + * is \ref CUpti_ActivityPCSampling3. + */ + CUPTI_ACTIVITY_KIND_PC_SAMPLING = 30, + /** + * Summary information about PC sampling records. The + * corresponding activity record structure is \ref + * CUpti_ActivityPCSamplingRecordInfo. + */ + CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO = 31, + /** + * SASS/Source line-by-line correlation record. + * This will generate sass/source correlation for functions that have source + * level analysis or pc sampling results. The records will be generated only + * when either of source level analysis or pc sampling activity is enabled. + * The corresponding activity record structure is \ref + * CUpti_ActivityInstructionCorrelation. + */ + CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION = 32, + /** + * OpenACC data events. + * The corresponding activity record structure is \ref + * CUpti_ActivityOpenAccData. + */ + CUPTI_ACTIVITY_KIND_OPENACC_DATA = 33, + /** + * OpenACC launch events. + * The corresponding activity record structure is \ref + * CUpti_ActivityOpenAccLaunch. + */ + CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH = 34, + /** + * OpenACC other events. + * The corresponding activity record structure is \ref + * CUpti_ActivityOpenAccOther. + */ + CUPTI_ACTIVITY_KIND_OPENACC_OTHER = 35, + /** + * Information about a CUDA event. The + * corresponding activity record structure is \ref + * CUpti_ActivityCudaEvent. + */ + CUPTI_ACTIVITY_KIND_CUDA_EVENT = 36, + /** + * Information about a CUDA stream. The + * corresponding activity record structure is \ref + * CUpti_ActivityStream. + */ + CUPTI_ACTIVITY_KIND_STREAM = 37, + /** + * Records for synchronization management. The + * corresponding activity record structure is \ref + * CUpti_ActivitySynchronization. + */ + CUPTI_ACTIVITY_KIND_SYNCHRONIZATION = 38, + /** + * Records for correlation of different programming APIs. The + * corresponding activity record structure is \ref + * CUpti_ActivityExternalCorrelation. + */ + CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION = 39, + /** + * NVLink information. + * The corresponding activity record structure is \ref + * CUpti_ActivityNvLink4. + */ + CUPTI_ACTIVITY_KIND_NVLINK = 40, + /** + * Instantaneous Event information. + * The corresponding activity record structure is \ref + * CUpti_ActivityInstantaneousEvent. + */ + CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT = 41, + /** + * Instantaneous Event information for a specific event + * domain instance. + * The corresponding activity record structure is \ref + * CUpti_ActivityInstantaneousEventInstance + */ + CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE = 42, + /** + * Instantaneous Metric information + * The corresponding activity record structure is \ref + * CUpti_ActivityInstantaneousMetric. + */ + CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC = 43, + /** + * Instantaneous Metric information for a specific metric + * domain instance. + * The corresponding activity record structure is \ref + * CUpti_ActivityInstantaneousMetricInstance. + */ + CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE = 44, + /** + * Memory activity tracking allocation and freeing of the memory + * The corresponding activity record structure is \ref + * CUpti_ActivityMemory. + */ + CUPTI_ACTIVITY_KIND_MEMORY = 45, + /** + * PCI devices information used for PCI topology. + * The corresponding activity record structure is \ref + * CUpti_ActivityPcie. + */ + CUPTI_ACTIVITY_KIND_PCIE = 46, + /** + * OpenMP parallel events. + * The corresponding activity record structure is \ref + * CUpti_ActivityOpenMp. + */ + CUPTI_ACTIVITY_KIND_OPENMP = 47, + /** + * A CUDA driver kernel launch occurring outside of any + * public API function execution. Tools can handle these + * like records for driver API launch functions, although + * the cbid field is not used here. + * The corresponding activity record structure is \ref + * CUpti_ActivityAPI. + */ + CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API = 48, + /** + * Memory activity tracking allocation and freeing of the memory + * The corresponding activity record structure is \ref + * CUpti_ActivityMemory3. + */ + CUPTI_ACTIVITY_KIND_MEMORY2 = 49, + + /** + * Memory pool activity tracking creation, destruction and + * triming of the memory pool. + * The corresponding activity record structure is \ref + * CUpti_ActivityMemoryPool2. + */ + CUPTI_ACTIVITY_KIND_MEMORY_POOL = 50, + + /** + * The corresponding activity record structure is \ref CUpti_ActivityGraphTrace. + */ + CUPTI_ACTIVITY_KIND_GRAPH_TRACE = 51, + + /** + * JIT operation tracking + * The corresponding activity record structure is \ref + * CUpti_ActivityJit. + */ + CUPTI_ACTIVITY_KIND_JIT = 52, + + + CUPTI_ACTIVITY_KIND_COUNT, + + CUPTI_ACTIVITY_KIND_FORCE_INT = 0x7fffffff +} CUpti_ActivityKind; + +/** + * \brief The kinds of activity objects. + * \see CUpti_ActivityObjectKindId + */ +typedef enum { + /** + * The object kind is not known. + */ + CUPTI_ACTIVITY_OBJECT_UNKNOWN = 0, + /** + * A process. + */ + CUPTI_ACTIVITY_OBJECT_PROCESS = 1, + /** + * A thread. + */ + CUPTI_ACTIVITY_OBJECT_THREAD = 2, + /** + * A device. + */ + CUPTI_ACTIVITY_OBJECT_DEVICE = 3, + /** + * A context. + */ + CUPTI_ACTIVITY_OBJECT_CONTEXT = 4, + /** + * A stream. + */ + CUPTI_ACTIVITY_OBJECT_STREAM = 5, + + CUPTI_ACTIVITY_OBJECT_FORCE_INT = 0x7fffffff +} CUpti_ActivityObjectKind; + +/** + * \brief Identifiers for object kinds as specified by + * CUpti_ActivityObjectKind. + * \see CUpti_ActivityObjectKind + */ +typedef union { + /** + * A process object requires that we identify the process ID. A + * thread object requires that we identify both the process and + * thread ID. + */ + struct { + uint32_t processId; + uint32_t threadId; + } pt; + /** + * A device object requires that we identify the device ID. A + * context object requires that we identify both the device and + * context ID. A stream object requires that we identify device, + * context, and stream ID. + */ + struct { + uint32_t deviceId; + uint32_t contextId; + uint32_t streamId; + } dcs; +} CUpti_ActivityObjectKindId; + +/** + * \brief The kinds of activity overhead. + */ +typedef enum { + /** + * The overhead kind is not known. + */ + CUPTI_ACTIVITY_OVERHEAD_UNKNOWN = 0, + /** + * Compiler(JIT) overhead. + */ + CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER = 1, + /** + * Activity buffer flush overhead. + */ + CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH = 1<<16, + /** + * CUPTI instrumentation overhead. + */ + CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION = 2<<16, + /** + * CUPTI resource creation and destruction overhead. + */ + CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE = 3<<16, + CUPTI_ACTIVITY_OVERHEAD_FORCE_INT = 0x7fffffff +} CUpti_ActivityOverheadKind; + +/** + * \brief The kind of a compute API. + */ +typedef enum { + /** + * The compute API is not known. + */ + CUPTI_ACTIVITY_COMPUTE_API_UNKNOWN = 0, + /** + * The compute APIs are for CUDA. + */ + CUPTI_ACTIVITY_COMPUTE_API_CUDA = 1, + /** + * The compute APIs are for CUDA running + * in MPS (Multi-Process Service) environment. + */ + CUPTI_ACTIVITY_COMPUTE_API_CUDA_MPS = 2, + + CUPTI_ACTIVITY_COMPUTE_API_FORCE_INT = 0x7fffffff +} CUpti_ActivityComputeApiKind; + +/** + * \brief Flags associated with activity records. + * + * Activity record flags. Flags can be combined by bitwise OR to + * associated multiple flags with an activity record. Each flag is + * specific to a certain activity kind, as noted below. + */ +typedef enum { + /** + * Indicates the activity record has no flags. + */ + CUPTI_ACTIVITY_FLAG_NONE = 0, + + /** + * Indicates the activity represents a device that supports + * concurrent kernel execution. Valid for + * CUPTI_ACTIVITY_KIND_DEVICE. + */ + CUPTI_ACTIVITY_FLAG_DEVICE_CONCURRENT_KERNELS = 1 << 0, + + /** + * Indicates if the activity represents a CUdevice_attribute value + * or a CUpti_DeviceAttribute value. Valid for + * CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE. + */ + CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE = 1 << 0, + + /** + * Indicates the activity represents an asynchronous memcpy + * operation. Valid for CUPTI_ACTIVITY_KIND_MEMCPY. + */ + CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC = 1 << 0, + + /** + * Indicates the activity represents an instantaneous marker. Valid + * for CUPTI_ACTIVITY_KIND_MARKER. + */ + CUPTI_ACTIVITY_FLAG_MARKER_INSTANTANEOUS = 1 << 0, + + /** + * Indicates the activity represents a region start marker. Valid + * for CUPTI_ACTIVITY_KIND_MARKER. + */ + CUPTI_ACTIVITY_FLAG_MARKER_START = 1 << 1, + + /** + * Indicates the activity represents a region end marker. Valid for + * CUPTI_ACTIVITY_KIND_MARKER. + */ + CUPTI_ACTIVITY_FLAG_MARKER_END = 1 << 2, + + /** + * Indicates the activity represents an attempt to acquire a user + * defined synchronization object. + * Valid for CUPTI_ACTIVITY_KIND_MARKER. + */ + CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE = 1 << 3, + + /** + * Indicates the activity represents success in acquiring the + * user defined synchronization object. + * Valid for CUPTI_ACTIVITY_KIND_MARKER. + */ + CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_SUCCESS = 1 << 4, + + /** + * Indicates the activity represents failure in acquiring the + * user defined synchronization object. + * Valid for CUPTI_ACTIVITY_KIND_MARKER. + */ + CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_FAILED = 1 << 5, + + /** + * Indicates the activity represents releasing a reservation on + * user defined synchronization object. + * Valid for CUPTI_ACTIVITY_KIND_MARKER. + */ + CUPTI_ACTIVITY_FLAG_MARKER_SYNC_RELEASE = 1 << 6, + + /** + * Indicates the activity represents a marker that does not specify + * a color. Valid for CUPTI_ACTIVITY_KIND_MARKER_DATA. + */ + CUPTI_ACTIVITY_FLAG_MARKER_COLOR_NONE = 1 << 0, + + /** + * Indicates the activity represents a marker that specifies a color + * in alpha-red-green-blue format. Valid for + * CUPTI_ACTIVITY_KIND_MARKER_DATA. + */ + CUPTI_ACTIVITY_FLAG_MARKER_COLOR_ARGB = 1 << 1, + + /** + * The number of bytes requested by each thread + * Valid for CUpti_ActivityGlobalAccess3. + */ + CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_SIZE_MASK = 0xFF << 0, + /** + * If bit in this flag is set, the access was load, else it is a + * store access. Valid for CUpti_ActivityGlobalAccess3. + */ + CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_LOAD = 1 << 8, + /** + * If this bit in flag is set, the load access was cached else it is + * uncached. Valid for CUpti_ActivityGlobalAccess3. + */ + CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_CACHED = 1 << 9, + /** + * If this bit in flag is set, the metric value overflowed. Valid + * for CUpti_ActivityMetric and CUpti_ActivityMetricInstance. + */ + CUPTI_ACTIVITY_FLAG_METRIC_OVERFLOWED = 1 << 0, + /** + * If this bit in flag is set, the metric value couldn't be + * calculated. This occurs when a value(s) required to calculate the + * metric is missing. Valid for CUpti_ActivityMetric and + * CUpti_ActivityMetricInstance. + */ + CUPTI_ACTIVITY_FLAG_METRIC_VALUE_INVALID = 1 << 1, + /** + * If this bit in flag is set, the source level metric value couldn't be + * calculated. This occurs when a value(s) required to calculate the + * source level metric cannot be evaluated. + * Valid for CUpti_ActivityInstructionExecution. + */ + CUPTI_ACTIVITY_FLAG_INSTRUCTION_VALUE_INVALID = 1 << 0, + /** + * The mask for the instruction class, \ref CUpti_ActivityInstructionClass + * Valid for CUpti_ActivityInstructionExecution and + * CUpti_ActivityInstructionCorrelation + */ + CUPTI_ACTIVITY_FLAG_INSTRUCTION_CLASS_MASK = 0xFF << 1, + /** + * When calling cuptiActivityFlushAll, this flag + * can be set to force CUPTI to flush all records in the buffer, whether + * finished or not + */ + CUPTI_ACTIVITY_FLAG_FLUSH_FORCED = 1 << 0, + + /** + * The number of bytes requested by each thread + * Valid for CUpti_ActivitySharedAccess. + */ + CUPTI_ACTIVITY_FLAG_SHARED_ACCESS_KIND_SIZE_MASK = 0xFF << 0, + /** + * If bit in this flag is set, the access was load, else it is a + * store access. Valid for CUpti_ActivitySharedAccess. + */ + CUPTI_ACTIVITY_FLAG_SHARED_ACCESS_KIND_LOAD = 1 << 8, + + /** + * Indicates the activity represents an asynchronous memset + * operation. Valid for CUPTI_ACTIVITY_KIND_MEMSET. + */ + CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC = 1 << 0, + + /** + * Indicates the activity represents thrashing in CPU. + * Valid for counter of kind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING in + * CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER + */ + CUPTI_ACTIVITY_FLAG_THRASHING_IN_CPU = 1 << 0, + + /** + * Indicates the activity represents page throttling in CPU. + * Valid for counter of kind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING in + * CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER + */ + CUPTI_ACTIVITY_FLAG_THROTTLING_IN_CPU = 1 << 0, + + CUPTI_ACTIVITY_FLAG_FORCE_INT = 0x7fffffff +} CUpti_ActivityFlag; + +/** + * \brief The stall reason for PC sampling activity. + */ +typedef enum { + /** + * Invalid reason + */ + CUPTI_ACTIVITY_PC_SAMPLING_STALL_INVALID = 0, + /** + * No stall, instruction is selected for issue + */ + CUPTI_ACTIVITY_PC_SAMPLING_STALL_NONE = 1, + /** + * Warp is blocked because next instruction is not yet available, + * because of instruction cache miss, or because of branching effects + */ + CUPTI_ACTIVITY_PC_SAMPLING_STALL_INST_FETCH = 2, + /** + * Instruction is waiting on an arithmatic dependency + */ + CUPTI_ACTIVITY_PC_SAMPLING_STALL_EXEC_DEPENDENCY = 3, + /** + * Warp is blocked because it is waiting for a memory access to complete. + */ + CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_DEPENDENCY = 4, + /** + * Texture sub-system is fully utilized or has too many outstanding requests. + */ + CUPTI_ACTIVITY_PC_SAMPLING_STALL_TEXTURE = 5, + /** + * Warp is blocked as it is waiting at __syncthreads() or at memory barrier. + */ + CUPTI_ACTIVITY_PC_SAMPLING_STALL_SYNC = 6, + /** + * Warp is blocked waiting for __constant__ memory and immediate memory access to complete. + */ + CUPTI_ACTIVITY_PC_SAMPLING_STALL_CONSTANT_MEMORY_DEPENDENCY = 7, + /** + * Compute operation cannot be performed due to the required resources not + * being available. + */ + CUPTI_ACTIVITY_PC_SAMPLING_STALL_PIPE_BUSY = 8, + /** + * Warp is blocked because there are too many pending memory operations. + * In Kepler architecture it often indicates high number of memory replays. + */ + CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_THROTTLE = 9, + /** + * Warp was ready to issue, but some other warp issued instead. + */ + CUPTI_ACTIVITY_PC_SAMPLING_STALL_NOT_SELECTED = 10, + /** + * Miscellaneous reasons + */ + CUPTI_ACTIVITY_PC_SAMPLING_STALL_OTHER = 11, + /** + * Sleeping. + */ + CUPTI_ACTIVITY_PC_SAMPLING_STALL_SLEEPING = 12, + CUPTI_ACTIVITY_PC_SAMPLING_STALL_FORCE_INT = 0x7fffffff +} CUpti_ActivityPCSamplingStallReason; + +/** + * \brief Sampling period for PC sampling method + * + * Sampling period can be set using \ref cuptiActivityConfigurePCSampling + */ +typedef enum { + /** + * The PC sampling period is not set. + */ + CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_INVALID = 0, + /** + * Minimum sampling period available on the device. + */ + CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MIN = 1, + /** + * Sampling period in lower range. + */ + CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_LOW = 2, + /** + * Medium sampling period. + */ + CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MID = 3, + /** + * Sampling period in higher range. + */ + CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_HIGH = 4, + /** + * Maximum sampling period available on the device. + */ + CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MAX = 5, + CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_FORCE_INT = 0x7fffffff +} CUpti_ActivityPCSamplingPeriod; + +/** + * \brief The kind of a memory copy, indicating the source and + * destination targets of the copy. + * + * Each kind represents the source and destination targets of a memory + * copy. Targets are host, device, and array. + */ +typedef enum { + /** + * The memory copy kind is not known. + */ + CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN = 0, + /** + * A host to device memory copy. + */ + CUPTI_ACTIVITY_MEMCPY_KIND_HTOD = 1, + /** + * A device to host memory copy. + */ + CUPTI_ACTIVITY_MEMCPY_KIND_DTOH = 2, + /** + * A host to device array memory copy. + */ + CUPTI_ACTIVITY_MEMCPY_KIND_HTOA = 3, + /** + * A device array to host memory copy. + */ + CUPTI_ACTIVITY_MEMCPY_KIND_ATOH = 4, + /** + * A device array to device array memory copy. + */ + CUPTI_ACTIVITY_MEMCPY_KIND_ATOA = 5, + /** + * A device array to device memory copy. + */ + CUPTI_ACTIVITY_MEMCPY_KIND_ATOD = 6, + /** + * A device to device array memory copy. + */ + CUPTI_ACTIVITY_MEMCPY_KIND_DTOA = 7, + /** + * A device to device memory copy on the same device. + */ + CUPTI_ACTIVITY_MEMCPY_KIND_DTOD = 8, + /** + * A host to host memory copy. + */ + CUPTI_ACTIVITY_MEMCPY_KIND_HTOH = 9, + /** + * A peer to peer memory copy across different devices. + */ + CUPTI_ACTIVITY_MEMCPY_KIND_PTOP = 10, + + CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT = 0x7fffffff +} CUpti_ActivityMemcpyKind; + +/** + * \brief The kinds of memory accessed by a memory operation/copy. + * + * Each kind represents the type of the memory + * accessed by a memory operation/copy. + */ +typedef enum { + /** + * The memory kind is unknown. + */ + CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN = 0, + /** + * The memory is pageable. + */ + CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE = 1, + /** + * The memory is pinned. + */ + CUPTI_ACTIVITY_MEMORY_KIND_PINNED = 2, + /** + * The memory is on the device. + */ + CUPTI_ACTIVITY_MEMORY_KIND_DEVICE = 3, + /** + * The memory is an array. + */ + CUPTI_ACTIVITY_MEMORY_KIND_ARRAY = 4, + /** + * The memory is managed + */ + CUPTI_ACTIVITY_MEMORY_KIND_MANAGED = 5, + /** + * The memory is device static + */ + CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC = 6, + /** + * The memory is managed static + */ + CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC = 7, + CUPTI_ACTIVITY_MEMORY_KIND_FORCE_INT = 0x7fffffff +} CUpti_ActivityMemoryKind; + +/** + * \brief The kind of a preemption activity. + */ +typedef enum { + /** + * The preemption kind is not known. + */ + CUPTI_ACTIVITY_PREEMPTION_KIND_UNKNOWN = 0, + /** + * Preemption to save CDP block. + */ + CUPTI_ACTIVITY_PREEMPTION_KIND_SAVE = 1, + /** + * Preemption to restore CDP block. + */ + CUPTI_ACTIVITY_PREEMPTION_KIND_RESTORE = 2, + CUPTI_ACTIVITY_PREEMPTION_KIND_FORCE_INT = 0x7fffffff +} CUpti_ActivityPreemptionKind; + +/** + * \brief The kind of environment data. Used to indicate what type of + * data is being reported by an environment activity record. + */ +typedef enum { + /** + * Unknown data. + */ + CUPTI_ACTIVITY_ENVIRONMENT_UNKNOWN = 0, + /** + * The environment data is related to speed. + */ + CUPTI_ACTIVITY_ENVIRONMENT_SPEED = 1, + /** + * The environment data is related to temperature. + */ + CUPTI_ACTIVITY_ENVIRONMENT_TEMPERATURE = 2, + /** + * The environment data is related to power. + */ + CUPTI_ACTIVITY_ENVIRONMENT_POWER = 3, + /** + * The environment data is related to cooling. + */ + CUPTI_ACTIVITY_ENVIRONMENT_COOLING = 4, + + CUPTI_ACTIVITY_ENVIRONMENT_COUNT, + CUPTI_ACTIVITY_ENVIRONMENT_KIND_FORCE_INT = 0x7fffffff +} CUpti_ActivityEnvironmentKind; + +/** + * \brief Reasons for clock throttling. + * + * The possible reasons that a clock can be throttled. There can be + * more than one reason that a clock is being throttled so these types + * can be combined by bitwise OR. These are used in the + * clocksThrottleReason field in the Environment Activity Record. + */ +typedef enum { + /** + * Nothing is running on the GPU and the clocks are dropping to idle + * state. + */ + CUPTI_CLOCKS_THROTTLE_REASON_GPU_IDLE = 0x00000001, + /** + * The GPU clocks are limited by a user specified limit. + */ + CUPTI_CLOCKS_THROTTLE_REASON_USER_DEFINED_CLOCKS = 0x00000002, + /** + * A software power scaling algorithm is reducing the clocks below + * requested clocks. + */ + CUPTI_CLOCKS_THROTTLE_REASON_SW_POWER_CAP = 0x00000004, + /** + * Hardware slowdown to reduce the clock by a factor of two or more + * is engaged. This is an indicator of one of the following: 1) + * Temperature is too high, 2) External power brake assertion is + * being triggered (e.g. by the system power supply), 3) Change in + * power state. + */ + CUPTI_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN = 0x00000008, + /** + * Some unspecified factor is reducing the clocks. + */ + CUPTI_CLOCKS_THROTTLE_REASON_UNKNOWN = 0x80000000, + /** + * Throttle reason is not supported for this GPU. + */ + CUPTI_CLOCKS_THROTTLE_REASON_UNSUPPORTED = 0x40000000, + /** + * No clock throttling. + */ + CUPTI_CLOCKS_THROTTLE_REASON_NONE = 0x00000000, + + CUPTI_CLOCKS_THROTTLE_REASON_FORCE_INT = 0x7fffffff +} CUpti_EnvironmentClocksThrottleReason; + +/** + * \brief Scope of the unified memory counter (deprecated in CUDA 7.0) + */ +typedef enum { + /** + * The unified memory counter scope is not known. + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_UNKNOWN = 0, + /** + * Collect unified memory counter for single process on one device + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE = 1, + /** + * Collect unified memory counter for single process across all devices + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_ALL_DEVICES = 2, + + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_COUNT, + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_FORCE_INT = 0x7fffffff +} CUpti_ActivityUnifiedMemoryCounterScope; + +/** + * \brief Kind of the Unified Memory counter + * + * Many activities are associated with Unified Memory mechanism; among them + * are tranfer from host to device, device to host, page fault at + * host side. + */ +typedef enum { + /** + * The unified memory counter kind is not known. + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_UNKNOWN = 0, + /** + * Number of bytes transfered from host to device + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD = 1, + /** + * Number of bytes transfered from device to host + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH = 2, + /** + * Number of CPU page faults, this is only supported on 64 bit + * Linux and Mac platforms + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT = 3, + /** + * Number of GPU page faults, this is only supported on devices with + * compute capability 6.0 and higher and 64 bit Linux platforms + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT = 4, + /** + * Thrashing occurs when data is frequently accessed by + * multiple processors and has to be constantly migrated around + * to achieve data locality. In this case the overhead of migration + * may exceed the benefits of locality. + * This is only supported on 64 bit Linux platforms. + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING = 5, + /** + * Throttling is a prevention technique used by the driver to avoid + * further thrashing. Here, the driver doesn't service the fault for + * one of the contending processors for a specific period of time, + * so that the other processor can run at full-speed. + * This is only supported on 64 bit Linux platforms. + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING = 6, + /** + * In case throttling does not help, the driver tries to pin the memory + * to a processor for a specific period of time. One of the contending + * processors will have slow access to the memory, while the other will + * have fast access. + * This is only supported on 64 bit Linux platforms. + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP = 7, + + /** + * Number of bytes transferred from one device to another device. + * This is only supported on 64 bit Linux platforms. + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD = 8, + + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_COUNT, + CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_FORCE_INT = 0x7fffffff +} CUpti_ActivityUnifiedMemoryCounterKind; + +/** + * \brief Memory access type for unified memory page faults + * + * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT + * and \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT + */ +typedef enum { + /** + * The unified memory access type is not known + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_UNKNOWN = 0, + /** + * The page fault was triggered by read memory instruction + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_READ = 1, + /** + * The page fault was triggered by write memory instruction + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_WRITE = 2, + /** + * The page fault was triggered by atomic memory instruction + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_ATOMIC = 3, + /** + * The page fault was triggered by memory prefetch operation + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_PREFETCH = 4 +} CUpti_ActivityUnifiedMemoryAccessType; + +/** + * \brief Migration cause of the Unified Memory counter + * + * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and + * \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH + */ +typedef enum { + /** + * The unified memory migration cause is not known + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_UNKNOWN = 0, + /** + * The unified memory migrated due to an explicit call from + * the user e.g. cudaMemPrefetchAsync + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_USER = 1, + /** + * The unified memory migrated to guarantee data coherence + * e.g. CPU/GPU faults on Pascal+ and kernel launch on pre-Pascal GPUs + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_COHERENCE = 2, + /** + * The unified memory was speculatively migrated by the UVM driver + * before being accessed by the destination processor to improve + * performance + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_PREFETCH = 3, + /** + * The unified memory migrated to the CPU because it was evicted to make + * room for another block of memory on the GPU + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_EVICTION = 4, + /** + * The unified memory migrated to another processor because of access counter + * notifications. Only frequently accessed pages are migrated between CPU and GPU, or + * between peer GPUs. + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_ACCESS_COUNTERS = 5, +} CUpti_ActivityUnifiedMemoryMigrationCause; + +/** + * \brief Remote memory map cause of the Unified Memory counter + * + * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP + */ +typedef enum { + /** + * The cause of mapping to remote memory was unknown + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_UNKNOWN = 0, + /** + * Mapping to remote memory was added to maintain data coherence. + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_COHERENCE = 1, + /** + * Mapping to remote memory was added to prevent further thrashing + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_THRASHING = 2, + /** + * Mapping to remote memory was added to enforce the hints + * specified by the programmer or by performance heuristics of the + * UVM driver + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_POLICY = 3, + /** + * Mapping to remote memory was added because there is no more + * memory available on the processor and eviction was not + * possible + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_OUT_OF_MEMORY = 4, + /** + * Mapping to remote memory was added after the memory was + * evicted to make room for another block of memory on the GPU + */ + CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_EVICTION = 5, +} CUpti_ActivityUnifiedMemoryRemoteMapCause; + +/** + * \brief SASS instruction classification. + * + * The sass instruction are broadly divided into different class. Each enum represents a classification. + */ +typedef enum { + /** + * The instruction class is not known. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_UNKNOWN = 0, + /** + * Represents a 32 bit floating point operation. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_32 = 1, + /** + * Represents a 64 bit floating point operation. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_64 = 2, + /** + * Represents an integer operation. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_INTEGER = 3, + /** + * Represents a bit conversion operation. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_BIT_CONVERSION = 4, + /** + * Represents a control flow instruction. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_CONTROL_FLOW = 5, + /** + * Represents a global load-store instruction. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_GLOBAL = 6, + /** + * Represents a shared load-store instruction. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_SHARED = 7, + /** + * Represents a local load-store instruction. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_LOCAL = 8, + /** + * Represents a generic load-store instruction. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_GENERIC = 9, + /** + * Represents a surface load-store instruction. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_SURFACE = 10, + /** + * Represents a constant load instruction. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_CONSTANT = 11, + /** + * Represents a texture load-store instruction. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_TEXTURE = 12, + /** + * Represents a global atomic instruction. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_GLOBAL_ATOMIC = 13, + /** + * Represents a shared atomic instruction. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_SHARED_ATOMIC = 14, + /** + * Represents a surface atomic instruction. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_SURFACE_ATOMIC = 15, + /** + * Represents a inter-thread communication instruction. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_INTER_THREAD_COMMUNICATION = 16, + /** + * Represents a barrier instruction. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_BARRIER = 17, + /** + * Represents some miscellaneous instructions which do not fit in the above classification. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_MISCELLANEOUS = 18, + /** + * Represents a 16 bit floating point operation. + */ + CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_16 = 19, + + /** + * Represents uniform instruction. + */ + + CUPTI_ACTIVITY_INSTRUCTION_CLASS_UNIFORM = 20, + + CUPTI_ACTIVITY_INSTRUCTION_CLASS_KIND_FORCE_INT = 0x7fffffff +} CUpti_ActivityInstructionClass; + +/** + * \brief Partitioned global caching option + */ +typedef enum { + /** + * Partitioned global cache config unknown. + */ + CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_UNKNOWN = 0, + /** + * Partitioned global cache not supported. + */ + CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_NOT_SUPPORTED = 1, + /** + * Partitioned global cache config off. + */ + CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_OFF = 2, + /** + * Partitioned global cache config on. + */ + CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_ON = 3, + CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_FORCE_INT = 0x7fffffff +} CUpti_ActivityPartitionedGlobalCacheConfig; + +/** + * \brief Synchronization type. + * + * The types of synchronization to be used with CUpti_ActivitySynchronization. + */ + +typedef enum { + /** + * Unknown data. + */ + CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_UNKNOWN = 0, + /** + * Event synchronize API. + */ + CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE = 1, + /** + * Stream wait event API. + */ + CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT = 2, + /** + * Stream synchronize API. + */ + CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE = 3, + /** + * Context synchronize API. + */ + CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_CONTEXT_SYNCHRONIZE = 4, + + CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_FORCE_INT = 0x7fffffff +} CUpti_ActivitySynchronizationType; + +/** + * \brief stream type. + * + * The types of stream to be used with CUpti_ActivityStream. + */ + +typedef enum { + /** + * Unknown data. + */ + CUPTI_ACTIVITY_STREAM_CREATE_FLAG_UNKNOWN = 0, + /** + * Default stream. + */ + CUPTI_ACTIVITY_STREAM_CREATE_FLAG_DEFAULT = 1, + /** + * Non-blocking stream. + */ + CUPTI_ACTIVITY_STREAM_CREATE_FLAG_NON_BLOCKING = 2, + /** + * Null stream. + */ + CUPTI_ACTIVITY_STREAM_CREATE_FLAG_NULL = 3, + /** + * Stream create Mask + */ + CUPTI_ACTIVITY_STREAM_CREATE_MASK = 0xFFFF, + + CUPTI_ACTIVITY_STREAM_CREATE_FLAG_FORCE_INT = 0x7fffffff +} CUpti_ActivityStreamFlag; + +/** +* \brief Link flags. +* +* Describes link properties, to be used with CUpti_ActivityNvLink. +*/ + +typedef enum { + CUPTI_LINK_FLAG_INVALID = 0, + /** + * Is peer to peer access supported by this link. + */ + CUPTI_LINK_FLAG_PEER_ACCESS = (1 << 1), + /** + * Is system memory access supported by this link. + */ + CUPTI_LINK_FLAG_SYSMEM_ACCESS = (1 << 2), + /** + * Is peer atomic access supported by this link. + */ + CUPTI_LINK_FLAG_PEER_ATOMICS = (1 << 3), + /** + * Is system memory atomic access supported by this link. + */ + CUPTI_LINK_FLAG_SYSMEM_ATOMICS = (1 << 4), + + CUPTI_LINK_FLAG_FORCE_INT = 0x7fffffff +} CUpti_LinkFlag; + +/** +* \brief Memory operation types. +* +* Describes the type of memory operation, to be used with CUpti_ActivityMemory3. +*/ + +typedef enum { + CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_INVALID = 0, + /** + * Memory is allocated. + */ + CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_ALLOCATION = 1, + /** + * Memory is released. + */ + CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_RELEASE = 2, + + CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_FORCE_INT = 0x7fffffff +} CUpti_ActivityMemoryOperationType; + +/** +* \brief Memory pool types. +* +* Describes the type of memory pool, to be used with CUpti_ActivityMemory3. +*/ + +typedef enum { + CUPTI_ACTIVITY_MEMORY_POOL_TYPE_INVALID = 0, + /** + * Memory pool is local to the process. + */ + CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL = 1, + /** + * Memory pool is imported by the process. + */ + CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED = 2, + + CUPTI_ACTIVITY_MEMORY_POOL_TYPE_FORCE_INT = 0x7fffffff +} CUpti_ActivityMemoryPoolType; + +/** +* \brief Memory pool operation types. +* +* Describes the type of memory pool operation, to be used with CUpti_ActivityMemoryPool2. +*/ + +typedef enum { + CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_INVALID = 0, + /** + * Memory pool is created. + */ + CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_CREATED = 1, + /** + * Memory pool is destroyed. + */ + CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_DESTROYED = 2, + /** + * Memory pool is trimmed. + */ + CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED = 3, + + CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_FORCE_INT = 0x7fffffff +} CUpti_ActivityMemoryPoolOperationType; + +typedef enum { + CUPTI_CHANNEL_TYPE_INVALID = 0, + CUPTI_CHANNEL_TYPE_COMPUTE = 1, + CUPTI_CHANNEL_TYPE_ASYNC_MEMCPY = 2 +} CUpti_ChannelType; + +/** + * The source-locator ID that indicates an unknown source + * location. There is not an actual CUpti_ActivitySourceLocator object + * corresponding to this value. + */ +#define CUPTI_SOURCE_LOCATOR_ID_UNKNOWN 0 + +/** + * An invalid function index ID. + */ +#define CUPTI_FUNCTION_INDEX_ID_INVALID 0 + +/** + * An invalid/unknown correlation ID. A correlation ID of this value + * indicates that there is no correlation for the activity record. + */ +#define CUPTI_CORRELATION_ID_UNKNOWN 0 + +/** + * An invalid/unknown grid ID. + */ +#define CUPTI_GRID_ID_UNKNOWN 0LL + +/** + * An invalid/unknown timestamp for a start, end, queued, submitted, + * or completed time. + */ +#define CUPTI_TIMESTAMP_UNKNOWN 0LL + +/** + * An invalid/unknown value. + */ +#define CUPTI_SYNCHRONIZATION_INVALID_VALUE -1 + +/** + * An invalid/unknown process id. + */ +#define CUPTI_AUTO_BOOST_INVALID_CLIENT_PID 0 + +/** + * Invalid/unknown NVLink port number. +*/ +#define CUPTI_NVLINK_INVALID_PORT -1 + +/** + * Maximum NVLink port numbers. +*/ +#define CUPTI_MAX_NVLINK_PORTS 32 + +START_PACKED_ALIGNMENT +/** + * \brief Unified Memory counters configuration structure + * + * This structure controls the enable/disable of the various + * Unified Memory counters consisting of scope, kind and other parameters. + * See function \ref cuptiActivityConfigureUnifiedMemoryCounter + */ +typedef struct PACKED_ALIGNMENT { + /** + * Unified Memory counter Counter scope. (deprecated in CUDA 7.0) + */ + CUpti_ActivityUnifiedMemoryCounterScope scope; + + /** + * Unified Memory counter Counter kind + */ + CUpti_ActivityUnifiedMemoryCounterKind kind; + + /** + * Device id of the traget device. This is relevant only + * for single device scopes. (deprecated in CUDA 7.0) + */ + uint32_t deviceId; + + /** + * Control to enable/disable the counter. To enable the counter + * set it to non-zero value while disable is indicated by zero. + */ + uint32_t enable; +} CUpti_ActivityUnifiedMemoryCounterConfig; + +/** + * \brief Device auto boost state structure + * + * This structure defines auto boost state for a device. + * See function \ref cuptiGetAutoBoostState + */ +typedef struct PACKED_ALIGNMENT { + /** + * Returned auto boost state. 1 is returned in case auto boost is enabled, 0 + * otherwise + */ + uint32_t enabled; + + /** + * Id of process that has set the current boost state. The value will be + * CUPTI_AUTO_BOOST_INVALID_CLIENT_PID if the user does not have the + * permission to query process ids or there is an error in querying the + * process id. + */ + uint32_t pid; + +} CUpti_ActivityAutoBoostState; + +/** + * \brief PC sampling configuration structure + * + * This structure defines the pc sampling configuration. + * + * See function \ref cuptiActivityConfigurePCSampling + */ +typedef struct PACKED_ALIGNMENT { + /** + * Size of configuration structure. + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + uint32_t size; + /** + * There are 5 level provided for sampling period. The level + * internally maps to a period in terms of cycles. Same level can + * map to different number of cycles on different gpus. No of + * cycles will be chosen to minimize information loss. The period + * chosen will be given by samplingPeriodInCycles in + * \ref CUpti_ActivityPCSamplingRecordInfo for each kernel instance. + */ + CUpti_ActivityPCSamplingPeriod samplingPeriod; + + /** + * This will override the period set by samplingPeriod. Value 0 in samplingPeriod2 will be + * considered as samplingPeriod2 should not be used and samplingPeriod should be used. + * Valid values for samplingPeriod2 are between 5 to 31 both inclusive. + * This will set the sampling period to (2^samplingPeriod2) cycles. + */ + uint32_t samplingPeriod2; +} CUpti_ActivityPCSamplingConfig; + +/** + * \brief The base activity record. + * + * The activity API uses a CUpti_Activity as a generic representation + * for any activity. The 'kind' field is used to determine the + * specific activity kind, and from that the CUpti_Activity object can + * be cast to the specific activity record type appropriate for that kind. + * + * Note that all activity record types are padded and aligned to + * ensure that each member of the record is naturally aligned. + * + * \see CUpti_ActivityKind + */ +typedef struct PACKED_ALIGNMENT { + /** + * The kind of this activity. + */ + CUpti_ActivityKind kind; +} CUpti_Activity; + +/** + * \brief The activity record for memory copies. (deprecated) + * + * This activity record represents a memory copy + * (CUPTI_ACTIVITY_KIND_MEMCPY). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY. + */ + CUpti_ActivityKind kind; + + /** + * The kind of the memory copy, stored as a byte to reduce record + * size. \see CUpti_ActivityMemcpyKind + */ + uint8_t copyKind; + + /** + * The source memory kind read by the memory copy, stored as a byte + * to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t srcKind; + + /** + * The destination memory kind read by the memory copy, stored as a + * byte to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t dstKind; + + /** + * The flags associated with the memory copy. \see CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The number of bytes transferred by the memory copy. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t start; + + /** + * The end timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t end; + + /** + * The ID of the device where the memory copy is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory copy is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory copy is occurring. + */ + uint32_t streamId; + + /** + * The correlation ID of the memory copy. Each memory copy is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver API activity record that launched + * the memory copy. + */ + uint32_t correlationId; + + /** + * The runtime correlation ID of the memory copy. Each memory copy + * is assigned a unique runtime correlation ID that is identical to + * the correlation ID in the runtime API activity record that + * launched the memory copy. + */ + uint32_t runtimeCorrelationId; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; +} CUpti_ActivityMemcpy; + +/** + * \brief The activity record for memory copies. (deprecated in CUDA 11.1) + * + * This activity record represents a memory copy + * (CUPTI_ACTIVITY_KIND_MEMCPY). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY. + */ + CUpti_ActivityKind kind; + + /** + * The kind of the memory copy, stored as a byte to reduce record + * size. \see CUpti_ActivityMemcpyKind + */ + uint8_t copyKind; + + /** + * The source memory kind read by the memory copy, stored as a byte + * to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t srcKind; + + /** + * The destination memory kind read by the memory copy, stored as a + * byte to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t dstKind; + + /** + * The flags associated with the memory copy. \see CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The number of bytes transferred by the memory copy. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t start; + + /** + * The end timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t end; + + /** + * The ID of the device where the memory copy is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory copy is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory copy is occurring. + */ + uint32_t streamId; + + /** + * The correlation ID of the memory copy. Each memory copy is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver API activity record that launched + * the memory copy. + */ + uint32_t correlationId; + + /** + * The runtime correlation ID of the memory copy. Each memory copy + * is assigned a unique runtime correlation ID that is identical to + * the correlation ID in the runtime API activity record that + * launched the memory copy. + */ + uint32_t runtimeCorrelationId; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed this memcpy through graph launch. + * This field will be 0 if the memcpy is not done through graph launch. + */ + uint64_t graphNodeId; +} CUpti_ActivityMemcpy3; + +/** + * \brief The activity record for memory copies. (deprecated in CUDA 11.6) + * + * This activity record represents a memory copy + * (CUPTI_ACTIVITY_KIND_MEMCPY). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY. + */ + CUpti_ActivityKind kind; + + /** + * The kind of the memory copy, stored as a byte to reduce record + * size. \see CUpti_ActivityMemcpyKind + */ + uint8_t copyKind; + + /** + * The source memory kind read by the memory copy, stored as a byte + * to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t srcKind; + + /** + * The destination memory kind read by the memory copy, stored as a + * byte to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t dstKind; + + /** + * The flags associated with the memory copy. \see CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The number of bytes transferred by the memory copy. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t start; + + /** + * The end timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t end; + + /** + * The ID of the device where the memory copy is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory copy is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory copy is occurring. + */ + uint32_t streamId; + + /** + * The correlation ID of the memory copy. Each memory copy is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver API activity record that launched + * the memory copy. + */ + uint32_t correlationId; + + /** + * The runtime correlation ID of the memory copy. Each memory copy + * is assigned a unique runtime correlation ID that is identical to + * the correlation ID in the runtime API activity record that + * launched the memory copy. + */ + uint32_t runtimeCorrelationId; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed this memcpy through graph launch. + * This field will be 0 if the memcpy is not done through graph launch. + */ + uint64_t graphNodeId; + + /** + * The unique ID of the graph that executed this memcpy through graph launch. + * This field will be 0 if the memcpy is not done through graph launch. + */ + uint32_t graphId; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t padding; +} CUpti_ActivityMemcpy4; + +/** + * \brief The activity record for memory copies. + * + * This activity record represents a memory copy + * (CUPTI_ACTIVITY_KIND_MEMCPY). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY. + */ + CUpti_ActivityKind kind; + + /** + * The kind of the memory copy, stored as a byte to reduce record + * size. \see CUpti_ActivityMemcpyKind + */ + uint8_t copyKind; + + /** + * The source memory kind read by the memory copy, stored as a byte + * to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t srcKind; + + /** + * The destination memory kind read by the memory copy, stored as a + * byte to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t dstKind; + + /** + * The flags associated with the memory copy. \see CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The number of bytes transferred by the memory copy. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t start; + + /** + * The end timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t end; + + /** + * The ID of the device where the memory copy is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory copy is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory copy is occurring. + */ + uint32_t streamId; + + /** + * The correlation ID of the memory copy. Each memory copy is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver API activity record that launched + * the memory copy. + */ + uint32_t correlationId; + + /** + * The runtime correlation ID of the memory copy. Each memory copy + * is assigned a unique runtime correlation ID that is identical to + * the correlation ID in the runtime API activity record that + * launched the memory copy. + */ + uint32_t runtimeCorrelationId; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed this memcpy through graph launch. + * This field will be 0 if the memcpy is not done through graph launch. + */ + uint64_t graphNodeId; + + /** + * The unique ID of the graph that executed this memcpy through graph launch. + * This field will be 0 if the memcpy is not done through graph launch. + */ + uint32_t graphId; + + /** + * The ID of the HW channel on which the memory copy is occuring. + */ + uint32_t channelID; + + /** + * The type of the channel + */ + CUpti_ChannelType channelType; + + /** + * Reserved for internal use. + */ + uint32_t pad2; + +} CUpti_ActivityMemcpy5; + +/** + * \brief The activity record for peer-to-peer memory copies. + * + * This activity record represents a peer-to-peer memory copy + * (CUPTI_ACTIVITY_KIND_MEMCPY2) but is no longer generated + * by CUPTI. Peer-to-peer memory copy activities are now reported using the + * CUpti_ActivityMemcpyPtoP2 activity record.. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2. + */ + CUpti_ActivityKind kind; + + /** + * The kind of the memory copy, stored as a byte to reduce record + * size. \see CUpti_ActivityMemcpyKind + */ + uint8_t copyKind; + + /** + * The source memory kind read by the memory copy, stored as a byte + * to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t srcKind; + + /** + * The destination memory kind read by the memory copy, stored as a + * byte to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t dstKind; + + /** + * The flags associated with the memory copy. \see + * CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The number of bytes transferred by the memory copy. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t start; + + /** + * The end timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t end; + + /** + * The ID of the device where the memory copy is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory copy is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory copy is occurring. + */ + uint32_t streamId; + + /** + * The ID of the device where memory is being copied from. + */ + uint32_t srcDeviceId; + + /** + * The ID of the context owning the memory being copied from. + */ + uint32_t srcContextId; + + /** + * The ID of the device where memory is being copied to. + */ + uint32_t dstDeviceId; + + /** + * The ID of the context owning the memory being copied to. + */ + uint32_t dstContextId; + + /** + * The correlation ID of the memory copy. Each memory copy is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver and runtime API activity record that + * launched the memory copy. + */ + uint32_t correlationId; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; +} CUpti_ActivityMemcpyPtoP; + +typedef CUpti_ActivityMemcpyPtoP CUpti_ActivityMemcpy2; + +/** + * \brief The activity record for peer-to-peer memory copies. + * (deprecated in CUDA 11.1) + * + * This activity record represents a peer-to-peer memory copy + * (CUPTI_ACTIVITY_KIND_MEMCPY2). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2. + */ + CUpti_ActivityKind kind; + + /** + * The kind of the memory copy, stored as a byte to reduce record + * size. \see CUpti_ActivityMemcpyKind + */ + uint8_t copyKind; + + /** + * The source memory kind read by the memory copy, stored as a byte + * to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t srcKind; + + /** + * The destination memory kind read by the memory copy, stored as a + * byte to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t dstKind; + + /** + * The flags associated with the memory copy. \see + * CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The number of bytes transferred by the memory copy. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t start; + + /** + * The end timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t end; + + /** + * The ID of the device where the memory copy is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory copy is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory copy is occurring. + */ + uint32_t streamId; + + /** + * The ID of the device where memory is being copied from. + */ + uint32_t srcDeviceId; + + /** + * The ID of the context owning the memory being copied from. + */ + uint32_t srcContextId; + + /** + * The ID of the device where memory is being copied to. + */ + uint32_t dstDeviceId; + + /** + * The ID of the context owning the memory being copied to. + */ + uint32_t dstContextId; + + /** + * The correlation ID of the memory copy. Each memory copy is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver and runtime API activity record that + * launched the memory copy. + */ + uint32_t correlationId; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed the memcpy through graph launch. + * This field will be 0 if memcpy is not done using graph launch. + */ + uint64_t graphNodeId; +} CUpti_ActivityMemcpyPtoP2; + +/** + * \brief The activity record for peer-to-peer memory copies. + * (deprecated in CUDA 11.6) + * + * This activity record represents a peer-to-peer memory copy + * (CUPTI_ACTIVITY_KIND_MEMCPY2). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2. + */ + CUpti_ActivityKind kind; + + /** + * The kind of the memory copy, stored as a byte to reduce record + * size. \see CUpti_ActivityMemcpyKind + */ + uint8_t copyKind; + + /** + * The source memory kind read by the memory copy, stored as a byte + * to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t srcKind; + + /** + * The destination memory kind read by the memory copy, stored as a + * byte to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t dstKind; + + /** + * The flags associated with the memory copy. \see + * CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The number of bytes transferred by the memory copy. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t start; + + /** + * The end timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t end; + + /** + * The ID of the device where the memory copy is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory copy is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory copy is occurring. + */ + uint32_t streamId; + + /** + * The ID of the device where memory is being copied from. + */ + uint32_t srcDeviceId; + + /** + * The ID of the context owning the memory being copied from. + */ + uint32_t srcContextId; + + /** + * The ID of the device where memory is being copied to. + */ + uint32_t dstDeviceId; + + /** + * The ID of the context owning the memory being copied to. + */ + uint32_t dstContextId; + + /** + * The correlation ID of the memory copy. Each memory copy is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver and runtime API activity record that + * launched the memory copy. + */ + uint32_t correlationId; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed the memcpy through graph launch. + * This field will be 0 if memcpy is not done using graph launch. + */ + uint64_t graphNodeId; + + /** + * The unique ID of the graph that executed this memcpy through graph launch. + * This field will be 0 if the memcpy is not done through graph launch. + */ + uint32_t graphId; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t padding; +} CUpti_ActivityMemcpyPtoP3; + +/** + * \brief The activity record for peer-to-peer memory copies. + * + * This activity record represents a peer-to-peer memory copy + * (CUPTI_ACTIVITY_KIND_MEMCPY2). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2. + */ + CUpti_ActivityKind kind; + + /** + * The kind of the memory copy, stored as a byte to reduce record + * size. \see CUpti_ActivityMemcpyKind + */ + uint8_t copyKind; + + /** + * The source memory kind read by the memory copy, stored as a byte + * to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t srcKind; + + /** + * The destination memory kind read by the memory copy, stored as a + * byte to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t dstKind; + + /** + * The flags associated with the memory copy. \see + * CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The number of bytes transferred by the memory copy. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t start; + + /** + * The end timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t end; + + /** + * The ID of the device where the memory copy is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory copy is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory copy is occurring. + */ + uint32_t streamId; + + /** + * The ID of the device where memory is being copied from. + */ + uint32_t srcDeviceId; + + /** + * The ID of the context owning the memory being copied from. + */ + uint32_t srcContextId; + + /** + * The ID of the device where memory is being copied to. + */ + uint32_t dstDeviceId; + + /** + * The ID of the context owning the memory being copied to. + */ + uint32_t dstContextId; + + /** + * The correlation ID of the memory copy. Each memory copy is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver and runtime API activity record that + * launched the memory copy. + */ + uint32_t correlationId; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed the memcpy through graph launch. + * This field will be 0 if memcpy is not done using graph launch. + */ + uint64_t graphNodeId; + + /** + * The unique ID of the graph that executed this memcpy through graph launch. + * This field will be 0 if the memcpy is not done through graph launch. + */ + uint32_t graphId; + + /** + * The ID of the HW channel on which the memory copy is occuring. + */ + uint32_t channelID; + + /** + * The type of the channel + */ + CUpti_ChannelType channelType; +} CUpti_ActivityMemcpyPtoP4; + +/** + * \brief The activity record for memset. (deprecated) + * + * This activity record represents a memory set operation + * (CUPTI_ACTIVITY_KIND_MEMSET). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET. + */ + CUpti_ActivityKind kind; + + /** + * The value being assigned to memory by the memory set. + */ + uint32_t value; + + /** + * The number of bytes being set by the memory set. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory set, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory set. + */ + uint64_t start; + + /** + * The end timestamp for the memory set, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory set. + */ + uint64_t end; + + /** + * The ID of the device where the memory set is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory set is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory set is occurring. + */ + uint32_t streamId; + + /** + * The correlation ID of the memory set. Each memory set is assigned + * a unique correlation ID that is identical to the correlation ID + * in the driver API activity record that launched the memory set. + */ + uint32_t correlationId; + + /** + * The flags associated with the memset. \see CUpti_ActivityFlag + */ + uint16_t flags; + + /** + * The memory kind of the memory set \see CUpti_ActivityMemoryKind + */ + uint16_t memoryKind; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; +} CUpti_ActivityMemset; + +/** + * \brief The activity record for memset. (deprecated in CUDA 11.1) + * + * This activity record represents a memory set operation + * (CUPTI_ACTIVITY_KIND_MEMSET). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET. + */ + CUpti_ActivityKind kind; + + /** + * The value being assigned to memory by the memory set. + */ + uint32_t value; + + /** + * The number of bytes being set by the memory set. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory set, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory set. + */ + uint64_t start; + + /** + * The end timestamp for the memory set, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory set. + */ + uint64_t end; + + /** + * The ID of the device where the memory set is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory set is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory set is occurring. + */ + uint32_t streamId; + + /** + * The correlation ID of the memory set. Each memory set is assigned + * a unique correlation ID that is identical to the correlation ID + * in the driver API activity record that launched the memory set. + */ + uint32_t correlationId; + + /** + * The flags associated with the memset. \see CUpti_ActivityFlag + */ + uint16_t flags; + + /** + * The memory kind of the memory set \see CUpti_ActivityMemoryKind + */ + uint16_t memoryKind; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed this memset through graph launch. + * This field will be 0 if the memset is not executed through graph launch. + */ + uint64_t graphNodeId; +} CUpti_ActivityMemset2; + +/** + * \brief The activity record for memset. (deprecated in CUDA 11.6) + * + * This activity record represents a memory set operation + * (CUPTI_ACTIVITY_KIND_MEMSET). + */ + +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET. + */ + CUpti_ActivityKind kind; + + /** + * The value being assigned to memory by the memory set. + */ + uint32_t value; + + /** + * The number of bytes being set by the memory set. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory set, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory set. + */ + uint64_t start; + + /** + * The end timestamp for the memory set, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory set. + */ + uint64_t end; + + /** + * The ID of the device where the memory set is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory set is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory set is occurring. + */ + uint32_t streamId; + + /** + * The correlation ID of the memory set. Each memory set is assigned + * a unique correlation ID that is identical to the correlation ID + * in the driver API activity record that launched the memory set. + */ + uint32_t correlationId; + + /** + * The flags associated with the memset. \see CUpti_ActivityFlag + */ + uint16_t flags; + + /** + * The memory kind of the memory set \see CUpti_ActivityMemoryKind + */ + uint16_t memoryKind; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed this memset through graph launch. + * This field will be 0 if the memset is not executed through graph launch. + */ + uint64_t graphNodeId; + + /** + * The unique ID of the graph that executed this memset through graph launch. + * This field will be 0 if the memset is not executed through graph launch. + */ + uint32_t graphId; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t padding; +} CUpti_ActivityMemset3; + +/** + * \brief The activity record for memset. + * + * This activity record represents a memory set operation + * (CUPTI_ACTIVITY_KIND_MEMSET). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET. + */ + CUpti_ActivityKind kind; + + /** + * The value being assigned to memory by the memory set. + */ + uint32_t value; + + /** + * The number of bytes being set by the memory set. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory set, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory set. + */ + uint64_t start; + + /** + * The end timestamp for the memory set, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory set. + */ + uint64_t end; + + /** + * The ID of the device where the memory set is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory set is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory set is occurring. + */ + uint32_t streamId; + + /** + * The correlation ID of the memory set. Each memory set is assigned + * a unique correlation ID that is identical to the correlation ID + * in the driver API activity record that launched the memory set. + */ + uint32_t correlationId; + + /** + * The flags associated with the memset. \see CUpti_ActivityFlag + */ + uint16_t flags; + + /** + * The memory kind of the memory set \see CUpti_ActivityMemoryKind + */ + uint16_t memoryKind; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed this memset through graph launch. + * This field will be 0 if the memset is not executed through graph launch. + */ + uint64_t graphNodeId; + + /** + * The unique ID of the graph that executed this memset through graph launch. + * This field will be 0 if the memset is not executed through graph launch. + */ + uint32_t graphId; + + /** + * The ID of the HW channel on which the memory set is occuring. + */ + uint32_t channelID; + + /** + * The type of the channel + */ + CUpti_ChannelType channelType; + + /** + * Undefined. Reserved for internal use + */ + uint32_t pad2; + +} CUpti_ActivityMemset4; + +/** + * \brief The activity record for memory. + * + * This activity record represents a memory allocation and free operation + * (CUPTI_ACTIVITY_KIND_MEMORY). + * This activity record provides a single record for the memory + * allocation and memory release operations. + * + * Note: It is recommended to move to the new activity record \ref CUpti_ActivityMemory3 + * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY2. + * \ref CUpti_ActivityMemory3 provides separate records for memory + * allocation and memory release operations. This allows to correlate the + * corresponding driver and runtime API activity record with the memory operation. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY + */ + CUpti_ActivityKind kind; + + /** + * The memory kind requested by the user + */ + CUpti_ActivityMemoryKind memoryKind; + + /** + * The virtual address of the allocation + */ + uint64_t address; + + /** + * The number of bytes of memory allocated. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory operation, i.e. + * the time when memory was allocated, in ns. + */ + uint64_t start; + + /** + * The end timestamp for the memory operation, i.e. + * the time when memory was freed, in ns. + * This will be 0 if memory is not freed in the application + */ + uint64_t end; + + /** + * The program counter of the allocation of memory + */ + uint64_t allocPC; + + /** + * The program counter of the freeing of memory. This will + * be 0 if memory is not freed in the application + */ + uint64_t freePC; + + /** + * The ID of the process to which this record belongs to. + */ + uint32_t processId; + + /** + * The ID of the device where the memory allocation is taking place. + */ + uint32_t deviceId; + + /** + * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID. + */ + uint32_t contextId; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Variable name. This name is shared across all activity + * records representing the same symbol, and so should not be + * modified. + */ + const char* name; +} CUpti_ActivityMemory; + +/** + * \brief The activity record for memory. + * + * This activity record represents a memory allocation and free operation + * (CUPTI_ACTIVITY_KIND_MEMORY2). + * This activity record provides separate records for memory allocation and + * memory release operations. + * This allows to correlate the corresponding driver and runtime API + * activity record with the memory operation. + * + * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory + * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY. + * \ref CUpti_ActivityMemory provides a single record for the memory + * allocation and memory release operations. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2 + */ + CUpti_ActivityKind kind; + + /** + * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType. + */ + CUpti_ActivityMemoryOperationType memoryOperationType; + + /** + * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind. + */ + CUpti_ActivityMemoryKind memoryKind; + + /** + * The correlation ID of the memory operation. Each memory operation is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver and runtime API activity record that + * launched the memory operation. + */ + uint32_t correlationId; + + /** + * The virtual address of the allocation. + */ + uint64_t address; + + /** + * The number of bytes of memory allocated. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory operation, in ns. + */ + uint64_t timestamp; + + /** + * The program counter of the memory operation. + */ + uint64_t PC; + + /** + * The ID of the process to which this record belongs to. + */ + uint32_t processId; + + /** + * The ID of the device where the memory operation is taking place. + */ + uint32_t deviceId; + + /** + * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID. + */ + uint32_t contextId; + + /** + * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID. + */ + uint32_t streamId; + + /** + * Variable name. This name is shared across all activity + * records representing the same symbol, and so should not be + * modified. + */ + const char* name; + + /** + * \p isAsync is set if memory operation happens through async memory APIs. + */ + uint32_t isAsync; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad1; +#endif + + /** + * The memory pool configuration used for the memory operations. + */ + struct { + /** + * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType + */ + CUpti_ActivityMemoryPoolType memoryPoolType; +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad2; +#endif + /** + * The base address of the memory pool. + */ + uint64_t address; + /** + * The release threshold of the memory pool in bytes. \p releaseThreshold is + * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t releaseThreshold; + + union { + /** + * The size of the memory pool in bytes. + * \p size is valid if \p memoryPoolType is + * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t size; + /** + * The processId of the memory pool. + * \p processId is valid if \p memoryPoolType is + * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t processId; + } pool; + } memoryPoolConfig; + +} CUpti_ActivityMemory2; + +/** + * \brief The activity record for memory. + * + * This activity record represents a memory allocation and free operation + * (CUPTI_ACTIVITY_KIND_MEMORY2). + * This activity record provides separate records for memory allocation and + * memory release operations. + * This allows to correlate the corresponding driver and runtime API + * activity record with the memory operation. + * + * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory + * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY. + * \ref CUpti_ActivityMemory provides a single record for the memory + * allocation and memory release operations. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2 + */ + CUpti_ActivityKind kind; + + /** + * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType. + */ + CUpti_ActivityMemoryOperationType memoryOperationType; + + /** + * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind. + */ + CUpti_ActivityMemoryKind memoryKind; + + /** + * The correlation ID of the memory operation. Each memory operation is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver and runtime API activity record that + * launched the memory operation. + */ + uint32_t correlationId; + + /** + * The virtual address of the allocation. + */ + uint64_t address; + + /** + * The number of bytes of memory allocated. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory operation, in ns. + */ + uint64_t timestamp; + + /** + * The program counter of the memory operation. + */ + uint64_t PC; + + /** + * The ID of the process to which this record belongs to. + */ + uint32_t processId; + + /** + * The ID of the device where the memory operation is taking place. + */ + uint32_t deviceId; + + /** + * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID. + */ + uint32_t contextId; + + /** + * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID. + */ + uint32_t streamId; + + /** + * Variable name. This name is shared across all activity + * records representing the same symbol, and so should not be + * modified. + */ + const char* name; + + /** + * \p isAsync is set if memory operation happens through async memory APIs. + */ + uint32_t isAsync; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad1; +#endif + + /** + * The memory pool configuration used for the memory operations. + */ + struct PACKED_ALIGNMENT { + /** + * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType + */ + CUpti_ActivityMemoryPoolType memoryPoolType; +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad2; +#endif + /** + * The base address of the memory pool. + */ + uint64_t address; + /** + * The release threshold of the memory pool in bytes. \p releaseThreshold is + * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t releaseThreshold; + + union { + /** + * The size of the memory pool in bytes. + * \p size is valid if \p memoryPoolType is + * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t size; + /** + * The processId of the memory pool. + * \p processId is valid if \p memoryPoolType is + * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t processId; + } pool; + + /** + * The utilized size of the memory pool. \p utilizedSize is + * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t utilizedSize; + } memoryPoolConfig; + +} CUpti_ActivityMemory3; + +/** + * \brief The activity record for memory pool. + * + * This activity record represents a memory pool creation, destruction and + * trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL). + * This activity record provides separate records for memory pool creation, + * destruction and triming operations. + * This allows to correlate the corresponding driver and runtime API + * activity record with the memory pool operation. + * + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL + */ + CUpti_ActivityKind kind; + + /** + * The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType. + */ + CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType; + + /** + * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType + */ + CUpti_ActivityMemoryPoolType memoryPoolType; + + /** + * The correlation ID of the memory pool operation. Each memory pool + * operation is assigned a unique correlation ID that is identical to the + * correlation ID in the driver and runtime API activity record that + * launched the memory operation. + */ + uint32_t correlationId; + + /** + * The ID of the process to which this record belongs to. + */ + uint32_t processId; + + /** + * The ID of the device where the memory pool is created. + */ + uint32_t deviceId; + + /** + * The minimum bytes to keep of the memory pool. \p minBytesToKeep is + * valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED, + * \ref CUpti_ActivityMemoryPoolOperationType + */ + size_t minBytesToKeep; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The virtual address of the allocation. + */ + uint64_t address; + + /** + * The size of the memory pool operation in bytes. \p size is + * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t size; + + /** + * The release threshold of the memory pool. \p releaseThreshold is + * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t releaseThreshold; + + /** + * The start timestamp for the memory operation, in ns. + */ + uint64_t timestamp; +} CUpti_ActivityMemoryPool; + +/** + * \brief The activity record for memory pool. + * + * This activity record represents a memory pool creation, destruction and + * trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL). + * This activity record provides separate records for memory pool creation, + * destruction and triming operations. + * This allows to correlate the corresponding driver and runtime API + * activity record with the memory pool operation. + * + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL + */ + CUpti_ActivityKind kind; + + /** + * The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType. + */ + CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType; + + /** + * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType + */ + CUpti_ActivityMemoryPoolType memoryPoolType; + + /** + * The correlation ID of the memory pool operation. Each memory pool + * operation is assigned a unique correlation ID that is identical to the + * correlation ID in the driver and runtime API activity record that + * launched the memory operation. + */ + uint32_t correlationId; + + /** + * The ID of the process to which this record belongs to. + */ + uint32_t processId; + + /** + * The ID of the device where the memory pool is created. + */ + uint32_t deviceId; + + /** + * The minimum bytes to keep of the memory pool. \p minBytesToKeep is + * valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED, + * \ref CUpti_ActivityMemoryPoolOperationType + */ + size_t minBytesToKeep; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The virtual address of the allocation. + */ + uint64_t address; + + /** + * The size of the memory pool operation in bytes. \p size is + * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t size; + + /** + * The release threshold of the memory pool. \p releaseThreshold is + * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t releaseThreshold; + + /** + * The start timestamp for the memory operation, in ns. + */ + uint64_t timestamp; + + /** + * The utilized size of the memory pool. \p utilizedSize is + * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t utilizedSize; +} CUpti_ActivityMemoryPool2; + +/** + * \brief The activity record for kernel. (deprecated) + * + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated + * by CUPTI. Kernel activities are now reported using the + * CUpti_ActivityKernel8 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL + * or CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t cacheConfigRequested; + + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t cacheConfigExecuted; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver API activity record that launched + * the kernel. + */ + uint32_t correlationId; + + /** + * The runtime correlation ID of the kernel. Each kernel execution + * is assigned a unique runtime correlation ID that is identical to + * the correlation ID in the runtime API activity record that + * launched the kernel. + */ + uint32_t runtimeCorrelationId; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; +} CUpti_ActivityKernel; + +/** + * \brief The activity record for kernel. (deprecated) + * + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated + * by CUPTI. Kernel activities are now reported using the + * CUpti_ActivityKernel8 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The completed timestamp for the kernel execution, in ns. It + * represents the completion of all it's child kernels and the + * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that + * the completion time is unknown. + */ + uint64_t completed; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel is assigned a unique + * grid ID at runtime. + */ + int64_t gridId; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; +} CUpti_ActivityKernel2; + +/** + * \brief The activity record for a kernel (CUDA 6.5(with sm_52 support) onwards). + * (deprecated in CUDA 9.0) + * + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL). + * Kernel activities are now reported using the CUpti_ActivityKernel8 activity + * record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The partitioned global caching requested for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested; + + /** + * The partitioned global caching executed for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. Partitioned global caching can be + * automatically disabled if the occupancy requirement of the launch cannot + * support caching. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The completed timestamp for the kernel execution, in ns. It + * represents the completion of all it's child kernels and the + * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that + * the completion time is unknown. + */ + uint64_t completed; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel is assigned a unique + * grid ID at runtime. + */ + int64_t gridId; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; +} CUpti_ActivityKernel3; + +/** + * \brief The type of the CUDA kernel launch. + */ +typedef enum { + /** + * The kernel was launched via a regular kernel call + */ + CUPTI_ACTIVITY_LAUNCH_TYPE_REGULAR = 0, + /** + * The kernel was launched via API \ref cudaLaunchCooperativeKernel() or + * \ref cuLaunchCooperativeKernel() + */ + CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_SINGLE_DEVICE = 1, + /** + * The kernel was launched via API \ref cudaLaunchCooperativeKernelMultiDevice() or + * \ref cuLaunchCooperativeKernelMultiDevice() + */ + CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_MULTI_DEVICE = 2 +} CUpti_ActivityLaunchType; + +/** + * \brief The activity record for a kernel (CUDA 9.0(with sm_70 support) onwards). + * (deprecated in CUDA 11.0) + * + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL). + * Kernel activities are now reported using the CUpti_ActivityKernel8 activity + * record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + /** + * For devices with compute capability 7.0+ cacheConfig values are not updated + * in case field isSharedMemoryCarveoutRequested is set + */ + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The partitioned global caching requested for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested; + + /** + * The partitioned global caching executed for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. Partitioned global caching can be + * automatically disabled if the occupancy requirement of the launch cannot + * support caching. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The completed timestamp for the kernel execution, in ns. It + * represents the completion of all it's child kernels and the + * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that + * the completion time is unknown. + */ + uint64_t completed; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel is assigned a unique + * grid ID at runtime. + */ + int64_t gridId; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The timestamp when the kernel is queued up in the command buffer, in ns. + * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time + * could not be collected for the kernel. This timestamp is not collected + * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to + * enable collection. + * + * Command buffer is a buffer written by CUDA driver to send commands + * like kernel launch, memory copy etc to the GPU. All launches of CUDA + * kernels are asynchrnous with respect to the host, the host requests + * the launch by writing commands into the command buffer, then returns + * without checking the GPU's progress. + */ + uint64_t queued; + + /** + * The timestamp when the command buffer containing the kernel launch + * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN + * indicates that the submitted time could not be collected for the kernel. + * This timestamp is not collected by default. Use API \ref + * cuptiActivityEnableLatencyTimestamps() to enable collection. + */ + uint64_t submitted; + + /** + * The indicates if the kernel was executed via a regular launch or via a + * single/multi device cooperative launch. \see CUpti_ActivityLaunchType + */ + uint8_t launchType; + + /** + * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was + * updated for the kernel launch + */ + uint8_t isSharedMemoryCarveoutRequested; + + /** + * Shared memory carveout value requested for the function in percentage of + * the total resource. The value will be updated only if field + * isSharedMemoryCarveoutRequested is set. + */ + uint8_t sharedMemoryCarveoutRequested; + + /** + * Undefined. Reserved for internal use. + */ + uint8_t padding; + + /** + * Shared memory size set by the driver. + */ + uint32_t sharedMemoryExecuted; +} CUpti_ActivityKernel4; + +/** + * \brief The shared memory limit per block config for a kernel + * This should be used to set 'cudaOccFuncShmemConfig' field in occupancy calculator API + */ +typedef enum { + /* The shared memory limit config is default */ + CUPTI_FUNC_SHMEM_LIMIT_DEFAULT = 0x00, + /* User has opted for a higher dynamic shared memory limit using function attribute + 'cudaFuncAttributeMaxDynamicSharedMemorySize' for runtime API or + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES for driver API */ + CUPTI_FUNC_SHMEM_LIMIT_OPTIN = 0x01, + CUPTI_FUNC_SHMEM_LIMIT_FORCE_INT = 0x7fffffff +} CUpti_FuncShmemLimitConfig; + +/** + * \brief The activity record for a kernel (CUDA 11.0(with sm_80 support) onwards). + * (deprecated in CUDA 11.2) + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated + * by CUPTI. Kernel activities are now reported using the + * CUpti_ActivityKernel8 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + /** + * For devices with compute capability 7.0+ cacheConfig values are not updated + * in case field isSharedMemoryCarveoutRequested is set + */ + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The partitioned global caching requested for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested; + + /** + * The partitioned global caching executed for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. Partitioned global caching can be + * automatically disabled if the occupancy requirement of the launch cannot + * support caching. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The completed timestamp for the kernel execution, in ns. It + * represents the completion of all it's child kernels and the + * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that + * the completion time is unknown. + */ + uint64_t completed; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel is assigned a unique + * grid ID at runtime. + */ + int64_t gridId; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The timestamp when the kernel is queued up in the command buffer, in ns. + * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time + * could not be collected for the kernel. This timestamp is not collected + * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to + * enable collection. + * + * Command buffer is a buffer written by CUDA driver to send commands + * like kernel launch, memory copy etc to the GPU. All launches of CUDA + * kernels are asynchrnous with respect to the host, the host requests + * the launch by writing commands into the command buffer, then returns + * without checking the GPU's progress. + */ + uint64_t queued; + + /** + * The timestamp when the command buffer containing the kernel launch + * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN + * indicates that the submitted time could not be collected for the kernel. + * This timestamp is not collected by default. Use API \ref + * cuptiActivityEnableLatencyTimestamps() to enable collection. + */ + uint64_t submitted; + + /** + * The indicates if the kernel was executed via a regular launch or via a + * single/multi device cooperative launch. \see CUpti_ActivityLaunchType + */ + uint8_t launchType; + + /** + * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was + * updated for the kernel launch + */ + uint8_t isSharedMemoryCarveoutRequested; + + /** + * Shared memory carveout value requested for the function in percentage of + * the total resource. The value will be updated only if field + * isSharedMemoryCarveoutRequested is set. + */ + uint8_t sharedMemoryCarveoutRequested; + + /** + * Undefined. Reserved for internal use. + */ + uint8_t padding; + + /** + * Shared memory size set by the driver. + */ + uint32_t sharedMemoryExecuted; + + /** + * The unique ID of the graph node that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint64_t graphNodeId; + + /** + * The shared memory limit config for the kernel. This field shows whether user has opted for a + * higher per block limit of dynamic shared memory. + */ + CUpti_FuncShmemLimitConfig shmemLimitConfig; + + /** + * The unique ID of the graph that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint32_t graphId; +} CUpti_ActivityKernel5; + +/** + * \brief The activity record for kernel. (deprecated in CUDA 11.6) + * + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated + * by CUPTI. Kernel activities are now reported using the + * CUpti_ActivityKernel8 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + /** + * For devices with compute capability 7.0+ cacheConfig values are not updated + * in case field isSharedMemoryCarveoutRequested is set + */ + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The partitioned global caching requested for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested; + + /** + * The partitioned global caching executed for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. Partitioned global caching can be + * automatically disabled if the occupancy requirement of the launch cannot + * support caching. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The completed timestamp for the kernel execution, in ns. It + * represents the completion of all it's child kernels and the + * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that + * the completion time is unknown. + */ + uint64_t completed; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel is assigned a unique + * grid ID at runtime. + */ + int64_t gridId; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The timestamp when the kernel is queued up in the command buffer, in ns. + * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time + * could not be collected for the kernel. This timestamp is not collected + * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to + * enable collection. + * + * Command buffer is a buffer written by CUDA driver to send commands + * like kernel launch, memory copy etc to the GPU. All launches of CUDA + * kernels are asynchrnous with respect to the host, the host requests + * the launch by writing commands into the command buffer, then returns + * without checking the GPU's progress. + */ + uint64_t queued; + + /** + * The timestamp when the command buffer containing the kernel launch + * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN + * indicates that the submitted time could not be collected for the kernel. + * This timestamp is not collected by default. Use API \ref + * cuptiActivityEnableLatencyTimestamps() to enable collection. + */ + uint64_t submitted; + + /** + * The indicates if the kernel was executed via a regular launch or via a + * single/multi device cooperative launch. \see CUpti_ActivityLaunchType + */ + uint8_t launchType; + + /** + * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was + * updated for the kernel launch + */ + uint8_t isSharedMemoryCarveoutRequested; + + /** + * Shared memory carveout value requested for the function in percentage of + * the total resource. The value will be updated only if field + * isSharedMemoryCarveoutRequested is set. + */ + uint8_t sharedMemoryCarveoutRequested; + + /** + * Undefined. Reserved for internal use. + */ + uint8_t padding; + + /** + * Shared memory size set by the driver. + */ + uint32_t sharedMemoryExecuted; + + /** + * The unique ID of the graph node that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint64_t graphNodeId; + + /** + * The shared memory limit config for the kernel. This field shows whether user has opted for a + * higher per block limit of dynamic shared memory. + */ + CUpti_FuncShmemLimitConfig shmemLimitConfig; + + /** + * The unique ID of the graph that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint32_t graphId; + + /** + * The pointer to the access policy window. The structure CUaccessPolicyWindow is + * defined in cuda.h. + */ + CUaccessPolicyWindow *pAccessPolicyWindow; +} CUpti_ActivityKernel6; + +/** + * \brief The activity record for kernel. (deprecated in CUDA 11.8) + * + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated + * by CUPTI. Kernel activities are now reported using the + * CUpti_ActivityKernel8 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + /** + * For devices with compute capability 7.0+ cacheConfig values are not updated + * in case field isSharedMemoryCarveoutRequested is set + */ + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The partitioned global caching requested for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested; + + /** + * The partitioned global caching executed for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. Partitioned global caching can be + * automatically disabled if the occupancy requirement of the launch cannot + * support caching. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The completed timestamp for the kernel execution, in ns. It + * represents the completion of all it's child kernels and the + * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that + * the completion time is unknown. + */ + uint64_t completed; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel is assigned a unique + * grid ID at runtime. + */ + int64_t gridId; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The timestamp when the kernel is queued up in the command buffer, in ns. + * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time + * could not be collected for the kernel. This timestamp is not collected + * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to + * enable collection. + * + * Command buffer is a buffer written by CUDA driver to send commands + * like kernel launch, memory copy etc to the GPU. All launches of CUDA + * kernels are asynchrnous with respect to the host, the host requests + * the launch by writing commands into the command buffer, then returns + * without checking the GPU's progress. + */ + uint64_t queued; + + /** + * The timestamp when the command buffer containing the kernel launch + * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN + * indicates that the submitted time could not be collected for the kernel. + * This timestamp is not collected by default. Use API \ref + * cuptiActivityEnableLatencyTimestamps() to enable collection. + */ + uint64_t submitted; + + /** + * The indicates if the kernel was executed via a regular launch or via a + * single/multi device cooperative launch. \see CUpti_ActivityLaunchType + */ + uint8_t launchType; + + /** + * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was + * updated for the kernel launch + */ + uint8_t isSharedMemoryCarveoutRequested; + + /** + * Shared memory carveout value requested for the function in percentage of + * the total resource. The value will be updated only if field + * isSharedMemoryCarveoutRequested is set. + */ + uint8_t sharedMemoryCarveoutRequested; + + /** + * Undefined. Reserved for internal use. + */ + uint8_t padding; + + /** + * Shared memory size set by the driver. + */ + uint32_t sharedMemoryExecuted; + + /** + * The unique ID of the graph node that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint64_t graphNodeId; + + /** + * The shared memory limit config for the kernel. This field shows whether user has opted for a + * higher per block limit of dynamic shared memory. + */ + CUpti_FuncShmemLimitConfig shmemLimitConfig; + + /** + * The unique ID of the graph that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint32_t graphId; + + /** + * The pointer to the access policy window. The structure CUaccessPolicyWindow is + * defined in cuda.h. + */ + CUaccessPolicyWindow *pAccessPolicyWindow; + + /** + * The ID of the HW channel on which the kernel is launched. + */ + uint32_t channelID; + + /** + * The type of the channel + */ + CUpti_ChannelType channelType; + +} CUpti_ActivityKernel7; + +/** + * \brief The activity record for kernel. + * + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + /** + * For devices with compute capability 7.0+ cacheConfig values are not updated + * in case field isSharedMemoryCarveoutRequested is set + */ + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The partitioned global caching requested for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested; + + /** + * The partitioned global caching executed for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. Partitioned global caching can be + * automatically disabled if the occupancy requirement of the launch cannot + * support caching. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The completed timestamp for the kernel execution, in ns. It + * represents the completion of all it's child kernels and the + * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that + * the completion time is unknown. + */ + uint64_t completed; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes (deprecated in CUDA 11.8). + * Refer field localMemoryTotal_v2 + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel is assigned a unique + * grid ID at runtime. + */ + int64_t gridId; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The timestamp when the kernel is queued up in the command buffer, in ns. + * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time + * could not be collected for the kernel. This timestamp is not collected + * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to + * enable collection. + * + * Command buffer is a buffer written by CUDA driver to send commands + * like kernel launch, memory copy etc to the GPU. All launches of CUDA + * kernels are asynchrnous with respect to the host, the host requests + * the launch by writing commands into the command buffer, then returns + * without checking the GPU's progress. + */ + uint64_t queued; + + /** + * The timestamp when the command buffer containing the kernel launch + * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN + * indicates that the submitted time could not be collected for the kernel. + * This timestamp is not collected by default. Use API \ref + * cuptiActivityEnableLatencyTimestamps() to enable collection. + */ + uint64_t submitted; + + /** + * The indicates if the kernel was executed via a regular launch or via a + * single/multi device cooperative launch. \see CUpti_ActivityLaunchType + */ + uint8_t launchType; + + /** + * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was + * updated for the kernel launch + */ + uint8_t isSharedMemoryCarveoutRequested; + + /** + * Shared memory carveout value requested for the function in percentage of + * the total resource. The value will be updated only if field + * isSharedMemoryCarveoutRequested is set. + */ + uint8_t sharedMemoryCarveoutRequested; + + /** + * Undefined. Reserved for internal use. + */ + uint8_t padding; + + /** + * Shared memory size set by the driver. + */ + uint32_t sharedMemoryExecuted; + + /** + * The unique ID of the graph node that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint64_t graphNodeId; + + /** + * The shared memory limit config for the kernel. This field shows whether user has opted for a + * higher per block limit of dynamic shared memory. + */ + CUpti_FuncShmemLimitConfig shmemLimitConfig; + + /** + * The unique ID of the graph that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint32_t graphId; + + /** + * The pointer to the access policy window. The structure CUaccessPolicyWindow is + * defined in cuda.h. + */ + CUaccessPolicyWindow *pAccessPolicyWindow; + + /** + * The ID of the HW channel on which the kernel is launched. + */ + uint32_t channelID; + + /** + * The type of the channel + */ + CUpti_ChannelType channelType; + + + /** + * The X-dimension cluster size for the kernel. + * Field is valid for devices with compute capability 9.0 and higher + */ + uint32_t clusterX; + + /** + * The Y-dimension cluster size for the kernel. + * Field is valid for devices with compute capability 9.0 and higher + */ + uint32_t clusterY; + + /** + * The Z-dimension cluster size for the kernel. + * Field is valid for devices with compute capability 9.0 and higher + */ + uint32_t clusterZ; + + /** + * The cluster scheduling policy for the kernel. Refer CUclusterSchedulingPolicy + * Field is valid for devices with compute capability 9.0 and higher + */ + uint32_t clusterSchedulingPolicy; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint64_t localMemoryTotal_v2; +} CUpti_ActivityKernel8; + +/** + * \brief The activity record for CDP (CUDA Dynamic Parallelism) + * kernel. + * + * This activity record represents a CDP kernel execution. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_CDP_KERNEL + */ + CUpti_ActivityKind kind; + + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver API activity record that launched + * the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel execution + * is assigned a unique grid ID. + */ + int64_t gridId; + + /** + * The grid ID of the parent kernel. + */ + int64_t parentGridId; + + /** + * The timestamp when kernel is queued up, in ns. A value of + * CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time is + * unknown. + */ + uint64_t queued; + + /** + * The timestamp when kernel is submitted to the gpu, in ns. A value + * of CUPTI_TIMESTAMP_UNKNOWN indicates that the submission time is + * unknown. + */ + uint64_t submitted; + + /** + * The timestamp when kernel is marked as completed, in ns. A value + * of CUPTI_TIMESTAMP_UNKNOWN indicates that the completion time is + * unknown. + */ + uint64_t completed; + + /** + * The X-dimension of the parent block. + */ + uint32_t parentBlockX; + + /** + * The Y-dimension of the parent block. + */ + uint32_t parentBlockY; + + /** + * The Z-dimension of the parent block. + */ + uint32_t parentBlockZ; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; +} CUpti_ActivityCdpKernel; + +/** + * \brief The activity record for a preemption of a CDP kernel. + * + * This activity record represents a preemption of a CDP kernel. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_PREEMPTION + */ + CUpti_ActivityKind kind; + + /** + * kind of the preemption + */ + CUpti_ActivityPreemptionKind preemptionKind; + + /** + * The timestamp of the preemption, in ns. A value of 0 indicates + * that timestamp information could not be collected for the + * preemption. + */ + uint64_t timestamp; + + /** + * The grid-id of the block that is preempted + */ + int64_t gridId; + + /** + * The X-dimension of the block that is preempted + */ + uint32_t blockX; + + /** + * The Y-dimension of the block that is preempted + */ + uint32_t blockY; + + /** + * The Z-dimension of the block that is preempted + */ + uint32_t blockZ; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +} CUpti_ActivityPreemption; + +/** + * \brief The activity record for a driver or runtime API invocation. + * + * This activity record represents an invocation of a driver or + * runtime API (CUPTI_ACTIVITY_KIND_DRIVER and + * CUPTI_ACTIVITY_KIND_RUNTIME). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_DRIVER, + * CUPTI_ACTIVITY_KIND_RUNTIME, or CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API. + */ + CUpti_ActivityKind kind; + + /** + * The ID of the driver or runtime function. + */ + CUpti_CallbackId cbid; + + /** + * The start timestamp for the function, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the function. + */ + uint64_t start; + + /** + * The end timestamp for the function, in ns. A value of 0 for both + * the start and end timestamps indicates that timestamp information + * could not be collected for the function. + */ + uint64_t end; + + /** + * The ID of the process where the driver or runtime CUDA function + * is executing. + */ + uint32_t processId; + + /** + * The ID of the thread where the driver or runtime CUDA function is + * executing. + */ + uint32_t threadId; + + /** + * The correlation ID of the driver or runtime CUDA function. Each + * function invocation is assigned a unique correlation ID that is + * identical to the correlation ID in the memcpy, memset, or kernel + * activity record that is associated with this function. + */ + uint32_t correlationId; + + /** + * The return value for the function. For a CUDA driver function + * with will be a CUresult value, and for a CUDA runtime function + * this will be a cudaError_t value. + */ + uint32_t returnValue; +} CUpti_ActivityAPI; + +/** + * \brief The activity record for a CUPTI event. + * + * This activity record represents a CUPTI event value + * (CUPTI_ACTIVITY_KIND_EVENT). This activity record kind is not + * produced by the activity API but is included for completeness and + * ease-of-use. Profile frameworks built on top of CUPTI that collect + * event data may choose to use this type to store the collected event + * data. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_EVENT. + */ + CUpti_ActivityKind kind; + + /** + * The event ID. + */ + CUpti_EventID id; + + /** + * The event value. + */ + uint64_t value; + + /** + * The event domain ID. + */ + CUpti_EventDomainID domain; + + /** + * The correlation ID of the event. Use of this ID is user-defined, + * but typically this ID value will equal the correlation ID of the + * kernel for which the event was gathered. + */ + uint32_t correlationId; +} CUpti_ActivityEvent; + +/** + * \brief The activity record for a CUPTI event with instance + * information. + * + * This activity record represents the a CUPTI event value for a + * specific event domain instance + * (CUPTI_ACTIVITY_KIND_EVENT_INSTANCE). This activity record kind is + * not produced by the activity API but is included for completeness + * and ease-of-use. Profile frameworks built on top of CUPTI that + * collect event data may choose to use this type to store the + * collected event data. This activity record should be used when + * event domain instance information needs to be associated with the + * event. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be + * CUPTI_ACTIVITY_KIND_EVENT_INSTANCE. + */ + CUpti_ActivityKind kind; + + /** + * The event ID. + */ + CUpti_EventID id; + + /** + * The event domain ID. + */ + CUpti_EventDomainID domain; + + /** + * The event domain instance. + */ + uint32_t instance; + + /** + * The event value. + */ + uint64_t value; + + /** + * The correlation ID of the event. Use of this ID is user-defined, + * but typically this ID value will equal the correlation ID of the + * kernel for which the event was gathered. + */ + uint32_t correlationId; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +} CUpti_ActivityEventInstance; + +/** + * \brief The activity record for a CUPTI metric. + * + * This activity record represents the collection of a CUPTI metric + * value (CUPTI_ACTIVITY_KIND_METRIC). This activity record kind is not + * produced by the activity API but is included for completeness and + * ease-of-use. Profile frameworks built on top of CUPTI that collect + * metric data may choose to use this type to store the collected metric + * data. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_METRIC. + */ + CUpti_ActivityKind kind; + + /** + * The metric ID. + */ + CUpti_MetricID id; + + /** + * The metric value. + */ + CUpti_MetricValue value; + + /** + * The correlation ID of the metric. Use of this ID is user-defined, + * but typically this ID value will equal the correlation ID of the + * kernel for which the metric was gathered. + */ + uint32_t correlationId; + + /** + * The properties of this metric. \see CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * Undefined. Reserved for internal use. + */ + uint8_t pad[3]; +} CUpti_ActivityMetric; + +/** + * \brief The activity record for a CUPTI metric with instance + * information. + * + * This activity record represents a CUPTI metric value + * for a specific metric domain instance + * (CUPTI_ACTIVITY_KIND_METRIC_INSTANCE). This activity record kind + * is not produced by the activity API but is included for + * completeness and ease-of-use. Profile frameworks built on top of + * CUPTI that collect metric data may choose to use this type to store + * the collected metric data. This activity record should be used when + * metric domain instance information needs to be associated with the + * metric. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be + * CUPTI_ACTIVITY_KIND_METRIC_INSTANCE. + */ + CUpti_ActivityKind kind; + + /** + * The metric ID. + */ + CUpti_MetricID id; + + /** + * The metric value. + */ + CUpti_MetricValue value; + + /** + * The metric domain instance. + */ + uint32_t instance; + + /** + * The correlation ID of the metric. Use of this ID is user-defined, + * but typically this ID value will equal the correlation ID of the + * kernel for which the metric was gathered. + */ + uint32_t correlationId; + + /** + * The properties of this metric. \see CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * Undefined. Reserved for internal use. + */ + uint8_t pad[7]; +} CUpti_ActivityMetricInstance; + +/** + * \brief The activity record for source locator. + * + * This activity record represents a source locator + * (CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR. + */ + CUpti_ActivityKind kind; + + /** + * The ID for the source path, will be used in all the source level + * results. + */ + uint32_t id; + + /** + * The line number in the source . + */ + uint32_t lineNumber; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The path for the file. + */ + const char *fileName; +} CUpti_ActivitySourceLocator; + +/** + * \brief The activity record for source-level global + * access. (deprecated) + * + * This activity records the locations of the global + * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS). + * Global access activities are now reported using the + * CUpti_ActivityGlobalAccess3 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS. + */ + CUpti_ActivityKind kind; + + /** + * The properties of this global access. + */ + CUpti_ActivityFlag flags; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * The pc offset for the access. + */ + uint32_t pcOffset; + + /** + * The number of times this instruction was executed per warp. It will be incremented + * when at least one of thread among warp is active with predicate and condition code + * evaluating to true. + */ + uint32_t executed; + + /** + * This increments each time when this instruction is executed by number + * of threads that executed this instruction with predicate and condition code evaluating to true. + */ + uint64_t threadsExecuted; + + /** + * The total number of 32 bytes transactions to L2 cache generated by this access + */ + uint64_t l2_transactions; +} CUpti_ActivityGlobalAccess; + +/** + * \brief The activity record for source-level global + * access. (deprecated in CUDA 9.0) + * + * This activity records the locations of the global + * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS). + * Global access activities are now reported using the + * CUpti_ActivityGlobalAccess3 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS. + */ + CUpti_ActivityKind kind; + + /** + * The properties of this global access. + */ + CUpti_ActivityFlag flags; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * Correlation ID with global/device function name + */ + uint32_t functionId; + + /** + * The pc offset for the access. + */ + uint32_t pcOffset; + + /** + * This increments each time when this instruction is executed by number + * of threads that executed this instruction with predicate and condition code evaluating to true. + */ + uint64_t threadsExecuted; + + /** + * The total number of 32 bytes transactions to L2 cache generated by this access + */ + uint64_t l2_transactions; + + /** + * The minimum number of L2 transactions possible based on the access pattern. + */ + uint64_t theoreticalL2Transactions; + + /** + * The number of times this instruction was executed per warp. It will be incremented + * when at least one of thread among warp is active with predicate and condition code + * evaluating to true. + */ + uint32_t executed; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +} CUpti_ActivityGlobalAccess2; + +/** + * \brief The activity record for source-level global + * access. + * + * This activity records the locations of the global + * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS. + */ + CUpti_ActivityKind kind; + + /** + * The properties of this global access. + */ + CUpti_ActivityFlag flags; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * Correlation ID with global/device function name + */ + uint32_t functionId; + + /** + * The number of times this instruction was executed per warp. It will be incremented + * when at least one of thread among warp is active with predicate and condition code + * evaluating to true. + */ + uint32_t executed; + + /** + * The pc offset for the access. + */ + uint64_t pcOffset; + + /** + * This increments each time when this instruction is executed by number of + * threads that executed this instruction with predicate and condition code + * evaluating to true. + */ + uint64_t threadsExecuted; + + /** + * The total number of 32 bytes transactions to L2 cache generated by this + access + */ + uint64_t l2_transactions; + + /** + * The minimum number of L2 transactions possible based on the access pattern. + */ + uint64_t theoreticalL2Transactions; +} CUpti_ActivityGlobalAccess3; + +/** + * \brief The activity record for source level result + * branch. (deprecated) + * + * This activity record the locations of the branches in the + * source (CUPTI_ACTIVITY_KIND_BRANCH). + * Branch activities are now reported using the + * CUpti_ActivityBranch2 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH. + */ + CUpti_ActivityKind kind; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * The pc offset for the branch. + */ + uint32_t pcOffset; + + /** + * The number of times this instruction was executed per warp. It will be incremented + * regardless of predicate or condition code. + */ + uint32_t executed; + + /** + * Number of times this branch diverged + */ + uint32_t diverged; + + /** + * This increments each time when this instruction is executed by number + * of threads that executed this instruction + */ + uint64_t threadsExecuted; +} CUpti_ActivityBranch; + +/** + * \brief The activity record for source level result + * branch. + * + * This activity record the locations of the branches in the + * source (CUPTI_ACTIVITY_KIND_BRANCH). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH. + */ + CUpti_ActivityKind kind; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * Correlation ID with global/device function name + */ + uint32_t functionId; + + /** + * The pc offset for the branch. + */ + uint32_t pcOffset; + + /** + * Number of times this branch diverged + */ + uint32_t diverged; + + /** + * This increments each time when this instruction is executed by number + * of threads that executed this instruction + */ + uint64_t threadsExecuted; + + /** + * The number of times this instruction was executed per warp. It will be incremented + * regardless of predicate or condition code. + */ + uint32_t executed; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +} CUpti_ActivityBranch2; + + +/** + * \brief The activity record for a device. (deprecated) + * + * This activity record represents information about a GPU device + * (CUPTI_ACTIVITY_KIND_DEVICE). + * Device activity is now reported using the + * CUpti_ActivityDevice4 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE. + */ + CUpti_ActivityKind kind; + + /** + * The flags associated with the device. \see CUpti_ActivityFlag + */ + CUpti_ActivityFlag flags; + + /** + * The global memory bandwidth available on the device, in + * kBytes/sec. + */ + uint64_t globalMemoryBandwidth; + + /** + * The amount of global memory on the device, in bytes. + */ + uint64_t globalMemorySize; + + /** + * The amount of constant memory on the device, in bytes. + */ + uint32_t constantMemorySize; + + /** + * The size of the L2 cache on the device, in bytes. + */ + uint32_t l2CacheSize; + + /** + * The number of threads per warp on the device. + */ + uint32_t numThreadsPerWarp; + + /** + * The core clock rate of the device, in kHz. + */ + uint32_t coreClockRate; + + /** + * Number of memory copy engines on the device. + */ + uint32_t numMemcpyEngines; + + /** + * Number of multiprocessors on the device. + */ + uint32_t numMultiprocessors; + + /** + * The maximum "instructions per cycle" possible on each device + * multiprocessor. + */ + uint32_t maxIPC; + + /** + * Maximum number of warps that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxWarpsPerMultiprocessor; + + /** + * Maximum number of blocks that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxBlocksPerMultiprocessor; + + /** + * Maximum number of registers that can be allocated to a block. + */ + uint32_t maxRegistersPerBlock; + + /** + * Maximum amount of shared memory that can be assigned to a block, + * in bytes. + */ + uint32_t maxSharedMemoryPerBlock; + + /** + * Maximum number of threads allowed in a block. + */ + uint32_t maxThreadsPerBlock; + + /** + * Maximum allowed X dimension for a block. + */ + uint32_t maxBlockDimX; + + /** + * Maximum allowed Y dimension for a block. + */ + uint32_t maxBlockDimY; + + /** + * Maximum allowed Z dimension for a block. + */ + uint32_t maxBlockDimZ; + + /** + * Maximum allowed X dimension for a grid. + */ + uint32_t maxGridDimX; + + /** + * Maximum allowed Y dimension for a grid. + */ + uint32_t maxGridDimY; + + /** + * Maximum allowed Z dimension for a grid. + */ + uint32_t maxGridDimZ; + + /** + * Compute capability for the device, major number. + */ + uint32_t computeCapabilityMajor; + + /** + * Compute capability for the device, minor number. + */ + uint32_t computeCapabilityMinor; + + /** + * The device ID. + */ + uint32_t id; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The device name. This name is shared across all activity records + * representing instances of the device, and so should not be + * modified. + */ + const char *name; +} CUpti_ActivityDevice; + +/** + * \brief The activity record for a device. (deprecated) + * + * This activity record represents information about a GPU device + * (CUPTI_ACTIVITY_KIND_DEVICE). + * Device activity is now reported using the + * CUpti_ActivityDevice4 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE. + */ + CUpti_ActivityKind kind; + + /** + * The flags associated with the device. \see CUpti_ActivityFlag + */ + CUpti_ActivityFlag flags; + + /** + * The global memory bandwidth available on the device, in + * kBytes/sec. + */ + uint64_t globalMemoryBandwidth; + + /** + * The amount of global memory on the device, in bytes. + */ + uint64_t globalMemorySize; + + /** + * The amount of constant memory on the device, in bytes. + */ + uint32_t constantMemorySize; + + /** + * The size of the L2 cache on the device, in bytes. + */ + uint32_t l2CacheSize; + + /** + * The number of threads per warp on the device. + */ + uint32_t numThreadsPerWarp; + + /** + * The core clock rate of the device, in kHz. + */ + uint32_t coreClockRate; + + /** + * Number of memory copy engines on the device. + */ + uint32_t numMemcpyEngines; + + /** + * Number of multiprocessors on the device. + */ + uint32_t numMultiprocessors; + + /** + * The maximum "instructions per cycle" possible on each device + * multiprocessor. + */ + uint32_t maxIPC; + + /** + * Maximum number of warps that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxWarpsPerMultiprocessor; + + /** + * Maximum number of blocks that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxBlocksPerMultiprocessor; + + /** + * Maximum amount of shared memory available per multiprocessor, in bytes. + */ + uint32_t maxSharedMemoryPerMultiprocessor; + + /** + * Maximum number of 32-bit registers available per multiprocessor. + */ + uint32_t maxRegistersPerMultiprocessor; + + /** + * Maximum number of registers that can be allocated to a block. + */ + uint32_t maxRegistersPerBlock; + + /** + * Maximum amount of shared memory that can be assigned to a block, + * in bytes. + */ + uint32_t maxSharedMemoryPerBlock; + + /** + * Maximum number of threads allowed in a block. + */ + uint32_t maxThreadsPerBlock; + + /** + * Maximum allowed X dimension for a block. + */ + uint32_t maxBlockDimX; + + /** + * Maximum allowed Y dimension for a block. + */ + uint32_t maxBlockDimY; + + /** + * Maximum allowed Z dimension for a block. + */ + uint32_t maxBlockDimZ; + + /** + * Maximum allowed X dimension for a grid. + */ + uint32_t maxGridDimX; + + /** + * Maximum allowed Y dimension for a grid. + */ + uint32_t maxGridDimY; + + /** + * Maximum allowed Z dimension for a grid. + */ + uint32_t maxGridDimZ; + + /** + * Compute capability for the device, major number. + */ + uint32_t computeCapabilityMajor; + + /** + * Compute capability for the device, minor number. + */ + uint32_t computeCapabilityMinor; + + /** + * The device ID. + */ + uint32_t id; + + /** + * ECC enabled flag for device + */ + uint32_t eccEnabled; + + /** + * The device UUID. This value is the globally unique immutable + * alphanumeric identifier of the device. + */ + CUuuid uuid; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The device name. This name is shared across all activity records + * representing instances of the device, and so should not be + * modified. + */ + const char *name; +} CUpti_ActivityDevice2; + +/** + * \brief The activity record for a device. (CUDA 7.0 onwards) + * + * This activity record represents information about a GPU device + * (CUPTI_ACTIVITY_KIND_DEVICE). + * Device activity is now reported using the + * CUpti_ActivityDevice4 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE. + */ + CUpti_ActivityKind kind; + + /** + * The flags associated with the device. \see CUpti_ActivityFlag + */ + CUpti_ActivityFlag flags; + + /** + * The global memory bandwidth available on the device, in + * kBytes/sec. + */ + uint64_t globalMemoryBandwidth; + + /** + * The amount of global memory on the device, in bytes. + */ + uint64_t globalMemorySize; + + /** + * The amount of constant memory on the device, in bytes. + */ + uint32_t constantMemorySize; + + /** + * The size of the L2 cache on the device, in bytes. + */ + uint32_t l2CacheSize; + + /** + * The number of threads per warp on the device. + */ + uint32_t numThreadsPerWarp; + + /** + * The core clock rate of the device, in kHz. + */ + uint32_t coreClockRate; + + /** + * Number of memory copy engines on the device. + */ + uint32_t numMemcpyEngines; + + /** + * Number of multiprocessors on the device. + */ + uint32_t numMultiprocessors; + + /** + * The maximum "instructions per cycle" possible on each device + * multiprocessor. + */ + uint32_t maxIPC; + + /** + * Maximum number of warps that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxWarpsPerMultiprocessor; + + /** + * Maximum number of blocks that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxBlocksPerMultiprocessor; + + /** + * Maximum amount of shared memory available per multiprocessor, in bytes. + */ + uint32_t maxSharedMemoryPerMultiprocessor; + + /** + * Maximum number of 32-bit registers available per multiprocessor. + */ + uint32_t maxRegistersPerMultiprocessor; + + /** + * Maximum number of registers that can be allocated to a block. + */ + uint32_t maxRegistersPerBlock; + + /** + * Maximum amount of shared memory that can be assigned to a block, + * in bytes. + */ + uint32_t maxSharedMemoryPerBlock; + + /** + * Maximum number of threads allowed in a block. + */ + uint32_t maxThreadsPerBlock; + + /** + * Maximum allowed X dimension for a block. + */ + uint32_t maxBlockDimX; + + /** + * Maximum allowed Y dimension for a block. + */ + uint32_t maxBlockDimY; + + /** + * Maximum allowed Z dimension for a block. + */ + uint32_t maxBlockDimZ; + + /** + * Maximum allowed X dimension for a grid. + */ + uint32_t maxGridDimX; + + /** + * Maximum allowed Y dimension for a grid. + */ + uint32_t maxGridDimY; + + /** + * Maximum allowed Z dimension for a grid. + */ + uint32_t maxGridDimZ; + + /** + * Compute capability for the device, major number. + */ + uint32_t computeCapabilityMajor; + + /** + * Compute capability for the device, minor number. + */ + uint32_t computeCapabilityMinor; + + /** + * The device ID. + */ + uint32_t id; + + /** + * ECC enabled flag for device + */ + uint32_t eccEnabled; + + /** + * The device UUID. This value is the globally unique immutable + * alphanumeric identifier of the device. + */ + CUuuid uuid; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The device name. This name is shared across all activity records + * representing instances of the device, and so should not be + * modified. + */ + const char *name; + + /** + * Flag to indicate whether the device is visible to CUDA. Users can + * set the device visibility using CUDA_VISIBLE_DEVICES environment + */ + uint8_t isCudaVisible; + + uint8_t reserved[7]; +} CUpti_ActivityDevice3; + + +/** + * \brief The activity record for a device. (CUDA 11.6 onwards) + * + * This activity record represents information about a GPU device + * (CUPTI_ACTIVITY_KIND_DEVICE). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE. + */ + CUpti_ActivityKind kind; + + /** + * The flags associated with the device. \see CUpti_ActivityFlag + */ + CUpti_ActivityFlag flags; + + /** + * The global memory bandwidth available on the device, in + * kBytes/sec. + */ + uint64_t globalMemoryBandwidth; + + /** + * The amount of global memory on the device, in bytes. + */ + uint64_t globalMemorySize; + + /** + * The amount of constant memory on the device, in bytes. + */ + uint32_t constantMemorySize; + + /** + * The size of the L2 cache on the device, in bytes. + */ + uint32_t l2CacheSize; + + /** + * The number of threads per warp on the device. + */ + uint32_t numThreadsPerWarp; + + /** + * The core clock rate of the device, in kHz. + */ + uint32_t coreClockRate; + + /** + * Number of memory copy engines on the device. + */ + uint32_t numMemcpyEngines; + + /** + * Number of multiprocessors on the device. + */ + uint32_t numMultiprocessors; + + /** + * The maximum "instructions per cycle" possible on each device + * multiprocessor. + */ + uint32_t maxIPC; + + /** + * Maximum number of warps that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxWarpsPerMultiprocessor; + + /** + * Maximum number of blocks that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxBlocksPerMultiprocessor; + + /** + * Maximum amount of shared memory available per multiprocessor, in bytes. + */ + uint32_t maxSharedMemoryPerMultiprocessor; + + /** + * Maximum number of 32-bit registers available per multiprocessor. + */ + uint32_t maxRegistersPerMultiprocessor; + + /** + * Maximum number of registers that can be allocated to a block. + */ + uint32_t maxRegistersPerBlock; + + /** + * Maximum amount of shared memory that can be assigned to a block, + * in bytes. + */ + uint32_t maxSharedMemoryPerBlock; + + /** + * Maximum number of threads allowed in a block. + */ + uint32_t maxThreadsPerBlock; + + /** + * Maximum allowed X dimension for a block. + */ + uint32_t maxBlockDimX; + + /** + * Maximum allowed Y dimension for a block. + */ + uint32_t maxBlockDimY; + + /** + * Maximum allowed Z dimension for a block. + */ + uint32_t maxBlockDimZ; + + /** + * Maximum allowed X dimension for a grid. + */ + uint32_t maxGridDimX; + + /** + * Maximum allowed Y dimension for a grid. + */ + uint32_t maxGridDimY; + + /** + * Maximum allowed Z dimension for a grid. + */ + uint32_t maxGridDimZ; + + /** + * Compute capability for the device, major number. + */ + uint32_t computeCapabilityMajor; + + /** + * Compute capability for the device, minor number. + */ + uint32_t computeCapabilityMinor; + + /** + * The device ID. + */ + uint32_t id; + + /** + * ECC enabled flag for device + */ + uint32_t eccEnabled; + + /** + * The device UUID. This value is the globally unique immutable + * alphanumeric identifier of the device. + */ + CUuuid uuid; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The device name. This name is shared across all activity records + * representing instances of the device, and so should not be + * modified. + */ + const char *name; + + /** + * Flag to indicate whether the device is visible to CUDA. Users can + * set the device visibility using CUDA_VISIBLE_DEVICES environment + */ + uint8_t isCudaVisible; + + /** + * MIG enabled flag for device + */ + uint8_t isMigEnabled; + + uint8_t reserved[6]; + + /** + * GPU Instance id for MIG enabled devices. + * If mig mode is disabled value is set to UINT32_MAX + */ + uint32_t gpuInstanceId; + + /** + * Compute Instance id for MIG enabled devices. + * If mig mode is disabled value is set to UINT32_MAX + */ + uint32_t computeInstanceId; + + /** + * The MIG UUID. This value is the globally unique immutable + * alphanumeric identifier of the device. + */ + CUuuid migUuid; + +} CUpti_ActivityDevice4; + + +/** + * \brief The activity record for a device attribute. + * + * This activity record represents information about a GPU device: + * either a CUpti_DeviceAttribute or CUdevice_attribute value + * (CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be + * CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE. + */ + CUpti_ActivityKind kind; + + /** + * The flags associated with the device. \see CUpti_ActivityFlag + */ + CUpti_ActivityFlag flags; + + /** + * The ID of the device that this attribute applies to. + */ + uint32_t deviceId; + + /** + * The attribute, either a CUpti_DeviceAttribute or + * CUdevice_attribute. Flag + * CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE is used to indicate + * what kind of attribute this is. If + * CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE is 1 then + * CUdevice_attribute field is value, otherwise + * CUpti_DeviceAttribute field is valid. + */ + union { + CUdevice_attribute cu; + CUpti_DeviceAttribute cupti; + } attribute; + + /** + * The value for the attribute. See CUpti_DeviceAttribute and + * CUdevice_attribute for the type of the value for a given + * attribute. + */ + union { + double vDouble; + uint32_t vUint32; + uint64_t vUint64; + int32_t vInt32; + int64_t vInt64; + } value; +} CUpti_ActivityDeviceAttribute; + +/** + * \brief The activity record for a context. + * + * This activity record represents information about a context + * (CUPTI_ACTIVITY_KIND_CONTEXT). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_CONTEXT. + */ + CUpti_ActivityKind kind; + + /** + * The context ID. + */ + uint32_t contextId; + + /** + * The device ID. + */ + uint32_t deviceId; + + /** + * The compute API kind. \see CUpti_ActivityComputeApiKind + */ + uint16_t computeApiKind; + + /** + * The ID for the NULL stream in this context + */ + uint16_t nullStreamId; + +} CUpti_ActivityContext; + +/** + * \brief The activity record providing a name. + * + * This activity record provides a name for a device, context, thread, + * etc. and other resource naming done via NVTX APIs + * (CUPTI_ACTIVITY_KIND_NAME). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_NAME. + */ + CUpti_ActivityKind kind; + + /** + * The kind of activity object being named. + */ + CUpti_ActivityObjectKind objectKind; + + /** + * The identifier for the activity object. 'objectKind' indicates + * which ID is valid for this record. + */ + CUpti_ActivityObjectKindId objectId; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The name. + */ + const char *name; + +} CUpti_ActivityName; + +/** + * \brief The activity record providing a marker which is an + * instantaneous point in time. (deprecated in CUDA 8.0) + * + * The marker is specified with a descriptive name and unique id + * (CUPTI_ACTIVITY_KIND_MARKER). + * Marker activity is now reported using the + * CUpti_ActivityMarker2 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER. + */ + CUpti_ActivityKind kind; + + /** + * The flags associated with the marker. \see CUpti_ActivityFlag + */ + CUpti_ActivityFlag flags; + + /** + * The timestamp for the marker, in ns. A value of 0 indicates that + * timestamp information could not be collected for the marker. + */ + uint64_t timestamp; + + /** + * The marker ID. + */ + uint32_t id; + + /** + * The kind of activity object associated with this marker. + */ + CUpti_ActivityObjectKind objectKind; + + /** + * The identifier for the activity object associated with this + * marker. 'objectKind' indicates which ID is valid for this record. + */ + CUpti_ActivityObjectKindId objectId; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The marker name for an instantaneous or start marker. This will + * be NULL for an end marker. + */ + const char *name; + +} CUpti_ActivityMarker; + +/** + * \brief The activity record providing a marker which is an + * instantaneous point in time. + * + * The marker is specified with a descriptive name and unique id + * (CUPTI_ACTIVITY_KIND_MARKER). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER. + */ + CUpti_ActivityKind kind; + + /** + * The flags associated with the marker. \see CUpti_ActivityFlag + */ + CUpti_ActivityFlag flags; + + /** + * The timestamp for the marker, in ns. A value of 0 indicates that + * timestamp information could not be collected for the marker. + */ + uint64_t timestamp; + + /** + * The marker ID. + */ + uint32_t id; + + /** + * The kind of activity object associated with this marker. + */ + CUpti_ActivityObjectKind objectKind; + + /** + * The identifier for the activity object associated with this + * marker. 'objectKind' indicates which ID is valid for this record. + */ + CUpti_ActivityObjectKindId objectId; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; + + + /** + * The marker name for an instantaneous or start marker. This will + * be NULL for an end marker. + */ + const char *name; + + /** + * The name of the domain to which this marker belongs to. + * This will be NULL for default domain. + */ + const char *domain; + +} CUpti_ActivityMarker2; + +/** + * \brief The activity record providing detailed information for a marker. + * + * The marker data contains color, payload, and category. + * (CUPTI_ACTIVITY_KIND_MARKER_DATA). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be + * CUPTI_ACTIVITY_KIND_MARKER_DATA. + */ + CUpti_ActivityKind kind; + + /** + * The flags associated with the marker. \see CUpti_ActivityFlag + */ + CUpti_ActivityFlag flags; + + /** + * The marker ID. + */ + uint32_t id; + + /** + * Defines the payload format for the value associated with the marker. + */ + CUpti_MetricValueKind payloadKind; + + /** + * The payload value. + */ + CUpti_MetricValue payload; + + /** + * The color for the marker. + */ + uint32_t color; + + /** + * The category for the marker. + */ + uint32_t category; + +} CUpti_ActivityMarkerData; + +/** + * \brief The activity record for CUPTI and driver overheads. + * + * This activity record provides CUPTI and driver overhead information + * (CUPTI_ACTIVITY_OVERHEAD). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD. + */ + CUpti_ActivityKind kind; + + /** + * The kind of overhead, CUPTI, DRIVER, COMPILER etc. + */ + CUpti_ActivityOverheadKind overheadKind; + + /** + * The kind of activity object that the overhead is associated with. + */ + CUpti_ActivityObjectKind objectKind; + + /** + * The identifier for the activity object. 'objectKind' indicates + * which ID is valid for this record. + */ + CUpti_ActivityObjectKindId objectId; + + /** + * The start timestamp for the overhead, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the overhead. + */ + uint64_t start; + + /** + * The end timestamp for the overhead, in ns. A value of 0 for both + * the start and end timestamps indicates that timestamp information + * could not be collected for the overhead. + */ + uint64_t end; +} CUpti_ActivityOverhead; + +/** + * \brief The activity record for CUPTI environmental data. + * + * This activity record provides CUPTI environmental data, include + * power, clocks, and thermals. This information is sampled at + * various rates and returned in this activity record. The consumer + * of the record needs to check the environmentKind field to figure + * out what kind of environmental record this is. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_ENVIRONMENT. + */ + CUpti_ActivityKind kind; + + /** + * The ID of the device + */ + uint32_t deviceId; + + /** + * The timestamp when this sample was retrieved, in ns. A value of 0 + * indicates that timestamp information could not be collected for + * the marker. + */ + uint64_t timestamp; + + /** + * The kind of data reported in this record. + */ + CUpti_ActivityEnvironmentKind environmentKind; + + union { + /** + * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_SPEED environment + * kind. + */ + struct { + /** + * The SM frequency in MHz + */ + uint32_t smClock; + + /** + * The memory frequency in MHz + */ + uint32_t memoryClock; + + /** + * The PCIe link generation. + */ + uint32_t pcieLinkGen; + + /** + * The PCIe link width. + */ + uint32_t pcieLinkWidth; + + /** + * The clocks throttle reasons. + */ + CUpti_EnvironmentClocksThrottleReason clocksThrottleReasons; + } speed; + /** + * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_TEMPERATURE + * environment kind. + */ + struct { + /** + * The GPU temperature in degrees C. + */ + uint32_t gpuTemperature; + } temperature; + /** + * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_POWER environment + * kind. + */ + struct { + /** + * The power in milliwatts consumed by GPU and associated + * circuitry. + */ + uint32_t power; + + /** + * The power in milliwatts that will trigger power management + * algorithm. + */ + uint32_t powerLimit; + } power; + /** + * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_COOLING + * environment kind. + */ + struct { + /** + * The fan speed as percentage of maximum. + */ + uint32_t fanSpeed; + } cooling; + } data; +} CUpti_ActivityEnvironment; + +/** + * \brief The activity record for source-level instruction execution. + * + * This activity records result for source level instruction execution. + * (CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION. + */ + CUpti_ActivityKind kind; + + /** + * The properties of this instruction execution. + */ + CUpti_ActivityFlag flags; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * Correlation ID with global/device function name + */ + uint32_t functionId; + + /** + * The pc offset for the instruction. + */ + uint32_t pcOffset; + + /** + * This increments each time when this instruction is executed by number + * of threads that executed this instruction, regardless of predicate or condition code. + */ + uint64_t threadsExecuted; + + /** + * This increments each time when this instruction is executed by number + * of threads that executed this instruction with predicate and condition code evaluating to true. + */ + uint64_t notPredOffThreadsExecuted; + + /** + * The number of times this instruction was executed per warp. It will be incremented + * regardless of predicate or condition code. + */ + uint32_t executed; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +} CUpti_ActivityInstructionExecution; + +/** + * \brief The activity record for PC sampling. (deprecated in CUDA 8.0) + * + * This activity records information obtained by sampling PC + * (CUPTI_ACTIVITY_KIND_PC_SAMPLING). + * PC sampling activities are now reported using the + * CUpti_ActivityPCSampling2 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING. + */ + CUpti_ActivityKind kind; + + /** + * The properties of this instruction. + */ + CUpti_ActivityFlag flags; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * Correlation ID with global/device function name + */ + uint32_t functionId; + + /** + * The pc offset for the instruction. + */ + uint32_t pcOffset; + + /** + * Number of times the PC was sampled with the stallReason in the record. + * The same PC can be sampled with different stall reasons. + */ + uint32_t samples; + + /** + * Current stall reason. Includes one of the reasons from + * \ref CUpti_ActivityPCSamplingStallReason + */ + CUpti_ActivityPCSamplingStallReason stallReason; +} CUpti_ActivityPCSampling; + +/** + * \brief The activity record for PC sampling. (deprecated in CUDA 9.0) + * + * This activity records information obtained by sampling PC + * (CUPTI_ACTIVITY_KIND_PC_SAMPLING). + * PC sampling activities are now reported using the + * CUpti_ActivityPCSampling3 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING. + */ + CUpti_ActivityKind kind; + + /** + * The properties of this instruction. + */ + CUpti_ActivityFlag flags; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * Correlation ID with global/device function name + */ + uint32_t functionId; + + /** + * The pc offset for the instruction. + */ + uint32_t pcOffset; + + /** + * Number of times the PC was sampled with the stallReason in the record. + * These samples indicate that no instruction was issued in that cycle from + * the warp scheduler from where the warp was sampled. + * Field is valid for devices with compute capability 6.0 and higher + */ + uint32_t latencySamples; + + /** + * Number of times the PC was sampled with the stallReason in the record. + * The same PC can be sampled with different stall reasons. The count includes + * latencySamples. + */ + uint32_t samples; + + /** + * Current stall reason. Includes one of the reasons from + * \ref CUpti_ActivityPCSamplingStallReason + */ + CUpti_ActivityPCSamplingStallReason stallReason; + + uint32_t pad; +} CUpti_ActivityPCSampling2; + +/** + * \brief The activity record for PC sampling. + * + * This activity records information obtained by sampling PC + * (CUPTI_ACTIVITY_KIND_PC_SAMPLING). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING. + */ + CUpti_ActivityKind kind; + + /** + * The properties of this instruction. + */ + CUpti_ActivityFlag flags; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * Correlation ID with global/device function name + */ + uint32_t functionId; + + /** + * Number of times the PC was sampled with the stallReason in the record. + * These samples indicate that no instruction was issued in that cycle from + * the warp scheduler from where the warp was sampled. + * Field is valid for devices with compute capability 6.0 and higher + */ + uint32_t latencySamples; + + /** + * Number of times the PC was sampled with the stallReason in the record. + * The same PC can be sampled with different stall reasons. The count includes + * latencySamples. + */ + uint32_t samples; + + /** + * Current stall reason. Includes one of the reasons from + * \ref CUpti_ActivityPCSamplingStallReason + */ + CUpti_ActivityPCSamplingStallReason stallReason; + + /** + * The pc offset for the instruction. + */ + uint64_t pcOffset; +} CUpti_ActivityPCSampling3; + +/** + * \brief The activity record for record status for PC sampling. + * + * This activity records information obtained by sampling PC + * (CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO. + */ + CUpti_ActivityKind kind; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * Number of times the PC was sampled for this kernel instance including all + * dropped samples. + */ + uint64_t totalSamples; + + /** + * Number of samples that were dropped by hardware due to backpressure/overflow. + */ + uint64_t droppedSamples; + /** + * Sampling period in terms of number of cycles . + */ + uint64_t samplingPeriodInCycles; +} CUpti_ActivityPCSamplingRecordInfo; + +/** + * \brief The activity record for Unified Memory counters (deprecated in CUDA 7.0) + * + * This activity record represents a Unified Memory counter + * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER + */ + CUpti_ActivityKind kind; + + /** + * The Unified Memory counter kind. See \ref CUpti_ActivityUnifiedMemoryCounterKind + */ + CUpti_ActivityUnifiedMemoryCounterKind counterKind; + + /** + * Scope of the Unified Memory counter. See \ref CUpti_ActivityUnifiedMemoryCounterScope + */ + CUpti_ActivityUnifiedMemoryCounterScope scope; + + /** + * The ID of the device involved in the memory transfer operation. + * It is not relevant if the scope of the counter is global (all devices). + */ + uint32_t deviceId; + + /** + * Value of the counter + * + */ + uint64_t value; + + /** + * The timestamp when this sample was retrieved, in ns. A value of 0 + * indicates that timestamp information could not be collected + */ + uint64_t timestamp; + + /** + * The ID of the process to which this record belongs to. In case of + * global scope, processId is undefined. + */ + uint32_t processId; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +} CUpti_ActivityUnifiedMemoryCounter; + +/** + * \brief The activity record for Unified Memory counters (CUDA 7.0 and beyond) + * + * This activity record represents a Unified Memory counter + * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER + */ + CUpti_ActivityKind kind; + + /** + * The Unified Memory counter kind + */ + CUpti_ActivityUnifiedMemoryCounterKind counterKind; + + /** + * Value of the counter + * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD, + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THREASHING and + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP, it is the size of the + * memory region in bytes. + * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, it + * is the number of page fault groups for the same page. + * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT, + * it is the program counter for the instruction that caused fault. + */ + uint64_t value; + + /** + * The start timestamp of the counter, in ns. + * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is + * captured when activity starts on GPU. + * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT and + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT, timestamp is + * captured when CUDA driver started processing the fault. + * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, timestamp + * is captured when CUDA driver detected thrashing of memory region. + * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING, + * timestamp is captured when throttling opeeration was started by CUDA driver. + * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP, + * timestamp is captured when CUDA driver has pushed all required operations + * to the processor specified by dstId. + */ + uint64_t start; + + /** + * The end timestamp of the counter, in ns. + * Ignore this field if counterKind is + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP. + * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is + * captured when activity finishes on GPU. + * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, timestamp is + * captured when CUDA driver queues the replay of faulting memory accesses on the GPU + * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING, timestamp + * is captured when throttling operation was finished by CUDA driver + */ + uint64_t end; + + /** + * This is the virtual base address of the page/s being transferred. For cpu and + * gpu faults, the virtual address for the page that faulted. + */ + uint64_t address; + + /** + * The ID of the source CPU/device involved in the memory transfer, page fault, thrashing, + * throttling or remote map operation. For counterKind + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, it is a bitwise ORing of the + * device IDs fighting for the memory region. Ignore this field if counterKind is + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT + */ + uint32_t srcId; + + /** + * The ID of the destination CPU/device involved in the memory transfer or remote map + * operation. Ignore this field if counterKind is + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT or + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING + */ + uint32_t dstId; + + /** + * The ID of the stream causing the transfer. + * This value of this field is invalid. + */ + uint32_t streamId; + + /** + * The ID of the process to which this record belongs to. + */ + uint32_t processId; + + /** + * The flags associated with this record. See enums \ref CUpti_ActivityUnifiedMemoryAccessType + * if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT + * and \ref CUpti_ActivityUnifiedMemoryMigrationCause if counterKind is + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD or + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD + * and \ref CUpti_ActivityUnifiedMemoryRemoteMapCause if counterKind is + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP and \ref CUpti_ActivityFlag + * if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or + * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING + */ + uint32_t flags; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +} CUpti_ActivityUnifiedMemoryCounter2; + +/** + * \brief The activity record for global/device functions. + * + * This activity records function name and corresponding module + * information. + * (CUPTI_ACTIVITY_KIND_FUNCTION). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_FUNCTION. + */ + CUpti_ActivityKind kind; + + /** + * ID to uniquely identify the record + */ + uint32_t id; + + /** + * The ID of the context where the function is launched. + */ + uint32_t contextId; + + /** + * The module ID in which this global/device function is present. + */ + uint32_t moduleId; + + /** + * The function's unique symbol index in the module. + */ + uint32_t functionIndex; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The name of the function. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; +} CUpti_ActivityFunction; + +/** + * \brief The activity record for a CUDA module. + * + * This activity record represents a CUDA module + * (CUPTI_ACTIVITY_KIND_MODULE). This activity record kind is not + * produced by the activity API but is included for completeness and + * ease-of-use. Profile frameworks built on top of CUPTI that collect + * module data from the module callback may choose to use this type to + * store the collected module data. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MODULE. + */ + CUpti_ActivityKind kind; + + /** + * The ID of the context where the module is loaded. + */ + uint32_t contextId; + + /** + * The module ID. + */ + uint32_t id; + + /** + * The cubin size. + */ + uint32_t cubinSize; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The pointer to cubin. + */ + const void *cubin; +} CUpti_ActivityModule; + +/** + * \brief The activity record for source-level shared + * access. + * + * This activity records the locations of the shared + * accesses in the source + * (CUPTI_ACTIVITY_KIND_SHARED_ACCESS). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_SHARED_ACCESS. + */ + CUpti_ActivityKind kind; + + /** + * The properties of this shared access. + */ + CUpti_ActivityFlag flags; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * Correlation ID with global/device function name + */ + uint32_t functionId; + + /** + * The pc offset for the access. + */ + uint32_t pcOffset; + + /** + * This increments each time when this instruction is executed by number + * of threads that executed this instruction with predicate and condition code evaluating to true. + */ + uint64_t threadsExecuted; + + /** + * The total number of shared memory transactions generated by this access + */ + uint64_t sharedTransactions; + + /** + * The minimum number of shared memory transactions possible based on the access pattern. + */ + uint64_t theoreticalSharedTransactions; + + /** + * The number of times this instruction was executed per warp. It will be incremented + * when at least one of thread among warp is active with predicate and condition code + * evaluating to true. + */ + uint32_t executed; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +} CUpti_ActivitySharedAccess; + +/** + * \brief The activity record for CUDA event. + * + * This activity is used to track recorded events. + * (CUPTI_ACTIVITY_KIND_CUDA_EVENT). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_CUDA_EVENT. + */ + CUpti_ActivityKind kind; + + /** + * The correlation ID of the API to which this result is associated. + */ + uint32_t correlationId; + + /** + * The ID of the context where the event was recorded. + */ + uint32_t contextId; + + /** + * The compute stream where the event was recorded. + */ + uint32_t streamId; + + /** + * A unique event ID to identify the event record. + */ + uint32_t eventId; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +} CUpti_ActivityCudaEvent; + +/** + * \brief The activity record for CUDA stream. + * + * This activity is used to track created streams. + * (CUPTI_ACTIVITY_KIND_STREAM). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_STREAM. + */ + CUpti_ActivityKind kind; + /** + * The ID of the context where the stream was created. + */ + uint32_t contextId; + + /** + * A unique stream ID to identify the stream. + */ + uint32_t streamId; + + /** + * The clamped priority for the stream. + */ + uint32_t priority; + + /** + * Flags associated with the stream. + */ + CUpti_ActivityStreamFlag flag; + + /** + * The correlation ID of the API to which this result is associated. + */ + uint32_t correlationId; +} CUpti_ActivityStream; + +/** + * \brief The activity record for synchronization management. + * + * This activity is used to track various CUDA synchronization APIs. + * (CUPTI_ACTIVITY_KIND_SYNCHRONIZATION). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_SYNCHRONIZATION. + */ + CUpti_ActivityKind kind; + + /** + * The type of record. + */ + CUpti_ActivitySynchronizationType type; + + /** + * The start timestamp for the function, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the function. + */ + uint64_t start; + + /** + * The end timestamp for the function, in ns. A value of 0 for both + * the start and end timestamps indicates that timestamp information + * could not be collected for the function. + */ + uint64_t end; + + /** + * The correlation ID of the API to which this result is associated. + */ + uint32_t correlationId; + + /** + * The ID of the context for which the synchronization API is called. + * In case of context synchronization API it is the context id for which the API is called. + * In case of stream/event synchronization it is the ID of the context where the stream/event was created. + */ + uint32_t contextId; + + /** + * The compute stream for which the synchronization API is called. + * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record. + * Not valid for cuCtxSynchronize, cuEventSynchronize. + */ + uint32_t streamId; + + /** + * The event ID for which the synchronization API is called. + * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record. + * Not valid for cuCtxSynchronize, cuStreamSynchronize. + */ + uint32_t cudaEventId; +} CUpti_ActivitySynchronization; + + +/** + * \brief The activity record for source-level sass/source + * line-by-line correlation. + * + * This activity records source level sass/source correlation + * information. + * (CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION. + */ + CUpti_ActivityKind kind; + + /** + * The properties of this instruction. + */ + CUpti_ActivityFlag flags; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * Correlation ID with global/device function name + */ + uint32_t functionId; + + /** + * The pc offset for the instruction. + */ + uint32_t pcOffset; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +} CUpti_ActivityInstructionCorrelation; + +/** + * \brief The OpenAcc event kind for OpenAcc activity records. + * + * \see CUpti_ActivityKindOpenAcc + */ +typedef enum { + CUPTI_OPENACC_EVENT_KIND_INVALID = 0, + CUPTI_OPENACC_EVENT_KIND_DEVICE_INIT = 1, + CUPTI_OPENACC_EVENT_KIND_DEVICE_SHUTDOWN = 2, + CUPTI_OPENACC_EVENT_KIND_RUNTIME_SHUTDOWN = 3, + CUPTI_OPENACC_EVENT_KIND_ENQUEUE_LAUNCH = 4, + CUPTI_OPENACC_EVENT_KIND_ENQUEUE_UPLOAD = 5, + CUPTI_OPENACC_EVENT_KIND_ENQUEUE_DOWNLOAD = 6, + CUPTI_OPENACC_EVENT_KIND_WAIT = 7, + CUPTI_OPENACC_EVENT_KIND_IMPLICIT_WAIT = 8, + CUPTI_OPENACC_EVENT_KIND_COMPUTE_CONSTRUCT = 9, + CUPTI_OPENACC_EVENT_KIND_UPDATE = 10, + CUPTI_OPENACC_EVENT_KIND_ENTER_DATA = 11, + CUPTI_OPENACC_EVENT_KIND_EXIT_DATA = 12, + CUPTI_OPENACC_EVENT_KIND_CREATE = 13, + CUPTI_OPENACC_EVENT_KIND_DELETE = 14, + CUPTI_OPENACC_EVENT_KIND_ALLOC = 15, + CUPTI_OPENACC_EVENT_KIND_FREE = 16, + CUPTI_OPENACC_EVENT_KIND_FORCE_INT = 0x7fffffff +} CUpti_OpenAccEventKind; + +/** + * \brief The OpenAcc parent construct kind for OpenAcc activity records. + */ +typedef enum { + CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN = 0, + CUPTI_OPENACC_CONSTRUCT_KIND_PARALLEL = 1, + CUPTI_OPENACC_CONSTRUCT_KIND_KERNELS = 2, + CUPTI_OPENACC_CONSTRUCT_KIND_LOOP = 3, + CUPTI_OPENACC_CONSTRUCT_KIND_DATA = 4, + CUPTI_OPENACC_CONSTRUCT_KIND_ENTER_DATA = 5, + CUPTI_OPENACC_CONSTRUCT_KIND_EXIT_DATA = 6, + CUPTI_OPENACC_CONSTRUCT_KIND_HOST_DATA = 7, + CUPTI_OPENACC_CONSTRUCT_KIND_ATOMIC = 8, + CUPTI_OPENACC_CONSTRUCT_KIND_DECLARE = 9, + CUPTI_OPENACC_CONSTRUCT_KIND_INIT = 10, + CUPTI_OPENACC_CONSTRUCT_KIND_SHUTDOWN = 11, + CUPTI_OPENACC_CONSTRUCT_KIND_SET = 12, + CUPTI_OPENACC_CONSTRUCT_KIND_UPDATE = 13, + CUPTI_OPENACC_CONSTRUCT_KIND_ROUTINE = 14, + CUPTI_OPENACC_CONSTRUCT_KIND_WAIT = 15, + CUPTI_OPENACC_CONSTRUCT_KIND_RUNTIME_API = 16, + CUPTI_OPENACC_CONSTRUCT_KIND_FORCE_INT = 0x7fffffff + +} CUpti_OpenAccConstructKind; + +typedef enum { + CUPTI_OPENMP_EVENT_KIND_INVALID = 0, + CUPTI_OPENMP_EVENT_KIND_PARALLEL = 1, + CUPTI_OPENMP_EVENT_KIND_TASK = 2, + CUPTI_OPENMP_EVENT_KIND_THREAD = 3, + CUPTI_OPENMP_EVENT_KIND_IDLE = 4, + CUPTI_OPENMP_EVENT_KIND_WAIT_BARRIER = 5, + CUPTI_OPENMP_EVENT_KIND_WAIT_TASKWAIT = 6, + CUPTI_OPENMP_EVENT_KIND_FORCE_INT = 0x7fffffff +} CUpti_OpenMpEventKind; + +/** + * \brief The base activity record for OpenAcc records. + * + * The OpenACC activity API part uses a CUpti_ActivityOpenAcc as a generic + * representation for any OpenACC activity. The 'kind' field is used to determine the + * specific activity kind, and from that the CUpti_ActivityOpenAcc object can + * be cast to the specific OpenACC activity record type appropriate for that kind. + * + * Note that all OpenACC activity record types are padded and aligned to + * ensure that each member of the record is naturally aligned. + * + * \see CUpti_ActivityKind + */ +typedef struct PACKED_ALIGNMENT { + /** + * The kind of this activity. + */ + CUpti_ActivityKind kind; + + /** + * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind) + */ + CUpti_OpenAccEventKind eventKind; + + /** + * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind) + * + * Note that for applications using PGI OpenACC runtime < 16.1, this + * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN. + */ + CUpti_OpenAccConstructKind parentConstruct; + + /* + * Version number + */ + uint32_t version; + + /* + * 1 for any implicit event, such as an implicit wait at a synchronous data construct + * 0 otherwise + */ + uint32_t implicit; + + /* + * Device type + */ + uint32_t deviceType; + + /* + * Device number + */ + uint32_t deviceNumber; + + /** + * ThreadId + */ + uint32_t threadId; + + /* + * Value of async() clause of the corresponding directive + */ + uint64_t async; + + /* + * Internal asynchronous queue number used + */ + uint64_t asyncMap; + + /* + * The line number of the directive or program construct or the starting line + * number of the OpenACC construct corresponding to the event. + * A zero value means the line number is not known. + */ + uint32_t lineNo; + + /* + * For an OpenACC construct, this contains the line number of the end + * of the construct. A zero value means the line number is not known. + */ + uint32_t endLineNo; + + /* + * The line number of the first line of the function named in funcName. + * A zero value means the line number is not known. + */ + uint32_t funcLineNo; + + /* + * The last line number of the function named in funcName. + * A zero value means the line number is not known. + */ + uint32_t funcEndLineNo; + + /** + * CUPTI start timestamp + */ + uint64_t start; + + /** + * CUPTI end timestamp + */ + uint64_t end; + + /** + * CUDA device id + * Valid only if deviceType is acc_device_nvidia. + */ + uint32_t cuDeviceId; + + /** + * CUDA context id + * Valid only if deviceType is acc_device_nvidia. + */ + uint32_t cuContextId; + + /** + * CUDA stream id + * Valid only if deviceType is acc_device_nvidia. + */ + uint32_t cuStreamId; + + /** + * The ID of the process where the OpenACC activity is executing. + */ + uint32_t cuProcessId; + + /** + * The ID of the thread where the OpenACC activity is executing. + */ + uint32_t cuThreadId; + + /** + * The OpenACC correlation ID. + * Valid only if deviceType is acc_device_nvidia. + * If not 0, it uniquely identifies this record. It is identical to the + * externalId in the preceeding external correlation record of type + * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC. + */ + uint32_t externalId; + + /* + * A pointer to null-terminated string containing the name of or path to + * the source file, if known, or a null pointer if not. + */ + const char *srcFile; + + /* + * A pointer to a null-terminated string containing the name of the + * function in which the event occurred. + */ + const char *funcName; +} CUpti_ActivityOpenAcc; + +/** + * \brief The activity record for OpenACC data. + * + * (CUPTI_ACTIVITY_KIND_OPENACC_DATA). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_DATA. + */ + CUpti_ActivityKind kind; + + /** + * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind) + */ + CUpti_OpenAccEventKind eventKind; + + /* + * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind) + * + * Note that for applications using PGI OpenACC runtime < 16.1, this + * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN. + */ + CUpti_OpenAccConstructKind parentConstruct; + + /* + * Version number + */ + uint32_t version; + + /* + * 1 for any implicit event, such as an implicit wait at a synchronous data construct + * 0 otherwise + */ + uint32_t implicit; + + /* + * Device type + */ + uint32_t deviceType; + + /* + * Device number + */ + uint32_t deviceNumber; + + /** + * ThreadId + */ + uint32_t threadId; + + /* + * Value of async() clause of the corresponding directive + */ + uint64_t async; + + /* + * Internal asynchronous queue number used + */ + uint64_t asyncMap; + + /* + * The line number of the directive or program construct or the starting line + * number of the OpenACC construct corresponding to the event. + * A negative or zero value means the line number is not known. + */ + uint32_t lineNo; + + /* + * For an OpenACC construct, this contains the line number of the end + * of the construct. A negative or zero value means the line number is not known. + */ + uint32_t endLineNo; + + /* + * The line number of the first line of the function named in func_name. + * A negative or zero value means the line number is not known. + */ + uint32_t funcLineNo; + + /* + * The last line number of the function named in func_name. + * A negative or zero value means the line number is not known. + */ + uint32_t funcEndLineNo; + + /** + * CUPTI start timestamp + */ + uint64_t start; + + /** + * CUPTI end timestamp + */ + uint64_t end; + + /** + * CUDA device id + * Valid only if deviceType is acc_device_nvidia. + */ + uint32_t cuDeviceId; + + /** + * CUDA context id + * Valid only if deviceType is acc_device_nvidia. + */ + uint32_t cuContextId; + + /** + * CUDA stream id + * Valid only if deviceType is acc_device_nvidia. + */ + uint32_t cuStreamId; + + /** + * The ID of the process where the OpenACC activity is executing. + */ + uint32_t cuProcessId; + + /** + * The ID of the thread where the OpenACC activity is executing. + */ + uint32_t cuThreadId; + + /** + * The OpenACC correlation ID. + * Valid only if deviceType is acc_device_nvidia. + * If not 0, it uniquely identifies this record. It is identical to the + * externalId in the preceeding external correlation record of type + * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC. + */ + uint32_t externalId; + + /* + * A pointer to null-terminated string containing the name of or path to + * the source file, if known, or a null pointer if not. + */ + const char *srcFile; + + /* + * A pointer to a null-terminated string containing the name of the + * function in which the event occurred. + */ + const char *funcName; + + /* --- end of common CUpti_ActivityOpenAcc part --- */ + + /** + * Number of bytes + */ + uint64_t bytes; + + /** + * Host pointer if available + */ + uint64_t hostPtr; + + /** + * Device pointer if available + */ + uint64_t devicePtr; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad1; +#endif + + /* + * A pointer to null-terminated string containing the name of the variable + * for which this event is triggered, if known, or a null pointer if not. + */ + const char *varName; + +} CUpti_ActivityOpenAccData; + +/** + * \brief The activity record for OpenACC launch. + * + * (CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH. + */ + CUpti_ActivityKind kind; + + /** + * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind) + */ + CUpti_OpenAccEventKind eventKind; + + /* + * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind) + * + * Note that for applications using PGI OpenACC runtime < 16.1, this + * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN. + */ + CUpti_OpenAccConstructKind parentConstruct; + + /* + * Version number + */ + uint32_t version; + + /* + * 1 for any implicit event, such as an implicit wait at a synchronous data construct + * 0 otherwise + */ + uint32_t implicit; + + /* + * Device type + */ + uint32_t deviceType; + + /* + * Device number + */ + uint32_t deviceNumber; + + /** + * ThreadId + */ + uint32_t threadId; + + /* + * Value of async() clause of the corresponding directive + */ + uint64_t async; + + /* + * Internal asynchronous queue number used + */ + uint64_t asyncMap; + + /* + * The line number of the directive or program construct or the starting line + * number of the OpenACC construct corresponding to the event. + * A negative or zero value means the line number is not known. + */ + uint32_t lineNo; + + /* + * For an OpenACC construct, this contains the line number of the end + * of the construct. A negative or zero value means the line number is not known. + */ + uint32_t endLineNo; + + /* + * The line number of the first line of the function named in func_name. + * A negative or zero value means the line number is not known. + */ + uint32_t funcLineNo; + + /* + * The last line number of the function named in func_name. + * A negative or zero value means the line number is not known. + */ + uint32_t funcEndLineNo; + + /** + * CUPTI start timestamp + */ + uint64_t start; + + /** + * CUPTI end timestamp + */ + uint64_t end; + + /** + * CUDA device id + * Valid only if deviceType is acc_device_nvidia. + */ + uint32_t cuDeviceId; + + /** + * CUDA context id + * Valid only if deviceType is acc_device_nvidia. + */ + uint32_t cuContextId; + + /** + * CUDA stream id + * Valid only if deviceType is acc_device_nvidia. + */ + uint32_t cuStreamId; + + /** + * The ID of the process where the OpenACC activity is executing. + */ + uint32_t cuProcessId; + + /** + * The ID of the thread where the OpenACC activity is executing. + */ + uint32_t cuThreadId; + + /** + * The OpenACC correlation ID. + * Valid only if deviceType is acc_device_nvidia. + * If not 0, it uniquely identifies this record. It is identical to the + * externalId in the preceeding external correlation record of type + * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC. + */ + uint32_t externalId; + + /* + * A pointer to null-terminated string containing the name of or path to + * the source file, if known, or a null pointer if not. + */ + const char *srcFile; + + /* + * A pointer to a null-terminated string containing the name of the + * function in which the event occurred. + */ + const char *funcName; + + /* --- end of common CUpti_ActivityOpenAcc part --- */ + + /** + * The number of gangs created for this kernel launch + */ + uint64_t numGangs; + + /** + * The number of workers created for this kernel launch + */ + uint64_t numWorkers; + + /** + * The number of vector lanes created for this kernel launch + */ + uint64_t vectorLength; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad1; +#endif + + /* + * A pointer to null-terminated string containing the name of the + * kernel being launched, if known, or a null pointer if not. + */ + const char *kernelName; + +} CUpti_ActivityOpenAccLaunch; + +/** + * \brief The activity record for OpenACC other. + * + * (CUPTI_ACTIVITY_KIND_OPENACC_OTHER). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_OTHER. + */ + CUpti_ActivityKind kind; + + /** + * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind) + */ + CUpti_OpenAccEventKind eventKind; + + /* + * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind) + * + * Note that for applications using PGI OpenACC runtime < 16.1, this + * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN. + */ + CUpti_OpenAccConstructKind parentConstruct; + + /* + * Version number + */ + uint32_t version; + + /* + * 1 for any implicit event, such as an implicit wait at a synchronous data construct + * 0 otherwise + */ + uint32_t implicit; + + /* + * Device type + */ + uint32_t deviceType; + + /* + * Device number + */ + uint32_t deviceNumber; + + /** + * ThreadId + */ + uint32_t threadId; + + /* + * Value of async() clause of the corresponding directive + */ + uint64_t async; + + /* + * Internal asynchronous queue number used + */ + uint64_t asyncMap; + + /* + * The line number of the directive or program construct or the starting line + * number of the OpenACC construct corresponding to the event. + * A negative or zero value means the line number is not known. + */ + uint32_t lineNo; + + /* + * For an OpenACC construct, this contains the line number of the end + * of the construct. A negative or zero value means the line number is not known. + */ + uint32_t endLineNo; + + /* + * The line number of the first line of the function named in func_name. + * A negative or zero value means the line number is not known. + */ + uint32_t funcLineNo; + + /* + * The last line number of the function named in func_name. + * A negative or zero value means the line number is not known. + */ + uint32_t funcEndLineNo; + + /** + * CUPTI start timestamp + */ + uint64_t start; + + /** + * CUPTI end timestamp + */ + uint64_t end; + + /** + * CUDA device id + * Valid only if deviceType is acc_device_nvidia. + */ + uint32_t cuDeviceId; + + /** + * CUDA context id + * Valid only if deviceType is acc_device_nvidia. + */ + uint32_t cuContextId; + + /** + * CUDA stream id + * Valid only if deviceType is acc_device_nvidia. + */ + uint32_t cuStreamId; + + /** + * The ID of the process where the OpenACC activity is executing. + */ + uint32_t cuProcessId; + + /** + * The ID of the thread where the OpenACC activity is executing. + */ + uint32_t cuThreadId; + + /** + * The OpenACC correlation ID. + * Valid only if deviceType is acc_device_nvidia. + * If not 0, it uniquely identifies this record. It is identical to the + * externalId in the preceeding external correlation record of type + * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC. + */ + uint32_t externalId; + + /* + * A pointer to null-terminated string containing the name of or path to + * the source file, if known, or a null pointer if not. + */ + const char *srcFile; + + /* + * A pointer to a null-terminated string containing the name of the + * function in which the event occurred. + */ + const char *funcName; + + /* --- end of common CUpti_ActivityOpenAcc part --- */ +} CUpti_ActivityOpenAccOther; + + +/** + * \brief The base activity record for OpenMp records. + * + * \see CUpti_ActivityKind + */ +typedef struct PACKED_ALIGNMENT { + + /** + * The kind of this activity. + */ + CUpti_ActivityKind kind; + + /** + * CUPTI OpenMP event kind (\see CUpti_OpenMpEventKind) + */ + CUpti_OpenMpEventKind eventKind; + + /* + * Version number + */ + uint32_t version; + + /** + * ThreadId + */ + uint32_t threadId; + + /** + * CUPTI start timestamp + */ + uint64_t start; + + /** + * CUPTI end timestamp + */ + uint64_t end; + + /** + * The ID of the process where the OpenMP activity is executing. + */ + uint32_t cuProcessId; + + /** + * The ID of the thread where the OpenMP activity is executing. + */ + uint32_t cuThreadId; + +} CUpti_ActivityOpenMp; + +/** + * \brief The kind of external APIs supported for correlation. + * + * Custom correlation kinds are reserved for usage in external tools. + * + * \see CUpti_ActivityExternalCorrelation + */ +typedef enum { + CUPTI_EXTERNAL_CORRELATION_KIND_INVALID = 0, + + /** + * The external API is unknown to CUPTI + */ + CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN = 1, + + /** + * The external API is OpenACC + */ + CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC = 2, + + /** + * The external API is custom0 + */ + CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0 = 3, + + /** + * The external API is custom1 + */ + CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1 = 4, + + /** + * The external API is custom2 + */ + CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM2 = 5, + + /** + * Add new kinds before this line + */ + CUPTI_EXTERNAL_CORRELATION_KIND_SIZE, + + CUPTI_EXTERNAL_CORRELATION_KIND_FORCE_INT = 0x7fffffff +} CUpti_ExternalCorrelationKind; + +/** + * \brief The activity record for correlation with external records + * + * This activity record correlates native CUDA records (e.g. CUDA Driver API, + * kernels, memcpys, ...) with records from external APIs such as OpenACC. + * (CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION). + * + * \see CUpti_ActivityKind + */ +typedef struct PACKED_ALIGNMENT { + /** + * The kind of this activity. + */ + CUpti_ActivityKind kind; + + /** + * The kind of external API this record correlated to. + */ + CUpti_ExternalCorrelationKind externalKind; + + /** + * The correlation ID of the associated non-CUDA API record. + * The exact field in the associated external record depends + * on that record's activity kind (\see externalKind). + */ + uint64_t externalId; + + /** + * The correlation ID of the associated CUDA driver or runtime API record. + */ + uint32_t correlationId; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t reserved; +} CUpti_ActivityExternalCorrelation; + +/** +* \brief The device type for device connected to NVLink. +*/ +typedef enum { + CUPTI_DEV_TYPE_INVALID = 0, + /** + * The device type is GPU. + */ + CUPTI_DEV_TYPE_GPU = 1, + /** + * The device type is NVLink processing unit in CPU. + */ + CUPTI_DEV_TYPE_NPU = 2, + CUPTI_DEV_TYPE_FORCE_INT = 0x7fffffff +} CUpti_DevType; + +/** +* \brief NVLink information. (deprecated in CUDA 9.0) +* +* This structure gives capabilities of each logical NVLink connection between two devices, +* gpu<->gpu or gpu<->CPU which can be used to understand the topology. +* NVLink information are now reported using the +* CUpti_ActivityNvLink2 activity record. +*/ + +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK. + */ + CUpti_ActivityKind kind; + /** + * NVLink version. + */ + uint32_t nvlinkVersion; + /** + * Type of device 0 \ref CUpti_DevType + */ + CUpti_DevType typeDev0; + /** + * Type of device 1 \ref CUpti_DevType + */ + CUpti_DevType typeDev1; + /** + * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice4. + * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU. + */ + union { + CUuuid uuidDev; + struct { + /** + * Index of the NPU. First index will always be zero. + */ + uint32_t index; + /** + * Domain ID of NPU. On Linux, this can be queried using lspci. + */ + uint32_t domainId; + } npu; + } idDev0; + /** + * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice4. + * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU. + */ + union { + CUuuid uuidDev; + struct { + /** + * Index of the NPU. First index will always be zero. + */ + uint32_t index; + /** + * Domain ID of NPU. On Linux, this can be queried using lspci. + */ + uint32_t domainId; + } npu; + } idDev1; + /** + * Flag gives capabilities of the link \see CUpti_LinkFlag + */ + uint32_t flag; + /** + * Number of physical NVLinks present between two devices. + */ + uint32_t physicalNvLinkCount; + /** + * Port numbers for maximum 4 NVLinks connected to device 0. + * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field. + * In case of invalid/unknown port number, this field will be set + * to value CUPTI_NVLINK_INVALID_PORT. + * This will be used to correlate the metric values to individual + * physical link and attribute traffic to the logical NVLink in + * the topology. + */ + int8_t portDev0[4]; + /** + * Port numbers for maximum 4 NVLinks connected to device 1. + * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field. + * In case of invalid/unknown port number, this field will be set + * to value CUPTI_NVLINK_INVALID_PORT. + * This will be used to correlate the metric values to individual + * physical link and attribute traffic to the logical NVLink in + * the topology. + */ + int8_t portDev1[4]; + /** + * Banwidth of NVLink in kbytes/sec + */ + uint64_t bandwidth; +} CUpti_ActivityNvLink; + +/** +* \brief NVLink information. (deprecated in CUDA 10.0) +* +* This structure gives capabilities of each logical NVLink connection between two devices, +* gpu<->gpu or gpu<->CPU which can be used to understand the topology. +* NvLink information are now reported using the +* CUpti_ActivityNvLink4 activity record. +*/ + +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK. + */ + CUpti_ActivityKind kind; + /** + * NvLink version. + */ + uint32_t nvlinkVersion; + /** + * Type of device 0 \ref CUpti_DevType + */ + CUpti_DevType typeDev0; + /** + * Type of device 1 \ref CUpti_DevType + */ + CUpti_DevType typeDev1; + /** + * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice4. + * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU. + */ + union { + CUuuid uuidDev; + struct { + /** + * Index of the NPU. First index will always be zero. + */ + uint32_t index; + /** + * Domain ID of NPU. On Linux, this can be queried using lspci. + */ + uint32_t domainId; + } npu; + } idDev0; + /** + * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice4. + * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU. + */ + union { + CUuuid uuidDev; + struct { + /** + * Index of the NPU. First index will always be zero. + */ + uint32_t index; + /** + * Domain ID of NPU. On Linux, this can be queried using lspci. + */ + uint32_t domainId; + } npu; + } idDev1; + /** + * Flag gives capabilities of the link \see CUpti_LinkFlag + */ + uint32_t flag; + /** + * Number of physical NVLinks present between two devices. + */ + uint32_t physicalNvLinkCount; + /** + * Port numbers for maximum 16 NVLinks connected to device 0. + * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field. + * In case of invalid/unknown port number, this field will be set + * to value CUPTI_NVLINK_INVALID_PORT. + * This will be used to correlate the metric values to individual + * physical link and attribute traffic to the logical NVLink in + * the topology. + */ + int8_t portDev0[CUPTI_MAX_NVLINK_PORTS]; + /** + * Port numbers for maximum 16 NVLinks connected to device 1. + * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field. + * In case of invalid/unknown port number, this field will be set + * to value CUPTI_NVLINK_INVALID_PORT. + * This will be used to correlate the metric values to individual + * physical link and attribute traffic to the logical NVLink in + * the topology. + */ + int8_t portDev1[CUPTI_MAX_NVLINK_PORTS]; + /** + * Banwidth of NVLink in kbytes/sec + */ + uint64_t bandwidth; +} CUpti_ActivityNvLink2; + +/** +* \brief NVLink information. +* +* This structure gives capabilities of each logical NVLink connection between two devices, +* gpu<->gpu or gpu<->CPU which can be used to understand the topology. +* NvLink information are now reported using the +* CUpti_ActivityNvLink4 activity record. +*/ + +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK. + */ + CUpti_ActivityKind kind; + /** + * NvLink version. + */ + uint32_t nvlinkVersion; + /** + * Type of device 0 \ref CUpti_DevType + */ + CUpti_DevType typeDev0; + /** + * Type of device 1 \ref CUpti_DevType + */ + CUpti_DevType typeDev1; + /** + * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice4. + * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU. + */ + union { + CUuuid uuidDev; + struct { + /** + * Index of the NPU. First index will always be zero. + */ + uint32_t index; + /** + * Domain ID of NPU. On Linux, this can be queried using lspci. + */ + uint32_t domainId; + } npu; + } idDev0; + /** + * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice4. + * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU. + */ + union { + CUuuid uuidDev; + struct { + /** + * Index of the NPU. First index will always be zero. + */ + uint32_t index; + /** + * Domain ID of NPU. On Linux, this can be queried using lspci. + */ + uint32_t domainId; + } npu; + } idDev1; + /** + * Flag gives capabilities of the link \see CUpti_LinkFlag + */ + uint32_t flag; + /** + * Number of physical NVLinks present between two devices. + */ + uint32_t physicalNvLinkCount; + /** + * Port numbers for maximum 16 NVLinks connected to device 0. + * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field. + * In case of invalid/unknown port number, this field will be set + * to value CUPTI_NVLINK_INVALID_PORT. + * This will be used to correlate the metric values to individual + * physical link and attribute traffic to the logical NVLink in + * the topology. + */ + int8_t portDev0[CUPTI_MAX_NVLINK_PORTS]; + /** + * Port numbers for maximum 16 NVLinks connected to device 1. + * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field. + * In case of invalid/unknown port number, this field will be set + * to value CUPTI_NVLINK_INVALID_PORT. + * This will be used to correlate the metric values to individual + * physical link and attribute traffic to the logical NVLink in + * the topology. + */ + int8_t portDev1[CUPTI_MAX_NVLINK_PORTS]; + /** + * Banwidth of NVLink in kbytes/sec + */ + uint64_t bandwidth; + /** + * NVSwitch is connected as an intermediate node. + */ + uint8_t nvswitchConnected; + /** + * Undefined. reserved for internal use + */ + uint8_t pad[7]; +} CUpti_ActivityNvLink3; + +/** +* \brief NVLink information. +* +* This structure gives capabilities of each logical NVLink connection between two devices, +* gpu<->gpu or gpu<->CPU which can be used to understand the topology. +*/ + +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK. + */ + CUpti_ActivityKind kind; + /** + * NvLink version. + */ + uint32_t nvlinkVersion; + /** + * Type of device 0 \ref CUpti_DevType + */ + CUpti_DevType typeDev0; + /** + * Type of device 1 \ref CUpti_DevType + */ + CUpti_DevType typeDev1; + /** + * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice4. + * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU. + */ + union { + CUuuid uuidDev; + struct { + /** + * Index of the NPU. First index will always be zero. + */ + uint32_t index; + /** + * Domain ID of NPU. On Linux, this can be queried using lspci. + */ + uint32_t domainId; + } npu; + } idDev0; + /** + * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice4. + * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU. + */ + union { + CUuuid uuidDev; + struct { + /** + * Index of the NPU. First index will always be zero. + */ + uint32_t index; + /** + * Domain ID of NPU. On Linux, this can be queried using lspci. + */ + uint32_t domainId; + } npu; + } idDev1; + /** + * Flag gives capabilities of the link \see CUpti_LinkFlag + */ + uint32_t flag; + /** + * Number of physical NVLinks present between two devices. + */ + uint32_t physicalNvLinkCount; + /** + * Port numbers for maximum 32 NVLinks connected to device 0. + * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field. + * In case of invalid/unknown port number, this field will be set + * to value CUPTI_NVLINK_INVALID_PORT. + * This will be used to correlate the metric values to individual + * physical link and attribute traffic to the logical NVLink in + * the topology. + */ + int8_t portDev0[CUPTI_MAX_NVLINK_PORTS]; + /** + * Port numbers for maximum 32 NVLinks connected to device 1. + * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field. + * In case of invalid/unknown port number, this field will be set + * to value CUPTI_NVLINK_INVALID_PORT. + * This will be used to correlate the metric values to individual + * physical link and attribute traffic to the logical NVLink in + * the topology. + */ + int8_t portDev1[CUPTI_MAX_NVLINK_PORTS]; + /** + * Banwidth of NVLink in kbytes/sec + */ + uint64_t bandwidth; + /** + * NVSwitch is connected as an intermediate node. + */ + uint8_t nvswitchConnected; + /** + * Undefined. reserved for internal use + */ + uint8_t pad[7]; +} CUpti_ActivityNvLink4; + +#define CUPTI_MAX_GPUS 32 +/** + * Field to differentiate whether PCIE Activity record + * is of a GPU or a PCI Bridge + */ +typedef enum { + /** + * PCIE GPU record + */ + CUPTI_PCIE_DEVICE_TYPE_GPU = 0, + + /** + * PCIE Bridge record + */ + CUPTI_PCIE_DEVICE_TYPE_BRIDGE = 1, + + CUPTI_PCIE_DEVICE_TYPE_FORCE_INT = 0x7fffffff +} CUpti_PcieDeviceType; + +/** + * \brief PCI devices information required to construct topology + * + * This structure gives capabilities of GPU and PCI bridge connected to the PCIE bus + * which can be used to understand the topology. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_PCIE. + */ + CUpti_ActivityKind kind; + /** + * Type of device in topology, \ref CUpti_PcieDeviceType. If type is + * CUPTI_PCIE_DEVICE_TYPE_GPU use devId for id and gpuAttr and if type is + * CUPTI_PCIE_DEVICE_TYPE_BRIDGE use bridgeId for id and bridgeAttr. + */ + CUpti_PcieDeviceType type; + /** + * A unique identifier for GPU or Bridge in Topology + */ + union { + /** + * GPU device ID + */ + CUdevice devId; + /** + * A unique identifier for Bridge in the Topology + */ + uint32_t bridgeId; + } id; + + /** + * Domain for the GPU or Bridge, required to identify which PCIE bus it belongs to in + * multiple NUMA systems. + */ + uint32_t domain; + /** + * PCIE Generation of GPU or Bridge. + */ + uint16_t pcieGeneration; + /** + * Link rate of the GPU or bridge in gigatransfers per second (GT/s) + */ + uint16_t linkRate; + /** + * Link width of the GPU or bridge + */ + uint16_t linkWidth; + + /** + * Upstream bus ID for the GPU or PCI bridge. Required to identify which bus it is + * connected to in the topology. + */ + uint16_t upstreamBus; + + /** + * Attributes for more information about GPU (gpuAttr) or PCI Bridge (bridgeAttr) + */ + union { + struct { + /** + * UUID for the device. \ref CUpti_ActivityDevice4. + */ + CUuuid uuidDev; + /** + * CUdevice with which this device has P2P capability. + * This can also be obtained by querying cuDeviceCanAccessPeer or + * cudaDeviceCanAccessPeer APIs + */ + CUdevice peerDev[CUPTI_MAX_GPUS]; + } gpuAttr; + + struct { + /** + * The downstream bus number, used to search downstream devices/bridges connected + * to this bridge. + */ + uint16_t secondaryBus; + /** + * Device ID of the bridge + */ + uint16_t deviceId; + /** + * Vendor ID of the bridge + */ + uint16_t vendorId; + /** + * Padding for alignment + */ + uint16_t pad0; + } bridgeAttr; + } attr; +} CUpti_ActivityPcie; + +/** + * \brief PCIE Generation. + * + * Enumeration of PCIE Generation for + * pcie activity attribute pcieGeneration + */ +typedef enum { + /** + * PCIE Generation 1 + */ + CUPTI_PCIE_GEN_GEN1 = 1, + /** + * PCIE Generation 2 + */ + CUPTI_PCIE_GEN_GEN2 = 2, + /** + * PCIE Generation 3 + */ + CUPTI_PCIE_GEN_GEN3 = 3, + /** + * PCIE Generation 4 + */ + CUPTI_PCIE_GEN_GEN4 = 4, + /** + * PCIE Generation 5 + */ + CUPTI_PCIE_GEN_GEN5 = 5, + + CUPTI_PCIE_GEN_FORCE_INT = 0x7fffffff +} CUpti_PcieGen; + +/** + * \brief The activity record for an instantaneous CUPTI event. + * + * This activity record represents a CUPTI event value + * (CUPTI_ACTIVITY_KIND_EVENT) sampled at a particular instant. + * This activity record kind is not produced by the activity API but is + * included for completeness and ease-of-use. Profiler frameworks built on + * top of CUPTI that collect event data at a particular time may choose to + * use this type to store the collected event data. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT. + */ + CUpti_ActivityKind kind; + + /** + * The event ID. + */ + CUpti_EventID id; + + /** + * The event value. + */ + uint64_t value; + + /** + * The timestamp at which event is sampled + */ + uint64_t timestamp; + + /** + * The device id + */ + uint32_t deviceId; + /** + * Undefined. reserved for internal use + */ + uint32_t reserved; +} CUpti_ActivityInstantaneousEvent; + +/** + * \brief The activity record for an instantaneous CUPTI event + * with event domain instance information. + * + * This activity record represents the a CUPTI event value for a + * specific event domain instance + * (CUPTI_ACTIVITY_KIND_EVENT_INSTANCE) sampled at a particular instant. + * This activity record kind is not produced by the activity API but is + * included for completeness and ease-of-use. Profiler frameworks built on + * top of CUPTI that collect event data may choose to use this type to store the + * collected event data. This activity record should be used when + * event domain instance information needs to be associated with the + * event. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE. + */ + CUpti_ActivityKind kind; + + /** + * The event ID. + */ + CUpti_EventID id; + + /** + * The event value. + */ + uint64_t value; + + /** + * The timestamp at which event is sampled + */ + uint64_t timestamp; + + /** + * The device id + */ + uint32_t deviceId; + /** + * The event domain instance + */ + uint8_t instance; + /** + * Undefined. reserved for internal use + */ + uint8_t pad[3]; +} CUpti_ActivityInstantaneousEventInstance; + +/** + * \brief The activity record for an instantaneous CUPTI metric. + * + * This activity record represents the collection of a CUPTI metric + * value (CUPTI_ACTIVITY_KIND_METRIC) at a particular instance. + * This activity record kind is not produced by the activity API but + * is included for completeness and ease-of-use. Profiler frameworks built + * on top of CUPTI that collect metric data may choose to use this type to + * store the collected metric data. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC. + */ + CUpti_ActivityKind kind; + + /** + * The metric ID. + */ + CUpti_MetricID id; + + /** + * The metric value. + */ + CUpti_MetricValue value; + + /** + * The timestamp at which metric is sampled + */ + uint64_t timestamp; + + /** + * The device id + */ + uint32_t deviceId; + + /** + * The properties of this metric. \see CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * Undefined. reserved for internal use + */ + uint8_t pad[3]; +} CUpti_ActivityInstantaneousMetric; + +/** + * \brief The instantaneous activity record for a CUPTI metric with instance + * information. + + * This activity record represents a CUPTI metric value + * for a specific metric domain instance + * (CUPTI_ACTIVITY_KIND_METRIC_INSTANCE) sampled at a particular time. This + * activity record kind is not produced by the activity API but is included for + * completeness and ease-of-use. Profiler frameworks built on top of + * CUPTI that collect metric data may choose to use this type to store + * the collected metric data. This activity record should be used when + * metric domain instance information needs to be associated with the + * metric. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE. + */ + CUpti_ActivityKind kind; + + /** + * The metric ID. + */ + CUpti_MetricID id; + + /** + * The metric value. + */ + CUpti_MetricValue value; + + /** + * The timestamp at which metric is sampled + */ + uint64_t timestamp; + + /** + * The device id + */ + uint32_t deviceId; + + /** + * The properties of this metric. \see CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The metric domain instance + */ + uint8_t instance; + /** + * Undefined. reserved for internal use + */ + uint8_t pad[2]; +} CUpti_ActivityInstantaneousMetricInstance; + +/** + * \brief The types of JIT entry. + * + * To be used in CUpti_ActivityJit. + */ +typedef enum { + CUPTI_ACTIVITY_JIT_ENTRY_INVALID= 0, + /** + * PTX to CUBIN. + */ + CUPTI_ACTIVITY_JIT_ENTRY_PTX_TO_CUBIN = 1, + /** + * NVVM-IR to PTX + */ + CUPTI_ACTIVITY_JIT_ENTRY_NVVM_IR_TO_PTX = 2, + + CUPTI_ACTIVITY_JIT_ENTRY_TYPE_FORCE_INT = 0x7fffffff +} CUpti_ActivityJitEntryType; + +/** + * \brief The types of JIT compilation operations. + * + * To be used in CUpti_ActivityJit. + */ + +typedef enum { + CUPTI_ACTIVITY_JIT_OPERATION_INVALID = 0, + /** + * Loaded from the compute cache. + */ + CUPTI_ACTIVITY_JIT_OPERATION_CACHE_LOAD = 1, + /** + * Stored in the compute cache. + */ + CUPTI_ACTIVITY_JIT_OPERATION_CACHE_STORE = 2, + /** + * JIT compilation. + */ + CUPTI_ACTIVITY_JIT_OPERATION_COMPILE = 3, + + CUPTI_ACTIVITY_JIT_OPERATION_TYPE_FORCE_INT = 0x7fffffff +} CUpti_ActivityJitOperationType; + +/** + * \brief The activity record for JIT operations. + * This activity represents the JIT operations (compile, load, store) of a CUmodule + * from the Compute Cache. + * Gives the exact hashed path of where the cached module is loaded from, + * or where the module will be stored after Just-In-Time (JIT) compilation. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind must be CUPTI_ACTIVITY_KIND_JIT. + */ + CUpti_ActivityKind kind; + /** + * The JIT entry type. + */ + CUpti_ActivityJitEntryType jitEntryType; + /** + * The JIT operation type. + */ + CUpti_ActivityJitOperationType jitOperationType; + /** + * The device ID. + */ + uint32_t deviceId; + /** + * The start timestamp for the JIT operation, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the JIT operation. + */ + uint64_t start; + /** + * The end timestamp for the JIT operation, in ns. A value of 0 for both + * the start and end timestamps indicates that timestamp information + * could not be collected for the JIT operation. + */ + uint64_t end; + /** + * The correlation ID of the JIT operation to which + * records belong to. Each JIT operation is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the JIT operation. + */ + uint32_t correlationId; + /** + * Internal use. + */ + uint32_t padding; + /** + * The correlation ID to correlate JIT compilation, load and store operations. + * Each JIT compilation unit is assigned a unique correlation ID + * at the time of the JIT compilation. This correlation id can be used + * to find the matching JIT cache load/store records. + */ + uint64_t jitOperationCorrelationId; + /** + * The size of compute cache. + */ + uint64_t cacheSize; + /** + * The path where the fat binary is cached. + */ + const char* cachePath; +} CUpti_ActivityJit; + + +/** + * \brief The activity record for trace of graph execution. + * + * This activity record represents execution for a graph without giving visibility + * about the execution of its nodes. This is intended to reduce overheads in tracing + * each node. The activity kind is CUPTI_ACTIVITY_KIND_GRAPH_TRACE + */ +typedef struct { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_GRAPH_TRACE + */ + CUpti_ActivityKind kind; + + /** + * The correlation ID of the graph launch. Each graph launch is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver API activity record that launched + * the graph. + */ + uint32_t correlationId; + + /** + * The start timestamp for the graph execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the graph. + */ + uint64_t start; + + /** + * The end timestamp for the graph execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the graph. + */ + uint64_t end; + + /** + * The ID of the device where the graph execution is occurring. + */ + uint32_t deviceId; + + /** + * The unique ID of the graph that is launched. + */ + uint32_t graphId; + + /** + * The ID of the context where the graph is being launched. + */ + uint32_t contextId; + + /** + * The ID of the stream where the graph is being launched. + */ + uint32_t streamId; + + /** + * This field is reserved for internal use + */ + void *reserved; +} CUpti_ActivityGraphTrace; + +END_PACKED_ALIGNMENT + +/** + * \brief Activity attributes. + * + * These attributes are used to control the behavior of the activity + * API. + */ +typedef enum { + /** + * The device memory size (in bytes) reserved for storing profiling data for concurrent + * kernels (activity kind \ref CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), memcopies and memsets + * for each buffer on a context. The value is a size_t. + * + * There is a limit on how many device buffers can be allocated per context. User + * can query and set this limit using the attribute + * \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT. + * CUPTI doesn't pre-allocate all the buffers, it pre-allocates only those many + * buffers as set by the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE. + * When all of the data in a buffer is consumed, it is added in the reuse pool, and + * CUPTI picks a buffer from this pool when a new buffer is needed. Thus memory + * footprint does not scale with the kernel count. Applications with the high density + * of kernels, memcopies and memsets might result in having CUPTI to allocate more device buffers. + * CUPTI allocates another buffer only when it runs out of the buffers in the + * reuse pool. + * + * Since buffer allocation happens in the main application thread, this might result + * in stalls in the critical path. CUPTI pre-allocates 3 buffers of the same size to + * mitigate this issue. User can query and set the pre-allocation limit using the + * attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE. + * + * Having larger buffer size leaves less device memory for the application. + * Having smaller buffer size increases the risk of dropping timestamps for + * records if too many kernels or memcopies or memsets are launched at one time. + * + * This value only applies to new buffer allocations. Set this value before initializing + * CUDA or before creating a context to ensure it is considered for the following allocations. + * + * The default value is 3200000 (~3MB) which can accommodate profiling data + * up to 100,000 kernels, memcopies and memsets combined. + * + * Note: Starting with the CUDA 11.2 release, CUPTI allocates profiling buffer in the + * pinned host memory by default as this might help in improving the performance of the + * tracing run. Refer to the description of the attribute + * \ref CUPTI_ACTIVITY_ATTR_MEM_ALLOCATION_TYPE_HOST_PINNED for more details. + * Size of the memory and maximum number of pools are still controlled by the attributes + * \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE and \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT. + * + * Note: The actual amount of device memory per buffer reserved by CUPTI might be larger. + */ + CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE = 0, + /** + * The device memory size (in bytes) reserved for storing profiling + * data for CDP operations for each buffer on a context. The + * value is a size_t. + * + * Having larger buffer size means less flush operations but + * consumes more device memory. This value only applies to new + * allocations. + * + * Set this value before initializing CUDA or before creating a + * context to ensure it is considered for the following allocations. + * + * The default value is 8388608 (8MB). + * + * Note: The actual amount of device memory per context reserved by + * CUPTI might be larger. + */ + CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP = 1, + /** + * The maximum number of device memory buffers per context. The value is a size_t. + * + * For an application with high rate of kernel launches, memcopies and memsets having a bigger pool + * limit helps in timestamp collection for all these activties at the expense of a larger memory footprint. + * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE + * for more details. + * + * Setting this value will not modify the number of memory buffers + * currently stored. + * + * Set this value before initializing CUDA to ensure the limit is + * not exceeded. + * + * The default value is 250. + */ + CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT = 2, + + /** + * The profiling semaphore pool size reserved for storing profiling data for + * serialized kernels tracing (activity kind \ref CUPTI_ACTIVITY_KIND_KERNEL) + * for each context. The value is a size_t. + * + * There is a limit on how many semaphore pools can be allocated per context. User + * can query and set this limit using the attribute + * \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT. + * CUPTI doesn't pre-allocate all the semaphore pools, it pre-allocates only those many + * semaphore pools as set by the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE. + * When all of the data in a semaphore pool is consumed, it is added in the reuse pool, and + * CUPTI picks a semaphore pool from the reuse pool when a new semaphore pool is needed. Thus memory + * footprint does not scale with the kernel count. Applications with the high density + * of kernels might result in having CUPTI to allocate more semaphore pools. + * CUPTI allocates another semaphore pool only when it runs out of the semaphore pools in the + * reuse pool. + * + * Since semaphore pool allocation happens in the main application thread, this might result + * in stalls in the critical path. CUPTI pre-allocates 3 semaphore pools of the same size to + * mitigate this issue. User can query and set the pre-allocation limit using the + * attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE. + * + * Having larger semaphore pool size leaves less device memory for the application. + * Having smaller semaphore pool size increases the risk of dropping timestamps for + * kernel records if too many kernels are issued/launched at one time. + * + * This value only applies to new semaphore pool allocations. Set this value before initializing + * CUDA or before creating a context to ensure it is considered for the following allocations. + * + * The default value is 25000 which can accommodate profiling data for upto 25,000 kernels. + * + */ + CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE = 3, + /** + * The maximum number of profiling semaphore pools per context. The value is a size_t. + * + * For an application with high rate of kernel launches, having a bigger + * pool limit helps in timestamp collection for all the kernels, at the + * expense of a larger device memory footprint. + * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE + * for more details. + * + * Set this value before initializing CUDA to ensure the limit is not exceeded. + * + * The default value is 250. + */ + CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT = 4, + + /** + * The flag to indicate whether user should provide activity buffer of zero value. + * The value is a uint8_t. + * + * If the value of this attribute is non-zero, user should provide + * a zero value buffer in the \ref CUpti_BuffersCallbackRequestFunc. + * If the user does not provide a zero value buffer after setting this to non-zero, + * the activity buffer may contain some uninitialized values when CUPTI returns it in + * \ref CUpti_BuffersCallbackCompleteFunc + * + * If the value of this attribute is zero, CUPTI will initialize the user buffer + * received in the \ref CUpti_BuffersCallbackRequestFunc to zero before filling it. + * If the user sets this to zero, a few stalls may appear in critical path because CUPTI + * will zero out the buffer in the main thread. + * Set this value before returning from \ref CUpti_BuffersCallbackRequestFunc to + * ensure it is considered for all the subsequent user buffers. + * + * The default value is 0. + */ + CUPTI_ACTIVITY_ATTR_ZEROED_OUT_ACTIVITY_BUFFER = 5, + + /** + * Number of device buffers to pre-allocate for a context during the initialization phase. + * The value is a size_t. + * + * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE + * for details. + * + * This value must be less than the maximum number of device buffers set using + * the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT + * + * Set this value before initializing CUDA or before creating a context to ensure it + * is considered by the CUPTI. + * + * The default value is set to 3 to ping pong between these buffers (if possible). + */ + CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE = 6, + + /** + * Number of profiling semaphore pools to pre-allocate for a context during the + * initialization phase. The value is a size_t. + * + * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE + * for details. + * + * This value must be less than the maximum number of profiling semaphore pools set + * using the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT + * + * Set this value before initializing CUDA or before creating a context to ensure it + * is considered by the CUPTI. + * + * The default value is set to 3 to ping pong between these pools (if possible). + */ + CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE = 7, + + /** + * Allocate page-locked (pinned) host memory for storing profiling data for concurrent + * kernels, memcopies and memsets for each buffer on a context. The value is a uint8_t. + * + * Starting with the CUDA 11.2 release, CUPTI allocates profiling buffer in the pinned host + * memory by default as this might help in improving the performance of the tracing run. + * Allocating excessive amounts of pinned memory may degrade system performance, since it + * reduces the amount of memory available to the system for paging. For this reason user + * might want to change the location from pinned host memory to device memory by setting + * value of this attribute to 0. + * + * The default value is 1. + */ + CUPTI_ACTIVITY_ATTR_MEM_ALLOCATION_TYPE_HOST_PINNED = 8, + + + CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_FORCE_INT = 0x7fffffff +} CUpti_ActivityAttribute; + +/** + * \brief Thread-Id types. + * + * CUPTI uses different methods to obtain the thread-id depending on the + * support and the underlying platform. This enum documents these methods + * for each type. APIs \ref cuptiSetThreadIdType and \ref cuptiGetThreadIdType + * can be used to set and get the thread-id type. + */ +typedef enum { + /** + * Default type + * Windows uses API GetCurrentThreadId() + * Linux/Mac/Android/QNX use POSIX pthread API pthread_self() + */ + CUPTI_ACTIVITY_THREAD_ID_TYPE_DEFAULT = 0, + + /** + * This type is based on the system API available on the underlying platform + * and thread-id obtained is supposed to be unique for the process lifetime. + * Windows uses API GetCurrentThreadId() + * Linux uses syscall SYS_gettid + * Mac uses syscall SYS_thread_selfid + * Android/QNX use gettid() + */ + CUPTI_ACTIVITY_THREAD_ID_TYPE_SYSTEM = 1, + + CUPTI_ACTIVITY_THREAD_ID_TYPE_FORCE_INT = 0x7fffffff +} CUpti_ActivityThreadIdType; + +/** + * \brief Get the CUPTI timestamp. + * + * Returns a timestamp normalized to correspond with the start and end + * timestamps reported in the CUPTI activity records. The timestamp is + * reported in nanoseconds. + * + * \param timestamp Returns the CUPTI timestamp + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p timestamp is NULL + */ +CUptiResult CUPTIAPI cuptiGetTimestamp(uint64_t *timestamp); + +/** + * \brief Get the ID of a context. + * + * Get the ID of a context. + * + * \param context The context + * \param contextId Returns a process-unique ID for the context + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_CONTEXT The context is NULL or not valid. + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p contextId is NULL + */ +CUptiResult CUPTIAPI cuptiGetContextId(CUcontext context, uint32_t *contextId); + +/** + * \brief Get the ID of a stream. + * + * Get the ID of a stream. The stream ID is unique within a context + * (i.e. all streams within a context will have unique stream + * IDs). + * + * \param context If non-NULL then the stream is checked to ensure + * that it belongs to this context. Typically this parameter should be + * null. + * \param stream The stream + * \param streamId Returns a context-unique ID for the stream + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_STREAM if unable to get stream ID, or + * if \p context is non-NULL and \p stream does not belong to the + * context + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p streamId is NULL + * + * **DEPRECATED** This method is deprecated as of CUDA 8.0. + * Use method cuptiGetStreamIdEx instead. + */ +CUptiResult CUPTIAPI cuptiGetStreamId(CUcontext context, CUstream stream, uint32_t *streamId); + +/** +* \brief Get the ID of a stream. +* +* Get the ID of a stream. The stream ID is unique within a context +* (i.e. all streams within a context will have unique stream +* IDs). +* +* \param context If non-NULL then the stream is checked to ensure +* that it belongs to this context. Typically this parameter should be +* null. +* \param stream The stream +* \param perThreadStream Flag to indicate if program is compiled for per-thread streams +* \param streamId Returns a context-unique ID for the stream +* +* \retval CUPTI_SUCCESS +* \retval CUPTI_ERROR_NOT_INITIALIZED +* \retval CUPTI_ERROR_INVALID_STREAM if unable to get stream ID, or +* if \p context is non-NULL and \p stream does not belong to the +* context +* \retval CUPTI_ERROR_INVALID_PARAMETER if \p streamId is NULL +*/ +CUptiResult CUPTIAPI cuptiGetStreamIdEx(CUcontext context, CUstream stream, uint8_t perThreadStream, uint32_t *streamId); + +/** + * \brief Get the ID of a device + * + * If \p context is NULL, returns the ID of the device that contains + * the currently active context. If \p context is non-NULL, returns + * the ID of the device which contains that context. Operates in a + * similar manner to cudaGetDevice() or cuCtxGetDevice() but may be + * called from within callback functions. + * + * \param context The context, or NULL to indicate the current context. + * \param deviceId Returns the ID of the device that is current for + * the calling thread. + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_DEVICE if unable to get device ID + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p deviceId is NULL + */ +CUptiResult CUPTIAPI cuptiGetDeviceId(CUcontext context, uint32_t *deviceId); + +/** + * \brief Get the unique ID of a graph node + * + * Returns the unique ID of the CUDA graph node. + * + * \param node The graph node. + * \param nodeId Returns the unique ID of the node + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p node is NULL + */ +CUptiResult CUPTIAPI cuptiGetGraphNodeId(CUgraphNode node, uint64_t *nodeId); + +/** + * \brief Get the unique ID of graph + * + * Returns the unique ID of CUDA graph. + * + * \param graph The graph. + * \param pId Returns the unique ID of the graph + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p graph is NULL + */ +CUptiResult CUPTIAPI cuptiGetGraphId(CUgraph graph, uint32_t *pId); + +/** + * \brief Enable collection of a specific kind of activity record. + * + * Enable collection of a specific kind of activity record. Multiple + * kinds can be enabled by calling this function multiple times. By + * default all activity kinds are disabled for collection. + * + * \param kind The kind of activity record to collect + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled + * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported + */ +CUptiResult CUPTIAPI cuptiActivityEnable(CUpti_ActivityKind kind); + +/** + * \brief Enable collection of a specific kind of activity record. For certain activity kinds + * it dumps existing records. + * + * In general, the behavior of this API is similar to the API \ref cuptiActivityEnable i.e. it + * enables the collection of a specific kind of activity record. + * Additionally, this API can help in dumping the records for activities which happened in + * the past before enabling the corresponding activity kind. + * The API allows to get records for the current resource allocations done in CUDA + * For CUPTI_ACTIVITY_KIND_DEVICE, existing device records are dumped + * For CUPTI_ACTIVITY_KIND_CONTEXT, existing context records are dumped + * For CUPTI_ACTIVITY_KIND_STREAM, existing stream records are dumped + * For CUPTI_ACTIVITY_KIND_ NVLINK, existing NVLINK records are dumped + * For CUPTI_ACTIVITY_KIND_PCIE, existing PCIE records are dumped + * For other activities, the behavior is similar to the API \ref cuptiActivityEnable + * + * Device records are emitted in CUPTI on CUDA driver initialization. Those records + * can only be retrieved by the user if CUPTI is attached before CUDA initialization. + * Context and stream records are emitted on context and stream creation. + * The use case of the API is to provide the records for CUDA resources + * (contexs/streams/devices) that are currently active if user late attaches CUPTI. + * + * Before calling this function, the user must register buffer callbacks + * to get the activity records by calling \ref cuptiActivityRegisterCallbacks. + * If the user does not register the buffers and calls API \ref cuptiActivityEnableAndDump, + * then CUPTI will enable the activity kind but not provide any records for that + * activity kind. + * + * \param kind The kind of activity record to collect + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_UNKNOWN if buffer is not initialized. + * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled + * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported + */ +CUptiResult CUPTIAPI cuptiActivityEnableAndDump(CUpti_ActivityKind kind); + +/** + * \brief Disable collection of a specific kind of activity record. + * + * Disable collection of a specific kind of activity record. Multiple + * kinds can be disabled by calling this function multiple times. By + * default all activity kinds are disabled for collection. + * + * \param kind The kind of activity record to stop collecting + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported + */ +CUptiResult CUPTIAPI cuptiActivityDisable(CUpti_ActivityKind kind); + +/** + * \brief Enable collection of a specific kind of activity record for + * a context. + * + * Enable collection of a specific kind of activity record for a + * context. This setting done by this API will supersede the global + * settings for activity records enabled by \ref cuptiActivityEnable. + * Multiple kinds can be enabled by calling this function multiple + * times. + * + * \param context The context for which activity is to be enabled + * \param kind The kind of activity record to collect + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled + * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported + */ +CUptiResult CUPTIAPI cuptiActivityEnableContext(CUcontext context, CUpti_ActivityKind kind); + +/** + * \brief Disable collection of a specific kind of activity record for + * a context. + * + * Disable collection of a specific kind of activity record for a context. + * This setting done by this API will supersede the global settings + * for activity records. + * Multiple kinds can be enabled by calling this function multiple times. + * + * \param context The context for which activity is to be disabled + * \param kind The kind of activity record to stop collecting + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported + */ +CUptiResult CUPTIAPI cuptiActivityDisableContext(CUcontext context, CUpti_ActivityKind kind); + +/** + * \brief Get the number of activity records that were dropped of + * insufficient buffer space. + * + * Get the number of records that were dropped because of insufficient + * buffer space. The dropped count includes records that could not be + * recorded because CUPTI did not have activity buffer space available + * for the record (because the CUpti_BuffersCallbackRequestFunc + * callback did not return an empty buffer of sufficient size) and + * also CDP records that could not be record because the device-size + * buffer was full (size is controlled by the + * CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP attribute). The dropped + * count maintained for the queue is reset to zero when this function + * is called. + * + * \param context The context, or NULL to get dropped count from global queue + * \param streamId The stream ID + * \param dropped The number of records that were dropped since the last call + * to this function. + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p dropped is NULL + */ +CUptiResult CUPTIAPI cuptiActivityGetNumDroppedRecords(CUcontext context, uint32_t streamId, + size_t *dropped); + +/** + * \brief Iterate over the activity records in a buffer. + * + * This is a helper function to iterate over the activity records in a + * buffer. A buffer of activity records is typically obtained by + * receiving a CUpti_BuffersCallbackCompleteFunc callback. + * + * An example of typical usage: + * \code + * CUpti_Activity *record = NULL; + * CUptiResult status = CUPTI_SUCCESS; + * do { + * status = cuptiActivityGetNextRecord(buffer, validSize, &record); + * if(status == CUPTI_SUCCESS) { + * // Use record here... + * } + * else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) + * break; + * else { + * goto Error; + * } + * } while (1); + * \endcode + * + * \param buffer The buffer containing activity records + * \param record Inputs the previous record returned by + * cuptiActivityGetNextRecord and returns the next activity record + * from the buffer. If input value is NULL, returns the first activity + * record in the buffer. Records of kind CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL + * may contain invalid (0) timestamps, indicating that no timing information could + * be collected for lack of device memory. + * \param validBufferSizeBytes The number of valid bytes in the buffer. + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_MAX_LIMIT_REACHED if no more records in the buffer + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p buffer is NULL. + */ +CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t* buffer, size_t validBufferSizeBytes, + CUpti_Activity **record); + +/** + * \brief Function type for callback used by CUPTI to request an empty + * buffer for storing activity records. + * + * This callback function signals the CUPTI client that an activity + * buffer is needed by CUPTI. The activity buffer is used by CUPTI to + * store activity records. The callback function can decline the + * request by setting \p *buffer to NULL. In this case CUPTI may drop + * activity records. + * + * \param buffer Returns the new buffer. If set to NULL then no buffer + * is returned. + * \param size Returns the size of the returned buffer. + * \param maxNumRecords Returns the maximum number of records that + * should be placed in the buffer. If 0 then the buffer is filled with + * as many records as possible. If > 0 the buffer is filled with at + * most that many records before it is returned. + */ +typedef void (CUPTIAPI *CUpti_BuffersCallbackRequestFunc)( + uint8_t **buffer, + size_t *size, + size_t *maxNumRecords); + +/** + * \brief Function type for callback used by CUPTI to return a buffer + * of activity records. + * + * This callback function returns to the CUPTI client a buffer + * containing activity records. The buffer contains \p validSize + * bytes of activity records which should be read using + * cuptiActivityGetNextRecord. The number of dropped records can be + * read using cuptiActivityGetNumDroppedRecords. After this call CUPTI + * relinquished ownership of the buffer and will not use it + * anymore. The client may return the buffer to CUPTI using the + * CUpti_BuffersCallbackRequestFunc callback. + * Note: CUDA 6.0 onwards, all buffers returned by this callback are + * global buffers i.e. there is no context/stream specific buffer. + * User needs to parse the global buffer to extract the context/stream + * specific activity records. + * + * \param context The context this buffer is associated with. If NULL, the + * buffer is associated with the global activities. This field is deprecated + * as of CUDA 6.0 and will always be NULL. + * \param streamId The stream id this buffer is associated with. + * This field is deprecated as of CUDA 6.0 and will always be NULL. + * \param buffer The activity record buffer. + * \param size The total size of the buffer in bytes as set in + * CUpti_BuffersCallbackRequestFunc. + * \param validSize The number of valid bytes in the buffer. + */ +typedef void (CUPTIAPI *CUpti_BuffersCallbackCompleteFunc)( + CUcontext context, + uint32_t streamId, + uint8_t *buffer, + size_t size, + size_t validSize); + +/** + * \brief Registers callback functions with CUPTI for activity buffer + * handling. + * + * This function registers two callback functions to be used in asynchronous + * buffer handling. If registered, activity record buffers are handled using + * asynchronous requested/completed callbacks from CUPTI. + * + * Registering these callbacks prevents the client from using CUPTI's + * blocking enqueue/dequeue functions. + * + * \param funcBufferRequested callback which is invoked when an empty + * buffer is requested by CUPTI + * \param funcBufferCompleted callback which is invoked when a buffer + * containing activity records is available from CUPTI + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER if either \p + * funcBufferRequested or \p funcBufferCompleted is NULL + */ +CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(CUpti_BuffersCallbackRequestFunc funcBufferRequested, + CUpti_BuffersCallbackCompleteFunc funcBufferCompleted); + +/** + * \brief Wait for all activity records to be delivered via the + * completion callback. + * + * This function does not return until all activity records associated + * with the specified context/stream are returned to the CUPTI client + * using the callback registered in cuptiActivityRegisterCallbacks. To + * ensure that all activity records are complete, the requested + * stream(s), if any, are synchronized. + * + * If \p context is NULL, the global activity records (i.e. those not + * associated with a particular stream) are flushed (in this case no + * streams are synchonized). If \p context is a valid CUcontext and + * \p streamId is 0, the buffers of all streams of this context are + * flushed. Otherwise, the buffers of the specified stream in this + * context is flushed. + * + * Before calling this function, the buffer handling callback api + * must be activated by calling cuptiActivityRegisterCallbacks. + * + * \param context A valid CUcontext or NULL. + * \param streamId The stream ID. + * \param flag The flag can be set to indicate a forced flush. See CUpti_ActivityFlag + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_CUPTI_ERROR_INVALID_OPERATION if not preceeded + * by a successful call to cuptiActivityRegisterCallbacks + * \retval CUPTI_ERROR_UNKNOWN an internal error occurred + * + * **DEPRECATED** This method is deprecated + * CONTEXT and STREAMID will be ignored. Use cuptiActivityFlushAll + * to flush all data. + */ +CUptiResult CUPTIAPI cuptiActivityFlush(CUcontext context, uint32_t streamId, uint32_t flag); + +/** + * \brief Request to deliver activity records via the buffer completion callback. + * + * This function returns the activity records associated with all contexts/streams + * (and the global buffers not associated with any stream) to the CUPTI client + * using the callback registered in cuptiActivityRegisterCallbacks. + * + * This is a blocking call but it doesn't issue any CUDA synchronization calls + * implicitly thus it's not guaranteed that all activities are completed on the + * underlying devices. Activity record is considered as completed if it has all + * the information filled up including the timestamps if any. It is the client's + * responsibility to issue necessary CUDA synchronization calls before calling + * this function if all activity records with complete information are expected + * to be delivered. + * + * Behavior of the function based on the input flag: + * - ::For default flush i.e. when flag is set as 0, it returns all the + * activity buffers which have all the activity records completed, buffers need not + * to be full though. It doesn't return buffers which have one or more incomplete + * records. Default flush can be done at a regular interval in a separate thread. + * - ::For forced flush i.e. when flag CUPTI_ACTIVITY_FLAG_FLUSH_FORCED is passed + * to the function, it returns all the activity buffers including the ones which have + * one or more incomplete activity records. It's suggested for clients to do the + * force flush before the termination of the profiling session to allow remaining + * buffers to be delivered. In general, it can be done in the at-exit handler. + * + * Before calling this function, the buffer handling callback api must be activated + * by calling cuptiActivityRegisterCallbacks. + * + * \param flag The flag can be set to indicate a forced flush. See CUpti_ActivityFlag + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_OPERATION if not preceeded by a + * successful call to cuptiActivityRegisterCallbacks + * \retval CUPTI_ERROR_UNKNOWN an internal error occurred + * + * \see cuptiActivityFlushPeriod + */ +CUptiResult CUPTIAPI cuptiActivityFlushAll(uint32_t flag); + +/** + * \brief Read an activity API attribute. + * + * Read an activity API attribute and return it in \p *value. + * + * \param attr The attribute to read + * \param valueSize Size of buffer pointed by the value, and + * returns the number of bytes written to \p value + * \param value Returns the value of the attribute + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value is NULL, or + * if \p attr is not an activity attribute + * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that + * the \p value buffer is too small to hold the attribute value. + */ +CUptiResult CUPTIAPI cuptiActivityGetAttribute(CUpti_ActivityAttribute attr, + size_t *valueSize, void* value); + +/** + * \brief Write an activity API attribute. + * + * Write an activity API attribute. + * + * \param attr The attribute to write + * \param valueSize The size, in bytes, of the value + * \param value The attribute value to write + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value is NULL, or + * if \p attr is not an activity attribute + * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that + * the \p value buffer is too small to hold the attribute value. + */ +CUptiResult CUPTIAPI cuptiActivitySetAttribute(CUpti_ActivityAttribute attr, + size_t *valueSize, void* value); + + +/** + * \brief Set Unified Memory Counter configuration. + * + * \param config A pointer to \ref CUpti_ActivityUnifiedMemoryCounterConfig structures + * containing Unified Memory counter configuration. + * \param count Number of Unified Memory counter configuration structures + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p config is NULL or + * any parameter in the \p config structures is not a valid value + * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED One potential reason is that + * platform (OS/arch) does not support the unified memory counters + * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE Indicates that the device + * does not support the unified memory counters + * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES Indicates that + * multi-GPU configuration without P2P support between any pair of devices + * does not support the unified memory counters + */ +CUptiResult CUPTIAPI cuptiActivityConfigureUnifiedMemoryCounter(CUpti_ActivityUnifiedMemoryCounterConfig *config, uint32_t count); + +/** + * \brief Get auto boost state + * + * The profiling results can be inconsistent in case auto boost is enabled. + * CUPTI tries to disable auto boost while profiling. It can fail to disable in + * cases where user does not have the permissions or CUDA_AUTO_BOOST env + * variable is set. The function can be used to query whether auto boost is + * enabled. + * + * \param context A valid CUcontext. + * \param state A pointer to \ref CUpti_ActivityAutoBoostState structure which + * contains the current state and the id of the process that has requested the + * current state + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p CUcontext or \p state is NULL + * \retval CUPTI_ERROR_NOT_SUPPORTED Indicates that the device does not support auto boost + * \retval CUPTI_ERROR_UNKNOWN an internal error occurred + */ +CUptiResult CUPTIAPI cuptiGetAutoBoostState(CUcontext context, CUpti_ActivityAutoBoostState *state); + +/** + * \brief Set PC sampling configuration. + * + * For Pascal and older GPU architectures this API must be called before enabling + * activity kind CUPTI_ACTIVITY_KIND_PC_SAMPLING. There is no such requirement + * for Volta and newer GPU architectures. + * + * For Volta and newer GPU architectures if this API is called in the middle of + * execution, PC sampling configuration will be updated for subsequent kernel launches. + * + * \param ctx The context + * \param config A pointer to \ref CUpti_ActivityPCSamplingConfig structure + * containing PC sampling configuration. + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_OPERATION if this api is called while + * some valid event collection method is set. + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p config is NULL or + * any parameter in the \p config structures is not a valid value + * \retval CUPTI_ERROR_NOT_SUPPORTED Indicates that the system/device + * does not support the unified memory counters + */ +CUptiResult CUPTIAPI cuptiActivityConfigurePCSampling(CUcontext ctx, CUpti_ActivityPCSamplingConfig *config); + +/** + * \brief Returns the last error from a cupti call or callback + * + * Returns the last error that has been produced by any of the cupti api calls + * or the callback in the same host thread and resets it to CUPTI_SUCCESS. + */ +CUptiResult CUPTIAPI cuptiGetLastError(void); + +/** + * \brief Set the thread-id type + * + * CUPTI uses the method corresponding to set type to generate the thread-id. + * See enum \ref CUpti_ActivityThreadIdType for the list of methods. + * Activity records having thread-id field contain the same value. + * Thread id type must not be changed during the profiling session to + * avoid thread-id value mismatch across activity records. + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_SUPPORTED if \p type is not supported on the platform + */ +CUptiResult CUPTIAPI cuptiSetThreadIdType(CUpti_ActivityThreadIdType type); + +/** + * \brief Get the thread-id type + * + * Returns the thread-id type used in CUPTI + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p type is NULL + */ +CUptiResult CUPTIAPI cuptiGetThreadIdType(CUpti_ActivityThreadIdType *type); + +/** +* \brief Check support for a compute capability +* +* This function is used to check the support for a device based on +* it's compute capability. It sets the \p support when the compute +* capability is supported by the current version of CUPTI, and clears +* it otherwise. This version of CUPTI might not support all GPUs sharing +* the same compute capability. It is suggested to use API \ref +* cuptiDeviceSupported which provides correct information. +* +* \param major The major revision number of the compute capability +* \param minor The minor revision number of the compute capability +* \param support Pointer to an integer to return the support status +* +* \retval CUPTI_SUCCESS +* \retval CUPTI_ERROR_INVALID_PARAMETER if \p support is NULL +* +* \sa ::cuptiDeviceSupported +*/ +CUptiResult CUPTIAPI cuptiComputeCapabilitySupported(int major, int minor, int *support); + +/** +* \brief Check support for a compute device +* +* This function is used to check the support for a compute device. +* It sets the \p support when the device is supported by the current +* version of CUPTI, and clears it otherwise. +* +* \param dev The device handle returned by CUDA Driver API cuDeviceGet +* \param support Pointer to an integer to return the support status +* +* \retval CUPTI_SUCCESS +* \retval CUPTI_ERROR_INVALID_PARAMETER if \p support is NULL +* \retval CUPTI_ERROR_INVALID_DEVICE if \p dev is not a valid device +* +* \sa ::cuptiComputeCapabilitySupported +*/ +CUptiResult CUPTIAPI cuptiDeviceSupported(CUdevice dev, int *support); + +/** + * This indicates the virtualization mode in which CUDA device is running + */ +typedef enum { + /** + * No virtualization mode isassociated with the device + * i.e. it's a baremetal GPU + */ + CUPTI_DEVICE_VIRTUALIZATION_MODE_NONE = 0, + /** + * The device is associated with the pass-through GPU. + * In this mode, an entire physical GPU is directly assigned + * to one virtual machine (VM). + */ + CUPTI_DEVICE_VIRTUALIZATION_MODE_PASS_THROUGH = 1, + /** + * The device is associated with the virtual GPU (vGPU). + * In this mode multiple virtual machines (VMs) have simultaneous, + * direct access to a single physical GPU. + */ + CUPTI_DEVICE_VIRTUALIZATION_MODE_VIRTUAL_GPU = 2, + + CUPTI_DEVICE_VIRTUALIZATION_MODE_FORCE_INT = 0x7fffffff +} CUpti_DeviceVirtualizationMode; + +/** + * \brief Query the virtualization mode of the device + * + * This function is used to query the virtualization mode of the CUDA device. + * + * \param dev The device handle returned by CUDA Driver API cuDeviceGet + * \param mode Pointer to an CUpti_DeviceVirtualizationMode to return the virtualization mode + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_DEVICE if \p dev is not a valid device + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p mode is NULL + * + */ +CUptiResult CUPTIAPI cuptiDeviceVirtualizationMode(CUdevice dev, CUpti_DeviceVirtualizationMode *mode); + +/** + * \brief Detach CUPTI from the running process + * + * This API detaches the CUPTI from the running process. It destroys and cleans up all the + * resources associated with CUPTI in the current process. After CUPTI detaches from the process, + * the process will keep on running with no CUPTI attached to it. + * For safe operation of the API, it is recommended this API is invoked from the exit callsite + * of any of the CUDA Driver or Runtime API. Otherwise CUPTI client needs to make sure that + * required CUDA synchronization and CUPTI activity buffer flush is done before calling the API. + * Sample code showing the usage of the API in the cupti callback handler code: + * \code + void CUPTIAPI + cuptiCallbackHandler(void *userdata, CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, void *cbdata) + { + const CUpti_CallbackData *cbInfo = (CUpti_CallbackData *)cbdata; + + // Take this code path when CUPTI detach is requested + if (detachCupti) { + switch(domain) + { + case CUPTI_CB_DOMAIN_RUNTIME_API: + case CUPTI_CB_DOMAIN_DRIVER_API: + if (cbInfo->callbackSite == CUPTI_API_EXIT) { + // call the CUPTI detach API + cuptiFinalize(); + } + break; + default: + break; + } + } + } + \endcode + */ +CUptiResult CUPTIAPI cuptiFinalize(void); + +/** + * \brief Push an external correlation id for the calling thread + * + * This function notifies CUPTI that the calling thread is entering an external API region. + * When a CUPTI activity API record is created while within an external API region and + * CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION is enabled, the activity API record will + * be preceeded by a CUpti_ActivityExternalCorrelation record for each \ref CUpti_ExternalCorrelationKind. + * + * \param kind The kind of external API activities should be correlated with. + * \param id External correlation id. + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER The external API kind is invalid + */ +CUptiResult CUPTIAPI cuptiActivityPushExternalCorrelationId(CUpti_ExternalCorrelationKind kind, uint64_t id); + +/** + * \brief Pop an external correlation id for the calling thread + * + * This function notifies CUPTI that the calling thread is leaving an external API region. + * + * \param kind The kind of external API activities should be correlated with. + * \param lastId If the function returns successful, contains the last external correlation id for this \p kind, can be NULL. + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER The external API kind is invalid. + * \retval CUPTI_ERROR_QUEUE_EMPTY No external id is currently associated with \p kind. + */ +CUptiResult CUPTIAPI cuptiActivityPopExternalCorrelationId(CUpti_ExternalCorrelationKind kind, uint64_t *lastId); + +/** + * \brief Controls the collection of queued and submitted timestamps for kernels. + * + * This API is used to control the collection of queued and submitted timestamps + * for kernels whose records are provided through the struct \ref CUpti_ActivityKernel8. + * Default value is 0, i.e. these timestamps are not collected. This API needs + * to be called before initialization of CUDA and this setting should not be + * changed during the profiling session. + * + * \param enable is a boolean, denoting whether these timestamps should be + * collected + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + */ +CUptiResult CUPTIAPI cuptiActivityEnableLatencyTimestamps(uint8_t enable); + +/** + * \brief Sets the flush period for the worker thread + * + * CUPTI creates a worker thread to minimize the perturbance for the application created + * threads. CUPTI offloads certain operations from the application threads to the worker + * thread, this includes synchronization of profiling resources between host and device, + * delivery of the activity buffers to the client using the callback registered in + * cuptiActivityRegisterCallbacks. For performance reasons, CUPTI wakes up the worker + * thread based on certain heuristics. + * + * This API is used to control the flush period of the worker thread. This setting will + * override the CUPTI heurtistics. Setting time to zero disables the periodic flush and + * restores the default behavior. + * + * Periodic flush can return only those activity buffers which are full and have all the + * activity records completed. + * + * It's allowed to use the API \ref cuptiActivityFlushAll to flush the data on-demand, even + * when client sets the periodic flush. + * + * \param time flush period in msec + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * + * \see cuptiActivityFlushAll + */ +CUptiResult CUPTIAPI cuptiActivityFlushPeriod(uint32_t time); + +/** + * \brief Controls the collection of launch attributes for kernels. + * + * This API is used to control the collection of launch attributes for kernels whose + * records are provided through the struct \ref CUpti_ActivityKernel8. + * Default value is 0, i.e. these attributes are not collected. + * + * \param enable is a boolean denoting whether these launch attributes should be collected + */ +CUptiResult CUPTIAPI cuptiActivityEnableLaunchAttributes(uint8_t enable); + +/** + * \brief Function type for callback used by CUPTI to request a timestamp + * to be used in activity records. + * + * This callback function signals the CUPTI client that a timestamp needs + * to be returned. This timestamp would be treated as normalized timestamp + * to be used for various purposes in CUPTI. For example to store start and + * end timestamps reported in the CUPTI activity records. + * The returned timestamp must be in nanoseconds. + * + * \sa ::cuptiActivityRegisterTimestampCallback + */ +typedef uint64_t (CUPTIAPI *CUpti_TimestampCallbackFunc)(void); + +/** + * \brief Registers callback function with CUPTI for providing timestamp. + * + * This function registers a callback function to obtain timestamp of user's + * choice instead of using CUPTI provided timestamp. + * By default CUPTI uses different methods, based on the underlying platform, + * to retrieve the timestamp + * Linux and Android use clock_gettime(CLOCK_REALTIME, ..) + * Windows uses QueryPerformanceCounter() + * Mac uses mach_absolute_time() + * QNX uses ClockCycles() + * Timestamps retrieved using these methods are converted to nanosecond if needed + * before usage. + * + * The registration of timestamp callback should be done before any of the CUPTI + * activity kinds are enabled to make sure that all the records report the timestamp using + * the callback function registered through cuptiActivityRegisterTimestampCallback API. + * + * Changing the timestamp callback function in CUPTI through + * cuptiActivityRegisterTimestampCallback API in the middle of the profiling + * session can cause records generated prior to the change to report + * timestamps through previous timestamp method. + * + * \param funcTimestamp callback which is invoked when a timestamp is + * needed by CUPTI + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p funcTimestamp is NULL + * \retval CUPTI_ERROR_NOT_INITIALIZED + */ +CUptiResult CUPTIAPI cuptiActivityRegisterTimestampCallback(CUpti_TimestampCallbackFunc funcTimestamp); + +/** @} */ /* END CUPTI_ACTIVITY_API */ + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility pop +#endif + +#if defined(__cplusplus) +} +#endif + +#endif /*_CUPTI_ACTIVITY_H_*/ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_events.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_events.h new file mode 100644 index 0000000000000000000000000000000000000000..d76394e8bc4c9dbbff8422eaa50651340639a546 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_events.h @@ -0,0 +1,1371 @@ +/* + * Copyright 2010-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(_CUPTI_EVENTS_H_) +#define _CUPTI_EVENTS_H_ + +#include +#include +#include +#include + +#ifndef CUPTIAPI +#ifdef _WIN32 +#define CUPTIAPI __stdcall +#else +#define CUPTIAPI +#endif +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility push(default) +#endif + +/** + * \defgroup CUPTI_EVENT_API CUPTI Event API + * Functions, types, and enums that implement the CUPTI Event API. + * + * \note CUPTI event API from the header cupti_events.h are not supported on devices + * with compute capability 7.5 and higher (i.e. Turing and later GPU architectures). + * These API will be deprecated in a future CUDA release. These are replaced by + * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API + * in the headers nvperf_host.h and nvperf_target.h which are supported on + * devices with compute capability 7.0 and higher (i.e. Volta and later GPU + * architectures). + * + * @{ + */ + +/** + * \brief ID for an event. + * + * An event represents a countable activity, action, or occurrence on + * the device. + */ +typedef uint32_t CUpti_EventID; + +/** + * \brief ID for an event domain. + * + * ID for an event domain. An event domain represents a group of + * related events. A device may have multiple instances of a domain, + * indicating that the device can simultaneously record multiple + * instances of each event within that domain. + */ +typedef uint32_t CUpti_EventDomainID; + +/** + * \brief A group of events. + * + * An event group is a collection of events that are managed + * together. All events in an event group must belong to the same + * domain. + */ +typedef void *CUpti_EventGroup; + +/** + * \brief Device class. + * + * Enumeration of device classes for device attribute + * CUPTI_DEVICE_ATTR_DEVICE_CLASS. + */ +typedef enum { + CUPTI_DEVICE_ATTR_DEVICE_CLASS_TESLA = 0, + CUPTI_DEVICE_ATTR_DEVICE_CLASS_QUADRO = 1, + CUPTI_DEVICE_ATTR_DEVICE_CLASS_GEFORCE = 2, + CUPTI_DEVICE_ATTR_DEVICE_CLASS_TEGRA = 3, +} CUpti_DeviceAttributeDeviceClass; + +/** + * \brief Device attributes. + * + * CUPTI device attributes. These attributes can be read using \ref + * cuptiDeviceGetAttribute. + */ +typedef enum { + /** + * Number of event IDs for a device. Value is a uint32_t. + */ + CUPTI_DEVICE_ATTR_MAX_EVENT_ID = 1, + /** + * Number of event domain IDs for a device. Value is a uint32_t. + */ + CUPTI_DEVICE_ATTR_MAX_EVENT_DOMAIN_ID = 2, + /** + * Get global memory bandwidth in Kbytes/sec. Value is a uint64_t. + */ + CUPTI_DEVICE_ATTR_GLOBAL_MEMORY_BANDWIDTH = 3, + /** + * Get theoretical maximum number of instructions per cycle. Value + * is a uint32_t. + */ + CUPTI_DEVICE_ATTR_INSTRUCTION_PER_CYCLE = 4, + /** + * Get theoretical maximum number of single precision instructions + * that can be executed per second. Value is a uint64_t. + */ + CUPTI_DEVICE_ATTR_INSTRUCTION_THROUGHPUT_SINGLE_PRECISION = 5, + /** + * Get number of frame buffers for device. Value is a uint64_t. + */ + CUPTI_DEVICE_ATTR_MAX_FRAME_BUFFERS = 6, + /** + * Get PCIE link rate in Mega bits/sec for device. Return 0 if bus-type + * is non-PCIE. Value is a uint64_t. + */ + CUPTI_DEVICE_ATTR_PCIE_LINK_RATE = 7, + /** + * Get PCIE link width for device. Return 0 if bus-type + * is non-PCIE. Value is a uint64_t. + */ + CUPTI_DEVICE_ATTR_PCIE_LINK_WIDTH = 8, + /** + * Get PCIE generation for device. Return 0 if bus-type + * is non-PCIE. Value is a uint64_t. + */ + CUPTI_DEVICE_ATTR_PCIE_GEN = 9, + /** + * Get the class for the device. Value is a + * CUpti_DeviceAttributeDeviceClass. + */ + CUPTI_DEVICE_ATTR_DEVICE_CLASS = 10, + /** + * Get the peak single precision flop per cycle. Value is a uint64_t. + */ + CUPTI_DEVICE_ATTR_FLOP_SP_PER_CYCLE = 11, + /** + * Get the peak double precision flop per cycle. Value is a uint64_t. + */ + CUPTI_DEVICE_ATTR_FLOP_DP_PER_CYCLE = 12, + /** + * Get number of L2 units. Value is a uint64_t. + */ + CUPTI_DEVICE_ATTR_MAX_L2_UNITS = 13, + /** + * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_SHARED + * preference. Value is a uint64_t. + */ + CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_SHARED = 14, + /** + * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_L1 + * preference. Value is a uint64_t. + */ + CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_L1 = 15, + /** + * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_EQUAL + * preference. Value is a uint64_t. + */ + CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_EQUAL = 16, + /** + * Get the peak half precision flop per cycle. Value is a uint64_t. + */ + CUPTI_DEVICE_ATTR_FLOP_HP_PER_CYCLE = 17, + /** + * Check if Nvlink is connected to device. Returns 1, if at least one + * Nvlink is connected to the device, returns 0 otherwise. + * Value is a uint32_t. + */ + CUPTI_DEVICE_ATTR_NVLINK_PRESENT = 18, + /** + * Check if Nvlink is present between GPU and CPU. Returns Bandwidth, + * in Bytes/sec, if Nvlink is present, returns 0 otherwise. + * Value is a uint64_t. + */ + CUPTI_DEVICE_ATTR_GPU_CPU_NVLINK_BW = 19, + /** + * Check if NVSwitch is present in the underlying topology. + * Returns 1, if present, returns 0 otherwise. + * Value is a uint32_t. + */ + CUPTI_DEVICE_ATTR_NVSWITCH_PRESENT = 20, + CUPTI_DEVICE_ATTR_FORCE_INT = 0x7fffffff, +} CUpti_DeviceAttribute; + +/** + * \brief Event domain attributes. + * + * Event domain attributes. Except where noted, all the attributes can + * be read using either \ref cuptiDeviceGetEventDomainAttribute or + * \ref cuptiEventDomainGetAttribute. + */ +typedef enum { + /** + * Event domain name. Value is a null terminated const c-string. + */ + CUPTI_EVENT_DOMAIN_ATTR_NAME = 0, + /** + * Number of instances of the domain for which event counts will be + * collected. The domain may have additional instances that cannot + * be profiled (see CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT). + * Can be read only with \ref + * cuptiDeviceGetEventDomainAttribute. Value is a uint32_t. + */ + CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT = 1, + /** + * Total number of instances of the domain, including instances that + * cannot be profiled. Use CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT + * to get the number of instances that can be profiled. Can be read + * only with \ref cuptiDeviceGetEventDomainAttribute. Value is a + * uint32_t. + */ + CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT = 3, + /** + * Collection method used for events contained in the event domain. + * Value is a \ref CUpti_EventCollectionMethod. + */ + CUPTI_EVENT_DOMAIN_ATTR_COLLECTION_METHOD = 4, + + CUPTI_EVENT_DOMAIN_ATTR_FORCE_INT = 0x7fffffff, +} CUpti_EventDomainAttribute; + +/** + * \brief The collection method used for an event. + * + * The collection method indicates how an event is collected. + */ +typedef enum { + /** + * Event is collected using a hardware global performance monitor. + */ + CUPTI_EVENT_COLLECTION_METHOD_PM = 0, + /** + * Event is collected using a hardware SM performance monitor. + */ + CUPTI_EVENT_COLLECTION_METHOD_SM = 1, + /** + * Event is collected using software instrumentation. + */ + CUPTI_EVENT_COLLECTION_METHOD_INSTRUMENTED = 2, + /** + * Event is collected using NvLink throughput counter method. + */ + CUPTI_EVENT_COLLECTION_METHOD_NVLINK_TC = 3, + CUPTI_EVENT_COLLECTION_METHOD_FORCE_INT = 0x7fffffff +} CUpti_EventCollectionMethod; + +/** + * \brief Event group attributes. + * + * Event group attributes. These attributes can be read using \ref + * cuptiEventGroupGetAttribute. Attributes marked [rw] can also be + * written using \ref cuptiEventGroupSetAttribute. + */ +typedef enum { + /** + * The domain to which the event group is bound. This attribute is + * set when the first event is added to the group. Value is a + * CUpti_EventDomainID. + */ + CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID = 0, + /** + * [rw] Profile all the instances of the domain for this + * eventgroup. This feature can be used to get load balancing + * across all instances of a domain. Value is an integer. + */ + CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES = 1, + /** + * [rw] Reserved for user data. + */ + CUPTI_EVENT_GROUP_ATTR_USER_DATA = 2, + /** + * Number of events in the group. Value is a uint32_t. + */ + CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS = 3, + /** + * Enumerates events in the group. Value is a pointer to buffer of + * size sizeof(CUpti_EventID) * num_of_events in the eventgroup. + * num_of_events can be queried using + * CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS. + */ + CUPTI_EVENT_GROUP_ATTR_EVENTS = 4, + /** + * Number of instances of the domain bound to this event group that + * will be counted. Value is a uint32_t. + */ + CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT = 5, + /** + * Event group scope can be set to CUPTI_EVENT_PROFILING_SCOPE_DEVICE or + * CUPTI_EVENT_PROFILING_SCOPE_CONTEXT for an eventGroup, before + * adding any event. + * Sets the scope of eventgroup as CUPTI_EVENT_PROFILING_SCOPE_DEVICE or + * CUPTI_EVENT_PROFILING_SCOPE_CONTEXT when the scope of the events + * that will be added is CUPTI_EVENT_PROFILING_SCOPE_BOTH. + * If profiling scope of event is either + * CUPTI_EVENT_PROFILING_SCOPE_DEVICE or CUPTI_EVENT_PROFILING_SCOPE_CONTEXT + * then setting this attribute will not affect the default scope. + * It is not allowed to add events of different scope to same eventgroup. + * Value is a uint32_t. + */ + CUPTI_EVENT_GROUP_ATTR_PROFILING_SCOPE = 6, + CUPTI_EVENT_GROUP_ATTR_FORCE_INT = 0x7fffffff, +} CUpti_EventGroupAttribute; + +/** +* \brief Profiling scope for event. +* +* Profiling scope of event indicates if the event can be collected at context +* scope or device scope or both i.e. it can be collected at any of context or +* device scope. +*/ +typedef enum { + /** + * Event is collected at context scope. + */ + CUPTI_EVENT_PROFILING_SCOPE_CONTEXT = 0, + /** + * Event is collected at device scope. + */ + CUPTI_EVENT_PROFILING_SCOPE_DEVICE = 1, + /** + * Event can be collected at device or context scope. + * The scope can be set using \ref cuptiEventGroupSetAttribute API. + */ + CUPTI_EVENT_PROFILING_SCOPE_BOTH = 2, + CUPTI_EVENT_PROFILING_SCOPE_FORCE_INT = 0x7fffffff +} CUpti_EventProfilingScope; + +/** + * \brief Event attributes. + * + * Event attributes. These attributes can be read using \ref + * cuptiEventGetAttribute. + */ +typedef enum { + /** + * Event name. Value is a null terminated const c-string. + */ + CUPTI_EVENT_ATTR_NAME = 0, + /** + * Short description of event. Value is a null terminated const + * c-string. + */ + CUPTI_EVENT_ATTR_SHORT_DESCRIPTION = 1, + /** + * Long description of event. Value is a null terminated const + * c-string. + */ + CUPTI_EVENT_ATTR_LONG_DESCRIPTION = 2, + /** + * Category of event. Value is CUpti_EventCategory. + */ + CUPTI_EVENT_ATTR_CATEGORY = 3, + /** + * Profiling scope of the events. It can be either device or context or both. + * Value is a \ref CUpti_EventProfilingScope. + */ + CUPTI_EVENT_ATTR_PROFILING_SCOPE = 5, + + CUPTI_EVENT_ATTR_FORCE_INT = 0x7fffffff, +} CUpti_EventAttribute; + +/** + * \brief Event collection modes. + * + * The event collection mode determines the period over which the + * events within the enabled event groups will be collected. + */ +typedef enum { + /** + * Events are collected for the entire duration between the + * cuptiEventGroupEnable and cuptiEventGroupDisable calls. + * Event values are reset when the events are read. + * For CUDA toolkit v6.0 and older this was the default mode. + */ + CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS = 0, + /** + * Events are collected only for the durations of kernel executions + * that occur between the cuptiEventGroupEnable and + * cuptiEventGroupDisable calls. Event collection begins when a + * kernel execution begins, and stops when kernel execution + * completes. Event values are reset to zero when each kernel + * execution begins. If multiple kernel executions occur between the + * cuptiEventGroupEnable and cuptiEventGroupDisable calls then the + * event values must be read after each kernel launch if those + * events need to be associated with the specific kernel launch. + * Note that collection in this mode may significantly change the + * overall performance characteristics of the application because + * kernel executions that occur between the cuptiEventGroupEnable and + * cuptiEventGroupDisable calls are serialized on the GPU. + * This is the default mode from CUDA toolkit v6.5 + */ + CUPTI_EVENT_COLLECTION_MODE_KERNEL = 1, + CUPTI_EVENT_COLLECTION_MODE_FORCE_INT = 0x7fffffff +} CUpti_EventCollectionMode; + +/** + * \brief An event category. + * + * Each event is assigned to a category that represents the general + * type of the event. A event's category is accessed using \ref + * cuptiEventGetAttribute and the CUPTI_EVENT_ATTR_CATEGORY attribute. + */ +typedef enum { + /** + * An instruction related event. + */ + CUPTI_EVENT_CATEGORY_INSTRUCTION = 0, + /** + * A memory related event. + */ + CUPTI_EVENT_CATEGORY_MEMORY = 1, + /** + * A cache related event. + */ + CUPTI_EVENT_CATEGORY_CACHE = 2, + /** + * A profile-trigger event. + */ + CUPTI_EVENT_CATEGORY_PROFILE_TRIGGER = 3, + /** + * A system event. + */ + CUPTI_EVENT_CATEGORY_SYSTEM = 4, + CUPTI_EVENT_CATEGORY_FORCE_INT = 0x7fffffff +} CUpti_EventCategory; + +/** + * \brief The overflow value for a CUPTI event. + * + * The CUPTI event value that indicates an overflow. + */ +#define CUPTI_EVENT_OVERFLOW ((uint64_t)0xFFFFFFFFFFFFFFFFULL) + +/** + * \brief The value that indicates the event value is invalid + */ +#define CUPTI_EVENT_INVALID ((uint64_t)0xFFFFFFFFFFFFFFFEULL) + +/** + * \brief Flags for cuptiEventGroupReadEvent an + * cuptiEventGroupReadAllEvents. + * + * Flags for \ref cuptiEventGroupReadEvent an \ref + * cuptiEventGroupReadAllEvents. + */ +typedef enum { + /** + * No flags. + */ + CUPTI_EVENT_READ_FLAG_NONE = 0, + CUPTI_EVENT_READ_FLAG_FORCE_INT = 0x7fffffff, +} CUpti_ReadEventFlags; + + +/** + * \brief A set of event groups. + * + * A set of event groups. When returned by \ref + * cuptiEventGroupSetsCreate and \ref cuptiMetricCreateEventGroupSets + * a set indicates that event groups that can be enabled at the same + * time (i.e. all the events in the set can be collected + * simultaneously). + */ +typedef struct { + /** + * The number of event groups in the set. + */ + uint32_t numEventGroups; + /** + * An array of \p numEventGroups event groups. + */ + CUpti_EventGroup *eventGroups; +} CUpti_EventGroupSet; + +/** + * \brief A set of event group sets. + * + * A set of event group sets. When returned by \ref + * cuptiEventGroupSetsCreate and \ref cuptiMetricCreateEventGroupSets + * a CUpti_EventGroupSets indicates the number of passes required to + * collect all the events, and the event groups that should be + * collected during each pass. + */ +typedef struct { + /** + * Number of event group sets. + */ + uint32_t numSets; + /** + * An array of \p numSets event group sets. + */ + CUpti_EventGroupSet *sets; +} CUpti_EventGroupSets; + +/** + * \brief Set the event collection mode. + * + * Set the event collection mode for a \p context. The \p mode + * controls the event collection behavior of all events in event + * groups created in the \p context. This API is invalid in kernel + * replay mode. + * \note \b Thread-safety: this function is thread safe. + * + * \param context The context + * \param mode The event collection mode + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_CONTEXT + * \retval CUPTI_ERROR_INVALID_OPERATION if called when replay mode is enabled + * \retval CUPTI_ERROR_NOT_SUPPORTED if mode is not supported on the device + */ + +CUptiResult CUPTIAPI cuptiSetEventCollectionMode(CUcontext context, + CUpti_EventCollectionMode mode); + +/** + * \brief Read a device attribute. + * + * Read a device attribute and return it in \p *value. + * \note \b Thread-safety: this function is thread safe. + * + * \param device The CUDA device + * \param attrib The attribute to read + * \param valueSize Size of buffer pointed by the value, and + * returns the number of bytes written to \p value + * \param value Returns the value of the attribute + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_DEVICE + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value + * is NULL, or if \p attrib is not a device attribute + * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string + * attribute values, indicates that the \p value buffer is too small + * to hold the attribute value. + */ +CUptiResult CUPTIAPI cuptiDeviceGetAttribute(CUdevice device, + CUpti_DeviceAttribute attrib, + size_t *valueSize, + void *value); + +/** + * \brief Read a device timestamp. + * + * Returns the device timestamp in \p *timestamp. The timestamp is + * reported in nanoseconds and indicates the time since the device was + * last reset. + * \note \b Thread-safety: this function is thread safe. + * + * \param context A context on the device from which to get the timestamp + * \param timestamp Returns the device timestamp + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_CONTEXT + * \retval CUPTI_ERROR_INVALID_PARAMETER is \p timestamp is NULL + + * **DEPRECATED** This API is deprecated as of CUDA 11.3 + */ +CUptiResult CUPTIAPI cuptiDeviceGetTimestamp(CUcontext context, + uint64_t *timestamp); + +/** + * \brief Get the number of domains for a device. + * + * Returns the number of domains in \p numDomains for a device. + * \note \b Thread-safety: this function is thread safe. + * + * \param device The CUDA device + * \param numDomains Returns the number of domains + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_DEVICE + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numDomains is NULL + */ +CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice device, + uint32_t *numDomains); + +/** + * \brief Get the event domains for a device. + * + * Returns the event domains IDs in \p domainArray for a device. The + * size of the \p domainArray buffer is given by \p + * *arraySizeBytes. The size of the \p domainArray buffer must be at + * least \p numdomains * sizeof(CUpti_EventDomainID) or else all + * domains will not be returned. The value returned in \p + * *arraySizeBytes contains the number of bytes returned in \p + * domainArray. + * \note \b Thread-safety: this function is thread safe. + * + * \param device The CUDA device + * \param arraySizeBytes The size of \p domainArray in bytes, and + * returns the number of bytes written to \p domainArray + * \param domainArray Returns the IDs of the event domains for the device + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_DEVICE + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or + * \p domainArray are NULL + */ +CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(CUdevice device, + size_t *arraySizeBytes, + CUpti_EventDomainID *domainArray); + +/** + * \brief Read an event domain attribute. + * + * Returns an event domain attribute in \p *value. The size of the \p + * value buffer is given by \p *valueSize. The value returned in \p + * *valueSize contains the number of bytes returned in \p value. + * + * If the attribute value is a c-string that is longer than \p + * *valueSize, then only the first \p *valueSize characters will be + * returned and there will be no terminating null byte. + * \note \b Thread-safety: this function is thread safe. + * + * \param device The CUDA device + * \param eventDomain ID of the event domain + * \param attrib The event domain attribute to read + * \param valueSize The size of the \p value buffer in bytes, and + * returns the number of bytes written to \p value + * \param value Returns the attribute's value + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_DEVICE + * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value + * is NULL, or if \p attrib is not an event domain attribute + * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string + * attribute values, indicates that the \p value buffer is too small + * to hold the attribute value. + */ +CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(CUdevice device, + CUpti_EventDomainID eventDomain, + CUpti_EventDomainAttribute attrib, + size_t *valueSize, + void *value); + +/** + * \brief Get the number of event domains available on any device. + * + * Returns the total number of event domains available on any + * CUDA-capable device. + * \note \b Thread-safety: this function is thread safe. + * + * \param numDomains Returns the number of domains + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numDomains is NULL + */ +CUptiResult CUPTIAPI cuptiGetNumEventDomains(uint32_t *numDomains); + +/** + * \brief Get the event domains available on any device. + * + * Returns all the event domains available on any CUDA-capable device. + * Event domain IDs are returned in \p domainArray. The size of the \p + * domainArray buffer is given by \p *arraySizeBytes. The size of the + * \p domainArray buffer must be at least \p numDomains * + * sizeof(CUpti_EventDomainID) or all domains will not be + * returned. The value returned in \p *arraySizeBytes contains the + * number of bytes returned in \p domainArray. + * \note \b Thread-safety: this function is thread safe. + * + * \param arraySizeBytes The size of \p domainArray in bytes, and + * returns the number of bytes written to \p domainArray + * \param domainArray Returns all the event domains + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or + * \p domainArray are NULL + */ +CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t *arraySizeBytes, + CUpti_EventDomainID *domainArray); + +/** + * \brief Read an event domain attribute. + * + * Returns an event domain attribute in \p *value. The size of the \p + * value buffer is given by \p *valueSize. The value returned in \p + * *valueSize contains the number of bytes returned in \p value. + * + * If the attribute value is a c-string that is longer than \p + * *valueSize, then only the first \p *valueSize characters will be + * returned and there will be no terminating null byte. + * \note \b Thread-safety: this function is thread safe. + * + * \param eventDomain ID of the event domain + * \param attrib The event domain attribute to read + * \param valueSize The size of the \p value buffer in bytes, and + * returns the number of bytes written to \p value + * \param value Returns the attribute's value + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value + * is NULL, or if \p attrib is not an event domain attribute + * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string + * attribute values, indicates that the \p value buffer is too small + * to hold the attribute value. + */ +CUptiResult CUPTIAPI cuptiEventDomainGetAttribute(CUpti_EventDomainID eventDomain, + CUpti_EventDomainAttribute attrib, + size_t *valueSize, + void *value); + +/** + * \brief Get number of events in a domain. + * + * Returns the number of events in \p numEvents for a domain. + * \note \b Thread-safety: this function is thread safe. + * + * \param eventDomain ID of the event domain + * \param numEvents Returns the number of events in the domain + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numEvents is NULL + */ +CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(CUpti_EventDomainID eventDomain, + uint32_t *numEvents); + +/** + * \brief Get the events in a domain. + * + * Returns the event IDs in \p eventArray for a domain. The size of + * the \p eventArray buffer is given by \p *arraySizeBytes. The size + * of the \p eventArray buffer must be at least \p numdomainevents * + * sizeof(CUpti_EventID) or else all events will not be returned. The + * value returned in \p *arraySizeBytes contains the number of bytes + * returned in \p eventArray. + * \note \b Thread-safety: this function is thread safe. + * + * \param eventDomain ID of the event domain + * \param arraySizeBytes The size of \p eventArray in bytes, and + * returns the number of bytes written to \p eventArray + * \param eventArray Returns the IDs of the events in the domain + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or \p + * eventArray are NULL + */ +CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID eventDomain, + size_t *arraySizeBytes, + CUpti_EventID *eventArray); + +/** + * \brief Get an event attribute. + * + * Returns an event attribute in \p *value. The size of the \p + * value buffer is given by \p *valueSize. The value returned in \p + * *valueSize contains the number of bytes returned in \p value. + * + * If the attribute value is a c-string that is longer than \p + * *valueSize, then only the first \p *valueSize characters will be + * returned and there will be no terminating null byte. + * \note \b Thread-safety: this function is thread safe. + * + * \param event ID of the event + * \param attrib The event attribute to read + * \param valueSize The size of the \p value buffer in bytes, and + * returns the number of bytes written to \p value + * \param value Returns the attribute's value + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_EVENT_ID + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value + * is NULL, or if \p attrib is not an event attribute + * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string + * attribute values, indicates that the \p value buffer is too small + * to hold the attribute value. + */ +CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID event, + CUpti_EventAttribute attrib, + size_t *valueSize, + void *value); + +/** + * \brief Find an event by name. + * + * Find an event by name and return the event ID in \p *event. + * \note \b Thread-safety: this function is thread safe. + * + * \param device The CUDA device + * \param eventName The name of the event to find + * \param event Returns the ID of the found event or undefined if + * unable to find the event + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_DEVICE + * \retval CUPTI_ERROR_INVALID_EVENT_NAME if unable to find an event + * with name \p eventName. In this case \p *event is undefined + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventName or \p event are NULL + */ +CUptiResult CUPTIAPI cuptiEventGetIdFromName(CUdevice device, + const char *eventName, + CUpti_EventID *event); + +/** + * \brief Create a new event group for a context. + * + * Creates a new event group for \p context and returns the new group + * in \p *eventGroup. + * \note \p flags are reserved for future use and should be set to zero. + * \note \b Thread-safety: this function is thread safe. + * + * \param context The context for the event group + * \param eventGroup Returns the new event group + * \param flags Reserved - must be zero + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_CONTEXT + * \retval CUPTI_ERROR_OUT_OF_MEMORY + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL + */ +CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext context, + CUpti_EventGroup *eventGroup, + uint32_t flags); + +/** + * \brief Destroy an event group. + * + * Destroy an \p eventGroup and free its resources. An event group + * cannot be destroyed if it is enabled. + * \note \b Thread-safety: this function is thread safe. + * + * \param eventGroup The event group to destroy + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_OPERATION if the event group is enabled + * \retval CUPTI_ERROR_INVALID_PARAMETER if eventGroup is NULL + */ +CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup eventGroup); + +/** + * \brief Read an event group attribute. + * + * Read an event group attribute and return it in \p *value. + * \note \b Thread-safety: this function is thread safe but client + * must guard against simultaneous destruction or modification of \p + * eventGroup (for example, client must guard against simultaneous + * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent, + * etc.), and must guard against simultaneous destruction of the + * context in which \p eventGroup was created (for example, client + * must guard against simultaneous calls to cudaDeviceReset, + * cuCtxDestroy, etc.). + * + * \param eventGroup The event group + * \param attrib The attribute to read + * \param valueSize Size of buffer pointed by the value, and + * returns the number of bytes written to \p value + * \param value Returns the value of the attribute + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value + * is NULL, or if \p attrib is not an eventgroup attribute + * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string + * attribute values, indicates that the \p value buffer is too small + * to hold the attribute value. + */ +CUptiResult CUPTIAPI cuptiEventGroupGetAttribute(CUpti_EventGroup eventGroup, + CUpti_EventGroupAttribute attrib, + size_t *valueSize, + void *value); + +/** + * \brief Write an event group attribute. + * + * Write an event group attribute. + * \note \b Thread-safety: this function is thread safe. + * + * \param eventGroup The event group + * \param attrib The attribute to write + * \param valueSize The size, in bytes, of the value + * \param value The attribute value to write + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value + * is NULL, or if \p attrib is not an event group attribute, or if + * \p attrib is not a writable attribute + * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that + * the \p value buffer is too small to hold the attribute value. + */ +CUptiResult CUPTIAPI cuptiEventGroupSetAttribute(CUpti_EventGroup eventGroup, + CUpti_EventGroupAttribute attrib, + size_t valueSize, + void *value); + +/** + * \brief Add an event to an event group. + * + * Add an event to an event group. The event add can fail for a number of reasons: + * \li The event group is enabled + * \li The event does not belong to the same event domain as the + * events that are already in the event group + * \li Device limitations on the events that can belong to the same group + * \li The event group is full + * + * \note \b Thread-safety: this function is thread safe. + * + * \param eventGroup The event group + * \param event The event to add to the group + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_EVENT_ID + * \retval CUPTI_ERROR_OUT_OF_MEMORY + * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled + * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p event belongs to a + * different event domain than the events already in \p eventGroup, or + * if a device limitation prevents \p event from being collected at + * the same time as the events already in \p eventGroup + * \retval CUPTI_ERROR_MAX_LIMIT_REACHED if \p eventGroup is full + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL + */ +CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup eventGroup, + CUpti_EventID event); + +/** + * \brief Remove an event from an event group. + * + * Remove \p event from the an event group. The event cannot be + * removed if the event group is enabled. + * \note \b Thread-safety: this function is thread safe. + * + * \param eventGroup The event group + * \param event The event to remove from the group + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_EVENT_ID + * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL + */ +CUptiResult CUPTIAPI cuptiEventGroupRemoveEvent(CUpti_EventGroup eventGroup, + CUpti_EventID event); + +/** + * \brief Remove all events from an event group. + * + * Remove all events from an event group. Events cannot be removed if + * the event group is enabled. + * \note \b Thread-safety: this function is thread safe. + * + * \param eventGroup The event group + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL + */ +CUptiResult CUPTIAPI cuptiEventGroupRemoveAllEvents(CUpti_EventGroup eventGroup); + +/** + * \brief Zero all the event counts in an event group. + * + * Zero all the event counts in an event group. + * \note \b Thread-safety: this function is thread safe but client + * must guard against simultaneous destruction or modification of \p + * eventGroup (for example, client must guard against simultaneous + * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent, + * etc.), and must guard against simultaneous destruction of the + * context in which \p eventGroup was created (for example, client + * must guard against simultaneous calls to cudaDeviceReset, + * cuCtxDestroy, etc.). + * + * \param eventGroup The event group + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_HARDWARE + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL + */ +CUptiResult CUPTIAPI cuptiEventGroupResetAllEvents(CUpti_EventGroup eventGroup); + +/** + * \brief Enable an event group. + * + * Enable an event group. Enabling an event group zeros the value of + * all the events in the group and then starts collection of those + * events. + * \note \b Thread-safety: this function is thread safe. + * + * \param eventGroup The event group + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_HARDWARE + * \retval CUPTI_ERROR_NOT_READY if \p eventGroup does not contain any events + * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p eventGroup cannot be + * enabled due to other already enabled event groups + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL + * \retval CUPTI_ERROR_HARDWARE_BUSY if another client is profiling + * and hardware is busy + */ +CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup eventGroup); + +/** + * \brief Disable an event group. + * + * Disable an event group. Disabling an event group stops collection + * of events contained in the group. + * \note \b Thread-safety: this function is thread safe. + * + * \param eventGroup The event group + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_HARDWARE + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL + */ +CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup eventGroup); + +/** + * \brief Read the value for an event in an event group. + * + * Read the value for an event in an event group. The event value is + * returned in the \p eventValueBuffer buffer. \p + * eventValueBufferSizeBytes indicates the size of the \p + * eventValueBuffer buffer. The buffer must be at least sizeof(uint64) + * if ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is not set + * on the group containing the event. The buffer must be at least + * (sizeof(uint64) * number of domain instances) if + * ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is set on the + * group. + * + * If any instance of an event counter overflows, the value returned + * for that event instance will be ::CUPTI_EVENT_OVERFLOW. + * + * The only allowed value for \p flags is ::CUPTI_EVENT_READ_FLAG_NONE. + * + * Reading an event from a disabled event group is not allowed. After + * being read, an event's value is reset to zero. + * \note \b Thread-safety: this function is thread safe but client + * must guard against simultaneous destruction or modification of \p + * eventGroup (for example, client must guard against simultaneous + * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent, + * etc.), and must guard against simultaneous destruction of the + * context in which \p eventGroup was created (for example, client + * must guard against simultaneous calls to cudaDeviceReset, + * cuCtxDestroy, etc.). If \ref cuptiEventGroupResetAllEvents is + * called simultaneously with this function, then returned event + * values are undefined. + * + * \param eventGroup The event group + * \param flags Flags controlling the reading mode + * \param event The event to read + * \param eventValueBufferSizeBytes The size of \p eventValueBuffer + * in bytes, and returns the number of bytes written to \p + * eventValueBuffer + * \param eventValueBuffer Returns the event value(s) + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_EVENT_ID + * \retval CUPTI_ERROR_HARDWARE + * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is disabled + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup, \p + * eventValueBufferSizeBytes or \p eventValueBuffer is NULL + * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if size of \p eventValueBuffer + * is not sufficient + */ +CUptiResult CUPTIAPI cuptiEventGroupReadEvent(CUpti_EventGroup eventGroup, + CUpti_ReadEventFlags flags, + CUpti_EventID event, + size_t *eventValueBufferSizeBytes, + uint64_t *eventValueBuffer); + +/** + * \brief Read the values for all the events in an event group. + * + * Read the values for all the events in an event group. The event + * values are returned in the \p eventValueBuffer buffer. \p + * eventValueBufferSizeBytes indicates the size of \p + * eventValueBuffer. The buffer must be at least (sizeof(uint64) * + * number of events in group) if + * ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is not set on + * the group containing the events. The buffer must be at least + * (sizeof(uint64) * number of domain instances * number of events in + * group) if ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is + * set on the group. + * + * The data format returned in \p eventValueBuffer is: + * - domain instance 0: event0 event1 ... eventN + * - domain instance 1: event0 event1 ... eventN + * - ... + * - domain instance M: event0 event1 ... eventN + * + * The event order in \p eventValueBuffer is returned in \p + * eventIdArray. The size of \p eventIdArray is specified in \p + * eventIdArraySizeBytes. The size should be at least + * (sizeof(CUpti_EventID) * number of events in group). + * + * If any instance of any event counter overflows, the value returned + * for that event instance will be ::CUPTI_EVENT_OVERFLOW. + * + * The only allowed value for \p flags is ::CUPTI_EVENT_READ_FLAG_NONE. + * + * Reading events from a disabled event group is not allowed. After + * being read, an event's value is reset to zero. + * \note \b Thread-safety: this function is thread safe but client + * must guard against simultaneous destruction or modification of \p + * eventGroup (for example, client must guard against simultaneous + * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent, + * etc.), and must guard against simultaneous destruction of the + * context in which \p eventGroup was created (for example, client + * must guard against simultaneous calls to cudaDeviceReset, + * cuCtxDestroy, etc.). If \ref cuptiEventGroupResetAllEvents is + * called simultaneously with this function, then returned event + * values are undefined. + * + * \param eventGroup The event group + * \param flags Flags controlling the reading mode + * \param eventValueBufferSizeBytes The size of \p eventValueBuffer in + * bytes, and returns the number of bytes written to \p + * eventValueBuffer + * \param eventValueBuffer Returns the event values + * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes, + * and returns the number of bytes written to \p eventIdArray + * \param eventIdArray Returns the IDs of the events in the same order + * as the values return in eventValueBuffer. + * \param numEventIdsRead Returns the number of event IDs returned + * in \p eventIdArray + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_HARDWARE + * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is disabled + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup, \p + * eventValueBufferSizeBytes, \p eventValueBuffer, \p + * eventIdArraySizeBytes, \p eventIdArray or \p numEventIdsRead is + * NULL + * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if size of \p eventValueBuffer + * or \p eventIdArray is not sufficient + */ +CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(CUpti_EventGroup eventGroup, + CUpti_ReadEventFlags flags, + size_t *eventValueBufferSizeBytes, + uint64_t *eventValueBuffer, + size_t *eventIdArraySizeBytes, + CUpti_EventID *eventIdArray, + size_t *numEventIdsRead); + +/** + * \brief For a set of events, get the grouping that indicates the + * number of passes and the event groups necessary to collect the + * events. + * + * The number of events that can be collected simultaneously varies by + * device and by the type of the events. When events can be collected + * simultaneously, they may need to be grouped into multiple event + * groups because they are from different event domains. This function + * takes a set of events and determines how many passes are required + * to collect all those events, and which events can be collected + * simultaneously in each pass. + * + * The CUpti_EventGroupSets returned in \p eventGroupPasses indicates + * how many passes are required to collect the events with the \p + * numSets field. Within each event group set, the \p sets array + * indicates the event groups that should be collected on each pass. + * \note \b Thread-safety: this function is thread safe, but client + * must guard against another thread simultaneously destroying \p + * context. + * + * \param context The context for event collection + * \param eventIdArraySizeBytes Size of \p eventIdArray in bytes + * \param eventIdArray Array of event IDs that need to be grouped + * \param eventGroupPasses Returns a CUpti_EventGroupSets object that + * indicates the number of passes required to collect the events and + * the events to collect on each pass + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_CONTEXT + * \retval CUPTI_ERROR_INVALID_EVENT_ID + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventIdArray or + * \p eventGroupPasses is NULL + */ +CUptiResult CUPTIAPI cuptiEventGroupSetsCreate(CUcontext context, + size_t eventIdArraySizeBytes, + CUpti_EventID *eventIdArray, + CUpti_EventGroupSets **eventGroupPasses); + +/** + * \brief Destroy a event group sets object. + * + * Destroy a CUpti_EventGroupSets object. + * \note \b Thread-safety: this function is thread safe. + * + * \param eventGroupSets The object to destroy + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_INVALID_OPERATION if any of the event groups + * contained in the sets is enabled + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSets is NULL + */ +CUptiResult CUPTIAPI cuptiEventGroupSetsDestroy(CUpti_EventGroupSets *eventGroupSets); + + +/** + * \brief Enable an event group set. + * + * Enable a set of event groups. Enabling a set of event groups zeros the value of + * all the events in all the groups and then starts collection of those events. + * \note \b Thread-safety: this function is thread safe. + * + * \param eventGroupSet The pointer to the event group set + * + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_HARDWARE + * \retval CUPTI_ERROR_NOT_READY if \p eventGroup does not contain any events + * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p eventGroup cannot be + * enabled due to other already enabled event groups + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSet is NULL + * \retval CUPTI_ERROR_HARDWARE_BUSY if other client is profiling and hardware is + * busy + */ +CUptiResult CUPTIAPI cuptiEventGroupSetEnable(CUpti_EventGroupSet *eventGroupSet); + +/** + * \brief Disable an event group set. + * + * Disable a set of event groups. Disabling a set of event groups + * stops collection of events contained in the groups. + * \note \b Thread-safety: this function is thread safe. + * \note \b If this call fails, some of the event groups in the set may be disabled + * and other event groups may remain enabled. + * + * \param eventGroupSet The pointer to the event group set + * \retval CUPTI_SUCCESS + * \retval CUPTI_ERROR_NOT_INITIALIZED + * \retval CUPTI_ERROR_HARDWARE + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSet is NULL + */ +CUptiResult CUPTIAPI cuptiEventGroupSetDisable(CUpti_EventGroupSet *eventGroupSet); + +/** + * \brief Enable kernel replay mode. + * + * Set profiling mode for the context to replay mode. In this mode, + * any number of events can be collected in one run of the kernel. The + * event collection mode will automatically switch to + * CUPTI_EVENT_COLLECTION_MODE_KERNEL. In this mode, \ref + * cuptiSetEventCollectionMode will return + * CUPTI_ERROR_INVALID_OPERATION. + * \note \b Kernels might take longer to run if many events are enabled. + * \note \b Thread-safety: this function is thread safe. + * + * \param context The context + * \retval CUPTI_SUCCESS + */ +CUptiResult CUPTIAPI cuptiEnableKernelReplayMode(CUcontext context); + +/** + * \brief Disable kernel replay mode. + * + * Set profiling mode for the context to non-replay (default) + * mode. Event collection mode will be set to + * CUPTI_EVENT_COLLECTION_MODE_KERNEL. All previously enabled + * event groups and event group sets will be disabled. + * \note \b Thread-safety: this function is thread safe. + * + * \param context The context + * \retval CUPTI_SUCCESS + */ +CUptiResult CUPTIAPI cuptiDisableKernelReplayMode(CUcontext context); + +/** + * \brief Function type for getting updates on kernel replay. + * + * \param kernelName The mangled kernel name + * \param numReplaysDone Number of replays done so far + * \param customData Pointer of any custom data passed in when subscribing + */ +typedef void (CUPTIAPI *CUpti_KernelReplayUpdateFunc)( + const char *kernelName, + int numReplaysDone, + void *customData); + +/** + * \brief Subscribe to kernel replay updates. + * + * When subscribed, the function pointer passed in will be called each time a + * kernel run is finished during kernel replay. Previously subscribed function + * pointer will be replaced. Pass in NULL as the function pointer unsubscribes + * the update. + * + * \param updateFunc The update function pointer + * \param customData Pointer to any custom data + * \retval CUPTI_SUCCESS + */ +CUptiResult CUPTIAPI cuptiKernelReplaySubscribeUpdate(CUpti_KernelReplayUpdateFunc updateFunc, void *customData); + +/** @} */ /* END CUPTI_EVENT_API */ + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility pop +#endif + +#if defined(__cplusplus) +} +#endif + +#endif /*_CUPTI_EVENTS_H_*/ + + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling_util.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling_util.h new file mode 100644 index 0000000000000000000000000000000000000000..9cb1ac2132b3d53bd67f39f1e4ebd85d3ea61465 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling_util.h @@ -0,0 +1,419 @@ +#if !defined(_CUPTI_PCSAMPLING_UTIL_H_) +#define _CUPTI_PCSAMPLING_UTIL_H_ + +#include +#include + +#ifndef CUPTIUTILAPI +#ifdef _WIN32 +#define CUPTIUTILAPI __stdcall +#else +#define CUPTIUTILAPI +#endif +#endif + +#define ACTIVITY_RECORD_ALIGNMENT 8 +#if defined(_WIN32) // Windows 32- and 64-bit +#define START_PACKED_ALIGNMENT __pragma(pack(push,1)) // exact fit - no padding +#define PACKED_ALIGNMENT __declspec(align(ACTIVITY_RECORD_ALIGNMENT)) +#define END_PACKED_ALIGNMENT __pragma(pack(pop)) +#elif defined(__GNUC__) // GCC +#define START_PACKED_ALIGNMENT +#define PACKED_ALIGNMENT __attribute__ ((__packed__)) __attribute__ ((aligned (ACTIVITY_RECORD_ALIGNMENT))) +#define END_PACKED_ALIGNMENT +#else // all other compilers +#define START_PACKED_ALIGNMENT +#define PACKED_ALIGNMENT +#define END_PACKED_ALIGNMENT +#endif + +#ifndef CUPTI_UTIL_STRUCT_SIZE +#define CUPTI_UTIL_STRUCT_SIZE(type_, lastfield_) (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_)) +#endif + +#ifndef CHECK_PC_SAMPLING_STRUCT_FIELD_EXISTS +#define CHECK_PC_SAMPLING_STRUCT_FIELD_EXISTS(type, member, structSize) \ + (offsetof(type, member) < structSize) +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__GNUC__) + #pragma GCC visibility push(default) +#endif + +namespace CUPTI { namespace PcSamplingUtil { + +/** + * \defgroup CUPTI_PCSAMPLING_UTILITY CUPTI PC Sampling Utility API + * Functions, types, and enums that implement the CUPTI PC Sampling Utility API. + * @{ + */ + +/** + * \brief Header info will be stored in file. + */ +typedef struct PACKED_ALIGNMENT { + /** + * Version of file format. + */ + uint32_t version; + /** + * Total number of buffers present in the file. + */ + uint32_t totalBuffers; +} Header; + +/** + * \brief BufferInfo will be stored in the file for every buffer + * i.e for every call of UtilDumpPcSamplingBufferInFile() API. + */ +typedef struct PACKED_ALIGNMENT { + /** + * Total number of PC records. + */ + uint64_t recordCount; + /** + * Count of all stall reasons supported on the GPU + */ + size_t numStallReasons; + /** + * Total number of stall reasons in single record. + */ + uint64_t numSelectedStallReasons; + /** + * Buffer size in Bytes. + */ + uint64_t bufferByteSize; +} BufferInfo; + +/** + * \brief All available stall reasons name and respective indexes + * will be stored in it. + */ +typedef struct PACKED_ALIGNMENT { + /** + * Number of all available stall reasons + */ + size_t numStallReasons; + /** + * Stall reasons names of all available stall reasons + */ + char **stallReasons; + /** + * Stall reason index of all available stall reasons + */ + uint32_t *stallReasonIndex; +} PcSamplingStallReasons; + +typedef enum { + /** + * Invalid buffer type. + */ + PC_SAMPLING_BUFFER_INVALID = 0, + /** + * Refers to CUpti_PCSamplingData buffer. + */ + PC_SAMPLING_BUFFER_PC_TO_COUNTER_DATA = 1 +} PcSamplingBufferType; + +/** + * \brief CUPTI PC sampling utility API result codes. + * + * Error and result codes returned by CUPTI PC sampling utility API. + */ +typedef enum { + /** + * No error + */ + CUPTI_UTIL_SUCCESS = 0, + /** + * One or more of the parameters are invalid. + */ + CUPTI_UTIL_ERROR_INVALID_PARAMETER = 1, + /** + * Unable to create a new file + */ + CUPTI_UTIL_ERROR_UNABLE_TO_CREATE_FILE = 2, + /** + * Unable to open a file + */ + CUPTI_UTIL_ERROR_UNABLE_TO_OPEN_FILE = 3, + /** + * Read or write operation failed + */ + CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED = 4, + /** + * Provided file handle is corrupted. + */ + CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED = 5, + /** + * seek operation failed. + */ + CUPTI_UTIL_ERROR_SEEK_OPERATION_FAILED = 6, + /** + * Unable to allocate enough memory to perform the requested + * operation. + */ + CUPTI_UTIL_ERROR_OUT_OF_MEMORY = 7, + /** + * An unknown internal error has occurred. + */ + CUPTI_UTIL_ERROR_UNKNOWN = 999, + CUPTI_UTIL_ERROR_FORCE_INT = 0x7fffffff +} CUptiUtilResult; + +/** + * \brief Params for \ref CuptiUtilPutPcSampData + */ +typedef struct { + /** + * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * Type of buffer to store in file + */ + PcSamplingBufferType bufferType; + /** + * PC sampling buffer. + */ + void *pSamplingData; + /** + * Number of configured attributes + */ + size_t numAttributes; + /** + * Refer \ref CUpti_PCSamplingConfigurationInfo + * It is expected to provide configuration details of at least + * CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON attribute. + */ + CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo; + /** + * Refer \ref PcSamplingStallReasons. + */ + PcSamplingStallReasons *pPcSamplingStallReasons; + /** + * File name to store buffer into it. + */ + const char* fileName; +} CUptiUtil_PutPcSampDataParams; +#define CUptiUtil_PutPcSampDataParamsSize CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_PutPcSampDataParams, fileName) + +/** + * \brief Dump PC sampling data into the file. + * + * This API can be called multiple times. + * It will append buffer in the file. + * For every buffer it will store BufferInfo + * so that before retrieving data it will help to allocate buffer + * to store retrieved data. + * This API creates file if file does not present. + * If stallReasonIndex or stallReasons pointer of \ref CUptiUtil_PutPcSampDataParams is NULL + * then stall reasons data will not be stored in file. + * It is expected to store all available stall reason data at least once to refer it during + * offline correlation. + * + * \retval CUPTI_UTIL_SUCCESS + * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if buffer type is invalid + * or if either of pSamplingData, pParams pointer is NULL or stall reason configuration details not provided + * or filename is empty. + * \retval CUPTI_UTIL_ERROR_UNABLE_TO_CREATE_FILE + * \retval CUPTI_UTIL_ERROR_UNABLE_TO_OPEN_FILE + * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED + */ +CUptiUtilResult CUPTIUTILAPI CuptiUtilPutPcSampData(CUptiUtil_PutPcSampDataParams *pParams); + +/** + * \brief Params for \ref CuptiUtilGetHeaderData + */ +typedef struct { + /** + * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * File handle. + */ + std::ifstream *fileHandler; + /** + * Header Info. + */ + Header headerInfo; + +} CUptiUtil_GetHeaderDataParams; +#define CUptiUtil_GetHeaderDataParamsSize CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetHeaderDataParams, headerInfo) + +/** + * \brief Get header data of file. + * + * This API must be called once initially while retrieving data from file. + * \ref Header structure, it gives info about total number + * of buffers present in the file. + * + * \retval CUPTI_UTIL_SUCCESS + * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if either of pParam or fileHandle is NULL or param struct size is incorrect. + * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file + * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED failed to read data from file. + */ +CUptiUtilResult CUPTIUTILAPI CuptiUtilGetHeaderData(CUptiUtil_GetHeaderDataParams *pParams); + +/** + * \brief Params for \ref CuptiUtilGetBufferInfo + */ +typedef struct { + /** + * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * File handle. + */ + std::ifstream *fileHandler; + /** + * Buffer Info. + */ + BufferInfo bufferInfoData; +} CUptiUtil_GetBufferInfoParams; +#define CUptiUtil_GetBufferInfoParamsSize CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetBufferInfoParams, bufferInfoData) + +/** + * \brief Get buffer info data of file. + * + * This API must be called every time before calling CuptiUtilGetPcSampData API. + * \ref BufferInfo structure, it gives info about recordCount and stallReasonCount + * of every record in the buffer. This will help to allocate exact buffer to retrieve data into it. + * + * \retval CUPTI_UTIL_SUCCESS + * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if either of pParam or fileHandle is NULL or param struct size is incorrect. + * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file. + * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED failed to read data from file. + */ +CUptiUtilResult CUPTIUTILAPI CuptiUtilGetBufferInfo(CUptiUtil_GetBufferInfoParams *pParams); + +/** + * \brief Params for \ref CuptiUtilGetPcSampData + */ +typedef struct { + /** + * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * File handle. + */ + std::ifstream *fileHandler; + /** + * Type of buffer to store in file + */ + PcSamplingBufferType bufferType; + /** + * Pointer to collected buffer info using \ref CuptiUtilGetBufferInfo + */ + BufferInfo *pBufferInfoData; + /** + * Pointer to allocated memory to store retrieved data from file. + */ + void *pSamplingData; + /** + * Number of configuration attributes + */ + size_t numAttributes; + /** + * Refer \ref CUpti_PCSamplingConfigurationInfo + */ + CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo; + /** + * Refer \ref PcSamplingStallReasons. + * For stallReasons field of \ref PcSamplingStallReasons it is expected to + * allocate memory for each string element of array. + */ + PcSamplingStallReasons *pPcSamplingStallReasons; +} CUptiUtil_GetPcSampDataParams; +#define CUptiUtil_GetPcSampDataParamsSize CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetPcSampDataParams, pPcSamplingStallReasons) + +/** + * \brief Retrieve PC sampling data from file into allocated buffer. + * + * This API must be called after CuptiUtilGetBufferInfo API. + * It will retrieve data from file into allocated buffer. + * + * \retval CUPTI_UTIL_SUCCESS + * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if buffer type is invalid + * or if either of pSampData, pParams is NULL. If pPcSamplingStallReasons is not NULL then + * error out if either of stallReasonIndex, stallReasons or stallReasons array element pointer is NULL. + * or filename is empty. + * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED + * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file. + */ +CUptiUtilResult CUPTIUTILAPI CuptiUtilGetPcSampData(CUptiUtil_GetPcSampDataParams *pParams); + +/** + * \brief Params for \ref CuptiUtilMergePcSampData + */ +typedef struct +{ + /** + * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize + * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are + * available in the structure. Used to preserve backward compatibility. + */ + size_t size; + /** + * Number of buffers to merge. + */ + size_t numberOfBuffers; + /** + * Pointer to array of buffers to merge + */ + CUpti_PCSamplingData *PcSampDataBuffer; + /** + * Pointer to array of merged buffers as per the range id. + */ + CUpti_PCSamplingData **MergedPcSampDataBuffers; + /** + * Number of merged buffers. + */ + size_t *numMergedBuffer; +} CUptiUtil_MergePcSampDataParams; +#define CUptiUtil_MergePcSampDataParamsSize CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_MergePcSampDataParams, numMergedBuffer) + +/** + * \brief Merge PC sampling data range id wise. + * + * This API merge PC sampling data range id wise. + * It allocates memory for merged data and fill data in it + * and provide buffer pointer in MergedPcSampDataBuffers field. + * It is expected from user to free merge data buffers after use. + * + * \retval CUPTI_UTIL_SUCCESS + * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if param struct size is invalid + * or count of buffers to merge is invalid i.e less than 1 + * or either of PcSampDataBuffer, MergedPcSampDataBuffers, numMergedBuffer is NULL + * \retval CUPTI_UTIL_ERROR_OUT_OF_MEMORY Unable to allocate memory for merged buffer. + */ +CUptiUtilResult CUPTIUTILAPI CuptiUtilMergePcSampData(CUptiUtil_MergePcSampDataParams *pParams); + +/** @} */ /* END CUPTI_PCSAMPLING_UTILITY */ + +} } + +#if defined(__GNUC__) + #pragma GCC visibility pop +#endif + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_result.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_result.h new file mode 100644 index 0000000000000000000000000000000000000000..f2896451245f9ad325175330c6715b80bf639832 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_result.h @@ -0,0 +1,328 @@ +/* + * Copyright 2010-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(_CUPTI_RESULT_H_) +#define _CUPTI_RESULT_H_ + +#ifndef CUPTIAPI +#ifdef _WIN32 +#define CUPTIAPI __stdcall +#else +#define CUPTIAPI +#endif +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility push(default) +#endif + +/** + * \defgroup CUPTI_RESULT_API CUPTI Result Codes + * Error and result codes returned by CUPTI functions. + * @{ + */ + +/** + * \brief CUPTI result codes. + * + * Error and result codes returned by CUPTI functions. + */ +typedef enum { + /** + * No error. + */ + CUPTI_SUCCESS = 0, + /** + * One or more of the parameters is invalid. + */ + CUPTI_ERROR_INVALID_PARAMETER = 1, + /** + * The device does not correspond to a valid CUDA device. + */ + CUPTI_ERROR_INVALID_DEVICE = 2, + /** + * The context is NULL or not valid. + */ + CUPTI_ERROR_INVALID_CONTEXT = 3, + /** + * The event domain id is invalid. + */ + CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID = 4, + /** + * The event id is invalid. + */ + CUPTI_ERROR_INVALID_EVENT_ID = 5, + /** + * The event name is invalid. + */ + CUPTI_ERROR_INVALID_EVENT_NAME = 6, + /** + * The current operation cannot be performed due to dependency on + * other factors. + */ + CUPTI_ERROR_INVALID_OPERATION = 7, + /** + * Unable to allocate enough memory to perform the requested + * operation. + */ + CUPTI_ERROR_OUT_OF_MEMORY = 8, + /** + * An error occurred on the performance monitoring hardware. + */ + CUPTI_ERROR_HARDWARE = 9, + /** + * The output buffer size is not sufficient to return all + * requested data. + */ + CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT = 10, + /** + * API is not implemented. + */ + CUPTI_ERROR_API_NOT_IMPLEMENTED = 11, + /** + * The maximum limit is reached. + */ + CUPTI_ERROR_MAX_LIMIT_REACHED = 12, + /** + * The object is not yet ready to perform the requested operation. + */ + CUPTI_ERROR_NOT_READY = 13, + /** + * The current operation is not compatible with the current state + * of the object + */ + CUPTI_ERROR_NOT_COMPATIBLE = 14, + /** + * CUPTI is unable to initialize its connection to the CUDA + * driver. + */ + CUPTI_ERROR_NOT_INITIALIZED = 15, + /** + * The metric id is invalid. + */ + CUPTI_ERROR_INVALID_METRIC_ID = 16, + /** + * The metric name is invalid. + */ + CUPTI_ERROR_INVALID_METRIC_NAME = 17, + /** + * The queue is empty. + */ + CUPTI_ERROR_QUEUE_EMPTY = 18, + /** + * Invalid handle (internal?). + */ + CUPTI_ERROR_INVALID_HANDLE = 19, + /** + * Invalid stream. + */ + CUPTI_ERROR_INVALID_STREAM = 20, + /** + * Invalid kind. + */ + CUPTI_ERROR_INVALID_KIND = 21, + /** + * Invalid event value. + */ + CUPTI_ERROR_INVALID_EVENT_VALUE = 22, + /** + * CUPTI is disabled due to conflicts with other enabled profilers + */ + CUPTI_ERROR_DISABLED = 23, + /** + * Invalid module. + */ + CUPTI_ERROR_INVALID_MODULE = 24, + /** + * Invalid metric value. + */ + CUPTI_ERROR_INVALID_METRIC_VALUE = 25, + /** + * The performance monitoring hardware is in use by other client. + */ + CUPTI_ERROR_HARDWARE_BUSY = 26, + /** + * The attempted operation is not supported on the current + * system or device. + */ + CUPTI_ERROR_NOT_SUPPORTED = 27, + /** + * Unified memory profiling is not supported on the system. + * Potential reason could be unsupported OS or architecture. + */ + CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED = 28, + /** + * Unified memory profiling is not supported on the device + */ + CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE = 29, + /** + * Unified memory profiling is not supported on a multi-GPU + * configuration without P2P support between any pair of devices + */ + CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES = 30, + /** + * Unified memory profiling is not supported under the + * Multi-Process Service (MPS) environment. CUDA 7.5 removes this + * restriction. + */ + CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_WITH_MPS = 31, + /** + * In CUDA 9.0, devices with compute capability 7.0 don't + * support CDP tracing + */ + CUPTI_ERROR_CDP_TRACING_NOT_SUPPORTED = 32, + /** + * Profiling on virtualized GPU is not supported. + */ + CUPTI_ERROR_VIRTUALIZED_DEVICE_NOT_SUPPORTED = 33, + /** + * Profiling results might be incorrect for CUDA applications + * compiled with nvcc version older than 9.0 for devices with + * compute capability 6.0 and 6.1. + * Profiling session will continue and CUPTI will notify it using this error code. + * User is advised to recompile the application code with nvcc version 9.0 or later. + * Ignore this warning if code is already compiled with the recommended nvcc version. + */ + CUPTI_ERROR_CUDA_COMPILER_NOT_COMPATIBLE = 34, + /** + * User doesn't have sufficient privileges which are required to + * start the profiling session. + * One possible reason for this may be that the NVIDIA driver or your system + * administrator may have restricted access to the NVIDIA GPU performance counters. + * To learn how to resolve this issue and find more information, please visit + * https://developer.nvidia.com/CUPTI_ERROR_INSUFFICIENT_PRIVILEGES + */ + CUPTI_ERROR_INSUFFICIENT_PRIVILEGES = 35, + /** + * Legacy CUPTI Profiling API i.e. event API from the header cupti_events.h and + * metric API from the header cupti_metrics.h are not compatible with the + * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API + * in the headers nvperf_host.h and nvperf_target.h. + */ + CUPTI_ERROR_OLD_PROFILER_API_INITIALIZED = 36, + /** + * Missing definition of the OpenACC API routine in the linked OpenACC library. + * + * One possible reason is that OpenACC library is linked statically in the + * user application, which might not have the definition of all the OpenACC + * API routines needed for the OpenACC profiling, as compiler might ignore + * definitions for the functions not used in the application. This issue + * can be mitigated by linking the OpenACC library dynamically. + */ + CUPTI_ERROR_OPENACC_UNDEFINED_ROUTINE = 37, + /** + * Legacy CUPTI Profiling API i.e. event API from the header cupti_events.h and + * metric API from the header cupti_metrics.h are not supported on devices with + * compute capability 7.5 and higher (i.e. Turing and later GPU architectures). + * These API will be deprecated in a future CUDA release. These are replaced by + * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API + * in the headers nvperf_host.h and nvperf_target.h. + */ + CUPTI_ERROR_LEGACY_PROFILER_NOT_SUPPORTED = 38, + /** + * CUPTI doesn't allow multiple callback subscribers. Only a single subscriber + * can be registered at a time. + * Same error code is used when application is launched using NVIDIA tools + * like nvprof, Visual Profiler, Nsight Systems, Nsight Compute, cuda-gdb and + * cuda-memcheck. + */ + CUPTI_ERROR_MULTIPLE_SUBSCRIBERS_NOT_SUPPORTED = 39, + /** + * Profiling on virtualized GPU is not allowed by hypervisor. + */ + CUPTI_ERROR_VIRTUALIZED_DEVICE_INSUFFICIENT_PRIVILEGES = 40, + /** + * Profiling and tracing are not allowed when confidential computing mode + * is enabled. + */ + CUPTI_ERROR_CONFIDENTIAL_COMPUTING_NOT_SUPPORTED = 41, + /** + * CUPTI does not support NVIDIA Crypto Mining Processors (CMP). + * For more information, please visit https://developer.nvidia.com/ERR_NVCMPGPU + */ + CUPTI_ERROR_CMP_DEVICE_NOT_SUPPORTED = 42, + /** + * An unknown internal error has occurred. + */ + CUPTI_ERROR_UNKNOWN = 999, + CUPTI_ERROR_FORCE_INT = 0x7fffffff +} CUptiResult; + +/** + * \brief Get the descriptive string for a CUptiResult. + * + * Return the descriptive string for a CUptiResult in \p *str. + * \note \b Thread-safety: this function is thread safe. + * + * \param result The result to get the string for + * \param str Returns the string + * + * \retval CUPTI_SUCCESS on success + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p str is NULL or \p + * result is not a valid CUptiResult + */ +CUptiResult CUPTIAPI cuptiGetResultString(CUptiResult result, const char **str); + +/** @} */ /* END CUPTI_RESULT_API */ + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility pop +#endif + +#if defined(__cplusplus) +} +#endif + +#endif /*_CUPTI_RESULT_H_*/ + + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_runtime_cbid.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_runtime_cbid.h new file mode 100644 index 0000000000000000000000000000000000000000..dac73f5d586e4704bfaaccd7d75f6c857e91fff3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_runtime_cbid.h @@ -0,0 +1,447 @@ + +// ************************************************************************* +// Definitions of indices for API functions, unique across entire API +// ************************************************************************* + +// This file is generated. Any changes you make will be lost during the next clean build. +// CUDA public interface, for type definitions and cu* function prototypes + +typedef enum CUpti_runtime_api_trace_cbid_enum { + CUPTI_RUNTIME_TRACE_CBID_INVALID = 0, + CUPTI_RUNTIME_TRACE_CBID_cudaDriverGetVersion_v3020 = 1, + CUPTI_RUNTIME_TRACE_CBID_cudaRuntimeGetVersion_v3020 = 2, + CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceCount_v3020 = 3, + CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v3020 = 4, + CUPTI_RUNTIME_TRACE_CBID_cudaChooseDevice_v3020 = 5, + CUPTI_RUNTIME_TRACE_CBID_cudaGetChannelDesc_v3020 = 6, + CUPTI_RUNTIME_TRACE_CBID_cudaCreateChannelDesc_v3020 = 7, + CUPTI_RUNTIME_TRACE_CBID_cudaConfigureCall_v3020 = 8, + CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020 = 9, + CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020 = 10, + CUPTI_RUNTIME_TRACE_CBID_cudaPeekAtLastError_v3020 = 11, + CUPTI_RUNTIME_TRACE_CBID_cudaGetErrorString_v3020 = 12, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020 = 13, + CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetCacheConfig_v3020 = 14, + CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetAttributes_v3020 = 15, + CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 = 16, + CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 = 17, + CUPTI_RUNTIME_TRACE_CBID_cudaSetValidDevices_v3020 = 18, + CUPTI_RUNTIME_TRACE_CBID_cudaSetDeviceFlags_v3020 = 19, + CUPTI_RUNTIME_TRACE_CBID_cudaMalloc_v3020 = 20, + CUPTI_RUNTIME_TRACE_CBID_cudaMallocPitch_v3020 = 21, + CUPTI_RUNTIME_TRACE_CBID_cudaFree_v3020 = 22, + CUPTI_RUNTIME_TRACE_CBID_cudaMallocArray_v3020 = 23, + CUPTI_RUNTIME_TRACE_CBID_cudaFreeArray_v3020 = 24, + CUPTI_RUNTIME_TRACE_CBID_cudaMallocHost_v3020 = 25, + CUPTI_RUNTIME_TRACE_CBID_cudaFreeHost_v3020 = 26, + CUPTI_RUNTIME_TRACE_CBID_cudaHostAlloc_v3020 = 27, + CUPTI_RUNTIME_TRACE_CBID_cudaHostGetDevicePointer_v3020 = 28, + CUPTI_RUNTIME_TRACE_CBID_cudaHostGetFlags_v3020 = 29, + CUPTI_RUNTIME_TRACE_CBID_cudaMemGetInfo_v3020 = 30, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020 = 31, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2D_v3020 = 32, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArray_v3020 = 33, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArray_v3020 = 34, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArray_v3020 = 35, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArray_v3020 = 36, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyArrayToArray_v3020 = 37, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DArrayToArray_v3020 = 38, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbol_v3020 = 39, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbol_v3020 = 40, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020 = 41, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArrayAsync_v3020 = 42, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArrayAsync_v3020 = 43, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DAsync_v3020 = 44, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArrayAsync_v3020 = 45, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArrayAsync_v3020 = 46, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbolAsync_v3020 = 47, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbolAsync_v3020 = 48, + CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020 = 49, + CUPTI_RUNTIME_TRACE_CBID_cudaMemset2D_v3020 = 50, + CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020 = 51, + CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_v3020 = 52, + CUPTI_RUNTIME_TRACE_CBID_cudaGetSymbolAddress_v3020 = 53, + CUPTI_RUNTIME_TRACE_CBID_cudaGetSymbolSize_v3020 = 54, + CUPTI_RUNTIME_TRACE_CBID_cudaBindTexture_v3020 = 55, + CUPTI_RUNTIME_TRACE_CBID_cudaBindTexture2D_v3020 = 56, + CUPTI_RUNTIME_TRACE_CBID_cudaBindTextureToArray_v3020 = 57, + CUPTI_RUNTIME_TRACE_CBID_cudaUnbindTexture_v3020 = 58, + CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureAlignmentOffset_v3020 = 59, + CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureReference_v3020 = 60, + CUPTI_RUNTIME_TRACE_CBID_cudaBindSurfaceToArray_v3020 = 61, + CUPTI_RUNTIME_TRACE_CBID_cudaGetSurfaceReference_v3020 = 62, + CUPTI_RUNTIME_TRACE_CBID_cudaGLSetGLDevice_v3020 = 63, + CUPTI_RUNTIME_TRACE_CBID_cudaGLRegisterBufferObject_v3020 = 64, + CUPTI_RUNTIME_TRACE_CBID_cudaGLMapBufferObject_v3020 = 65, + CUPTI_RUNTIME_TRACE_CBID_cudaGLUnmapBufferObject_v3020 = 66, + CUPTI_RUNTIME_TRACE_CBID_cudaGLUnregisterBufferObject_v3020 = 67, + CUPTI_RUNTIME_TRACE_CBID_cudaGLSetBufferObjectMapFlags_v3020 = 68, + CUPTI_RUNTIME_TRACE_CBID_cudaGLMapBufferObjectAsync_v3020 = 69, + CUPTI_RUNTIME_TRACE_CBID_cudaGLUnmapBufferObjectAsync_v3020 = 70, + CUPTI_RUNTIME_TRACE_CBID_cudaWGLGetDevice_v3020 = 71, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsGLRegisterImage_v3020 = 72, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsGLRegisterBuffer_v3020 = 73, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsUnregisterResource_v3020 = 74, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceSetMapFlags_v3020 = 75, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsMapResources_v3020 = 76, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsUnmapResources_v3020 = 77, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedPointer_v3020 = 78, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsSubResourceGetMappedArray_v3020 = 79, + CUPTI_RUNTIME_TRACE_CBID_cudaVDPAUGetDevice_v3020 = 80, + CUPTI_RUNTIME_TRACE_CBID_cudaVDPAUSetVDPAUDevice_v3020 = 81, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsVDPAURegisterVideoSurface_v3020 = 82, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsVDPAURegisterOutputSurface_v3020 = 83, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDevice_v3020 = 84, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDevices_v3020 = 85, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D11SetDirect3DDevice_v3020 = 86, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D11RegisterResource_v3020 = 87, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDevice_v3020 = 88, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDevices_v3020 = 89, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D10SetDirect3DDevice_v3020 = 90, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D10RegisterResource_v3020 = 91, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D10RegisterResource_v3020 = 92, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D10UnregisterResource_v3020 = 93, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D10MapResources_v3020 = 94, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D10UnmapResources_v3020 = 95, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceSetMapFlags_v3020 = 96, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetSurfaceDimensions_v3020 = 97, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedArray_v3020 = 98, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedPointer_v3020 = 99, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedSize_v3020 = 100, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedPitch_v3020 = 101, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDevice_v3020 = 102, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDevices_v3020 = 103, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9SetDirect3DDevice_v3020 = 104, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDirect3DDevice_v3020 = 105, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D9RegisterResource_v3020 = 106, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9RegisterResource_v3020 = 107, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnregisterResource_v3020 = 108, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9MapResources_v3020 = 109, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnmapResources_v3020 = 110, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceSetMapFlags_v3020 = 111, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetSurfaceDimensions_v3020 = 112, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedArray_v3020 = 113, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedPointer_v3020 = 114, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedSize_v3020 = 115, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedPitch_v3020 = 116, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9Begin_v3020 = 117, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9End_v3020 = 118, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9RegisterVertexBuffer_v3020 = 119, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnregisterVertexBuffer_v3020 = 120, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9MapVertexBuffer_v3020 = 121, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnmapVertexBuffer_v3020 = 122, + CUPTI_RUNTIME_TRACE_CBID_cudaThreadExit_v3020 = 123, + CUPTI_RUNTIME_TRACE_CBID_cudaSetDoubleForDevice_v3020 = 124, + CUPTI_RUNTIME_TRACE_CBID_cudaSetDoubleForHost_v3020 = 125, + CUPTI_RUNTIME_TRACE_CBID_cudaThreadSynchronize_v3020 = 126, + CUPTI_RUNTIME_TRACE_CBID_cudaThreadGetLimit_v3020 = 127, + CUPTI_RUNTIME_TRACE_CBID_cudaThreadSetLimit_v3020 = 128, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreate_v3020 = 129, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamDestroy_v3020 = 130, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_v3020 = 131, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamQuery_v3020 = 132, + CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020 = 133, + CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020 = 134, + CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_v3020 = 135, + CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020 = 136, + CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020 = 137, + CUPTI_RUNTIME_TRACE_CBID_cudaEventQuery_v3020 = 138, + CUPTI_RUNTIME_TRACE_CBID_cudaEventElapsedTime_v3020 = 139, + CUPTI_RUNTIME_TRACE_CBID_cudaMalloc3D_v3020 = 140, + CUPTI_RUNTIME_TRACE_CBID_cudaMalloc3DArray_v3020 = 141, + CUPTI_RUNTIME_TRACE_CBID_cudaMemset3D_v3020 = 142, + CUPTI_RUNTIME_TRACE_CBID_cudaMemset3DAsync_v3020 = 143, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3D_v3020 = 144, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DAsync_v3020 = 145, + CUPTI_RUNTIME_TRACE_CBID_cudaThreadSetCacheConfig_v3020 = 146, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_v3020 = 147, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDirect3DDevice_v3020 = 148, + CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDirect3DDevice_v3020 = 149, + CUPTI_RUNTIME_TRACE_CBID_cudaThreadGetCacheConfig_v3020 = 150, + CUPTI_RUNTIME_TRACE_CBID_cudaPointerGetAttributes_v4000 = 151, + CUPTI_RUNTIME_TRACE_CBID_cudaHostRegister_v4000 = 152, + CUPTI_RUNTIME_TRACE_CBID_cudaHostUnregister_v4000 = 153, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceCanAccessPeer_v4000 = 154, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceEnablePeerAccess_v4000 = 155, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceDisablePeerAccess_v4000 = 156, + CUPTI_RUNTIME_TRACE_CBID_cudaPeerRegister_v4000 = 157, + CUPTI_RUNTIME_TRACE_CBID_cudaPeerUnregister_v4000 = 158, + CUPTI_RUNTIME_TRACE_CBID_cudaPeerGetDevicePointer_v4000 = 159, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyPeer_v4000 = 160, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyPeerAsync_v4000 = 161, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeer_v4000 = 162, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeerAsync_v4000 = 163, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020 = 164, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020 = 165, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetLimit_v3020 = 166, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetLimit_v3020 = 167, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetCacheConfig_v3020 = 168, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetCacheConfig_v3020 = 169, + CUPTI_RUNTIME_TRACE_CBID_cudaProfilerInitialize_v4000 = 170, + CUPTI_RUNTIME_TRACE_CBID_cudaProfilerStart_v4000 = 171, + CUPTI_RUNTIME_TRACE_CBID_cudaProfilerStop_v4000 = 172, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetByPCIBusId_v4010 = 173, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetPCIBusId_v4010 = 174, + CUPTI_RUNTIME_TRACE_CBID_cudaGLGetDevices_v4010 = 175, + CUPTI_RUNTIME_TRACE_CBID_cudaIpcGetEventHandle_v4010 = 176, + CUPTI_RUNTIME_TRACE_CBID_cudaIpcOpenEventHandle_v4010 = 177, + CUPTI_RUNTIME_TRACE_CBID_cudaIpcGetMemHandle_v4010 = 178, + CUPTI_RUNTIME_TRACE_CBID_cudaIpcOpenMemHandle_v4010 = 179, + CUPTI_RUNTIME_TRACE_CBID_cudaIpcCloseMemHandle_v4010 = 180, + CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetInfo_v4010 = 181, + CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetSharedMemConfig_v4020 = 182, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetSharedMemConfig_v4020 = 183, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetSharedMemConfig_v4020 = 184, + CUPTI_RUNTIME_TRACE_CBID_cudaCreateTextureObject_v5000 = 185, + CUPTI_RUNTIME_TRACE_CBID_cudaDestroyTextureObject_v5000 = 186, + CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectResourceDesc_v5000 = 187, + CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectTextureDesc_v5000 = 188, + CUPTI_RUNTIME_TRACE_CBID_cudaCreateSurfaceObject_v5000 = 189, + CUPTI_RUNTIME_TRACE_CBID_cudaDestroySurfaceObject_v5000 = 190, + CUPTI_RUNTIME_TRACE_CBID_cudaGetSurfaceObjectResourceDesc_v5000 = 191, + CUPTI_RUNTIME_TRACE_CBID_cudaMallocMipmappedArray_v5000 = 192, + CUPTI_RUNTIME_TRACE_CBID_cudaGetMipmappedArrayLevel_v5000 = 193, + CUPTI_RUNTIME_TRACE_CBID_cudaFreeMipmappedArray_v5000 = 194, + CUPTI_RUNTIME_TRACE_CBID_cudaBindTextureToMipmappedArray_v5000 = 195, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedMipmappedArray_v5000 = 196, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamAddCallback_v5000 = 197, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreateWithFlags_v5000 = 198, + CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectResourceViewDesc_v5000 = 199, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetAttribute_v5000 = 200, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamDestroy_v5050 = 201, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreateWithPriority_v5050 = 202, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetPriority_v5050 = 203, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetFlags_v5050 = 204, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetStreamPriorityRange_v5050 = 205, + CUPTI_RUNTIME_TRACE_CBID_cudaMallocManaged_v6000 = 206, + CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000 = 207, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamAttachMemAsync_v6000 = 208, + CUPTI_RUNTIME_TRACE_CBID_cudaGetErrorName_v6050 = 209, + CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050 = 210, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000 = 211, + CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceFlags_v7000 = 212, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_ptsz_v7000 = 213, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000 = 214, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_ptds_v7000 = 215, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2D_ptds_v7000 = 216, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArray_ptds_v7000 = 217, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArray_ptds_v7000 = 218, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArray_ptds_v7000 = 219, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArray_ptds_v7000 = 220, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyArrayToArray_ptds_v7000 = 221, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DArrayToArray_ptds_v7000 = 222, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbol_ptds_v7000 = 223, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbol_ptds_v7000 = 224, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_ptsz_v7000 = 225, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArrayAsync_ptsz_v7000 = 226, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArrayAsync_ptsz_v7000 = 227, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DAsync_ptsz_v7000 = 228, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArrayAsync_ptsz_v7000 = 229, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArrayAsync_ptsz_v7000 = 230, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbolAsync_ptsz_v7000 = 231, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbolAsync_ptsz_v7000 = 232, + CUPTI_RUNTIME_TRACE_CBID_cudaMemset_ptds_v7000 = 233, + CUPTI_RUNTIME_TRACE_CBID_cudaMemset2D_ptds_v7000 = 234, + CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_ptsz_v7000 = 235, + CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_ptsz_v7000 = 236, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetPriority_ptsz_v7000 = 237, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetFlags_ptsz_v7000 = 238, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_ptsz_v7000 = 239, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamQuery_ptsz_v7000 = 240, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamAttachMemAsync_ptsz_v7000 = 241, + CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_ptsz_v7000 = 242, + CUPTI_RUNTIME_TRACE_CBID_cudaMemset3D_ptds_v7000 = 243, + CUPTI_RUNTIME_TRACE_CBID_cudaMemset3DAsync_ptsz_v7000 = 244, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3D_ptds_v7000 = 245, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DAsync_ptsz_v7000 = 246, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_ptsz_v7000 = 247, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamAddCallback_ptsz_v7000 = 248, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeer_ptds_v7000 = 249, + CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeerAsync_ptsz_v7000 = 250, + CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000 = 251, + CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v8000 = 252, + CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_ptsz_v8000 = 253, + CUPTI_RUNTIME_TRACE_CBID_cudaMemAdvise_v8000 = 254, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetP2PAttribute_v8000 = 255, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsEGLRegisterImage_v7000 = 256, + CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerConnect_v7000 = 257, + CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerDisconnect_v7000 = 258, + CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerAcquireFrame_v7000 = 259, + CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerReleaseFrame_v7000 = 260, + CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerConnect_v7000 = 261, + CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerDisconnect_v7000 = 262, + CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerPresentFrame_v7000 = 263, + CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerReturnFrame_v7000 = 264, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedEglFrame_v7000 = 265, + CUPTI_RUNTIME_TRACE_CBID_cudaMemRangeGetAttribute_v8000 = 266, + CUPTI_RUNTIME_TRACE_CBID_cudaMemRangeGetAttributes_v8000 = 267, + CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerConnectWithFlags_v7000 = 268, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000 = 269, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_ptsz_v9000 = 270, + CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateFromEGLSync_v9000 = 271, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000 = 272, + CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetAttribute_v9000 = 273, + CUPTI_RUNTIME_TRACE_CBID_cudaImportExternalMemory_v10000 = 274, + CUPTI_RUNTIME_TRACE_CBID_cudaExternalMemoryGetMappedBuffer_v10000 = 275, + CUPTI_RUNTIME_TRACE_CBID_cudaExternalMemoryGetMappedMipmappedArray_v10000 = 276, + CUPTI_RUNTIME_TRACE_CBID_cudaDestroyExternalMemory_v10000 = 277, + CUPTI_RUNTIME_TRACE_CBID_cudaImportExternalSemaphore_v10000 = 278, + CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v10000 = 279, + CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_ptsz_v10000 = 280, + CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v10000 = 281, + CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_ptsz_v10000 = 282, + CUPTI_RUNTIME_TRACE_CBID_cudaDestroyExternalSemaphore_v10000 = 283, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchHostFunc_v10000 = 284, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchHostFunc_ptsz_v10000 = 285, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphCreate_v10000 = 286, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeGetParams_v10000 = 287, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeSetParams_v10000 = 288, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddKernelNode_v10000 = 289, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNode_v10000 = 290, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeGetParams_v10000 = 291, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParams_v10000 = 292, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemsetNode_v10000 = 293, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemsetNodeGetParams_v10000 = 294, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemsetNodeSetParams_v10000 = 295, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddHostNode_v10000 = 296, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphHostNodeGetParams_v10000 = 297, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddChildGraphNode_v10000 = 298, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphChildGraphNodeGetGraph_v10000 = 299, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEmptyNode_v10000 = 300, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphClone_v10000 = 301, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeFindInClone_v10000 = 302, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetType_v10000 = 303, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetRootNodes_v10000 = 304, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependencies_v10000 = 305, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependentNodes_v10000 = 306, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddDependencies_v10000 = 307, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphRemoveDependencies_v10000 = 308, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphDestroyNode_v10000 = 309, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiate_v10000 = 310, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000 = 311, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000 = 312, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecDestroy_v10000 = 313, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphDestroy_v10000 = 314, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCapture_v10000 = 315, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCapture_ptsz_v10000 = 316, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamIsCapturing_v10000 = 317, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamIsCapturing_ptsz_v10000 = 318, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamEndCapture_v10000 = 319, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamEndCapture_ptsz_v10000 = 320, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphHostNodeSetParams_v10000 = 321, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetNodes_v10000 = 322, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetEdges_v10000 = 323, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v10010 = 324, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_ptsz_v10010 = 325, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecKernelNodeSetParams_v10010 = 326, + CUPTI_RUNTIME_TRACE_CBID_cudaThreadExchangeStreamCaptureMode_v10010 = 327, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetNvSciSyncAttributes_v10020 = 328, + CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyAvailableDynamicSMemPerBlock_v10200 = 329, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetFlags_v10200 = 330, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetFlags_ptsz_v10200 = 331, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParams_v10020 = 332, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemsetNodeSetParams_v10020 = 333, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecHostNodeSetParams_v10020 = 334, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecUpdate_v10020 = 335, + CUPTI_RUNTIME_TRACE_CBID_cudaGetFuncBySymbol_v11000 = 336, + CUPTI_RUNTIME_TRACE_CBID_cudaCtxResetPersistingL2Cache_v11000 = 337, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeCopyAttributes_v11000 = 338, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeGetAttribute_v11000 = 339, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeSetAttribute_v11000 = 340, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamCopyAttributes_v11000 = 341, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamCopyAttributes_ptsz_v11000 = 342, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetAttribute_v11000 = 343, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetAttribute_ptsz_v11000 = 344, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_v11000 = 345, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_ptsz_v11000 = 346, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetTexture1DLinearMaxWidth_v11010 = 347, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphUpload_v10000 = 348, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphUpload_ptsz_v10000 = 349, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNodeToSymbol_v11010 = 350, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNodeFromSymbol_v11010 = 351, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNode1D_v11010 = 352, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParamsToSymbol_v11010 = 353, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParamsFromSymbol_v11010 = 354, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParams1D_v11010 = 355, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010 = 356, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010 = 357, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParams1D_v11010 = 358, + CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetSparseProperties_v11010 = 359, + CUPTI_RUNTIME_TRACE_CBID_cudaMipmappedArrayGetSparseProperties_v11010 = 360, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecChildGraphNodeSetParams_v11010 = 361, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEventRecordNode_v11010 = 362, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventRecordNodeGetEvent_v11010 = 363, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventRecordNodeSetEvent_v11010 = 364, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEventWaitNode_v11010 = 365, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventWaitNodeGetEvent_v11010 = 366, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventWaitNodeSetEvent_v11010 = 367, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecEventRecordNodeSetEvent_v11010 = 368, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecEventWaitNodeSetEvent_v11010 = 369, + CUPTI_RUNTIME_TRACE_CBID_cudaEventRecordWithFlags_v11010 = 370, + CUPTI_RUNTIME_TRACE_CBID_cudaEventRecordWithFlags_ptsz_v11010 = 371, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetDefaultMemPool_v11020 = 372, + CUPTI_RUNTIME_TRACE_CBID_cudaMallocAsync_v11020 = 373, + CUPTI_RUNTIME_TRACE_CBID_cudaMallocAsync_ptsz_v11020 = 374, + CUPTI_RUNTIME_TRACE_CBID_cudaFreeAsync_v11020 = 375, + CUPTI_RUNTIME_TRACE_CBID_cudaFreeAsync_ptsz_v11020 = 376, + CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolTrimTo_v11020 = 377, + CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolSetAttribute_v11020 = 378, + CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolGetAttribute_v11020 = 379, + CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolSetAccess_v11020 = 380, + CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetPlane_v11020 = 381, + CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolGetAccess_v11020 = 382, + CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolCreate_v11020 = 383, + CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolDestroy_v11020 = 384, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetMemPool_v11020 = 385, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetMemPool_v11020 = 386, + CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolExportToShareableHandle_v11020 = 387, + CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolImportFromShareableHandle_v11020 = 388, + CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolExportPointer_v11020 = 389, + CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolImportPointer_v11020 = 390, + CUPTI_RUNTIME_TRACE_CBID_cudaMallocFromPoolAsync_v11020 = 391, + CUPTI_RUNTIME_TRACE_CBID_cudaMallocFromPoolAsync_ptsz_v11020 = 392, + CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v2_v11020 = 393, + CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020 = 394, + CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v2_v11020 = 395, + CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020 = 396, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddExternalSemaphoresSignalNode_v11020 = 397, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresSignalNodeGetParams_v11020 = 398, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresSignalNodeSetParams_v11020 = 399, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddExternalSemaphoresWaitNode_v11020 = 400, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresWaitNodeGetParams_v11020 = 401, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresWaitNodeSetParams_v11020 = 402, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020 = 403, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020 = 404, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceFlushGPUDirectRDMAWrites_v11030 = 405, + CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPoint_v11030 = 406, + CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPoint_ptsz_v11030 = 407, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphDebugDotPrint_v11030 = 408, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v2_v11030 = 409, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v2_ptsz_v11030 = 410, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v11030 = 411, + CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_ptsz_v11030 = 412, + CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectCreate_v11030 = 413, + CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectRetain_v11030 = 414, + CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectRelease_v11030 = 415, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphRetainUserObject_v11030 = 416, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphReleaseUserObject_v11030 = 417, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithFlags_v11040 = 418, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemAllocNode_v11040 = 419, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemAllocNodeGetParams_v11040 = 420, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemFreeNode_v11040 = 421, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemFreeNodeGetParams_v11040 = 422, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGraphMemTrim_v11040 = 423, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetGraphMemAttribute_v11040 = 424, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetGraphMemAttribute_v11040 = 425, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeSetEnabled_v11060 = 426, + CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetEnabled_v11060 = 427, + CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetMemoryRequirements_v11060 = 428, + CUPTI_RUNTIME_TRACE_CBID_cudaMipmappedArrayGetMemoryRequirements_v11060 = 429, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060 = 430, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060 = 431, + CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxPotentialClusterSize_v11070 = 432, + CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveClusters_v11070 = 433, + CUPTI_RUNTIME_TRACE_CBID_SIZE = 434, + CUPTI_RUNTIME_TRACE_CBID_FORCE_INT = 0x7fffffff +} CUpti_runtime_api_trace_cbid; + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_target.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_target.h new file mode 100644 index 0000000000000000000000000000000000000000..e4b625d45c65288fa2ea7dc05819ee4dfc4cbdd3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_target.h @@ -0,0 +1,43 @@ +#if !defined(_CUPTI_TARGET_H_) +#define _CUPTI_TARGET_H_ + +/* +CUPTI profiler target API's +This file contains the CUPTI profiling API's. +*/ +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility push(default) +#endif + +#ifndef CUPTI_PROFILER_STRUCT_SIZE +#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_) (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_)) +#endif + +typedef struct CUpti_Device_GetChipName_Params +{ + size_t structSize; //!< [in] + void* pPriv; //!< [in] assign to NULL + + size_t deviceIndex; //!< [in] + const char* pChipName; //!< [out] +} CUpti_Device_GetChipName_Params; + +#define CUpti_Device_GetChipName_Params_STRUCT_SIZE CUPTI_PROFILER_STRUCT_SIZE(CUpti_Device_GetChipName_Params, pChipName) +CUptiResult CUPTIAPI cuptiDeviceGetChipName(CUpti_Device_GetChipName_Params *pParams); + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility pop +#endif + +#ifdef __cplusplus +} /* extern "C" */ +#endif +#endif diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_version.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_version.h new file mode 100644 index 0000000000000000000000000000000000000000..0833be1c1894f53e4e9ded4585f485c4e96a3560 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_version.h @@ -0,0 +1,130 @@ +/* + * Copyright 2010-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(_CUPTI_VERSION_H_) +#define _CUPTI_VERSION_H_ + +#include +#include + +#ifndef CUPTIAPI +#ifdef _WIN32 +#define CUPTIAPI __stdcall +#else +#define CUPTIAPI +#endif +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility push(default) +#endif + +/** + * \defgroup CUPTI_VERSION_API CUPTI Version + * Function and macro to determine the CUPTI version. + * @{ + */ + +/** + * \brief The API version for this implementation of CUPTI. + * + * The API version for this implementation of CUPTI. This define along + * with \ref cuptiGetVersion can be used to dynamically detect if the + * version of CUPTI compiled against matches the version of the loaded + * CUPTI library. + * + * v1 : CUDAToolsSDK 4.0 + * v2 : CUDAToolsSDK 4.1 + * v3 : CUDA Toolkit 5.0 + * v4 : CUDA Toolkit 5.5 + * v5 : CUDA Toolkit 6.0 + * v6 : CUDA Toolkit 6.5 + * v7 : CUDA Toolkit 6.5(with sm_52 support) + * v8 : CUDA Toolkit 7.0 + * v9 : CUDA Toolkit 8.0 + * v10 : CUDA Toolkit 9.0 + * v11 : CUDA Toolkit 9.1 + * v12 : CUDA Toolkit 10.0, 10.1 and 10.2 + * v13 : CUDA Toolkit 11.0 + * v14 : CUDA Toolkit 11.1 + * v15 : CUDA Toolkit 11.2, 11.3 and 11.4 + * v16 : CUDA Toolkit 11.5 + * v17 : CUDA Toolkit 11.6 + * v18 : CUDA Toolkit 11.8 + */ +#define CUPTI_API_VERSION 18 + +/** + * \brief Get the CUPTI API version. + * + * Return the API version in \p *version. + * + * \param version Returns the version + * + * \retval CUPTI_SUCCESS on success + * \retval CUPTI_ERROR_INVALID_PARAMETER if \p version is NULL + * \sa CUPTI_API_VERSION + */ +CUptiResult CUPTIAPI cuptiGetVersion(uint32_t *version); + +/** @} */ /* END CUPTI_VERSION_API */ + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility pop +#endif + +#if defined(__cplusplus) +} +#endif + +#endif /*_CUPTI_VERSION_H_*/ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..aa6cf0390e339d33e543896395746685efa010c4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_meta.h @@ -0,0 +1,2941 @@ +// This file is generated. Any changes you make will be lost during the next clean build. + +// No dependent includes + +// CUDA public interface, for type definitions and cu* function prototypes +#include "cuda.h" + + +// ************************************************************************* +// Definitions of structs to hold parameters for each function +// ************************************************************************* + +typedef struct cuGetErrorString_params_st { + CUresult error; + const char **pStr; +} cuGetErrorString_params; + +typedef struct cuGetErrorName_params_st { + CUresult error; + const char **pStr; +} cuGetErrorName_params; + +typedef struct cuInit_params_st { + unsigned int Flags; +} cuInit_params; + +typedef struct cuDriverGetVersion_params_st { + int *driverVersion; +} cuDriverGetVersion_params; + +typedef struct cuDeviceGet_params_st { + CUdevice *device; + int ordinal; +} cuDeviceGet_params; + +typedef struct cuDeviceGetCount_params_st { + int *count; +} cuDeviceGetCount_params; + +typedef struct cuDeviceGetName_params_st { + char *name; + int len; + CUdevice dev; +} cuDeviceGetName_params; + +typedef struct cuDeviceGetUuid_params_st { + CUuuid *uuid; + CUdevice dev; +} cuDeviceGetUuid_params; + +typedef struct cuDeviceGetUuid_v2_params_st { + CUuuid *uuid; + CUdevice dev; +} cuDeviceGetUuid_v2_params; + +typedef struct cuDeviceGetLuid_params_st { + char *luid; + unsigned int *deviceNodeMask; + CUdevice dev; +} cuDeviceGetLuid_params; + +typedef struct cuDeviceTotalMem_v2_params_st { + size_t *bytes; + CUdevice dev; +} cuDeviceTotalMem_v2_params; + +typedef struct cuDeviceGetTexture1DLinearMaxWidth_params_st { + size_t *maxWidthInElements; + CUarray_format format; + unsigned numChannels; + CUdevice dev; +} cuDeviceGetTexture1DLinearMaxWidth_params; + +typedef struct cuDeviceGetAttribute_params_st { + int *pi; + CUdevice_attribute attrib; + CUdevice dev; +} cuDeviceGetAttribute_params; + +typedef struct cuDeviceGetNvSciSyncAttributes_params_st { + void *nvSciSyncAttrList; + CUdevice dev; + int flags; +} cuDeviceGetNvSciSyncAttributes_params; + +typedef struct cuDeviceSetMemPool_params_st { + CUdevice dev; + CUmemoryPool pool; +} cuDeviceSetMemPool_params; + +typedef struct cuDeviceGetMemPool_params_st { + CUmemoryPool *pool; + CUdevice dev; +} cuDeviceGetMemPool_params; + +typedef struct cuDeviceGetDefaultMemPool_params_st { + CUmemoryPool *pool_out; + CUdevice dev; +} cuDeviceGetDefaultMemPool_params; + +typedef struct cuFlushGPUDirectRDMAWrites_params_st { + CUflushGPUDirectRDMAWritesTarget target; + CUflushGPUDirectRDMAWritesScope scope; +} cuFlushGPUDirectRDMAWrites_params; + +typedef struct cuDeviceGetProperties_params_st { + CUdevprop *prop; + CUdevice dev; +} cuDeviceGetProperties_params; + +typedef struct cuDeviceComputeCapability_params_st { + int *major; + int *minor; + CUdevice dev; +} cuDeviceComputeCapability_params; + +typedef struct cuDevicePrimaryCtxRetain_params_st { + CUcontext *pctx; + CUdevice dev; +} cuDevicePrimaryCtxRetain_params; + +typedef struct cuDevicePrimaryCtxRelease_v2_params_st { + CUdevice dev; +} cuDevicePrimaryCtxRelease_v2_params; + +typedef struct cuDevicePrimaryCtxSetFlags_v2_params_st { + CUdevice dev; + unsigned int flags; +} cuDevicePrimaryCtxSetFlags_v2_params; + +typedef struct cuDevicePrimaryCtxGetState_params_st { + CUdevice dev; + unsigned int *flags; + int *active; +} cuDevicePrimaryCtxGetState_params; + +typedef struct cuDevicePrimaryCtxReset_v2_params_st { + CUdevice dev; +} cuDevicePrimaryCtxReset_v2_params; + +typedef struct cuDeviceGetExecAffinitySupport_params_st { + int *pi; + CUexecAffinityType type; + CUdevice dev; +} cuDeviceGetExecAffinitySupport_params; + +typedef struct cuCtxCreate_v2_params_st { + CUcontext *pctx; + unsigned int flags; + CUdevice dev; +} cuCtxCreate_v2_params; + +typedef struct cuCtxCreate_v3_params_st { + CUcontext *pctx; + CUexecAffinityParam *paramsArray; + int numParams; + unsigned int flags; + CUdevice dev; +} cuCtxCreate_v3_params; + +typedef struct cuCtxDestroy_v2_params_st { + CUcontext ctx; +} cuCtxDestroy_v2_params; + +typedef struct cuCtxPushCurrent_v2_params_st { + CUcontext ctx; +} cuCtxPushCurrent_v2_params; + +typedef struct cuCtxPopCurrent_v2_params_st { + CUcontext *pctx; +} cuCtxPopCurrent_v2_params; + +typedef struct cuCtxSetCurrent_params_st { + CUcontext ctx; +} cuCtxSetCurrent_params; + +typedef struct cuCtxGetCurrent_params_st { + CUcontext *pctx; +} cuCtxGetCurrent_params; + +typedef struct cuCtxGetDevice_params_st { + CUdevice *device; +} cuCtxGetDevice_params; + +typedef struct cuCtxGetFlags_params_st { + unsigned int *flags; +} cuCtxGetFlags_params; + +typedef struct cuCtxSetLimit_params_st { + CUlimit limit; + size_t value; +} cuCtxSetLimit_params; + +typedef struct cuCtxGetLimit_params_st { + size_t *pvalue; + CUlimit limit; +} cuCtxGetLimit_params; + +typedef struct cuCtxGetCacheConfig_params_st { + CUfunc_cache *pconfig; +} cuCtxGetCacheConfig_params; + +typedef struct cuCtxSetCacheConfig_params_st { + CUfunc_cache config; +} cuCtxSetCacheConfig_params; + +typedef struct cuCtxGetSharedMemConfig_params_st { + CUsharedconfig *pConfig; +} cuCtxGetSharedMemConfig_params; + +typedef struct cuCtxSetSharedMemConfig_params_st { + CUsharedconfig config; +} cuCtxSetSharedMemConfig_params; + +typedef struct cuCtxGetApiVersion_params_st { + CUcontext ctx; + unsigned int *version; +} cuCtxGetApiVersion_params; + +typedef struct cuCtxGetStreamPriorityRange_params_st { + int *leastPriority; + int *greatestPriority; +} cuCtxGetStreamPriorityRange_params; + +typedef struct cuCtxGetExecAffinity_params_st { + CUexecAffinityParam *pExecAffinity; + CUexecAffinityType type; +} cuCtxGetExecAffinity_params; + +typedef struct cuCtxAttach_params_st { + CUcontext *pctx; + unsigned int flags; +} cuCtxAttach_params; + +typedef struct cuCtxDetach_params_st { + CUcontext ctx; +} cuCtxDetach_params; + +typedef struct cuModuleLoad_params_st { + CUmodule *module; + const char *fname; +} cuModuleLoad_params; + +typedef struct cuModuleLoadData_params_st { + CUmodule *module; + const void *image; +} cuModuleLoadData_params; + +typedef struct cuModuleLoadDataEx_params_st { + CUmodule *module; + const void *image; + unsigned int numOptions; + CUjit_option *options; + void **optionValues; +} cuModuleLoadDataEx_params; + +typedef struct cuModuleLoadFatBinary_params_st { + CUmodule *module; + const void *fatCubin; +} cuModuleLoadFatBinary_params; + +typedef struct cuModuleUnload_params_st { + CUmodule hmod; +} cuModuleUnload_params; + +typedef struct cuModuleGetFunction_params_st { + CUfunction *hfunc; + CUmodule hmod; + const char *name; +} cuModuleGetFunction_params; + +typedef struct cuModuleGetGlobal_v2_params_st { + CUdeviceptr *dptr; + size_t *bytes; + CUmodule hmod; + const char *name; +} cuModuleGetGlobal_v2_params; + +typedef struct cuModuleGetTexRef_params_st { + CUtexref *pTexRef; + CUmodule hmod; + const char *name; +} cuModuleGetTexRef_params; + +typedef struct cuModuleGetSurfRef_params_st { + CUsurfref *pSurfRef; + CUmodule hmod; + const char *name; +} cuModuleGetSurfRef_params; + +typedef struct cuLinkCreate_v2_params_st { + unsigned int numOptions; + CUjit_option *options; + void **optionValues; + CUlinkState *stateOut; +} cuLinkCreate_v2_params; + +typedef struct cuLinkAddData_v2_params_st { + CUlinkState state; + CUjitInputType type; + void *data; + size_t size; + const char *name; + unsigned int numOptions; + CUjit_option *options; + void **optionValues; +} cuLinkAddData_v2_params; + +typedef struct cuLinkAddFile_v2_params_st { + CUlinkState state; + CUjitInputType type; + const char *path; + unsigned int numOptions; + CUjit_option *options; + void **optionValues; +} cuLinkAddFile_v2_params; + +typedef struct cuLinkComplete_params_st { + CUlinkState state; + void **cubinOut; + size_t *sizeOut; +} cuLinkComplete_params; + +typedef struct cuLinkDestroy_params_st { + CUlinkState state; +} cuLinkDestroy_params; + +typedef struct cuMemGetInfo_v2_params_st { + size_t *free; + size_t *total; +} cuMemGetInfo_v2_params; + +typedef struct cuMemAlloc_v2_params_st { + CUdeviceptr *dptr; + size_t bytesize; +} cuMemAlloc_v2_params; + +typedef struct cuMemAllocPitch_v2_params_st { + CUdeviceptr *dptr; + size_t *pPitch; + size_t WidthInBytes; + size_t Height; + unsigned int ElementSizeBytes; +} cuMemAllocPitch_v2_params; + +typedef struct cuMemFree_v2_params_st { + CUdeviceptr dptr; +} cuMemFree_v2_params; + +typedef struct cuMemGetAddressRange_v2_params_st { + CUdeviceptr *pbase; + size_t *psize; + CUdeviceptr dptr; +} cuMemGetAddressRange_v2_params; + +typedef struct cuMemAllocHost_v2_params_st { + void **pp; + size_t bytesize; +} cuMemAllocHost_v2_params; + +typedef struct cuMemFreeHost_params_st { + void *p; +} cuMemFreeHost_params; + +typedef struct cuMemHostAlloc_params_st { + void **pp; + size_t bytesize; + unsigned int Flags; +} cuMemHostAlloc_params; + +typedef struct cuMemHostGetDevicePointer_v2_params_st { + CUdeviceptr *pdptr; + void *p; + unsigned int Flags; +} cuMemHostGetDevicePointer_v2_params; + +typedef struct cuMemHostGetFlags_params_st { + unsigned int *pFlags; + void *p; +} cuMemHostGetFlags_params; + +typedef struct cuMemAllocManaged_params_st { + CUdeviceptr *dptr; + size_t bytesize; + unsigned int flags; +} cuMemAllocManaged_params; + +typedef struct cuDeviceGetByPCIBusId_params_st { + CUdevice *dev; + const char *pciBusId; +} cuDeviceGetByPCIBusId_params; + +typedef struct cuDeviceGetPCIBusId_params_st { + char *pciBusId; + int len; + CUdevice dev; +} cuDeviceGetPCIBusId_params; + +typedef struct cuIpcGetEventHandle_params_st { + CUipcEventHandle *pHandle; + CUevent event; +} cuIpcGetEventHandle_params; + +typedef struct cuIpcOpenEventHandle_params_st { + CUevent *phEvent; + CUipcEventHandle handle; +} cuIpcOpenEventHandle_params; + +typedef struct cuIpcGetMemHandle_params_st { + CUipcMemHandle *pHandle; + CUdeviceptr dptr; +} cuIpcGetMemHandle_params; + +typedef struct cuIpcOpenMemHandle_v2_params_st { + CUdeviceptr *pdptr; + CUipcMemHandle handle; + unsigned int Flags; +} cuIpcOpenMemHandle_v2_params; + +typedef struct cuIpcCloseMemHandle_params_st { + CUdeviceptr dptr; +} cuIpcCloseMemHandle_params; + +typedef struct cuMemHostRegister_v2_params_st { + void *p; + size_t bytesize; + unsigned int Flags; +} cuMemHostRegister_v2_params; + +typedef struct cuMemHostUnregister_params_st { + void *p; +} cuMemHostUnregister_params; + +typedef struct cuMemcpy_ptds_params_st { + CUdeviceptr dst; + CUdeviceptr src; + size_t ByteCount; +} cuMemcpy_ptds_params; + +typedef struct cuMemcpyPeer_ptds_params_st { + CUdeviceptr dstDevice; + CUcontext dstContext; + CUdeviceptr srcDevice; + CUcontext srcContext; + size_t ByteCount; +} cuMemcpyPeer_ptds_params; + +typedef struct cuMemcpyHtoD_v2_ptds_params_st { + CUdeviceptr dstDevice; + const void *srcHost; + size_t ByteCount; +} cuMemcpyHtoD_v2_ptds_params; + +typedef struct cuMemcpyDtoH_v2_ptds_params_st { + void *dstHost; + CUdeviceptr srcDevice; + size_t ByteCount; +} cuMemcpyDtoH_v2_ptds_params; + +typedef struct cuMemcpyDtoD_v2_ptds_params_st { + CUdeviceptr dstDevice; + CUdeviceptr srcDevice; + size_t ByteCount; +} cuMemcpyDtoD_v2_ptds_params; + +typedef struct cuMemcpyDtoA_v2_ptds_params_st { + CUarray dstArray; + size_t dstOffset; + CUdeviceptr srcDevice; + size_t ByteCount; +} cuMemcpyDtoA_v2_ptds_params; + +typedef struct cuMemcpyAtoD_v2_ptds_params_st { + CUdeviceptr dstDevice; + CUarray srcArray; + size_t srcOffset; + size_t ByteCount; +} cuMemcpyAtoD_v2_ptds_params; + +typedef struct cuMemcpyHtoA_v2_ptds_params_st { + CUarray dstArray; + size_t dstOffset; + const void *srcHost; + size_t ByteCount; +} cuMemcpyHtoA_v2_ptds_params; + +typedef struct cuMemcpyAtoH_v2_ptds_params_st { + void *dstHost; + CUarray srcArray; + size_t srcOffset; + size_t ByteCount; +} cuMemcpyAtoH_v2_ptds_params; + +typedef struct cuMemcpyAtoA_v2_ptds_params_st { + CUarray dstArray; + size_t dstOffset; + CUarray srcArray; + size_t srcOffset; + size_t ByteCount; +} cuMemcpyAtoA_v2_ptds_params; + +typedef struct cuMemcpy2D_v2_ptds_params_st { + const CUDA_MEMCPY2D *pCopy; +} cuMemcpy2D_v2_ptds_params; + +typedef struct cuMemcpy2DUnaligned_v2_ptds_params_st { + const CUDA_MEMCPY2D *pCopy; +} cuMemcpy2DUnaligned_v2_ptds_params; + +typedef struct cuMemcpy3D_v2_ptds_params_st { + const CUDA_MEMCPY3D *pCopy; +} cuMemcpy3D_v2_ptds_params; + +typedef struct cuMemcpy3DPeer_ptds_params_st { + const CUDA_MEMCPY3D_PEER *pCopy; +} cuMemcpy3DPeer_ptds_params; + +typedef struct cuMemcpyAsync_ptsz_params_st { + CUdeviceptr dst; + CUdeviceptr src; + size_t ByteCount; + CUstream hStream; +} cuMemcpyAsync_ptsz_params; + +typedef struct cuMemcpyPeerAsync_ptsz_params_st { + CUdeviceptr dstDevice; + CUcontext dstContext; + CUdeviceptr srcDevice; + CUcontext srcContext; + size_t ByteCount; + CUstream hStream; +} cuMemcpyPeerAsync_ptsz_params; + +typedef struct cuMemcpyHtoDAsync_v2_ptsz_params_st { + CUdeviceptr dstDevice; + const void *srcHost; + size_t ByteCount; + CUstream hStream; +} cuMemcpyHtoDAsync_v2_ptsz_params; + +typedef struct cuMemcpyDtoHAsync_v2_ptsz_params_st { + void *dstHost; + CUdeviceptr srcDevice; + size_t ByteCount; + CUstream hStream; +} cuMemcpyDtoHAsync_v2_ptsz_params; + +typedef struct cuMemcpyDtoDAsync_v2_ptsz_params_st { + CUdeviceptr dstDevice; + CUdeviceptr srcDevice; + size_t ByteCount; + CUstream hStream; +} cuMemcpyDtoDAsync_v2_ptsz_params; + +typedef struct cuMemcpyHtoAAsync_v2_ptsz_params_st { + CUarray dstArray; + size_t dstOffset; + const void *srcHost; + size_t ByteCount; + CUstream hStream; +} cuMemcpyHtoAAsync_v2_ptsz_params; + +typedef struct cuMemcpyAtoHAsync_v2_ptsz_params_st { + void *dstHost; + CUarray srcArray; + size_t srcOffset; + size_t ByteCount; + CUstream hStream; +} cuMemcpyAtoHAsync_v2_ptsz_params; + +typedef struct cuMemcpy2DAsync_v2_ptsz_params_st { + const CUDA_MEMCPY2D *pCopy; + CUstream hStream; +} cuMemcpy2DAsync_v2_ptsz_params; + +typedef struct cuMemcpy3DAsync_v2_ptsz_params_st { + const CUDA_MEMCPY3D *pCopy; + CUstream hStream; +} cuMemcpy3DAsync_v2_ptsz_params; + +typedef struct cuMemcpy3DPeerAsync_ptsz_params_st { + const CUDA_MEMCPY3D_PEER *pCopy; + CUstream hStream; +} cuMemcpy3DPeerAsync_ptsz_params; + +typedef struct cuMemsetD8_v2_ptds_params_st { + CUdeviceptr dstDevice; + unsigned char uc; + size_t N; +} cuMemsetD8_v2_ptds_params; + +typedef struct cuMemsetD16_v2_ptds_params_st { + CUdeviceptr dstDevice; + unsigned short us; + size_t N; +} cuMemsetD16_v2_ptds_params; + +typedef struct cuMemsetD32_v2_ptds_params_st { + CUdeviceptr dstDevice; + unsigned int ui; + size_t N; +} cuMemsetD32_v2_ptds_params; + +typedef struct cuMemsetD2D8_v2_ptds_params_st { + CUdeviceptr dstDevice; + size_t dstPitch; + unsigned char uc; + size_t Width; + size_t Height; +} cuMemsetD2D8_v2_ptds_params; + +typedef struct cuMemsetD2D16_v2_ptds_params_st { + CUdeviceptr dstDevice; + size_t dstPitch; + unsigned short us; + size_t Width; + size_t Height; +} cuMemsetD2D16_v2_ptds_params; + +typedef struct cuMemsetD2D32_v2_ptds_params_st { + CUdeviceptr dstDevice; + size_t dstPitch; + unsigned int ui; + size_t Width; + size_t Height; +} cuMemsetD2D32_v2_ptds_params; + +typedef struct cuMemsetD8Async_ptsz_params_st { + CUdeviceptr dstDevice; + unsigned char uc; + size_t N; + CUstream hStream; +} cuMemsetD8Async_ptsz_params; + +typedef struct cuMemsetD16Async_ptsz_params_st { + CUdeviceptr dstDevice; + unsigned short us; + size_t N; + CUstream hStream; +} cuMemsetD16Async_ptsz_params; + +typedef struct cuMemsetD32Async_ptsz_params_st { + CUdeviceptr dstDevice; + unsigned int ui; + size_t N; + CUstream hStream; +} cuMemsetD32Async_ptsz_params; + +typedef struct cuMemsetD2D8Async_ptsz_params_st { + CUdeviceptr dstDevice; + size_t dstPitch; + unsigned char uc; + size_t Width; + size_t Height; + CUstream hStream; +} cuMemsetD2D8Async_ptsz_params; + +typedef struct cuMemsetD2D16Async_ptsz_params_st { + CUdeviceptr dstDevice; + size_t dstPitch; + unsigned short us; + size_t Width; + size_t Height; + CUstream hStream; +} cuMemsetD2D16Async_ptsz_params; + +typedef struct cuMemsetD2D32Async_ptsz_params_st { + CUdeviceptr dstDevice; + size_t dstPitch; + unsigned int ui; + size_t Width; + size_t Height; + CUstream hStream; +} cuMemsetD2D32Async_ptsz_params; + +typedef struct cuArrayCreate_v2_params_st { + CUarray *pHandle; + const CUDA_ARRAY_DESCRIPTOR *pAllocateArray; +} cuArrayCreate_v2_params; + +typedef struct cuArrayGetDescriptor_v2_params_st { + CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor; + CUarray hArray; +} cuArrayGetDescriptor_v2_params; + +typedef struct cuArrayGetSparseProperties_params_st { + CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties; + CUarray array; +} cuArrayGetSparseProperties_params; + +typedef struct cuMipmappedArrayGetSparseProperties_params_st { + CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties; + CUmipmappedArray mipmap; +} cuMipmappedArrayGetSparseProperties_params; + +typedef struct cuArrayGetMemoryRequirements_params_st { + CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements; + CUarray array; + CUdevice device; +} cuArrayGetMemoryRequirements_params; + +typedef struct cuMipmappedArrayGetMemoryRequirements_params_st { + CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements; + CUmipmappedArray mipmap; + CUdevice device; +} cuMipmappedArrayGetMemoryRequirements_params; + +typedef struct cuArrayGetPlane_params_st { + CUarray *pPlaneArray; + CUarray hArray; + unsigned int planeIdx; +} cuArrayGetPlane_params; + +typedef struct cuArrayDestroy_params_st { + CUarray hArray; +} cuArrayDestroy_params; + +typedef struct cuArray3DCreate_v2_params_st { + CUarray *pHandle; + const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray; +} cuArray3DCreate_v2_params; + +typedef struct cuArray3DGetDescriptor_v2_params_st { + CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor; + CUarray hArray; +} cuArray3DGetDescriptor_v2_params; + +typedef struct cuMipmappedArrayCreate_params_st { + CUmipmappedArray *pHandle; + const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc; + unsigned int numMipmapLevels; +} cuMipmappedArrayCreate_params; + +typedef struct cuMipmappedArrayGetLevel_params_st { + CUarray *pLevelArray; + CUmipmappedArray hMipmappedArray; + unsigned int level; +} cuMipmappedArrayGetLevel_params; + +typedef struct cuMipmappedArrayDestroy_params_st { + CUmipmappedArray hMipmappedArray; +} cuMipmappedArrayDestroy_params; + +typedef struct cuMemAddressReserve_params_st { + CUdeviceptr *ptr; + size_t size; + size_t alignment; + CUdeviceptr addr; + unsigned long long flags; +} cuMemAddressReserve_params; + +typedef struct cuMemAddressFree_params_st { + CUdeviceptr ptr; + size_t size; +} cuMemAddressFree_params; + +typedef struct cuMemCreate_params_st { + CUmemGenericAllocationHandle *handle; + size_t size; + const CUmemAllocationProp *prop; + unsigned long long flags; +} cuMemCreate_params; + +typedef struct cuMemRelease_params_st { + CUmemGenericAllocationHandle handle; +} cuMemRelease_params; + +typedef struct cuMemMap_params_st { + CUdeviceptr ptr; + size_t size; + size_t offset; + CUmemGenericAllocationHandle handle; + unsigned long long flags; +} cuMemMap_params; + +typedef struct cuMemMapArrayAsync_ptsz_params_st { + CUarrayMapInfo *mapInfoList; + unsigned int count; + CUstream hStream; +} cuMemMapArrayAsync_ptsz_params; + +typedef struct cuMemUnmap_params_st { + CUdeviceptr ptr; + size_t size; +} cuMemUnmap_params; + +typedef struct cuMemSetAccess_params_st { + CUdeviceptr ptr; + size_t size; + const CUmemAccessDesc *desc; + size_t count; +} cuMemSetAccess_params; + +typedef struct cuMemGetAccess_params_st { + unsigned long long *flags; + const CUmemLocation *location; + CUdeviceptr ptr; +} cuMemGetAccess_params; + +typedef struct cuMemExportToShareableHandle_params_st { + void *shareableHandle; + CUmemGenericAllocationHandle handle; + CUmemAllocationHandleType handleType; + unsigned long long flags; +} cuMemExportToShareableHandle_params; + +typedef struct cuMemImportFromShareableHandle_params_st { + CUmemGenericAllocationHandle *handle; + void *osHandle; + CUmemAllocationHandleType shHandleType; +} cuMemImportFromShareableHandle_params; + +typedef struct cuMemGetAllocationGranularity_params_st { + size_t *granularity; + const CUmemAllocationProp *prop; + CUmemAllocationGranularity_flags option; +} cuMemGetAllocationGranularity_params; + +typedef struct cuMemGetAllocationPropertiesFromHandle_params_st { + CUmemAllocationProp *prop; + CUmemGenericAllocationHandle handle; +} cuMemGetAllocationPropertiesFromHandle_params; + +typedef struct cuMemRetainAllocationHandle_params_st { + CUmemGenericAllocationHandle *handle; + void *addr; +} cuMemRetainAllocationHandle_params; + +typedef struct cuMemFreeAsync_ptsz_params_st { + CUdeviceptr dptr; + CUstream hStream; +} cuMemFreeAsync_ptsz_params; + +typedef struct cuMemAllocAsync_ptsz_params_st { + CUdeviceptr *dptr; + size_t bytesize; + CUstream hStream; +} cuMemAllocAsync_ptsz_params; + +typedef struct cuMemPoolTrimTo_params_st { + CUmemoryPool pool; + size_t minBytesToKeep; +} cuMemPoolTrimTo_params; + +typedef struct cuMemPoolSetAttribute_params_st { + CUmemoryPool pool; + CUmemPool_attribute attr; + void *value; +} cuMemPoolSetAttribute_params; + +typedef struct cuMemPoolGetAttribute_params_st { + CUmemoryPool pool; + CUmemPool_attribute attr; + void *value; +} cuMemPoolGetAttribute_params; + +typedef struct cuMemPoolSetAccess_params_st { + CUmemoryPool pool; + const CUmemAccessDesc *map; + size_t count; +} cuMemPoolSetAccess_params; + +typedef struct cuMemPoolGetAccess_params_st { + CUmemAccess_flags *flags; + CUmemoryPool memPool; + CUmemLocation *location; +} cuMemPoolGetAccess_params; + +typedef struct cuMemPoolCreate_params_st { + CUmemoryPool *pool; + const CUmemPoolProps *poolProps; +} cuMemPoolCreate_params; + +typedef struct cuMemPoolDestroy_params_st { + CUmemoryPool pool; +} cuMemPoolDestroy_params; + +typedef struct cuMemAllocFromPoolAsync_ptsz_params_st { + CUdeviceptr *dptr; + size_t bytesize; + CUmemoryPool pool; + CUstream hStream; +} cuMemAllocFromPoolAsync_ptsz_params; + +typedef struct cuMemPoolExportToShareableHandle_params_st { + void *handle_out; + CUmemoryPool pool; + CUmemAllocationHandleType handleType; + unsigned long long flags; +} cuMemPoolExportToShareableHandle_params; + +typedef struct cuMemPoolImportFromShareableHandle_params_st { + CUmemoryPool *pool_out; + void *handle; + CUmemAllocationHandleType handleType; + unsigned long long flags; +} cuMemPoolImportFromShareableHandle_params; + +typedef struct cuMemPoolExportPointer_params_st { + CUmemPoolPtrExportData *shareData_out; + CUdeviceptr ptr; +} cuMemPoolExportPointer_params; + +typedef struct cuMemPoolImportPointer_params_st { + CUdeviceptr *ptr_out; + CUmemoryPool pool; + CUmemPoolPtrExportData *shareData; +} cuMemPoolImportPointer_params; + +typedef struct cuPointerGetAttribute_params_st { + void *data; + CUpointer_attribute attribute; + CUdeviceptr ptr; +} cuPointerGetAttribute_params; + +typedef struct cuMemPrefetchAsync_ptsz_params_st { + CUdeviceptr devPtr; + size_t count; + CUdevice dstDevice; + CUstream hStream; +} cuMemPrefetchAsync_ptsz_params; + +typedef struct cuMemAdvise_params_st { + CUdeviceptr devPtr; + size_t count; + CUmem_advise advice; + CUdevice device; +} cuMemAdvise_params; + +typedef struct cuMemRangeGetAttribute_params_st { + void *data; + size_t dataSize; + CUmem_range_attribute attribute; + CUdeviceptr devPtr; + size_t count; +} cuMemRangeGetAttribute_params; + +typedef struct cuMemRangeGetAttributes_params_st { + void **data; + size_t *dataSizes; + CUmem_range_attribute *attributes; + size_t numAttributes; + CUdeviceptr devPtr; + size_t count; +} cuMemRangeGetAttributes_params; + +typedef struct cuPointerSetAttribute_params_st { + const void *value; + CUpointer_attribute attribute; + CUdeviceptr ptr; +} cuPointerSetAttribute_params; + +typedef struct cuPointerGetAttributes_params_st { + unsigned int numAttributes; + CUpointer_attribute *attributes; + void **data; + CUdeviceptr ptr; +} cuPointerGetAttributes_params; + +typedef struct cuStreamCreate_params_st { + CUstream *phStream; + unsigned int Flags; +} cuStreamCreate_params; + +typedef struct cuStreamCreateWithPriority_params_st { + CUstream *phStream; + unsigned int flags; + int priority; +} cuStreamCreateWithPriority_params; + +typedef struct cuStreamGetPriority_ptsz_params_st { + CUstream hStream; + int *priority; +} cuStreamGetPriority_ptsz_params; + +typedef struct cuStreamGetFlags_ptsz_params_st { + CUstream hStream; + unsigned int *flags; +} cuStreamGetFlags_ptsz_params; + +typedef struct cuStreamGetCtx_ptsz_params_st { + CUstream hStream; + CUcontext *pctx; +} cuStreamGetCtx_ptsz_params; + +typedef struct cuStreamWaitEvent_ptsz_params_st { + CUstream hStream; + CUevent hEvent; + unsigned int Flags; +} cuStreamWaitEvent_ptsz_params; + +typedef struct cuStreamAddCallback_ptsz_params_st { + CUstream hStream; + CUstreamCallback callback; + void *userData; + unsigned int flags; +} cuStreamAddCallback_ptsz_params; + +typedef struct cuStreamBeginCapture_v2_ptsz_params_st { + CUstream hStream; + CUstreamCaptureMode mode; +} cuStreamBeginCapture_v2_ptsz_params; + +typedef struct cuThreadExchangeStreamCaptureMode_params_st { + CUstreamCaptureMode *mode; +} cuThreadExchangeStreamCaptureMode_params; + +typedef struct cuStreamEndCapture_ptsz_params_st { + CUstream hStream; + CUgraph *phGraph; +} cuStreamEndCapture_ptsz_params; + +typedef struct cuStreamIsCapturing_ptsz_params_st { + CUstream hStream; + CUstreamCaptureStatus *captureStatus; +} cuStreamIsCapturing_ptsz_params; + +typedef struct cuStreamGetCaptureInfo_ptsz_params_st { + CUstream hStream; + CUstreamCaptureStatus *captureStatus_out; + cuuint64_t *id_out; +} cuStreamGetCaptureInfo_ptsz_params; + +typedef struct cuStreamGetCaptureInfo_v2_ptsz_params_st { + CUstream hStream; + CUstreamCaptureStatus *captureStatus_out; + cuuint64_t *id_out; + CUgraph *graph_out; + const CUgraphNode **dependencies_out; + size_t *numDependencies_out; +} cuStreamGetCaptureInfo_v2_ptsz_params; + +typedef struct cuStreamUpdateCaptureDependencies_ptsz_params_st { + CUstream hStream; + CUgraphNode *dependencies; + size_t numDependencies; + unsigned int flags; +} cuStreamUpdateCaptureDependencies_ptsz_params; + +typedef struct cuStreamAttachMemAsync_ptsz_params_st { + CUstream hStream; + CUdeviceptr dptr; + size_t length; + unsigned int flags; +} cuStreamAttachMemAsync_ptsz_params; + +typedef struct cuStreamQuery_ptsz_params_st { + CUstream hStream; +} cuStreamQuery_ptsz_params; + +typedef struct cuStreamSynchronize_ptsz_params_st { + CUstream hStream; +} cuStreamSynchronize_ptsz_params; + +typedef struct cuStreamDestroy_v2_params_st { + CUstream hStream; +} cuStreamDestroy_v2_params; + +typedef struct cuStreamCopyAttributes_ptsz_params_st { + CUstream dst; + CUstream src; +} cuStreamCopyAttributes_ptsz_params; + +typedef struct cuStreamGetAttribute_ptsz_params_st { + CUstream hStream; + CUstreamAttrID attr; + CUstreamAttrValue *value_out; +} cuStreamGetAttribute_ptsz_params; + +typedef struct cuStreamSetAttribute_ptsz_params_st { + CUstream hStream; + CUstreamAttrID attr; + const CUstreamAttrValue *value; +} cuStreamSetAttribute_ptsz_params; + +typedef struct cuEventCreate_params_st { + CUevent *phEvent; + unsigned int Flags; +} cuEventCreate_params; + +typedef struct cuEventRecord_ptsz_params_st { + CUevent hEvent; + CUstream hStream; +} cuEventRecord_ptsz_params; + +typedef struct cuEventRecordWithFlags_ptsz_params_st { + CUevent hEvent; + CUstream hStream; + unsigned int flags; +} cuEventRecordWithFlags_ptsz_params; + +typedef struct cuEventQuery_params_st { + CUevent hEvent; +} cuEventQuery_params; + +typedef struct cuEventSynchronize_params_st { + CUevent hEvent; +} cuEventSynchronize_params; + +typedef struct cuEventDestroy_v2_params_st { + CUevent hEvent; +} cuEventDestroy_v2_params; + +typedef struct cuEventElapsedTime_params_st { + float *pMilliseconds; + CUevent hStart; + CUevent hEnd; +} cuEventElapsedTime_params; + +typedef struct cuImportExternalMemory_params_st { + CUexternalMemory *extMem_out; + const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc; +} cuImportExternalMemory_params; + +typedef struct cuExternalMemoryGetMappedBuffer_params_st { + CUdeviceptr *devPtr; + CUexternalMemory extMem; + const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc; +} cuExternalMemoryGetMappedBuffer_params; + +typedef struct cuExternalMemoryGetMappedMipmappedArray_params_st { + CUmipmappedArray *mipmap; + CUexternalMemory extMem; + const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc; +} cuExternalMemoryGetMappedMipmappedArray_params; + +typedef struct cuDestroyExternalMemory_params_st { + CUexternalMemory extMem; +} cuDestroyExternalMemory_params; + +typedef struct cuImportExternalSemaphore_params_st { + CUexternalSemaphore *extSem_out; + const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc; +} cuImportExternalSemaphore_params; + +typedef struct cuSignalExternalSemaphoresAsync_ptsz_params_st { + const CUexternalSemaphore *extSemArray; + const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray; + unsigned int numExtSems; + CUstream stream; +} cuSignalExternalSemaphoresAsync_ptsz_params; + +typedef struct cuWaitExternalSemaphoresAsync_ptsz_params_st { + const CUexternalSemaphore *extSemArray; + const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray; + unsigned int numExtSems; + CUstream stream; +} cuWaitExternalSemaphoresAsync_ptsz_params; + +typedef struct cuDestroyExternalSemaphore_params_st { + CUexternalSemaphore extSem; +} cuDestroyExternalSemaphore_params; + +typedef struct cuStreamWaitValue32_ptsz_params_st { + CUstream stream; + CUdeviceptr addr; + cuuint32_t value; + unsigned int flags; +} cuStreamWaitValue32_ptsz_params; + +typedef struct cuStreamWaitValue64_ptsz_params_st { + CUstream stream; + CUdeviceptr addr; + cuuint64_t value; + unsigned int flags; +} cuStreamWaitValue64_ptsz_params; + +typedef struct cuStreamWriteValue32_ptsz_params_st { + CUstream stream; + CUdeviceptr addr; + cuuint32_t value; + unsigned int flags; +} cuStreamWriteValue32_ptsz_params; + +typedef struct cuStreamWriteValue64_ptsz_params_st { + CUstream stream; + CUdeviceptr addr; + cuuint64_t value; + unsigned int flags; +} cuStreamWriteValue64_ptsz_params; + +typedef struct cuStreamBatchMemOp_ptsz_params_st { + CUstream stream; + unsigned int count; + CUstreamBatchMemOpParams *paramArray; + unsigned int flags; +} cuStreamBatchMemOp_ptsz_params; + +typedef struct cuFuncGetAttribute_params_st { + int *pi; + CUfunction_attribute attrib; + CUfunction hfunc; +} cuFuncGetAttribute_params; + +typedef struct cuFuncSetAttribute_params_st { + CUfunction hfunc; + CUfunction_attribute attrib; + int value; +} cuFuncSetAttribute_params; + +typedef struct cuFuncSetCacheConfig_params_st { + CUfunction hfunc; + CUfunc_cache config; +} cuFuncSetCacheConfig_params; + +typedef struct cuFuncSetSharedMemConfig_params_st { + CUfunction hfunc; + CUsharedconfig config; +} cuFuncSetSharedMemConfig_params; + +typedef struct cuFuncGetModule_params_st { + CUmodule *hmod; + CUfunction hfunc; +} cuFuncGetModule_params; + +typedef struct cuLaunchKernel_ptsz_params_st { + CUfunction f; + unsigned int gridDimX; + unsigned int gridDimY; + unsigned int gridDimZ; + unsigned int blockDimX; + unsigned int blockDimY; + unsigned int blockDimZ; + unsigned int sharedMemBytes; + CUstream hStream; + void **kernelParams; + void **extra; +} cuLaunchKernel_ptsz_params; + +typedef struct cuLaunchKernelEx_ptsz_params_st { + const CUlaunchConfig *config; + CUfunction f; + void **kernelParams; + void **extra; +} cuLaunchKernelEx_ptsz_params; + +typedef struct cuLaunchCooperativeKernel_ptsz_params_st { + CUfunction f; + unsigned int gridDimX; + unsigned int gridDimY; + unsigned int gridDimZ; + unsigned int blockDimX; + unsigned int blockDimY; + unsigned int blockDimZ; + unsigned int sharedMemBytes; + CUstream hStream; + void **kernelParams; +} cuLaunchCooperativeKernel_ptsz_params; + +typedef struct cuLaunchCooperativeKernelMultiDevice_params_st { + CUDA_LAUNCH_PARAMS *launchParamsList; + unsigned int numDevices; + unsigned int flags; +} cuLaunchCooperativeKernelMultiDevice_params; + +typedef struct cuLaunchHostFunc_ptsz_params_st { + CUstream hStream; + CUhostFn fn; + void *userData; +} cuLaunchHostFunc_ptsz_params; + +typedef struct cuFuncSetBlockShape_params_st { + CUfunction hfunc; + int x; + int y; + int z; +} cuFuncSetBlockShape_params; + +typedef struct cuFuncSetSharedSize_params_st { + CUfunction hfunc; + unsigned int bytes; +} cuFuncSetSharedSize_params; + +typedef struct cuParamSetSize_params_st { + CUfunction hfunc; + unsigned int numbytes; +} cuParamSetSize_params; + +typedef struct cuParamSeti_params_st { + CUfunction hfunc; + int offset; + unsigned int value; +} cuParamSeti_params; + +typedef struct cuParamSetf_params_st { + CUfunction hfunc; + int offset; + float value; +} cuParamSetf_params; + +typedef struct cuParamSetv_params_st { + CUfunction hfunc; + int offset; + void *ptr; + unsigned int numbytes; +} cuParamSetv_params; + +typedef struct cuLaunch_params_st { + CUfunction f; +} cuLaunch_params; + +typedef struct cuLaunchGrid_params_st { + CUfunction f; + int grid_width; + int grid_height; +} cuLaunchGrid_params; + +typedef struct cuLaunchGridAsync_params_st { + CUfunction f; + int grid_width; + int grid_height; + CUstream hStream; +} cuLaunchGridAsync_params; + +typedef struct cuParamSetTexRef_params_st { + CUfunction hfunc; + int texunit; + CUtexref hTexRef; +} cuParamSetTexRef_params; + +typedef struct cuGraphCreate_params_st { + CUgraph *phGraph; + unsigned int flags; +} cuGraphCreate_params; + +typedef struct cuGraphAddKernelNode_params_st { + CUgraphNode *phGraphNode; + CUgraph hGraph; + const CUgraphNode *dependencies; + size_t numDependencies; + const CUDA_KERNEL_NODE_PARAMS *nodeParams; +} cuGraphAddKernelNode_params; + +typedef struct cuGraphKernelNodeGetParams_params_st { + CUgraphNode hNode; + CUDA_KERNEL_NODE_PARAMS *nodeParams; +} cuGraphKernelNodeGetParams_params; + +typedef struct cuGraphKernelNodeSetParams_params_st { + CUgraphNode hNode; + const CUDA_KERNEL_NODE_PARAMS *nodeParams; +} cuGraphKernelNodeSetParams_params; + +typedef struct cuGraphAddMemcpyNode_params_st { + CUgraphNode *phGraphNode; + CUgraph hGraph; + const CUgraphNode *dependencies; + size_t numDependencies; + const CUDA_MEMCPY3D *copyParams; + CUcontext ctx; +} cuGraphAddMemcpyNode_params; + +typedef struct cuGraphMemcpyNodeGetParams_params_st { + CUgraphNode hNode; + CUDA_MEMCPY3D *nodeParams; +} cuGraphMemcpyNodeGetParams_params; + +typedef struct cuGraphMemcpyNodeSetParams_params_st { + CUgraphNode hNode; + const CUDA_MEMCPY3D *nodeParams; +} cuGraphMemcpyNodeSetParams_params; + +typedef struct cuGraphAddMemsetNode_params_st { + CUgraphNode *phGraphNode; + CUgraph hGraph; + const CUgraphNode *dependencies; + size_t numDependencies; + const CUDA_MEMSET_NODE_PARAMS *memsetParams; + CUcontext ctx; +} cuGraphAddMemsetNode_params; + +typedef struct cuGraphMemsetNodeGetParams_params_st { + CUgraphNode hNode; + CUDA_MEMSET_NODE_PARAMS *nodeParams; +} cuGraphMemsetNodeGetParams_params; + +typedef struct cuGraphMemsetNodeSetParams_params_st { + CUgraphNode hNode; + const CUDA_MEMSET_NODE_PARAMS *nodeParams; +} cuGraphMemsetNodeSetParams_params; + +typedef struct cuGraphAddHostNode_params_st { + CUgraphNode *phGraphNode; + CUgraph hGraph; + const CUgraphNode *dependencies; + size_t numDependencies; + const CUDA_HOST_NODE_PARAMS *nodeParams; +} cuGraphAddHostNode_params; + +typedef struct cuGraphHostNodeGetParams_params_st { + CUgraphNode hNode; + CUDA_HOST_NODE_PARAMS *nodeParams; +} cuGraphHostNodeGetParams_params; + +typedef struct cuGraphHostNodeSetParams_params_st { + CUgraphNode hNode; + const CUDA_HOST_NODE_PARAMS *nodeParams; +} cuGraphHostNodeSetParams_params; + +typedef struct cuGraphAddChildGraphNode_params_st { + CUgraphNode *phGraphNode; + CUgraph hGraph; + const CUgraphNode *dependencies; + size_t numDependencies; + CUgraph childGraph; +} cuGraphAddChildGraphNode_params; + +typedef struct cuGraphChildGraphNodeGetGraph_params_st { + CUgraphNode hNode; + CUgraph *phGraph; +} cuGraphChildGraphNodeGetGraph_params; + +typedef struct cuGraphAddEmptyNode_params_st { + CUgraphNode *phGraphNode; + CUgraph hGraph; + const CUgraphNode *dependencies; + size_t numDependencies; +} cuGraphAddEmptyNode_params; + +typedef struct cuGraphAddEventRecordNode_params_st { + CUgraphNode *phGraphNode; + CUgraph hGraph; + const CUgraphNode *dependencies; + size_t numDependencies; + CUevent event; +} cuGraphAddEventRecordNode_params; + +typedef struct cuGraphEventRecordNodeGetEvent_params_st { + CUgraphNode hNode; + CUevent *event_out; +} cuGraphEventRecordNodeGetEvent_params; + +typedef struct cuGraphEventRecordNodeSetEvent_params_st { + CUgraphNode hNode; + CUevent event; +} cuGraphEventRecordNodeSetEvent_params; + +typedef struct cuGraphAddEventWaitNode_params_st { + CUgraphNode *phGraphNode; + CUgraph hGraph; + const CUgraphNode *dependencies; + size_t numDependencies; + CUevent event; +} cuGraphAddEventWaitNode_params; + +typedef struct cuGraphEventWaitNodeGetEvent_params_st { + CUgraphNode hNode; + CUevent *event_out; +} cuGraphEventWaitNodeGetEvent_params; + +typedef struct cuGraphEventWaitNodeSetEvent_params_st { + CUgraphNode hNode; + CUevent event; +} cuGraphEventWaitNodeSetEvent_params; + +typedef struct cuGraphAddExternalSemaphoresSignalNode_params_st { + CUgraphNode *phGraphNode; + CUgraph hGraph; + const CUgraphNode *dependencies; + size_t numDependencies; + const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams; +} cuGraphAddExternalSemaphoresSignalNode_params; + +typedef struct cuGraphExternalSemaphoresSignalNodeGetParams_params_st { + CUgraphNode hNode; + CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out; +} cuGraphExternalSemaphoresSignalNodeGetParams_params; + +typedef struct cuGraphExternalSemaphoresSignalNodeSetParams_params_st { + CUgraphNode hNode; + const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams; +} cuGraphExternalSemaphoresSignalNodeSetParams_params; + +typedef struct cuGraphAddExternalSemaphoresWaitNode_params_st { + CUgraphNode *phGraphNode; + CUgraph hGraph; + const CUgraphNode *dependencies; + size_t numDependencies; + const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams; +} cuGraphAddExternalSemaphoresWaitNode_params; + +typedef struct cuGraphExternalSemaphoresWaitNodeGetParams_params_st { + CUgraphNode hNode; + CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out; +} cuGraphExternalSemaphoresWaitNodeGetParams_params; + +typedef struct cuGraphExternalSemaphoresWaitNodeSetParams_params_st { + CUgraphNode hNode; + const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams; +} cuGraphExternalSemaphoresWaitNodeSetParams_params; + +typedef struct cuGraphAddMemAllocNode_params_st { + CUgraphNode *phGraphNode; + CUgraph hGraph; + const CUgraphNode *dependencies; + size_t numDependencies; + CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams; +} cuGraphAddMemAllocNode_params; + +typedef struct cuGraphMemAllocNodeGetParams_params_st { + CUgraphNode hNode; + CUDA_MEM_ALLOC_NODE_PARAMS *params_out; +} cuGraphMemAllocNodeGetParams_params; + +typedef struct cuGraphAddMemFreeNode_params_st { + CUgraphNode *phGraphNode; + CUgraph hGraph; + const CUgraphNode *dependencies; + size_t numDependencies; + CUdeviceptr dptr; +} cuGraphAddMemFreeNode_params; + +typedef struct cuGraphMemFreeNodeGetParams_params_st { + CUgraphNode hNode; + CUdeviceptr *dptr_out; +} cuGraphMemFreeNodeGetParams_params; + +typedef struct cuDeviceGraphMemTrim_params_st { + CUdevice device; +} cuDeviceGraphMemTrim_params; + +typedef struct cuDeviceGetGraphMemAttribute_params_st { + CUdevice device; + CUgraphMem_attribute attr; + void *value; +} cuDeviceGetGraphMemAttribute_params; + +typedef struct cuDeviceSetGraphMemAttribute_params_st { + CUdevice device; + CUgraphMem_attribute attr; + void *value; +} cuDeviceSetGraphMemAttribute_params; + +typedef struct cuGraphClone_params_st { + CUgraph *phGraphClone; + CUgraph originalGraph; +} cuGraphClone_params; + +typedef struct cuGraphNodeFindInClone_params_st { + CUgraphNode *phNode; + CUgraphNode hOriginalNode; + CUgraph hClonedGraph; +} cuGraphNodeFindInClone_params; + +typedef struct cuGraphNodeGetType_params_st { + CUgraphNode hNode; + CUgraphNodeType *type; +} cuGraphNodeGetType_params; + +typedef struct cuGraphGetNodes_params_st { + CUgraph hGraph; + CUgraphNode *nodes; + size_t *numNodes; +} cuGraphGetNodes_params; + +typedef struct cuGraphGetRootNodes_params_st { + CUgraph hGraph; + CUgraphNode *rootNodes; + size_t *numRootNodes; +} cuGraphGetRootNodes_params; + +typedef struct cuGraphGetEdges_params_st { + CUgraph hGraph; + CUgraphNode *from; + CUgraphNode *to; + size_t *numEdges; +} cuGraphGetEdges_params; + +typedef struct cuGraphNodeGetDependencies_params_st { + CUgraphNode hNode; + CUgraphNode *dependencies; + size_t *numDependencies; +} cuGraphNodeGetDependencies_params; + +typedef struct cuGraphNodeGetDependentNodes_params_st { + CUgraphNode hNode; + CUgraphNode *dependentNodes; + size_t *numDependentNodes; +} cuGraphNodeGetDependentNodes_params; + +typedef struct cuGraphAddDependencies_params_st { + CUgraph hGraph; + const CUgraphNode *from; + const CUgraphNode *to; + size_t numDependencies; +} cuGraphAddDependencies_params; + +typedef struct cuGraphRemoveDependencies_params_st { + CUgraph hGraph; + const CUgraphNode *from; + const CUgraphNode *to; + size_t numDependencies; +} cuGraphRemoveDependencies_params; + +typedef struct cuGraphDestroyNode_params_st { + CUgraphNode hNode; +} cuGraphDestroyNode_params; + +typedef struct cuGraphInstantiate_v2_params_st { + CUgraphExec *phGraphExec; + CUgraph hGraph; + CUgraphNode *phErrorNode; + char *logBuffer; + size_t bufferSize; +} cuGraphInstantiate_v2_params; + +typedef struct cuGraphInstantiateWithFlags_params_st { + CUgraphExec *phGraphExec; + CUgraph hGraph; + unsigned long long flags; +} cuGraphInstantiateWithFlags_params; + +typedef struct cuGraphExecKernelNodeSetParams_params_st { + CUgraphExec hGraphExec; + CUgraphNode hNode; + const CUDA_KERNEL_NODE_PARAMS *nodeParams; +} cuGraphExecKernelNodeSetParams_params; + +typedef struct cuGraphExecMemcpyNodeSetParams_params_st { + CUgraphExec hGraphExec; + CUgraphNode hNode; + const CUDA_MEMCPY3D *copyParams; + CUcontext ctx; +} cuGraphExecMemcpyNodeSetParams_params; + +typedef struct cuGraphExecMemsetNodeSetParams_params_st { + CUgraphExec hGraphExec; + CUgraphNode hNode; + const CUDA_MEMSET_NODE_PARAMS *memsetParams; + CUcontext ctx; +} cuGraphExecMemsetNodeSetParams_params; + +typedef struct cuGraphExecHostNodeSetParams_params_st { + CUgraphExec hGraphExec; + CUgraphNode hNode; + const CUDA_HOST_NODE_PARAMS *nodeParams; +} cuGraphExecHostNodeSetParams_params; + +typedef struct cuGraphExecChildGraphNodeSetParams_params_st { + CUgraphExec hGraphExec; + CUgraphNode hNode; + CUgraph childGraph; +} cuGraphExecChildGraphNodeSetParams_params; + +typedef struct cuGraphExecEventRecordNodeSetEvent_params_st { + CUgraphExec hGraphExec; + CUgraphNode hNode; + CUevent event; +} cuGraphExecEventRecordNodeSetEvent_params; + +typedef struct cuGraphExecEventWaitNodeSetEvent_params_st { + CUgraphExec hGraphExec; + CUgraphNode hNode; + CUevent event; +} cuGraphExecEventWaitNodeSetEvent_params; + +typedef struct cuGraphExecExternalSemaphoresSignalNodeSetParams_params_st { + CUgraphExec hGraphExec; + CUgraphNode hNode; + const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams; +} cuGraphExecExternalSemaphoresSignalNodeSetParams_params; + +typedef struct cuGraphExecExternalSemaphoresWaitNodeSetParams_params_st { + CUgraphExec hGraphExec; + CUgraphNode hNode; + const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams; +} cuGraphExecExternalSemaphoresWaitNodeSetParams_params; + +typedef struct cuGraphNodeSetEnabled_params_st { + CUgraphExec hGraphExec; + CUgraphNode hNode; + unsigned int isEnabled; +} cuGraphNodeSetEnabled_params; + +typedef struct cuGraphNodeGetEnabled_params_st { + CUgraphExec hGraphExec; + CUgraphNode hNode; + unsigned int *isEnabled; +} cuGraphNodeGetEnabled_params; + +typedef struct cuGraphUpload_ptsz_params_st { + CUgraphExec hGraphExec; + CUstream hStream; +} cuGraphUpload_ptsz_params; + +typedef struct cuGraphLaunch_ptsz_params_st { + CUgraphExec hGraphExec; + CUstream hStream; +} cuGraphLaunch_ptsz_params; + +typedef struct cuGraphExecDestroy_params_st { + CUgraphExec hGraphExec; +} cuGraphExecDestroy_params; + +typedef struct cuGraphDestroy_params_st { + CUgraph hGraph; +} cuGraphDestroy_params; + +typedef struct cuGraphExecUpdate_params_st { + CUgraphExec hGraphExec; + CUgraph hGraph; + CUgraphNode *hErrorNode_out; + CUgraphExecUpdateResult *updateResult_out; +} cuGraphExecUpdate_params; + +typedef struct cuGraphKernelNodeCopyAttributes_params_st { + CUgraphNode dst; + CUgraphNode src; +} cuGraphKernelNodeCopyAttributes_params; + +typedef struct cuGraphKernelNodeGetAttribute_params_st { + CUgraphNode hNode; + CUkernelNodeAttrID attr; + CUkernelNodeAttrValue *value_out; +} cuGraphKernelNodeGetAttribute_params; + +typedef struct cuGraphKernelNodeSetAttribute_params_st { + CUgraphNode hNode; + CUkernelNodeAttrID attr; + const CUkernelNodeAttrValue *value; +} cuGraphKernelNodeSetAttribute_params; + +typedef struct cuGraphDebugDotPrint_params_st { + CUgraph hGraph; + const char *path; + unsigned int flags; +} cuGraphDebugDotPrint_params; + +typedef struct cuUserObjectCreate_params_st { + CUuserObject *object_out; + void *ptr; + CUhostFn destroy; + unsigned int initialRefcount; + unsigned int flags; +} cuUserObjectCreate_params; + +typedef struct cuUserObjectRetain_params_st { + CUuserObject object; + unsigned int count; +} cuUserObjectRetain_params; + +typedef struct cuUserObjectRelease_params_st { + CUuserObject object; + unsigned int count; +} cuUserObjectRelease_params; + +typedef struct cuGraphRetainUserObject_params_st { + CUgraph graph; + CUuserObject object; + unsigned int count; + unsigned int flags; +} cuGraphRetainUserObject_params; + +typedef struct cuGraphReleaseUserObject_params_st { + CUgraph graph; + CUuserObject object; + unsigned int count; +} cuGraphReleaseUserObject_params; + +typedef struct cuOccupancyMaxActiveBlocksPerMultiprocessor_params_st { + int *numBlocks; + CUfunction func; + int blockSize; + size_t dynamicSMemSize; +} cuOccupancyMaxActiveBlocksPerMultiprocessor_params; + +typedef struct cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_params_st { + int *numBlocks; + CUfunction func; + int blockSize; + size_t dynamicSMemSize; + unsigned int flags; +} cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_params; + +typedef struct cuOccupancyMaxPotentialBlockSize_params_st { + int *minGridSize; + int *blockSize; + CUfunction func; + CUoccupancyB2DSize blockSizeToDynamicSMemSize; + size_t dynamicSMemSize; + int blockSizeLimit; +} cuOccupancyMaxPotentialBlockSize_params; + +typedef struct cuOccupancyMaxPotentialBlockSizeWithFlags_params_st { + int *minGridSize; + int *blockSize; + CUfunction func; + CUoccupancyB2DSize blockSizeToDynamicSMemSize; + size_t dynamicSMemSize; + int blockSizeLimit; + unsigned int flags; +} cuOccupancyMaxPotentialBlockSizeWithFlags_params; + +typedef struct cuOccupancyAvailableDynamicSMemPerBlock_params_st { + size_t *dynamicSmemSize; + CUfunction func; + int numBlocks; + int blockSize; +} cuOccupancyAvailableDynamicSMemPerBlock_params; + +typedef struct cuOccupancyMaxPotentialClusterSize_params_st { + int *clusterSize; + CUfunction func; + const CUlaunchConfig *config; +} cuOccupancyMaxPotentialClusterSize_params; + +typedef struct cuOccupancyMaxActiveClusters_params_st { + int *numClusters; + CUfunction func; + const CUlaunchConfig *config; +} cuOccupancyMaxActiveClusters_params; + +typedef struct cuTexRefSetArray_params_st { + CUtexref hTexRef; + CUarray hArray; + unsigned int Flags; +} cuTexRefSetArray_params; + +typedef struct cuTexRefSetMipmappedArray_params_st { + CUtexref hTexRef; + CUmipmappedArray hMipmappedArray; + unsigned int Flags; +} cuTexRefSetMipmappedArray_params; + +typedef struct cuTexRefSetAddress_v2_params_st { + size_t *ByteOffset; + CUtexref hTexRef; + CUdeviceptr dptr; + size_t bytes; +} cuTexRefSetAddress_v2_params; + +typedef struct cuTexRefSetAddress2D_v3_params_st { + CUtexref hTexRef; + const CUDA_ARRAY_DESCRIPTOR *desc; + CUdeviceptr dptr; + size_t Pitch; +} cuTexRefSetAddress2D_v3_params; + +typedef struct cuTexRefSetFormat_params_st { + CUtexref hTexRef; + CUarray_format fmt; + int NumPackedComponents; +} cuTexRefSetFormat_params; + +typedef struct cuTexRefSetAddressMode_params_st { + CUtexref hTexRef; + int dim; + CUaddress_mode am; +} cuTexRefSetAddressMode_params; + +typedef struct cuTexRefSetFilterMode_params_st { + CUtexref hTexRef; + CUfilter_mode fm; +} cuTexRefSetFilterMode_params; + +typedef struct cuTexRefSetMipmapFilterMode_params_st { + CUtexref hTexRef; + CUfilter_mode fm; +} cuTexRefSetMipmapFilterMode_params; + +typedef struct cuTexRefSetMipmapLevelBias_params_st { + CUtexref hTexRef; + float bias; +} cuTexRefSetMipmapLevelBias_params; + +typedef struct cuTexRefSetMipmapLevelClamp_params_st { + CUtexref hTexRef; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; +} cuTexRefSetMipmapLevelClamp_params; + +typedef struct cuTexRefSetMaxAnisotropy_params_st { + CUtexref hTexRef; + unsigned int maxAniso; +} cuTexRefSetMaxAnisotropy_params; + +typedef struct cuTexRefSetBorderColor_params_st { + CUtexref hTexRef; + float *pBorderColor; +} cuTexRefSetBorderColor_params; + +typedef struct cuTexRefSetFlags_params_st { + CUtexref hTexRef; + unsigned int Flags; +} cuTexRefSetFlags_params; + +typedef struct cuTexRefGetAddress_v2_params_st { + CUdeviceptr *pdptr; + CUtexref hTexRef; +} cuTexRefGetAddress_v2_params; + +typedef struct cuTexRefGetArray_params_st { + CUarray *phArray; + CUtexref hTexRef; +} cuTexRefGetArray_params; + +typedef struct cuTexRefGetMipmappedArray_params_st { + CUmipmappedArray *phMipmappedArray; + CUtexref hTexRef; +} cuTexRefGetMipmappedArray_params; + +typedef struct cuTexRefGetAddressMode_params_st { + CUaddress_mode *pam; + CUtexref hTexRef; + int dim; +} cuTexRefGetAddressMode_params; + +typedef struct cuTexRefGetFilterMode_params_st { + CUfilter_mode *pfm; + CUtexref hTexRef; +} cuTexRefGetFilterMode_params; + +typedef struct cuTexRefGetFormat_params_st { + CUarray_format *pFormat; + int *pNumChannels; + CUtexref hTexRef; +} cuTexRefGetFormat_params; + +typedef struct cuTexRefGetMipmapFilterMode_params_st { + CUfilter_mode *pfm; + CUtexref hTexRef; +} cuTexRefGetMipmapFilterMode_params; + +typedef struct cuTexRefGetMipmapLevelBias_params_st { + float *pbias; + CUtexref hTexRef; +} cuTexRefGetMipmapLevelBias_params; + +typedef struct cuTexRefGetMipmapLevelClamp_params_st { + float *pminMipmapLevelClamp; + float *pmaxMipmapLevelClamp; + CUtexref hTexRef; +} cuTexRefGetMipmapLevelClamp_params; + +typedef struct cuTexRefGetMaxAnisotropy_params_st { + int *pmaxAniso; + CUtexref hTexRef; +} cuTexRefGetMaxAnisotropy_params; + +typedef struct cuTexRefGetBorderColor_params_st { + float *pBorderColor; + CUtexref hTexRef; +} cuTexRefGetBorderColor_params; + +typedef struct cuTexRefGetFlags_params_st { + unsigned int *pFlags; + CUtexref hTexRef; +} cuTexRefGetFlags_params; + +typedef struct cuTexRefCreate_params_st { + CUtexref *pTexRef; +} cuTexRefCreate_params; + +typedef struct cuTexRefDestroy_params_st { + CUtexref hTexRef; +} cuTexRefDestroy_params; + +typedef struct cuSurfRefSetArray_params_st { + CUsurfref hSurfRef; + CUarray hArray; + unsigned int Flags; +} cuSurfRefSetArray_params; + +typedef struct cuSurfRefGetArray_params_st { + CUarray *phArray; + CUsurfref hSurfRef; +} cuSurfRefGetArray_params; + +typedef struct cuTexObjectCreate_params_st { + CUtexObject *pTexObject; + const CUDA_RESOURCE_DESC *pResDesc; + const CUDA_TEXTURE_DESC *pTexDesc; + const CUDA_RESOURCE_VIEW_DESC *pResViewDesc; +} cuTexObjectCreate_params; + +typedef struct cuTexObjectDestroy_params_st { + CUtexObject texObject; +} cuTexObjectDestroy_params; + +typedef struct cuTexObjectGetResourceDesc_params_st { + CUDA_RESOURCE_DESC *pResDesc; + CUtexObject texObject; +} cuTexObjectGetResourceDesc_params; + +typedef struct cuTexObjectGetTextureDesc_params_st { + CUDA_TEXTURE_DESC *pTexDesc; + CUtexObject texObject; +} cuTexObjectGetTextureDesc_params; + +typedef struct cuTexObjectGetResourceViewDesc_params_st { + CUDA_RESOURCE_VIEW_DESC *pResViewDesc; + CUtexObject texObject; +} cuTexObjectGetResourceViewDesc_params; + +typedef struct cuSurfObjectCreate_params_st { + CUsurfObject *pSurfObject; + const CUDA_RESOURCE_DESC *pResDesc; +} cuSurfObjectCreate_params; + +typedef struct cuSurfObjectDestroy_params_st { + CUsurfObject surfObject; +} cuSurfObjectDestroy_params; + +typedef struct cuSurfObjectGetResourceDesc_params_st { + CUDA_RESOURCE_DESC *pResDesc; + CUsurfObject surfObject; +} cuSurfObjectGetResourceDesc_params; + +typedef struct cuDeviceCanAccessPeer_params_st { + int *canAccessPeer; + CUdevice dev; + CUdevice peerDev; +} cuDeviceCanAccessPeer_params; + +typedef struct cuCtxEnablePeerAccess_params_st { + CUcontext peerContext; + unsigned int Flags; +} cuCtxEnablePeerAccess_params; + +typedef struct cuCtxDisablePeerAccess_params_st { + CUcontext peerContext; +} cuCtxDisablePeerAccess_params; + +typedef struct cuDeviceGetP2PAttribute_params_st { + int *value; + CUdevice_P2PAttribute attrib; + CUdevice srcDevice; + CUdevice dstDevice; +} cuDeviceGetP2PAttribute_params; + +typedef struct cuGraphicsUnregisterResource_params_st { + CUgraphicsResource resource; +} cuGraphicsUnregisterResource_params; + +typedef struct cuGraphicsSubResourceGetMappedArray_params_st { + CUarray *pArray; + CUgraphicsResource resource; + unsigned int arrayIndex; + unsigned int mipLevel; +} cuGraphicsSubResourceGetMappedArray_params; + +typedef struct cuGraphicsResourceGetMappedMipmappedArray_params_st { + CUmipmappedArray *pMipmappedArray; + CUgraphicsResource resource; +} cuGraphicsResourceGetMappedMipmappedArray_params; + +typedef struct cuGraphicsResourceGetMappedPointer_v2_params_st { + CUdeviceptr *pDevPtr; + size_t *pSize; + CUgraphicsResource resource; +} cuGraphicsResourceGetMappedPointer_v2_params; + +typedef struct cuGraphicsResourceSetMapFlags_v2_params_st { + CUgraphicsResource resource; + unsigned int flags; +} cuGraphicsResourceSetMapFlags_v2_params; + +typedef struct cuGraphicsMapResources_ptsz_params_st { + unsigned int count; + CUgraphicsResource *resources; + CUstream hStream; +} cuGraphicsMapResources_ptsz_params; + +typedef struct cuGraphicsUnmapResources_ptsz_params_st { + unsigned int count; + CUgraphicsResource *resources; + CUstream hStream; +} cuGraphicsUnmapResources_ptsz_params; + +typedef struct cuGetProcAddress_params_st { + const char *symbol; + void **pfn; + int cudaVersion; + cuuint64_t flags; +} cuGetProcAddress_params; + +typedef struct cuModuleGetLoadingMode_params_st { + CUmoduleLoadingMode *mode; +} cuModuleGetLoadingMode_params; + +typedef struct cuMemGetHandleForAddressRange_params_st { + void *handle; + CUdeviceptr dptr; + size_t size; + CUmemRangeHandleType handleType; + unsigned long long flags; +} cuMemGetHandleForAddressRange_params; + +typedef struct cuGetExportTable_params_st { + const void **ppExportTable; + const CUuuid *pExportTableId; +} cuGetExportTable_params; + +typedef struct cuMemHostRegister_params_st { + void *p; + size_t bytesize; + unsigned int Flags; +} cuMemHostRegister_params; + +typedef struct cuGraphicsResourceSetMapFlags_params_st { + CUgraphicsResource resource; + unsigned int flags; +} cuGraphicsResourceSetMapFlags_params; + +typedef struct cuLinkCreate_params_st { + unsigned int numOptions; + CUjit_option *options; + void **optionValues; + CUlinkState *stateOut; +} cuLinkCreate_params; + +typedef struct cuLinkAddData_params_st { + CUlinkState state; + CUjitInputType type; + void *data; + size_t size; + const char *name; + unsigned int numOptions; + CUjit_option *options; + void **optionValues; +} cuLinkAddData_params; + +typedef struct cuLinkAddFile_params_st { + CUlinkState state; + CUjitInputType type; + const char *path; + unsigned int numOptions; + CUjit_option *options; + void **optionValues; +} cuLinkAddFile_params; + +typedef struct cuTexRefSetAddress2D_v2_params_st { + CUtexref hTexRef; + const CUDA_ARRAY_DESCRIPTOR *desc; + CUdeviceptr dptr; + size_t Pitch; +} cuTexRefSetAddress2D_v2_params; + +typedef struct cuDeviceTotalMem_params_st { + unsigned int *bytes; + CUdevice dev; +} cuDeviceTotalMem_params; + +typedef struct cuCtxCreate_params_st { + CUcontext *pctx; + unsigned int flags; + CUdevice dev; +} cuCtxCreate_params; + +typedef struct cuModuleGetGlobal_params_st { + CUdeviceptr_v1 *dptr; + unsigned int *bytes; + CUmodule hmod; + const char *name; +} cuModuleGetGlobal_params; + +typedef struct cuMemGetInfo_params_st { + unsigned int *free; + unsigned int *total; +} cuMemGetInfo_params; + +typedef struct cuMemAlloc_params_st { + CUdeviceptr_v1 *dptr; + unsigned int bytesize; +} cuMemAlloc_params; + +typedef struct cuMemAllocPitch_params_st { + CUdeviceptr_v1 *dptr; + unsigned int *pPitch; + unsigned int WidthInBytes; + unsigned int Height; + unsigned int ElementSizeBytes; +} cuMemAllocPitch_params; + +typedef struct cuMemFree_params_st { + CUdeviceptr_v1 dptr; +} cuMemFree_params; + +typedef struct cuMemGetAddressRange_params_st { + CUdeviceptr_v1 *pbase; + unsigned int *psize; + CUdeviceptr_v1 dptr; +} cuMemGetAddressRange_params; + +typedef struct cuMemAllocHost_params_st { + void **pp; + unsigned int bytesize; +} cuMemAllocHost_params; + +typedef struct cuMemHostGetDevicePointer_params_st { + CUdeviceptr_v1 *pdptr; + void *p; + unsigned int Flags; +} cuMemHostGetDevicePointer_params; + +typedef struct cuMemcpyHtoD_params_st { + CUdeviceptr_v1 dstDevice; + const void *srcHost; + unsigned int ByteCount; +} cuMemcpyHtoD_params; + +typedef struct cuMemcpyDtoH_params_st { + void *dstHost; + CUdeviceptr_v1 srcDevice; + unsigned int ByteCount; +} cuMemcpyDtoH_params; + +typedef struct cuMemcpyDtoD_params_st { + CUdeviceptr_v1 dstDevice; + CUdeviceptr_v1 srcDevice; + unsigned int ByteCount; +} cuMemcpyDtoD_params; + +typedef struct cuMemcpyDtoA_params_st { + CUarray dstArray; + unsigned int dstOffset; + CUdeviceptr_v1 srcDevice; + unsigned int ByteCount; +} cuMemcpyDtoA_params; + +typedef struct cuMemcpyAtoD_params_st { + CUdeviceptr_v1 dstDevice; + CUarray srcArray; + unsigned int srcOffset; + unsigned int ByteCount; +} cuMemcpyAtoD_params; + +typedef struct cuMemcpyHtoA_params_st { + CUarray dstArray; + unsigned int dstOffset; + const void *srcHost; + unsigned int ByteCount; +} cuMemcpyHtoA_params; + +typedef struct cuMemcpyAtoH_params_st { + void *dstHost; + CUarray srcArray; + unsigned int srcOffset; + unsigned int ByteCount; +} cuMemcpyAtoH_params; + +typedef struct cuMemcpyAtoA_params_st { + CUarray dstArray; + unsigned int dstOffset; + CUarray srcArray; + unsigned int srcOffset; + unsigned int ByteCount; +} cuMemcpyAtoA_params; + +typedef struct cuMemcpyHtoAAsync_params_st { + CUarray dstArray; + unsigned int dstOffset; + const void *srcHost; + unsigned int ByteCount; + CUstream hStream; +} cuMemcpyHtoAAsync_params; + +typedef struct cuMemcpyAtoHAsync_params_st { + void *dstHost; + CUarray srcArray; + unsigned int srcOffset; + unsigned int ByteCount; + CUstream hStream; +} cuMemcpyAtoHAsync_params; + +typedef struct cuMemcpy2D_params_st { + const CUDA_MEMCPY2D_v1 *pCopy; +} cuMemcpy2D_params; + +typedef struct cuMemcpy2DUnaligned_params_st { + const CUDA_MEMCPY2D_v1 *pCopy; +} cuMemcpy2DUnaligned_params; + +typedef struct cuMemcpy3D_params_st { + const CUDA_MEMCPY3D_v1 *pCopy; +} cuMemcpy3D_params; + +typedef struct cuMemcpyHtoDAsync_params_st { + CUdeviceptr_v1 dstDevice; + const void *srcHost; + unsigned int ByteCount; + CUstream hStream; +} cuMemcpyHtoDAsync_params; + +typedef struct cuMemcpyDtoHAsync_params_st { + void *dstHost; + CUdeviceptr_v1 srcDevice; + unsigned int ByteCount; + CUstream hStream; +} cuMemcpyDtoHAsync_params; + +typedef struct cuMemcpyDtoDAsync_params_st { + CUdeviceptr_v1 dstDevice; + CUdeviceptr_v1 srcDevice; + unsigned int ByteCount; + CUstream hStream; +} cuMemcpyDtoDAsync_params; + +typedef struct cuMemcpy2DAsync_params_st { + const CUDA_MEMCPY2D_v1 *pCopy; + CUstream hStream; +} cuMemcpy2DAsync_params; + +typedef struct cuMemcpy3DAsync_params_st { + const CUDA_MEMCPY3D_v1 *pCopy; + CUstream hStream; +} cuMemcpy3DAsync_params; + +typedef struct cuMemsetD8_params_st { + CUdeviceptr_v1 dstDevice; + unsigned char uc; + unsigned int N; +} cuMemsetD8_params; + +typedef struct cuMemsetD16_params_st { + CUdeviceptr_v1 dstDevice; + unsigned short us; + unsigned int N; +} cuMemsetD16_params; + +typedef struct cuMemsetD32_params_st { + CUdeviceptr_v1 dstDevice; + unsigned int ui; + unsigned int N; +} cuMemsetD32_params; + +typedef struct cuMemsetD2D8_params_st { + CUdeviceptr_v1 dstDevice; + unsigned int dstPitch; + unsigned char uc; + unsigned int Width; + unsigned int Height; +} cuMemsetD2D8_params; + +typedef struct cuMemsetD2D16_params_st { + CUdeviceptr_v1 dstDevice; + unsigned int dstPitch; + unsigned short us; + unsigned int Width; + unsigned int Height; +} cuMemsetD2D16_params; + +typedef struct cuMemsetD2D32_params_st { + CUdeviceptr_v1 dstDevice; + unsigned int dstPitch; + unsigned int ui; + unsigned int Width; + unsigned int Height; +} cuMemsetD2D32_params; + +typedef struct cuArrayCreate_params_st { + CUarray *pHandle; + const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray; +} cuArrayCreate_params; + +typedef struct cuArrayGetDescriptor_params_st { + CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor; + CUarray hArray; +} cuArrayGetDescriptor_params; + +typedef struct cuArray3DCreate_params_st { + CUarray *pHandle; + const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray; +} cuArray3DCreate_params; + +typedef struct cuArray3DGetDescriptor_params_st { + CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor; + CUarray hArray; +} cuArray3DGetDescriptor_params; + +typedef struct cuTexRefSetAddress_params_st { + unsigned int *ByteOffset; + CUtexref hTexRef; + CUdeviceptr_v1 dptr; + unsigned int bytes; +} cuTexRefSetAddress_params; + +typedef struct cuTexRefSetAddress2D_params_st { + CUtexref hTexRef; + const CUDA_ARRAY_DESCRIPTOR_v1 *desc; + CUdeviceptr_v1 dptr; + unsigned int Pitch; +} cuTexRefSetAddress2D_params; + +typedef struct cuTexRefGetAddress_params_st { + CUdeviceptr_v1 *pdptr; + CUtexref hTexRef; +} cuTexRefGetAddress_params; + +typedef struct cuGraphicsResourceGetMappedPointer_params_st { + CUdeviceptr_v1 *pDevPtr; + unsigned int *pSize; + CUgraphicsResource resource; +} cuGraphicsResourceGetMappedPointer_params; + +typedef struct cuCtxDestroy_params_st { + CUcontext ctx; +} cuCtxDestroy_params; + +typedef struct cuCtxPopCurrent_params_st { + CUcontext *pctx; +} cuCtxPopCurrent_params; + +typedef struct cuCtxPushCurrent_params_st { + CUcontext ctx; +} cuCtxPushCurrent_params; + +typedef struct cuStreamDestroy_params_st { + CUstream hStream; +} cuStreamDestroy_params; + +typedef struct cuEventDestroy_params_st { + CUevent hEvent; +} cuEventDestroy_params; + +typedef struct cuDevicePrimaryCtxRelease_params_st { + CUdevice dev; +} cuDevicePrimaryCtxRelease_params; + +typedef struct cuDevicePrimaryCtxReset_params_st { + CUdevice dev; +} cuDevicePrimaryCtxReset_params; + +typedef struct cuDevicePrimaryCtxSetFlags_params_st { + CUdevice dev; + unsigned int flags; +} cuDevicePrimaryCtxSetFlags_params; + +typedef struct cuMemcpyHtoD_v2_params_st { + CUdeviceptr dstDevice; + const void *srcHost; + size_t ByteCount; +} cuMemcpyHtoD_v2_params; + +typedef struct cuMemcpyDtoH_v2_params_st { + void *dstHost; + CUdeviceptr srcDevice; + size_t ByteCount; +} cuMemcpyDtoH_v2_params; + +typedef struct cuMemcpyDtoD_v2_params_st { + CUdeviceptr dstDevice; + CUdeviceptr srcDevice; + size_t ByteCount; +} cuMemcpyDtoD_v2_params; + +typedef struct cuMemcpyDtoA_v2_params_st { + CUarray dstArray; + size_t dstOffset; + CUdeviceptr srcDevice; + size_t ByteCount; +} cuMemcpyDtoA_v2_params; + +typedef struct cuMemcpyAtoD_v2_params_st { + CUdeviceptr dstDevice; + CUarray srcArray; + size_t srcOffset; + size_t ByteCount; +} cuMemcpyAtoD_v2_params; + +typedef struct cuMemcpyHtoA_v2_params_st { + CUarray dstArray; + size_t dstOffset; + const void *srcHost; + size_t ByteCount; +} cuMemcpyHtoA_v2_params; + +typedef struct cuMemcpyAtoH_v2_params_st { + void *dstHost; + CUarray srcArray; + size_t srcOffset; + size_t ByteCount; +} cuMemcpyAtoH_v2_params; + +typedef struct cuMemcpyAtoA_v2_params_st { + CUarray dstArray; + size_t dstOffset; + CUarray srcArray; + size_t srcOffset; + size_t ByteCount; +} cuMemcpyAtoA_v2_params; + +typedef struct cuMemcpyHtoAAsync_v2_params_st { + CUarray dstArray; + size_t dstOffset; + const void *srcHost; + size_t ByteCount; + CUstream hStream; +} cuMemcpyHtoAAsync_v2_params; + +typedef struct cuMemcpyAtoHAsync_v2_params_st { + void *dstHost; + CUarray srcArray; + size_t srcOffset; + size_t ByteCount; + CUstream hStream; +} cuMemcpyAtoHAsync_v2_params; + +typedef struct cuMemcpy2D_v2_params_st { + const CUDA_MEMCPY2D *pCopy; +} cuMemcpy2D_v2_params; + +typedef struct cuMemcpy2DUnaligned_v2_params_st { + const CUDA_MEMCPY2D *pCopy; +} cuMemcpy2DUnaligned_v2_params; + +typedef struct cuMemcpy3D_v2_params_st { + const CUDA_MEMCPY3D *pCopy; +} cuMemcpy3D_v2_params; + +typedef struct cuMemcpyHtoDAsync_v2_params_st { + CUdeviceptr dstDevice; + const void *srcHost; + size_t ByteCount; + CUstream hStream; +} cuMemcpyHtoDAsync_v2_params; + +typedef struct cuMemcpyDtoHAsync_v2_params_st { + void *dstHost; + CUdeviceptr srcDevice; + size_t ByteCount; + CUstream hStream; +} cuMemcpyDtoHAsync_v2_params; + +typedef struct cuMemcpyDtoDAsync_v2_params_st { + CUdeviceptr dstDevice; + CUdeviceptr srcDevice; + size_t ByteCount; + CUstream hStream; +} cuMemcpyDtoDAsync_v2_params; + +typedef struct cuMemcpy2DAsync_v2_params_st { + const CUDA_MEMCPY2D *pCopy; + CUstream hStream; +} cuMemcpy2DAsync_v2_params; + +typedef struct cuMemcpy3DAsync_v2_params_st { + const CUDA_MEMCPY3D *pCopy; + CUstream hStream; +} cuMemcpy3DAsync_v2_params; + +typedef struct cuMemsetD8_v2_params_st { + CUdeviceptr dstDevice; + unsigned char uc; + size_t N; +} cuMemsetD8_v2_params; + +typedef struct cuMemsetD16_v2_params_st { + CUdeviceptr dstDevice; + unsigned short us; + size_t N; +} cuMemsetD16_v2_params; + +typedef struct cuMemsetD32_v2_params_st { + CUdeviceptr dstDevice; + unsigned int ui; + size_t N; +} cuMemsetD32_v2_params; + +typedef struct cuMemsetD2D8_v2_params_st { + CUdeviceptr dstDevice; + size_t dstPitch; + unsigned char uc; + size_t Width; + size_t Height; +} cuMemsetD2D8_v2_params; + +typedef struct cuMemsetD2D16_v2_params_st { + CUdeviceptr dstDevice; + size_t dstPitch; + unsigned short us; + size_t Width; + size_t Height; +} cuMemsetD2D16_v2_params; + +typedef struct cuMemsetD2D32_v2_params_st { + CUdeviceptr dstDevice; + size_t dstPitch; + unsigned int ui; + size_t Width; + size_t Height; +} cuMemsetD2D32_v2_params; + +typedef struct cuMemcpy_params_st { + CUdeviceptr dst; + CUdeviceptr src; + size_t ByteCount; +} cuMemcpy_params; + +typedef struct cuMemcpyAsync_params_st { + CUdeviceptr dst; + CUdeviceptr src; + size_t ByteCount; + CUstream hStream; +} cuMemcpyAsync_params; + +typedef struct cuMemcpyPeer_params_st { + CUdeviceptr dstDevice; + CUcontext dstContext; + CUdeviceptr srcDevice; + CUcontext srcContext; + size_t ByteCount; +} cuMemcpyPeer_params; + +typedef struct cuMemcpyPeerAsync_params_st { + CUdeviceptr dstDevice; + CUcontext dstContext; + CUdeviceptr srcDevice; + CUcontext srcContext; + size_t ByteCount; + CUstream hStream; +} cuMemcpyPeerAsync_params; + +typedef struct cuMemcpy3DPeer_params_st { + const CUDA_MEMCPY3D_PEER *pCopy; +} cuMemcpy3DPeer_params; + +typedef struct cuMemcpy3DPeerAsync_params_st { + const CUDA_MEMCPY3D_PEER *pCopy; + CUstream hStream; +} cuMemcpy3DPeerAsync_params; + +typedef struct cuMemsetD8Async_params_st { + CUdeviceptr dstDevice; + unsigned char uc; + size_t N; + CUstream hStream; +} cuMemsetD8Async_params; + +typedef struct cuMemsetD16Async_params_st { + CUdeviceptr dstDevice; + unsigned short us; + size_t N; + CUstream hStream; +} cuMemsetD16Async_params; + +typedef struct cuMemsetD32Async_params_st { + CUdeviceptr dstDevice; + unsigned int ui; + size_t N; + CUstream hStream; +} cuMemsetD32Async_params; + +typedef struct cuMemsetD2D8Async_params_st { + CUdeviceptr dstDevice; + size_t dstPitch; + unsigned char uc; + size_t Width; + size_t Height; + CUstream hStream; +} cuMemsetD2D8Async_params; + +typedef struct cuMemsetD2D16Async_params_st { + CUdeviceptr dstDevice; + size_t dstPitch; + unsigned short us; + size_t Width; + size_t Height; + CUstream hStream; +} cuMemsetD2D16Async_params; + +typedef struct cuMemsetD2D32Async_params_st { + CUdeviceptr dstDevice; + size_t dstPitch; + unsigned int ui; + size_t Width; + size_t Height; + CUstream hStream; +} cuMemsetD2D32Async_params; + +typedef struct cuStreamGetPriority_params_st { + CUstream hStream; + int *priority; +} cuStreamGetPriority_params; + +typedef struct cuStreamGetFlags_params_st { + CUstream hStream; + unsigned int *flags; +} cuStreamGetFlags_params; + +typedef struct cuStreamGetCtx_params_st { + CUstream hStream; + CUcontext *pctx; +} cuStreamGetCtx_params; + +typedef struct cuStreamWaitEvent_params_st { + CUstream hStream; + CUevent hEvent; + unsigned int Flags; +} cuStreamWaitEvent_params; + +typedef struct cuStreamAddCallback_params_st { + CUstream hStream; + CUstreamCallback callback; + void *userData; + unsigned int flags; +} cuStreamAddCallback_params; + +typedef struct cuStreamAttachMemAsync_params_st { + CUstream hStream; + CUdeviceptr dptr; + size_t length; + unsigned int flags; +} cuStreamAttachMemAsync_params; + +typedef struct cuStreamQuery_params_st { + CUstream hStream; +} cuStreamQuery_params; + +typedef struct cuStreamSynchronize_params_st { + CUstream hStream; +} cuStreamSynchronize_params; + +typedef struct cuEventRecord_params_st { + CUevent hEvent; + CUstream hStream; +} cuEventRecord_params; + +typedef struct cuEventRecordWithFlags_params_st { + CUevent hEvent; + CUstream hStream; + unsigned int flags; +} cuEventRecordWithFlags_params; + +typedef struct cuLaunchKernel_params_st { + CUfunction f; + unsigned int gridDimX; + unsigned int gridDimY; + unsigned int gridDimZ; + unsigned int blockDimX; + unsigned int blockDimY; + unsigned int blockDimZ; + unsigned int sharedMemBytes; + CUstream hStream; + void **kernelParams; + void **extra; +} cuLaunchKernel_params; + +typedef struct cuLaunchKernelEx_params_st { + const CUlaunchConfig *config; + CUfunction f; + void **kernelParams; + void **extra; +} cuLaunchKernelEx_params; + +typedef struct cuLaunchHostFunc_params_st { + CUstream hStream; + CUhostFn fn; + void *userData; +} cuLaunchHostFunc_params; + +typedef struct cuGraphicsMapResources_params_st { + unsigned int count; + CUgraphicsResource *resources; + CUstream hStream; +} cuGraphicsMapResources_params; + +typedef struct cuGraphicsUnmapResources_params_st { + unsigned int count; + CUgraphicsResource *resources; + CUstream hStream; +} cuGraphicsUnmapResources_params; + +typedef struct cuStreamWriteValue32_params_st { + CUstream stream; + CUdeviceptr addr; + cuuint32_t value; + unsigned int flags; +} cuStreamWriteValue32_params; + +typedef struct cuStreamWaitValue32_params_st { + CUstream stream; + CUdeviceptr addr; + cuuint32_t value; + unsigned int flags; +} cuStreamWaitValue32_params; + +typedef struct cuStreamWriteValue64_params_st { + CUstream stream; + CUdeviceptr addr; + cuuint64_t value; + unsigned int flags; +} cuStreamWriteValue64_params; + +typedef struct cuStreamWaitValue64_params_st { + CUstream stream; + CUdeviceptr addr; + cuuint64_t value; + unsigned int flags; +} cuStreamWaitValue64_params; + +typedef struct cuStreamBatchMemOp_params_st { + CUstream stream; + unsigned int count; + CUstreamBatchMemOpParams *paramArray; + unsigned int flags; +} cuStreamBatchMemOp_params; + +typedef struct cuMemPrefetchAsync_params_st { + CUdeviceptr devPtr; + size_t count; + CUdevice dstDevice; + CUstream hStream; +} cuMemPrefetchAsync_params; + +typedef struct cuLaunchCooperativeKernel_params_st { + CUfunction f; + unsigned int gridDimX; + unsigned int gridDimY; + unsigned int gridDimZ; + unsigned int blockDimX; + unsigned int blockDimY; + unsigned int blockDimZ; + unsigned int sharedMemBytes; + CUstream hStream; + void **kernelParams; +} cuLaunchCooperativeKernel_params; + +typedef struct cuSignalExternalSemaphoresAsync_params_st { + const CUexternalSemaphore *extSemArray; + const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray; + unsigned int numExtSems; + CUstream stream; +} cuSignalExternalSemaphoresAsync_params; + +typedef struct cuWaitExternalSemaphoresAsync_params_st { + const CUexternalSemaphore *extSemArray; + const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray; + unsigned int numExtSems; + CUstream stream; +} cuWaitExternalSemaphoresAsync_params; + +typedef struct cuStreamBeginCapture_params_st { + CUstream hStream; +} cuStreamBeginCapture_params; + +typedef struct cuStreamBeginCapture_ptsz_params_st { + CUstream hStream; +} cuStreamBeginCapture_ptsz_params; + +typedef struct cuStreamBeginCapture_v2_params_st { + CUstream hStream; + CUstreamCaptureMode mode; +} cuStreamBeginCapture_v2_params; + +typedef struct cuStreamEndCapture_params_st { + CUstream hStream; + CUgraph *phGraph; +} cuStreamEndCapture_params; + +typedef struct cuStreamIsCapturing_params_st { + CUstream hStream; + CUstreamCaptureStatus *captureStatus; +} cuStreamIsCapturing_params; + +typedef struct cuStreamGetCaptureInfo_params_st { + CUstream hStream; + CUstreamCaptureStatus *captureStatus_out; + cuuint64_t *id_out; +} cuStreamGetCaptureInfo_params; + +typedef struct cuStreamGetCaptureInfo_v2_params_st { + CUstream hStream; + CUstreamCaptureStatus *captureStatus_out; + cuuint64_t *id_out; + CUgraph *graph_out; + const CUgraphNode **dependencies_out; + size_t *numDependencies_out; +} cuStreamGetCaptureInfo_v2_params; + +typedef struct cuGraphUpload_params_st { + CUgraphExec hGraph; + CUstream hStream; +} cuGraphUpload_params; + +typedef struct cuGraphLaunch_params_st { + CUgraphExec hGraph; + CUstream hStream; +} cuGraphLaunch_params; + +typedef struct cuStreamCopyAttributes_params_st { + CUstream dstStream; + CUstream srcStream; +} cuStreamCopyAttributes_params; + +typedef struct cuStreamGetAttribute_params_st { + CUstream hStream; + CUstreamAttrID attr; + CUstreamAttrValue *value; +} cuStreamGetAttribute_params; + +typedef struct cuStreamSetAttribute_params_st { + CUstream hStream; + CUstreamAttrID attr; + const CUstreamAttrValue *param; +} cuStreamSetAttribute_params; + +typedef struct cuIpcOpenMemHandle_params_st { + CUdeviceptr *pdptr; + CUipcMemHandle handle; + unsigned int Flags; +} cuIpcOpenMemHandle_params; + +typedef struct cuGraphInstantiate_params_st { + CUgraphExec *phGraphExec; + CUgraph hGraph; + CUgraphNode *phErrorNode; + char *logBuffer; + size_t bufferSize; +} cuGraphInstantiate_params; + +typedef struct cuMemMapArrayAsync_params_st { + CUarrayMapInfo *mapInfoList; + unsigned int count; + CUstream hStream; +} cuMemMapArrayAsync_params; + +typedef struct cuMemFreeAsync_params_st { + CUdeviceptr dptr; + CUstream hStream; +} cuMemFreeAsync_params; + +typedef struct cuMemAllocAsync_params_st { + CUdeviceptr *dptr; + size_t bytesize; + CUstream hStream; +} cuMemAllocAsync_params; + +typedef struct cuMemAllocFromPoolAsync_params_st { + CUdeviceptr *dptr; + size_t bytesize; + CUmemoryPool pool; + CUstream hStream; +} cuMemAllocFromPoolAsync_params; + +typedef struct cuStreamUpdateCaptureDependencies_params_st { + CUstream hStream; + CUgraphNode *dependencies; + size_t numDependencies; + unsigned int flags; +} cuStreamUpdateCaptureDependencies_params; diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_runtime_api_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_runtime_api_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..69cea3735f87455a668bae914f86b06deb157106 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_runtime_api_meta.h @@ -0,0 +1,2139 @@ +// This file is generated. Any changes you make will be lost during the next clean build. + +// CUDA public interface, for type definitions and api function prototypes +#include "cuda_runtime_api.h" + +// ************************************************************************* +// Definitions of structs to hold parameters for each function +// ************************************************************************* + +// Currently used parameter trace structures +typedef struct cudaDeviceSetLimit_v3020_params_st { + enum cudaLimit limit; + size_t value; +} cudaDeviceSetLimit_v3020_params; + +typedef struct cudaDeviceGetLimit_v3020_params_st { + size_t *pValue; + enum cudaLimit limit; +} cudaDeviceGetLimit_v3020_params; + +typedef struct cudaDeviceGetTexture1DLinearMaxWidth_v11010_params_st { + size_t *maxWidthInElements; + const struct cudaChannelFormatDesc *fmtDesc; + int device; +} cudaDeviceGetTexture1DLinearMaxWidth_v11010_params; + +typedef struct cudaDeviceGetCacheConfig_v3020_params_st { + enum cudaFuncCache *pCacheConfig; +} cudaDeviceGetCacheConfig_v3020_params; + +typedef struct cudaDeviceGetStreamPriorityRange_v5050_params_st { + int *leastPriority; + int *greatestPriority; +} cudaDeviceGetStreamPriorityRange_v5050_params; + +typedef struct cudaDeviceSetCacheConfig_v3020_params_st { + enum cudaFuncCache cacheConfig; +} cudaDeviceSetCacheConfig_v3020_params; + +typedef struct cudaDeviceGetSharedMemConfig_v4020_params_st { + enum cudaSharedMemConfig *pConfig; +} cudaDeviceGetSharedMemConfig_v4020_params; + +typedef struct cudaDeviceSetSharedMemConfig_v4020_params_st { + enum cudaSharedMemConfig config; +} cudaDeviceSetSharedMemConfig_v4020_params; + +typedef struct cudaDeviceGetByPCIBusId_v4010_params_st { + int *device; + const char *pciBusId; +} cudaDeviceGetByPCIBusId_v4010_params; + +typedef struct cudaDeviceGetPCIBusId_v4010_params_st { + char *pciBusId; + int len; + int device; +} cudaDeviceGetPCIBusId_v4010_params; + +typedef struct cudaIpcGetEventHandle_v4010_params_st { + cudaIpcEventHandle_t *handle; + cudaEvent_t event; +} cudaIpcGetEventHandle_v4010_params; + +typedef struct cudaIpcOpenEventHandle_v4010_params_st { + cudaEvent_t *event; + cudaIpcEventHandle_t handle; +} cudaIpcOpenEventHandle_v4010_params; + +typedef struct cudaIpcGetMemHandle_v4010_params_st { + cudaIpcMemHandle_t *handle; + void *devPtr; +} cudaIpcGetMemHandle_v4010_params; + +typedef struct cudaIpcOpenMemHandle_v4010_params_st { + void **devPtr; + cudaIpcMemHandle_t handle; + unsigned int flags; +} cudaIpcOpenMemHandle_v4010_params; + +typedef struct cudaIpcCloseMemHandle_v4010_params_st { + void *devPtr; +} cudaIpcCloseMemHandle_v4010_params; + +typedef struct cudaDeviceFlushGPUDirectRDMAWrites_v11030_params_st { + enum cudaFlushGPUDirectRDMAWritesTarget target; + enum cudaFlushGPUDirectRDMAWritesScope scope; +} cudaDeviceFlushGPUDirectRDMAWrites_v11030_params; + +typedef struct cudaGetErrorName_v6050_params_st { + cudaError_t error; +} cudaGetErrorName_v6050_params; + +typedef struct cudaGetErrorString_v3020_params_st { + cudaError_t error; +} cudaGetErrorString_v3020_params; + +typedef struct cudaGetDeviceCount_v3020_params_st { + int *count; +} cudaGetDeviceCount_v3020_params; + +typedef struct cudaGetDeviceProperties_v3020_params_st { + struct cudaDeviceProp *prop; + int device; +} cudaGetDeviceProperties_v3020_params; + +typedef struct cudaDeviceGetAttribute_v5000_params_st { + int *value; + enum cudaDeviceAttr attr; + int device; +} cudaDeviceGetAttribute_v5000_params; + +typedef struct cudaDeviceGetDefaultMemPool_v11020_params_st { + cudaMemPool_t *memPool; + int device; +} cudaDeviceGetDefaultMemPool_v11020_params; + +typedef struct cudaDeviceSetMemPool_v11020_params_st { + int device; + cudaMemPool_t memPool; +} cudaDeviceSetMemPool_v11020_params; + +typedef struct cudaDeviceGetMemPool_v11020_params_st { + cudaMemPool_t *memPool; + int device; +} cudaDeviceGetMemPool_v11020_params; + +typedef struct cudaDeviceGetNvSciSyncAttributes_v10020_params_st { + void *nvSciSyncAttrList; + int device; + int flags; +} cudaDeviceGetNvSciSyncAttributes_v10020_params; + +typedef struct cudaDeviceGetP2PAttribute_v8000_params_st { + int *value; + enum cudaDeviceP2PAttr attr; + int srcDevice; + int dstDevice; +} cudaDeviceGetP2PAttribute_v8000_params; + +typedef struct cudaChooseDevice_v3020_params_st { + int *device; + const struct cudaDeviceProp *prop; +} cudaChooseDevice_v3020_params; + +typedef struct cudaSetDevice_v3020_params_st { + int device; +} cudaSetDevice_v3020_params; + +typedef struct cudaGetDevice_v3020_params_st { + int *device; +} cudaGetDevice_v3020_params; + +typedef struct cudaSetValidDevices_v3020_params_st { + int *device_arr; + int len; +} cudaSetValidDevices_v3020_params; + +typedef struct cudaSetDeviceFlags_v3020_params_st { + unsigned int flags; +} cudaSetDeviceFlags_v3020_params; + +typedef struct cudaGetDeviceFlags_v7000_params_st { + unsigned int *flags; +} cudaGetDeviceFlags_v7000_params; + +typedef struct cudaStreamCreate_v3020_params_st { + cudaStream_t *pStream; +} cudaStreamCreate_v3020_params; + +typedef struct cudaStreamCreateWithFlags_v5000_params_st { + cudaStream_t *pStream; + unsigned int flags; +} cudaStreamCreateWithFlags_v5000_params; + +typedef struct cudaStreamCreateWithPriority_v5050_params_st { + cudaStream_t *pStream; + unsigned int flags; + int priority; +} cudaStreamCreateWithPriority_v5050_params; + +typedef struct cudaStreamGetPriority_ptsz_v7000_params_st { + cudaStream_t hStream; + int *priority; +} cudaStreamGetPriority_ptsz_v7000_params; + +typedef struct cudaStreamGetFlags_ptsz_v7000_params_st { + cudaStream_t hStream; + unsigned int *flags; +} cudaStreamGetFlags_ptsz_v7000_params; + +typedef struct cudaStreamCopyAttributes_ptsz_v11000_params_st { + cudaStream_t dst; + cudaStream_t src; +} cudaStreamCopyAttributes_ptsz_v11000_params; + +typedef struct cudaStreamGetAttribute_ptsz_v11000_params_st { + cudaStream_t hStream; + cudaStreamAttrID attr; + cudaStreamAttrValue *value_out; +} cudaStreamGetAttribute_ptsz_v11000_params; + +typedef struct cudaStreamSetAttribute_ptsz_v11000_params_st { + cudaStream_t hStream; + cudaStreamAttrID attr; + const cudaStreamAttrValue *value; +} cudaStreamSetAttribute_ptsz_v11000_params; + +typedef struct cudaStreamDestroy_v5050_params_st { + cudaStream_t stream; +} cudaStreamDestroy_v5050_params; + +typedef struct cudaStreamWaitEvent_ptsz_v7000_params_st { + cudaStream_t stream; + cudaEvent_t event; + unsigned int flags; +} cudaStreamWaitEvent_ptsz_v7000_params; + +typedef struct cudaStreamAddCallback_ptsz_v7000_params_st { + cudaStream_t stream; + cudaStreamCallback_t callback; + void *userData; + unsigned int flags; +} cudaStreamAddCallback_ptsz_v7000_params; + +typedef struct cudaStreamSynchronize_ptsz_v7000_params_st { + cudaStream_t stream; +} cudaStreamSynchronize_ptsz_v7000_params; + +typedef struct cudaStreamQuery_ptsz_v7000_params_st { + cudaStream_t stream; +} cudaStreamQuery_ptsz_v7000_params; + +typedef struct cudaStreamAttachMemAsync_ptsz_v7000_params_st { + cudaStream_t stream; + void *devPtr; + size_t length; + unsigned int flags; +} cudaStreamAttachMemAsync_ptsz_v7000_params; + +typedef struct cudaStreamBeginCapture_ptsz_v10000_params_st { + cudaStream_t stream; + enum cudaStreamCaptureMode mode; +} cudaStreamBeginCapture_ptsz_v10000_params; + +typedef struct cudaThreadExchangeStreamCaptureMode_v10010_params_st { + enum cudaStreamCaptureMode *mode; +} cudaThreadExchangeStreamCaptureMode_v10010_params; + +typedef struct cudaStreamEndCapture_ptsz_v10000_params_st { + cudaStream_t stream; + cudaGraph_t *pGraph; +} cudaStreamEndCapture_ptsz_v10000_params; + +typedef struct cudaStreamIsCapturing_ptsz_v10000_params_st { + cudaStream_t stream; + enum cudaStreamCaptureStatus *pCaptureStatus; +} cudaStreamIsCapturing_ptsz_v10000_params; + +typedef struct cudaStreamGetCaptureInfo_ptsz_v10010_params_st { + cudaStream_t stream; + enum cudaStreamCaptureStatus *pCaptureStatus; + unsigned long long *pId; +} cudaStreamGetCaptureInfo_ptsz_v10010_params; + +typedef struct cudaStreamGetCaptureInfo_v2_ptsz_v11030_params_st { + cudaStream_t stream; + enum cudaStreamCaptureStatus *captureStatus_out; + unsigned long long *id_out; + cudaGraph_t *graph_out; + const cudaGraphNode_t **dependencies_out; + size_t *numDependencies_out; +} cudaStreamGetCaptureInfo_v2_ptsz_v11030_params; + +typedef struct cudaStreamUpdateCaptureDependencies_v11030_params_st { + cudaStream_t stream; + cudaGraphNode_t *dependencies; + size_t numDependencies; + unsigned int flags; +} cudaStreamUpdateCaptureDependencies_v11030_params; + +typedef struct cudaEventCreate_v3020_params_st { + cudaEvent_t *event; +} cudaEventCreate_v3020_params; + +typedef struct cudaEventCreateWithFlags_v3020_params_st { + cudaEvent_t *event; + unsigned int flags; +} cudaEventCreateWithFlags_v3020_params; + +typedef struct cudaEventRecord_ptsz_v7000_params_st { + cudaEvent_t event; + cudaStream_t stream; +} cudaEventRecord_ptsz_v7000_params; + +typedef struct cudaEventRecordWithFlags_ptsz_v11010_params_st { + cudaEvent_t event; + cudaStream_t stream; + unsigned int flags; +} cudaEventRecordWithFlags_ptsz_v11010_params; + +typedef struct cudaEventQuery_v3020_params_st { + cudaEvent_t event; +} cudaEventQuery_v3020_params; + +typedef struct cudaEventSynchronize_v3020_params_st { + cudaEvent_t event; +} cudaEventSynchronize_v3020_params; + +typedef struct cudaEventDestroy_v3020_params_st { + cudaEvent_t event; +} cudaEventDestroy_v3020_params; + +typedef struct cudaEventElapsedTime_v3020_params_st { + float *ms; + cudaEvent_t start; + cudaEvent_t end; +} cudaEventElapsedTime_v3020_params; + +typedef struct cudaImportExternalMemory_v10000_params_st { + cudaExternalMemory_t *extMem_out; + const struct cudaExternalMemoryHandleDesc *memHandleDesc; +} cudaImportExternalMemory_v10000_params; + +typedef struct cudaExternalMemoryGetMappedBuffer_v10000_params_st { + void **devPtr; + cudaExternalMemory_t extMem; + const struct cudaExternalMemoryBufferDesc *bufferDesc; +} cudaExternalMemoryGetMappedBuffer_v10000_params; + +typedef struct cudaExternalMemoryGetMappedMipmappedArray_v10000_params_st { + cudaMipmappedArray_t *mipmap; + cudaExternalMemory_t extMem; + const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc; +} cudaExternalMemoryGetMappedMipmappedArray_v10000_params; + +typedef struct cudaDestroyExternalMemory_v10000_params_st { + cudaExternalMemory_t extMem; +} cudaDestroyExternalMemory_v10000_params; + +typedef struct cudaImportExternalSemaphore_v10000_params_st { + cudaExternalSemaphore_t *extSem_out; + const struct cudaExternalSemaphoreHandleDesc *semHandleDesc; +} cudaImportExternalSemaphore_v10000_params; + +typedef struct cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020_params_st { + const cudaExternalSemaphore_t *extSemArray; + const struct cudaExternalSemaphoreSignalParams *paramsArray; + unsigned int numExtSems; + cudaStream_t stream; +} cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020_params; + +typedef struct cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020_params_st { + const cudaExternalSemaphore_t *extSemArray; + const struct cudaExternalSemaphoreWaitParams *paramsArray; + unsigned int numExtSems; + cudaStream_t stream; +} cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020_params; + +typedef struct cudaDestroyExternalSemaphore_v10000_params_st { + cudaExternalSemaphore_t extSem; +} cudaDestroyExternalSemaphore_v10000_params; + +typedef struct cudaLaunchKernel_ptsz_v7000_params_st { + const void *func; + dim3 gridDim; + dim3 blockDim; + void **args; + size_t sharedMem; + cudaStream_t stream; +} cudaLaunchKernel_ptsz_v7000_params; + +typedef struct cudaLaunchKernelExC_ptsz_v11060_params_st { + const cudaLaunchConfig_t *config; + const void *func; + void **args; +} cudaLaunchKernelExC_ptsz_v11060_params; + +typedef struct cudaLaunchCooperativeKernel_ptsz_v9000_params_st { + const void *func; + dim3 gridDim; + dim3 blockDim; + void **args; + size_t sharedMem; + cudaStream_t stream; +} cudaLaunchCooperativeKernel_ptsz_v9000_params; + +typedef struct cudaLaunchCooperativeKernelMultiDevice_v9000_params_st { + struct cudaLaunchParams *launchParamsList; + unsigned int numDevices; + unsigned int flags; +} cudaLaunchCooperativeKernelMultiDevice_v9000_params; + +typedef struct cudaFuncSetCacheConfig_v3020_params_st { + const void *func; + enum cudaFuncCache cacheConfig; +} cudaFuncSetCacheConfig_v3020_params; + +typedef struct cudaFuncSetSharedMemConfig_v4020_params_st { + const void *func; + enum cudaSharedMemConfig config; +} cudaFuncSetSharedMemConfig_v4020_params; + +typedef struct cudaFuncGetAttributes_v3020_params_st { + struct cudaFuncAttributes *attr; + const void *func; +} cudaFuncGetAttributes_v3020_params; + +typedef struct cudaFuncSetAttribute_v9000_params_st { + const void *func; + enum cudaFuncAttribute attr; + int value; +} cudaFuncSetAttribute_v9000_params; + +typedef struct cudaLaunchHostFunc_ptsz_v10000_params_st { + cudaStream_t stream; + cudaHostFn_t fn; + void *userData; +} cudaLaunchHostFunc_ptsz_v10000_params; + +typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050_params_st { + int *numBlocks; + const void *func; + int blockSize; + size_t dynamicSMemSize; +} cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050_params; + +typedef struct cudaOccupancyAvailableDynamicSMemPerBlock_v10200_params_st { + size_t *dynamicSmemSize; + const void *func; + int numBlocks; + int blockSize; +} cudaOccupancyAvailableDynamicSMemPerBlock_v10200_params; + +typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000_params_st { + int *numBlocks; + const void *func; + int blockSize; + size_t dynamicSMemSize; + unsigned int flags; +} cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000_params; + +typedef struct cudaOccupancyMaxPotentialClusterSize_v11070_params_st { + int *clusterSize; + const void *func; + const cudaLaunchConfig_t *launchConfig; +} cudaOccupancyMaxPotentialClusterSize_v11070_params; + +typedef struct cudaOccupancyMaxActiveClusters_v11070_params_st { + int *numClusters; + const void *func; + const cudaLaunchConfig_t *launchConfig; +} cudaOccupancyMaxActiveClusters_v11070_params; + +typedef struct cudaMallocManaged_v6000_params_st { + void **devPtr; + size_t size; + unsigned int flags; +} cudaMallocManaged_v6000_params; + +typedef struct cudaMalloc_v3020_params_st { + void **devPtr; + size_t size; +} cudaMalloc_v3020_params; + +typedef struct cudaMallocHost_v3020_params_st { + void **ptr; + size_t size; +} cudaMallocHost_v3020_params; + +typedef struct cudaMallocPitch_v3020_params_st { + void **devPtr; + size_t *pitch; + size_t width; + size_t height; +} cudaMallocPitch_v3020_params; + +typedef struct cudaMallocArray_v3020_params_st { + cudaArray_t *array; + const struct cudaChannelFormatDesc *desc; + size_t width; + size_t height; + unsigned int flags; +} cudaMallocArray_v3020_params; + +typedef struct cudaFree_v3020_params_st { + void *devPtr; +} cudaFree_v3020_params; + +typedef struct cudaFreeHost_v3020_params_st { + void *ptr; +} cudaFreeHost_v3020_params; + +typedef struct cudaFreeArray_v3020_params_st { + cudaArray_t array; +} cudaFreeArray_v3020_params; + +typedef struct cudaFreeMipmappedArray_v5000_params_st { + cudaMipmappedArray_t mipmappedArray; +} cudaFreeMipmappedArray_v5000_params; + +typedef struct cudaHostAlloc_v3020_params_st { + void **pHost; + size_t size; + unsigned int flags; +} cudaHostAlloc_v3020_params; + +typedef struct cudaHostRegister_v4000_params_st { + void *ptr; + size_t size; + unsigned int flags; +} cudaHostRegister_v4000_params; + +typedef struct cudaHostUnregister_v4000_params_st { + void *ptr; +} cudaHostUnregister_v4000_params; + +typedef struct cudaHostGetDevicePointer_v3020_params_st { + void **pDevice; + void *pHost; + unsigned int flags; +} cudaHostGetDevicePointer_v3020_params; + +typedef struct cudaHostGetFlags_v3020_params_st { + unsigned int *pFlags; + void *pHost; +} cudaHostGetFlags_v3020_params; + +typedef struct cudaMalloc3D_v3020_params_st { + struct cudaPitchedPtr *pitchedDevPtr; + struct cudaExtent extent; +} cudaMalloc3D_v3020_params; + +typedef struct cudaMalloc3DArray_v3020_params_st { + cudaArray_t *array; + const struct cudaChannelFormatDesc *desc; + struct cudaExtent extent; + unsigned int flags; +} cudaMalloc3DArray_v3020_params; + +typedef struct cudaMallocMipmappedArray_v5000_params_st { + cudaMipmappedArray_t *mipmappedArray; + const struct cudaChannelFormatDesc *desc; + struct cudaExtent extent; + unsigned int numLevels; + unsigned int flags; +} cudaMallocMipmappedArray_v5000_params; + +typedef struct cudaGetMipmappedArrayLevel_v5000_params_st { + cudaArray_t *levelArray; + cudaMipmappedArray_const_t mipmappedArray; + unsigned int level; +} cudaGetMipmappedArrayLevel_v5000_params; + +typedef struct cudaMemcpy3D_ptds_v7000_params_st { + const struct cudaMemcpy3DParms *p; +} cudaMemcpy3D_ptds_v7000_params; + +typedef struct cudaMemcpy3DPeer_ptds_v7000_params_st { + const struct cudaMemcpy3DPeerParms *p; +} cudaMemcpy3DPeer_ptds_v7000_params; + +typedef struct cudaMemcpy3DAsync_ptsz_v7000_params_st { + const struct cudaMemcpy3DParms *p; + cudaStream_t stream; +} cudaMemcpy3DAsync_ptsz_v7000_params; + +typedef struct cudaMemcpy3DPeerAsync_ptsz_v7000_params_st { + const struct cudaMemcpy3DPeerParms *p; + cudaStream_t stream; +} cudaMemcpy3DPeerAsync_ptsz_v7000_params; + +typedef struct cudaMemGetInfo_v3020_params_st { + size_t *free; + size_t *total; +} cudaMemGetInfo_v3020_params; + +typedef struct cudaArrayGetInfo_v4010_params_st { + struct cudaChannelFormatDesc *desc; + struct cudaExtent *extent; + unsigned int *flags; + cudaArray_t array; +} cudaArrayGetInfo_v4010_params; + +typedef struct cudaArrayGetPlane_v11020_params_st { + cudaArray_t *pPlaneArray; + cudaArray_t hArray; + unsigned int planeIdx; +} cudaArrayGetPlane_v11020_params; + +typedef struct cudaArrayGetMemoryRequirements_v11060_params_st { + struct cudaArrayMemoryRequirements *memoryRequirements; + cudaArray_t array; + int device; +} cudaArrayGetMemoryRequirements_v11060_params; + +typedef struct cudaMipmappedArrayGetMemoryRequirements_v11060_params_st { + struct cudaArrayMemoryRequirements *memoryRequirements; + cudaMipmappedArray_t mipmap; + int device; +} cudaMipmappedArrayGetMemoryRequirements_v11060_params; + +typedef struct cudaArrayGetSparseProperties_v11010_params_st { + struct cudaArraySparseProperties *sparseProperties; + cudaArray_t array; +} cudaArrayGetSparseProperties_v11010_params; + +typedef struct cudaMipmappedArrayGetSparseProperties_v11010_params_st { + struct cudaArraySparseProperties *sparseProperties; + cudaMipmappedArray_t mipmap; +} cudaMipmappedArrayGetSparseProperties_v11010_params; + +typedef struct cudaMemcpy_ptds_v7000_params_st { + void *dst; + const void *src; + size_t count; + enum cudaMemcpyKind kind; +} cudaMemcpy_ptds_v7000_params; + +typedef struct cudaMemcpyPeer_v4000_params_st { + void *dst; + int dstDevice; + const void *src; + int srcDevice; + size_t count; +} cudaMemcpyPeer_v4000_params; + +typedef struct cudaMemcpy2D_ptds_v7000_params_st { + void *dst; + size_t dpitch; + const void *src; + size_t spitch; + size_t width; + size_t height; + enum cudaMemcpyKind kind; +} cudaMemcpy2D_ptds_v7000_params; + +typedef struct cudaMemcpy2DToArray_ptds_v7000_params_st { + cudaArray_t dst; + size_t wOffset; + size_t hOffset; + const void *src; + size_t spitch; + size_t width; + size_t height; + enum cudaMemcpyKind kind; +} cudaMemcpy2DToArray_ptds_v7000_params; + +typedef struct cudaMemcpy2DFromArray_ptds_v7000_params_st { + void *dst; + size_t dpitch; + cudaArray_const_t src; + size_t wOffset; + size_t hOffset; + size_t width; + size_t height; + enum cudaMemcpyKind kind; +} cudaMemcpy2DFromArray_ptds_v7000_params; + +typedef struct cudaMemcpy2DArrayToArray_ptds_v7000_params_st { + cudaArray_t dst; + size_t wOffsetDst; + size_t hOffsetDst; + cudaArray_const_t src; + size_t wOffsetSrc; + size_t hOffsetSrc; + size_t width; + size_t height; + enum cudaMemcpyKind kind; +} cudaMemcpy2DArrayToArray_ptds_v7000_params; + +typedef struct cudaMemcpyToSymbol_ptds_v7000_params_st { + const void *symbol; + const void *src; + size_t count; + size_t offset; + enum cudaMemcpyKind kind; +} cudaMemcpyToSymbol_ptds_v7000_params; + +typedef struct cudaMemcpyFromSymbol_ptds_v7000_params_st { + void *dst; + const void *symbol; + size_t count; + size_t offset; + enum cudaMemcpyKind kind; +} cudaMemcpyFromSymbol_ptds_v7000_params; + +typedef struct cudaMemcpyAsync_ptsz_v7000_params_st { + void *dst; + const void *src; + size_t count; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpyAsync_ptsz_v7000_params; + +typedef struct cudaMemcpyPeerAsync_v4000_params_st { + void *dst; + int dstDevice; + const void *src; + int srcDevice; + size_t count; + cudaStream_t stream; +} cudaMemcpyPeerAsync_v4000_params; + +typedef struct cudaMemcpy2DAsync_ptsz_v7000_params_st { + void *dst; + size_t dpitch; + const void *src; + size_t spitch; + size_t width; + size_t height; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpy2DAsync_ptsz_v7000_params; + +typedef struct cudaMemcpy2DToArrayAsync_ptsz_v7000_params_st { + cudaArray_t dst; + size_t wOffset; + size_t hOffset; + const void *src; + size_t spitch; + size_t width; + size_t height; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpy2DToArrayAsync_ptsz_v7000_params; + +typedef struct cudaMemcpy2DFromArrayAsync_ptsz_v7000_params_st { + void *dst; + size_t dpitch; + cudaArray_const_t src; + size_t wOffset; + size_t hOffset; + size_t width; + size_t height; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpy2DFromArrayAsync_ptsz_v7000_params; + +typedef struct cudaMemcpyToSymbolAsync_ptsz_v7000_params_st { + const void *symbol; + const void *src; + size_t count; + size_t offset; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpyToSymbolAsync_ptsz_v7000_params; + +typedef struct cudaMemcpyFromSymbolAsync_ptsz_v7000_params_st { + void *dst; + const void *symbol; + size_t count; + size_t offset; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpyFromSymbolAsync_ptsz_v7000_params; + +typedef struct cudaMemset_ptds_v7000_params_st { + void *devPtr; + int value; + size_t count; +} cudaMemset_ptds_v7000_params; + +typedef struct cudaMemset2D_ptds_v7000_params_st { + void *devPtr; + size_t pitch; + int value; + size_t width; + size_t height; +} cudaMemset2D_ptds_v7000_params; + +typedef struct cudaMemset3D_ptds_v7000_params_st { + struct cudaPitchedPtr pitchedDevPtr; + int value; + struct cudaExtent extent; +} cudaMemset3D_ptds_v7000_params; + +typedef struct cudaMemsetAsync_ptsz_v7000_params_st { + void *devPtr; + int value; + size_t count; + cudaStream_t stream; +} cudaMemsetAsync_ptsz_v7000_params; + +typedef struct cudaMemset2DAsync_ptsz_v7000_params_st { + void *devPtr; + size_t pitch; + int value; + size_t width; + size_t height; + cudaStream_t stream; +} cudaMemset2DAsync_ptsz_v7000_params; + +typedef struct cudaMemset3DAsync_ptsz_v7000_params_st { + struct cudaPitchedPtr pitchedDevPtr; + int value; + struct cudaExtent extent; + cudaStream_t stream; +} cudaMemset3DAsync_ptsz_v7000_params; + +typedef struct cudaGetSymbolAddress_v3020_params_st { + void **devPtr; + const void *symbol; +} cudaGetSymbolAddress_v3020_params; + +typedef struct cudaGetSymbolSize_v3020_params_st { + size_t *size; + const void *symbol; +} cudaGetSymbolSize_v3020_params; + +typedef struct cudaMemPrefetchAsync_ptsz_v8000_params_st { + const void *devPtr; + size_t count; + int dstDevice; + cudaStream_t stream; +} cudaMemPrefetchAsync_ptsz_v8000_params; + +typedef struct cudaMemAdvise_v8000_params_st { + const void *devPtr; + size_t count; + enum cudaMemoryAdvise advice; + int device; +} cudaMemAdvise_v8000_params; + +typedef struct cudaMemRangeGetAttribute_v8000_params_st { + void *data; + size_t dataSize; + enum cudaMemRangeAttribute attribute; + const void *devPtr; + size_t count; +} cudaMemRangeGetAttribute_v8000_params; + +typedef struct cudaMemRangeGetAttributes_v8000_params_st { + void **data; + size_t *dataSizes; + enum cudaMemRangeAttribute *attributes; + size_t numAttributes; + const void *devPtr; + size_t count; +} cudaMemRangeGetAttributes_v8000_params; + +typedef struct cudaMemcpyToArray_ptds_v7000_params_st { + cudaArray_t dst; + size_t wOffset; + size_t hOffset; + const void *src; + size_t count; + enum cudaMemcpyKind kind; +} cudaMemcpyToArray_ptds_v7000_params; + +typedef struct cudaMemcpyFromArray_ptds_v7000_params_st { + void *dst; + cudaArray_const_t src; + size_t wOffset; + size_t hOffset; + size_t count; + enum cudaMemcpyKind kind; +} cudaMemcpyFromArray_ptds_v7000_params; + +typedef struct cudaMemcpyArrayToArray_ptds_v7000_params_st { + cudaArray_t dst; + size_t wOffsetDst; + size_t hOffsetDst; + cudaArray_const_t src; + size_t wOffsetSrc; + size_t hOffsetSrc; + size_t count; + enum cudaMemcpyKind kind; +} cudaMemcpyArrayToArray_ptds_v7000_params; + +typedef struct cudaMemcpyToArrayAsync_ptsz_v7000_params_st { + cudaArray_t dst; + size_t wOffset; + size_t hOffset; + const void *src; + size_t count; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpyToArrayAsync_ptsz_v7000_params; + +typedef struct cudaMemcpyFromArrayAsync_ptsz_v7000_params_st { + void *dst; + cudaArray_const_t src; + size_t wOffset; + size_t hOffset; + size_t count; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpyFromArrayAsync_ptsz_v7000_params; + +typedef struct cudaMallocAsync_ptsz_v11020_params_st { + void **devPtr; + size_t size; + cudaStream_t hStream; +} cudaMallocAsync_ptsz_v11020_params; + +typedef struct cudaFreeAsync_ptsz_v11020_params_st { + void *devPtr; + cudaStream_t hStream; +} cudaFreeAsync_ptsz_v11020_params; + +typedef struct cudaMemPoolTrimTo_v11020_params_st { + cudaMemPool_t memPool; + size_t minBytesToKeep; +} cudaMemPoolTrimTo_v11020_params; + +typedef struct cudaMemPoolSetAttribute_v11020_params_st { + cudaMemPool_t memPool; + enum cudaMemPoolAttr attr; + void *value; +} cudaMemPoolSetAttribute_v11020_params; + +typedef struct cudaMemPoolGetAttribute_v11020_params_st { + cudaMemPool_t memPool; + enum cudaMemPoolAttr attr; + void *value; +} cudaMemPoolGetAttribute_v11020_params; + +typedef struct cudaMemPoolSetAccess_v11020_params_st { + cudaMemPool_t memPool; + const struct cudaMemAccessDesc *descList; + size_t count; +} cudaMemPoolSetAccess_v11020_params; + +typedef struct cudaMemPoolGetAccess_v11020_params_st { + enum cudaMemAccessFlags *flags; + cudaMemPool_t memPool; + struct cudaMemLocation *location; +} cudaMemPoolGetAccess_v11020_params; + +typedef struct cudaMemPoolCreate_v11020_params_st { + cudaMemPool_t *memPool; + const struct cudaMemPoolProps *poolProps; +} cudaMemPoolCreate_v11020_params; + +typedef struct cudaMemPoolDestroy_v11020_params_st { + cudaMemPool_t memPool; +} cudaMemPoolDestroy_v11020_params; + +typedef struct cudaMallocFromPoolAsync_ptsz_v11020_params_st { + void **ptr; + size_t size; + cudaMemPool_t memPool; + cudaStream_t stream; +} cudaMallocFromPoolAsync_ptsz_v11020_params; + +typedef struct cudaMemPoolExportToShareableHandle_v11020_params_st { + void *shareableHandle; + cudaMemPool_t memPool; + enum cudaMemAllocationHandleType handleType; + unsigned int flags; +} cudaMemPoolExportToShareableHandle_v11020_params; + +typedef struct cudaMemPoolImportFromShareableHandle_v11020_params_st { + cudaMemPool_t *memPool; + void *shareableHandle; + enum cudaMemAllocationHandleType handleType; + unsigned int flags; +} cudaMemPoolImportFromShareableHandle_v11020_params; + +typedef struct cudaMemPoolExportPointer_v11020_params_st { + struct cudaMemPoolPtrExportData *exportData; + void *ptr; +} cudaMemPoolExportPointer_v11020_params; + +typedef struct cudaMemPoolImportPointer_v11020_params_st { + void **ptr; + cudaMemPool_t memPool; + struct cudaMemPoolPtrExportData *exportData; +} cudaMemPoolImportPointer_v11020_params; + +typedef struct cudaPointerGetAttributes_v4000_params_st { + struct cudaPointerAttributes *attributes; + const void *ptr; +} cudaPointerGetAttributes_v4000_params; + +typedef struct cudaDeviceCanAccessPeer_v4000_params_st { + int *canAccessPeer; + int device; + int peerDevice; +} cudaDeviceCanAccessPeer_v4000_params; + +typedef struct cudaDeviceEnablePeerAccess_v4000_params_st { + int peerDevice; + unsigned int flags; +} cudaDeviceEnablePeerAccess_v4000_params; + +typedef struct cudaDeviceDisablePeerAccess_v4000_params_st { + int peerDevice; +} cudaDeviceDisablePeerAccess_v4000_params; + +typedef struct cudaGraphicsUnregisterResource_v3020_params_st { + cudaGraphicsResource_t resource; +} cudaGraphicsUnregisterResource_v3020_params; + +typedef struct cudaGraphicsResourceSetMapFlags_v3020_params_st { + cudaGraphicsResource_t resource; + unsigned int flags; +} cudaGraphicsResourceSetMapFlags_v3020_params; + +typedef struct cudaGraphicsMapResources_v3020_params_st { + int count; + cudaGraphicsResource_t *resources; + cudaStream_t stream; +} cudaGraphicsMapResources_v3020_params; + +typedef struct cudaGraphicsUnmapResources_v3020_params_st { + int count; + cudaGraphicsResource_t *resources; + cudaStream_t stream; +} cudaGraphicsUnmapResources_v3020_params; + +typedef struct cudaGraphicsResourceGetMappedPointer_v3020_params_st { + void **devPtr; + size_t *size; + cudaGraphicsResource_t resource; +} cudaGraphicsResourceGetMappedPointer_v3020_params; + +typedef struct cudaGraphicsSubResourceGetMappedArray_v3020_params_st { + cudaArray_t *array; + cudaGraphicsResource_t resource; + unsigned int arrayIndex; + unsigned int mipLevel; +} cudaGraphicsSubResourceGetMappedArray_v3020_params; + +typedef struct cudaGraphicsResourceGetMappedMipmappedArray_v5000_params_st { + cudaMipmappedArray_t *mipmappedArray; + cudaGraphicsResource_t resource; +} cudaGraphicsResourceGetMappedMipmappedArray_v5000_params; + +typedef struct cudaBindTexture_v3020_params_st { + size_t *offset; + const struct textureReference *texref; + const void *devPtr; + const struct cudaChannelFormatDesc *desc; + size_t size; +} cudaBindTexture_v3020_params; + +typedef struct cudaBindTexture2D_v3020_params_st { + size_t *offset; + const struct textureReference *texref; + const void *devPtr; + const struct cudaChannelFormatDesc *desc; + size_t width; + size_t height; + size_t pitch; +} cudaBindTexture2D_v3020_params; + +typedef struct cudaBindTextureToArray_v3020_params_st { + const struct textureReference *texref; + cudaArray_const_t array; + const struct cudaChannelFormatDesc *desc; +} cudaBindTextureToArray_v3020_params; + +typedef struct cudaBindTextureToMipmappedArray_v5000_params_st { + const struct textureReference *texref; + cudaMipmappedArray_const_t mipmappedArray; + const struct cudaChannelFormatDesc *desc; +} cudaBindTextureToMipmappedArray_v5000_params; + +typedef struct cudaUnbindTexture_v3020_params_st { + const struct textureReference *texref; +} cudaUnbindTexture_v3020_params; + +typedef struct cudaGetTextureAlignmentOffset_v3020_params_st { + size_t *offset; + const struct textureReference *texref; +} cudaGetTextureAlignmentOffset_v3020_params; + +typedef struct cudaGetTextureReference_v3020_params_st { + const struct textureReference **texref; + const void *symbol; +} cudaGetTextureReference_v3020_params; + +typedef struct cudaBindSurfaceToArray_v3020_params_st { + const struct surfaceReference *surfref; + cudaArray_const_t array; + const struct cudaChannelFormatDesc *desc; +} cudaBindSurfaceToArray_v3020_params; + +typedef struct cudaGetSurfaceReference_v3020_params_st { + const struct surfaceReference **surfref; + const void *symbol; +} cudaGetSurfaceReference_v3020_params; + +typedef struct cudaGetChannelDesc_v3020_params_st { + struct cudaChannelFormatDesc *desc; + cudaArray_const_t array; +} cudaGetChannelDesc_v3020_params; + +typedef struct cudaCreateChannelDesc_v3020_params_st { + int x; + int y; + int z; + int w; + enum cudaChannelFormatKind f; +} cudaCreateChannelDesc_v3020_params; + +typedef struct cudaCreateTextureObject_v5000_params_st { + cudaTextureObject_t *pTexObject; + const struct cudaResourceDesc *pResDesc; + const struct cudaTextureDesc *pTexDesc; + const struct cudaResourceViewDesc *pResViewDesc; +} cudaCreateTextureObject_v5000_params; + +typedef struct cudaDestroyTextureObject_v5000_params_st { + cudaTextureObject_t texObject; +} cudaDestroyTextureObject_v5000_params; + +typedef struct cudaGetTextureObjectResourceDesc_v5000_params_st { + struct cudaResourceDesc *pResDesc; + cudaTextureObject_t texObject; +} cudaGetTextureObjectResourceDesc_v5000_params; + +typedef struct cudaGetTextureObjectTextureDesc_v5000_params_st { + struct cudaTextureDesc *pTexDesc; + cudaTextureObject_t texObject; +} cudaGetTextureObjectTextureDesc_v5000_params; + +typedef struct cudaGetTextureObjectResourceViewDesc_v5000_params_st { + struct cudaResourceViewDesc *pResViewDesc; + cudaTextureObject_t texObject; +} cudaGetTextureObjectResourceViewDesc_v5000_params; + +typedef struct cudaCreateSurfaceObject_v5000_params_st { + cudaSurfaceObject_t *pSurfObject; + const struct cudaResourceDesc *pResDesc; +} cudaCreateSurfaceObject_v5000_params; + +typedef struct cudaDestroySurfaceObject_v5000_params_st { + cudaSurfaceObject_t surfObject; +} cudaDestroySurfaceObject_v5000_params; + +typedef struct cudaGetSurfaceObjectResourceDesc_v5000_params_st { + struct cudaResourceDesc *pResDesc; + cudaSurfaceObject_t surfObject; +} cudaGetSurfaceObjectResourceDesc_v5000_params; + +typedef struct cudaDriverGetVersion_v3020_params_st { + int *driverVersion; +} cudaDriverGetVersion_v3020_params; + +typedef struct cudaRuntimeGetVersion_v3020_params_st { + int *runtimeVersion; +} cudaRuntimeGetVersion_v3020_params; + +typedef struct cudaGraphCreate_v10000_params_st { + cudaGraph_t *pGraph; + unsigned int flags; +} cudaGraphCreate_v10000_params; + +typedef struct cudaGraphAddKernelNode_v10000_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; + const struct cudaKernelNodeParams *pNodeParams; +} cudaGraphAddKernelNode_v10000_params; + +typedef struct cudaGraphKernelNodeGetParams_v10000_params_st { + cudaGraphNode_t node; + struct cudaKernelNodeParams *pNodeParams; +} cudaGraphKernelNodeGetParams_v10000_params; + +typedef struct cudaGraphKernelNodeSetParams_v10000_params_st { + cudaGraphNode_t node; + const struct cudaKernelNodeParams *pNodeParams; +} cudaGraphKernelNodeSetParams_v10000_params; + +typedef struct cudaGraphKernelNodeCopyAttributes_v11000_params_st { + cudaGraphNode_t hSrc; + cudaGraphNode_t hDst; +} cudaGraphKernelNodeCopyAttributes_v11000_params; + +typedef struct cudaGraphKernelNodeGetAttribute_v11000_params_st { + cudaGraphNode_t hNode; + cudaKernelNodeAttrID attr; + cudaKernelNodeAttrValue *value_out; +} cudaGraphKernelNodeGetAttribute_v11000_params; + +typedef struct cudaGraphKernelNodeSetAttribute_v11000_params_st { + cudaGraphNode_t hNode; + cudaKernelNodeAttrID attr; + const cudaKernelNodeAttrValue *value; +} cudaGraphKernelNodeSetAttribute_v11000_params; + +typedef struct cudaGraphAddMemcpyNode_v10000_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; + const struct cudaMemcpy3DParms *pCopyParams; +} cudaGraphAddMemcpyNode_v10000_params; + +typedef struct cudaGraphAddMemcpyNodeToSymbol_v11010_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; + const void *symbol; + const void *src; + size_t count; + size_t offset; + enum cudaMemcpyKind kind; +} cudaGraphAddMemcpyNodeToSymbol_v11010_params; + +typedef struct cudaGraphAddMemcpyNodeFromSymbol_v11010_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; + void *dst; + const void *symbol; + size_t count; + size_t offset; + enum cudaMemcpyKind kind; +} cudaGraphAddMemcpyNodeFromSymbol_v11010_params; + +typedef struct cudaGraphAddMemcpyNode1D_v11010_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; + void *dst; + const void *src; + size_t count; + enum cudaMemcpyKind kind; +} cudaGraphAddMemcpyNode1D_v11010_params; + +typedef struct cudaGraphMemcpyNodeGetParams_v10000_params_st { + cudaGraphNode_t node; + struct cudaMemcpy3DParms *pNodeParams; +} cudaGraphMemcpyNodeGetParams_v10000_params; + +typedef struct cudaGraphMemcpyNodeSetParams_v10000_params_st { + cudaGraphNode_t node; + const struct cudaMemcpy3DParms *pNodeParams; +} cudaGraphMemcpyNodeSetParams_v10000_params; + +typedef struct cudaGraphMemcpyNodeSetParamsToSymbol_v11010_params_st { + cudaGraphNode_t node; + const void *symbol; + const void *src; + size_t count; + size_t offset; + enum cudaMemcpyKind kind; +} cudaGraphMemcpyNodeSetParamsToSymbol_v11010_params; + +typedef struct cudaGraphMemcpyNodeSetParamsFromSymbol_v11010_params_st { + cudaGraphNode_t node; + void *dst; + const void *symbol; + size_t count; + size_t offset; + enum cudaMemcpyKind kind; +} cudaGraphMemcpyNodeSetParamsFromSymbol_v11010_params; + +typedef struct cudaGraphMemcpyNodeSetParams1D_v11010_params_st { + cudaGraphNode_t node; + void *dst; + const void *src; + size_t count; + enum cudaMemcpyKind kind; +} cudaGraphMemcpyNodeSetParams1D_v11010_params; + +typedef struct cudaGraphAddMemsetNode_v10000_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; + const struct cudaMemsetParams *pMemsetParams; +} cudaGraphAddMemsetNode_v10000_params; + +typedef struct cudaGraphMemsetNodeGetParams_v10000_params_st { + cudaGraphNode_t node; + struct cudaMemsetParams *pNodeParams; +} cudaGraphMemsetNodeGetParams_v10000_params; + +typedef struct cudaGraphMemsetNodeSetParams_v10000_params_st { + cudaGraphNode_t node; + const struct cudaMemsetParams *pNodeParams; +} cudaGraphMemsetNodeSetParams_v10000_params; + +typedef struct cudaGraphAddHostNode_v10000_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; + const struct cudaHostNodeParams *pNodeParams; +} cudaGraphAddHostNode_v10000_params; + +typedef struct cudaGraphHostNodeGetParams_v10000_params_st { + cudaGraphNode_t node; + struct cudaHostNodeParams *pNodeParams; +} cudaGraphHostNodeGetParams_v10000_params; + +typedef struct cudaGraphHostNodeSetParams_v10000_params_st { + cudaGraphNode_t node; + const struct cudaHostNodeParams *pNodeParams; +} cudaGraphHostNodeSetParams_v10000_params; + +typedef struct cudaGraphAddChildGraphNode_v10000_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; + cudaGraph_t childGraph; +} cudaGraphAddChildGraphNode_v10000_params; + +typedef struct cudaGraphChildGraphNodeGetGraph_v10000_params_st { + cudaGraphNode_t node; + cudaGraph_t *pGraph; +} cudaGraphChildGraphNodeGetGraph_v10000_params; + +typedef struct cudaGraphAddEmptyNode_v10000_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; +} cudaGraphAddEmptyNode_v10000_params; + +typedef struct cudaGraphAddEventRecordNode_v11010_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; + cudaEvent_t event; +} cudaGraphAddEventRecordNode_v11010_params; + +typedef struct cudaGraphEventRecordNodeGetEvent_v11010_params_st { + cudaGraphNode_t node; + cudaEvent_t *event_out; +} cudaGraphEventRecordNodeGetEvent_v11010_params; + +typedef struct cudaGraphEventRecordNodeSetEvent_v11010_params_st { + cudaGraphNode_t node; + cudaEvent_t event; +} cudaGraphEventRecordNodeSetEvent_v11010_params; + +typedef struct cudaGraphAddEventWaitNode_v11010_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; + cudaEvent_t event; +} cudaGraphAddEventWaitNode_v11010_params; + +typedef struct cudaGraphEventWaitNodeGetEvent_v11010_params_st { + cudaGraphNode_t node; + cudaEvent_t *event_out; +} cudaGraphEventWaitNodeGetEvent_v11010_params; + +typedef struct cudaGraphEventWaitNodeSetEvent_v11010_params_st { + cudaGraphNode_t node; + cudaEvent_t event; +} cudaGraphEventWaitNodeSetEvent_v11010_params; + +typedef struct cudaGraphAddExternalSemaphoresSignalNode_v11020_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; + const struct cudaExternalSemaphoreSignalNodeParams *nodeParams; +} cudaGraphAddExternalSemaphoresSignalNode_v11020_params; + +typedef struct cudaGraphExternalSemaphoresSignalNodeGetParams_v11020_params_st { + cudaGraphNode_t hNode; + struct cudaExternalSemaphoreSignalNodeParams *params_out; +} cudaGraphExternalSemaphoresSignalNodeGetParams_v11020_params; + +typedef struct cudaGraphExternalSemaphoresSignalNodeSetParams_v11020_params_st { + cudaGraphNode_t hNode; + const struct cudaExternalSemaphoreSignalNodeParams *nodeParams; +} cudaGraphExternalSemaphoresSignalNodeSetParams_v11020_params; + +typedef struct cudaGraphAddExternalSemaphoresWaitNode_v11020_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; + const struct cudaExternalSemaphoreWaitNodeParams *nodeParams; +} cudaGraphAddExternalSemaphoresWaitNode_v11020_params; + +typedef struct cudaGraphExternalSemaphoresWaitNodeGetParams_v11020_params_st { + cudaGraphNode_t hNode; + struct cudaExternalSemaphoreWaitNodeParams *params_out; +} cudaGraphExternalSemaphoresWaitNodeGetParams_v11020_params; + +typedef struct cudaGraphExternalSemaphoresWaitNodeSetParams_v11020_params_st { + cudaGraphNode_t hNode; + const struct cudaExternalSemaphoreWaitNodeParams *nodeParams; +} cudaGraphExternalSemaphoresWaitNodeSetParams_v11020_params; + +typedef struct cudaGraphAddMemAllocNode_v11040_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; + struct cudaMemAllocNodeParams *nodeParams; +} cudaGraphAddMemAllocNode_v11040_params; + +typedef struct cudaGraphMemAllocNodeGetParams_v11040_params_st { + cudaGraphNode_t node; + struct cudaMemAllocNodeParams *params_out; +} cudaGraphMemAllocNodeGetParams_v11040_params; + +typedef struct cudaGraphAddMemFreeNode_v11040_params_st { + cudaGraphNode_t *pGraphNode; + cudaGraph_t graph; + const cudaGraphNode_t *pDependencies; + size_t numDependencies; + void *dptr; +} cudaGraphAddMemFreeNode_v11040_params; + +typedef struct cudaGraphMemFreeNodeGetParams_v11040_params_st { + cudaGraphNode_t node; + void *dptr_out; +} cudaGraphMemFreeNodeGetParams_v11040_params; + +typedef struct cudaDeviceGraphMemTrim_v11040_params_st { + int device; +} cudaDeviceGraphMemTrim_v11040_params; + +typedef struct cudaDeviceGetGraphMemAttribute_v11040_params_st { + int device; + enum cudaGraphMemAttributeType attr; + void *value; +} cudaDeviceGetGraphMemAttribute_v11040_params; + +typedef struct cudaDeviceSetGraphMemAttribute_v11040_params_st { + int device; + enum cudaGraphMemAttributeType attr; + void *value; +} cudaDeviceSetGraphMemAttribute_v11040_params; + +typedef struct cudaGraphClone_v10000_params_st { + cudaGraph_t *pGraphClone; + cudaGraph_t originalGraph; +} cudaGraphClone_v10000_params; + +typedef struct cudaGraphNodeFindInClone_v10000_params_st { + cudaGraphNode_t *pNode; + cudaGraphNode_t originalNode; + cudaGraph_t clonedGraph; +} cudaGraphNodeFindInClone_v10000_params; + +typedef struct cudaGraphNodeGetType_v10000_params_st { + cudaGraphNode_t node; + enum cudaGraphNodeType *pType; +} cudaGraphNodeGetType_v10000_params; + +typedef struct cudaGraphGetNodes_v10000_params_st { + cudaGraph_t graph; + cudaGraphNode_t *nodes; + size_t *numNodes; +} cudaGraphGetNodes_v10000_params; + +typedef struct cudaGraphGetRootNodes_v10000_params_st { + cudaGraph_t graph; + cudaGraphNode_t *pRootNodes; + size_t *pNumRootNodes; +} cudaGraphGetRootNodes_v10000_params; + +typedef struct cudaGraphGetEdges_v10000_params_st { + cudaGraph_t graph; + cudaGraphNode_t *from; + cudaGraphNode_t *to; + size_t *numEdges; +} cudaGraphGetEdges_v10000_params; + +typedef struct cudaGraphNodeGetDependencies_v10000_params_st { + cudaGraphNode_t node; + cudaGraphNode_t *pDependencies; + size_t *pNumDependencies; +} cudaGraphNodeGetDependencies_v10000_params; + +typedef struct cudaGraphNodeGetDependentNodes_v10000_params_st { + cudaGraphNode_t node; + cudaGraphNode_t *pDependentNodes; + size_t *pNumDependentNodes; +} cudaGraphNodeGetDependentNodes_v10000_params; + +typedef struct cudaGraphAddDependencies_v10000_params_st { + cudaGraph_t graph; + const cudaGraphNode_t *from; + const cudaGraphNode_t *to; + size_t numDependencies; +} cudaGraphAddDependencies_v10000_params; + +typedef struct cudaGraphRemoveDependencies_v10000_params_st { + cudaGraph_t graph; + const cudaGraphNode_t *from; + const cudaGraphNode_t *to; + size_t numDependencies; +} cudaGraphRemoveDependencies_v10000_params; + +typedef struct cudaGraphDestroyNode_v10000_params_st { + cudaGraphNode_t node; +} cudaGraphDestroyNode_v10000_params; + +typedef struct cudaGraphInstantiate_v10000_params_st { + cudaGraphExec_t *pGraphExec; + cudaGraph_t graph; + cudaGraphNode_t *pErrorNode; + char *pLogBuffer; + size_t bufferSize; +} cudaGraphInstantiate_v10000_params; + +typedef struct cudaGraphInstantiateWithFlags_v11040_params_st { + cudaGraphExec_t *pGraphExec; + cudaGraph_t graph; + unsigned long long flags; +} cudaGraphInstantiateWithFlags_v11040_params; + +typedef struct cudaGraphExecKernelNodeSetParams_v10010_params_st { + cudaGraphExec_t hGraphExec; + cudaGraphNode_t node; + const struct cudaKernelNodeParams *pNodeParams; +} cudaGraphExecKernelNodeSetParams_v10010_params; + +typedef struct cudaGraphExecMemcpyNodeSetParams_v10020_params_st { + cudaGraphExec_t hGraphExec; + cudaGraphNode_t node; + const struct cudaMemcpy3DParms *pNodeParams; +} cudaGraphExecMemcpyNodeSetParams_v10020_params; + +typedef struct cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010_params_st { + cudaGraphExec_t hGraphExec; + cudaGraphNode_t node; + const void *symbol; + const void *src; + size_t count; + size_t offset; + enum cudaMemcpyKind kind; +} cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010_params; + +typedef struct cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010_params_st { + cudaGraphExec_t hGraphExec; + cudaGraphNode_t node; + void *dst; + const void *symbol; + size_t count; + size_t offset; + enum cudaMemcpyKind kind; +} cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010_params; + +typedef struct cudaGraphExecMemcpyNodeSetParams1D_v11010_params_st { + cudaGraphExec_t hGraphExec; + cudaGraphNode_t node; + void *dst; + const void *src; + size_t count; + enum cudaMemcpyKind kind; +} cudaGraphExecMemcpyNodeSetParams1D_v11010_params; + +typedef struct cudaGraphExecMemsetNodeSetParams_v10020_params_st { + cudaGraphExec_t hGraphExec; + cudaGraphNode_t node; + const struct cudaMemsetParams *pNodeParams; +} cudaGraphExecMemsetNodeSetParams_v10020_params; + +typedef struct cudaGraphExecHostNodeSetParams_v10020_params_st { + cudaGraphExec_t hGraphExec; + cudaGraphNode_t node; + const struct cudaHostNodeParams *pNodeParams; +} cudaGraphExecHostNodeSetParams_v10020_params; + +typedef struct cudaGraphExecChildGraphNodeSetParams_v11010_params_st { + cudaGraphExec_t hGraphExec; + cudaGraphNode_t node; + cudaGraph_t childGraph; +} cudaGraphExecChildGraphNodeSetParams_v11010_params; + +typedef struct cudaGraphExecEventRecordNodeSetEvent_v11010_params_st { + cudaGraphExec_t hGraphExec; + cudaGraphNode_t hNode; + cudaEvent_t event; +} cudaGraphExecEventRecordNodeSetEvent_v11010_params; + +typedef struct cudaGraphExecEventWaitNodeSetEvent_v11010_params_st { + cudaGraphExec_t hGraphExec; + cudaGraphNode_t hNode; + cudaEvent_t event; +} cudaGraphExecEventWaitNodeSetEvent_v11010_params; + +typedef struct cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020_params_st { + cudaGraphExec_t hGraphExec; + cudaGraphNode_t hNode; + const struct cudaExternalSemaphoreSignalNodeParams *nodeParams; +} cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020_params; + +typedef struct cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020_params_st { + cudaGraphExec_t hGraphExec; + cudaGraphNode_t hNode; + const struct cudaExternalSemaphoreWaitNodeParams *nodeParams; +} cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020_params; + +typedef struct cudaGraphNodeSetEnabled_v11060_params_st { + cudaGraphExec_t hGraphExec; + cudaGraphNode_t hNode; + unsigned int isEnabled; +} cudaGraphNodeSetEnabled_v11060_params; + +typedef struct cudaGraphNodeGetEnabled_v11060_params_st { + cudaGraphExec_t hGraphExec; + cudaGraphNode_t hNode; + unsigned int *isEnabled; +} cudaGraphNodeGetEnabled_v11060_params; + +typedef struct cudaGraphExecUpdate_v10020_params_st { + cudaGraphExec_t hGraphExec; + cudaGraph_t hGraph; + cudaGraphNode_t *hErrorNode_out; + enum cudaGraphExecUpdateResult *updateResult_out; +} cudaGraphExecUpdate_v10020_params; + +typedef struct cudaGraphUpload_ptsz_v10000_params_st { + cudaGraphExec_t graphExec; + cudaStream_t stream; +} cudaGraphUpload_ptsz_v10000_params; + +typedef struct cudaGraphLaunch_ptsz_v10000_params_st { + cudaGraphExec_t graphExec; + cudaStream_t stream; +} cudaGraphLaunch_ptsz_v10000_params; + +typedef struct cudaGraphExecDestroy_v10000_params_st { + cudaGraphExec_t graphExec; +} cudaGraphExecDestroy_v10000_params; + +typedef struct cudaGraphDestroy_v10000_params_st { + cudaGraph_t graph; +} cudaGraphDestroy_v10000_params; + +typedef struct cudaGraphDebugDotPrint_v11030_params_st { + cudaGraph_t graph; + const char *path; + unsigned int flags; +} cudaGraphDebugDotPrint_v11030_params; + +typedef struct cudaUserObjectCreate_v11030_params_st { + cudaUserObject_t *object_out; + void *ptr; + cudaHostFn_t destroy; + unsigned int initialRefcount; + unsigned int flags; +} cudaUserObjectCreate_v11030_params; + +typedef struct cudaUserObjectRetain_v11030_params_st { + cudaUserObject_t object; + unsigned int count; +} cudaUserObjectRetain_v11030_params; + +typedef struct cudaUserObjectRelease_v11030_params_st { + cudaUserObject_t object; + unsigned int count; +} cudaUserObjectRelease_v11030_params; + +typedef struct cudaGraphRetainUserObject_v11030_params_st { + cudaGraph_t graph; + cudaUserObject_t object; + unsigned int count; + unsigned int flags; +} cudaGraphRetainUserObject_v11030_params; + +typedef struct cudaGraphReleaseUserObject_v11030_params_st { + cudaGraph_t graph; + cudaUserObject_t object; + unsigned int count; +} cudaGraphReleaseUserObject_v11030_params; + +typedef struct cudaGetDriverEntryPoint_ptsz_v11030_params_st { + const char *symbol; + void **funcPtr; + unsigned long long flags; +} cudaGetDriverEntryPoint_ptsz_v11030_params; + +typedef struct cudaGetFuncBySymbol_v11000_params_st { + cudaFunction_t *functionPtr; + const void *symbolPtr; +} cudaGetFuncBySymbol_v11000_params; + +typedef struct cudaMemcpy_v3020_params_st { + void *dst; + const void *src; + size_t count; + enum cudaMemcpyKind kind; +} cudaMemcpy_v3020_params; + +typedef struct cudaMemcpyToSymbol_v3020_params_st { + const void *symbol; + const void *src; + size_t count; + size_t offset; + enum cudaMemcpyKind kind; +} cudaMemcpyToSymbol_v3020_params; + +typedef struct cudaMemcpyFromSymbol_v3020_params_st { + void *dst; + const void *symbol; + size_t count; + size_t offset; + enum cudaMemcpyKind kind; +} cudaMemcpyFromSymbol_v3020_params; + +typedef struct cudaMemcpy2D_v3020_params_st { + void *dst; + size_t dpitch; + const void *src; + size_t spitch; + size_t width; + size_t height; + enum cudaMemcpyKind kind; +} cudaMemcpy2D_v3020_params; + +typedef struct cudaMemcpyToArray_v3020_params_st { + cudaArray_t dst; + size_t wOffset; + size_t hOffset; + const void *src; + size_t count; + enum cudaMemcpyKind kind; +} cudaMemcpyToArray_v3020_params; + +typedef struct cudaMemcpy2DToArray_v3020_params_st { + cudaArray_t dst; + size_t wOffset; + size_t hOffset; + const void *src; + size_t spitch; + size_t width; + size_t height; + enum cudaMemcpyKind kind; +} cudaMemcpy2DToArray_v3020_params; + +typedef struct cudaMemcpyFromArray_v3020_params_st { + void *dst; + cudaArray_const_t src; + size_t wOffset; + size_t hOffset; + size_t count; + enum cudaMemcpyKind kind; +} cudaMemcpyFromArray_v3020_params; + +typedef struct cudaMemcpy2DFromArray_v3020_params_st { + void *dst; + size_t dpitch; + cudaArray_const_t src; + size_t wOffset; + size_t hOffset; + size_t width; + size_t height; + enum cudaMemcpyKind kind; +} cudaMemcpy2DFromArray_v3020_params; + +typedef struct cudaMemcpyArrayToArray_v3020_params_st { + cudaArray_t dst; + size_t wOffsetDst; + size_t hOffsetDst; + cudaArray_const_t src; + size_t wOffsetSrc; + size_t hOffsetSrc; + size_t count; + enum cudaMemcpyKind kind; +} cudaMemcpyArrayToArray_v3020_params; + +typedef struct cudaMemcpy2DArrayToArray_v3020_params_st { + cudaArray_t dst; + size_t wOffsetDst; + size_t hOffsetDst; + cudaArray_const_t src; + size_t wOffsetSrc; + size_t hOffsetSrc; + size_t width; + size_t height; + enum cudaMemcpyKind kind; +} cudaMemcpy2DArrayToArray_v3020_params; + +typedef struct cudaMemcpy3D_v3020_params_st { + const struct cudaMemcpy3DParms *p; +} cudaMemcpy3D_v3020_params; + +typedef struct cudaMemcpy3DPeer_v4000_params_st { + const struct cudaMemcpy3DPeerParms *p; +} cudaMemcpy3DPeer_v4000_params; + +typedef struct cudaMemset_v3020_params_st { + void *devPtr; + int value; + size_t count; +} cudaMemset_v3020_params; + +typedef struct cudaMemset2D_v3020_params_st { + void *devPtr; + size_t pitch; + int value; + size_t width; + size_t height; +} cudaMemset2D_v3020_params; + +typedef struct cudaMemset3D_v3020_params_st { + struct cudaPitchedPtr pitchedDevPtr; + int value; + struct cudaExtent extent; +} cudaMemset3D_v3020_params; + +typedef struct cudaMemcpyAsync_v3020_params_st { + void *dst; + const void *src; + size_t count; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpyAsync_v3020_params; + +typedef struct cudaMemcpyToSymbolAsync_v3020_params_st { + const void *symbol; + const void *src; + size_t count; + size_t offset; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpyToSymbolAsync_v3020_params; + +typedef struct cudaMemcpyFromSymbolAsync_v3020_params_st { + void *dst; + const void *symbol; + size_t count; + size_t offset; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpyFromSymbolAsync_v3020_params; + +typedef struct cudaMemcpy2DAsync_v3020_params_st { + void *dst; + size_t dpitch; + const void *src; + size_t spitch; + size_t width; + size_t height; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpy2DAsync_v3020_params; + +typedef struct cudaMemcpyToArrayAsync_v3020_params_st { + cudaArray_t dst; + size_t wOffset; + size_t hOffset; + const void *src; + size_t count; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpyToArrayAsync_v3020_params; + +typedef struct cudaMemcpy2DToArrayAsync_v3020_params_st { + cudaArray_t dst; + size_t wOffset; + size_t hOffset; + const void *src; + size_t spitch; + size_t width; + size_t height; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpy2DToArrayAsync_v3020_params; + +typedef struct cudaMemcpyFromArrayAsync_v3020_params_st { + void *dst; + cudaArray_const_t src; + size_t wOffset; + size_t hOffset; + size_t count; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpyFromArrayAsync_v3020_params; + +typedef struct cudaMemcpy2DFromArrayAsync_v3020_params_st { + void *dst; + size_t dpitch; + cudaArray_const_t src; + size_t wOffset; + size_t hOffset; + size_t width; + size_t height; + enum cudaMemcpyKind kind; + cudaStream_t stream; +} cudaMemcpy2DFromArrayAsync_v3020_params; + +typedef struct cudaMemcpy3DAsync_v3020_params_st { + const struct cudaMemcpy3DParms *p; + cudaStream_t stream; +} cudaMemcpy3DAsync_v3020_params; + +typedef struct cudaMemcpy3DPeerAsync_v4000_params_st { + const struct cudaMemcpy3DPeerParms *p; + cudaStream_t stream; +} cudaMemcpy3DPeerAsync_v4000_params; + +typedef struct cudaMemsetAsync_v3020_params_st { + void *devPtr; + int value; + size_t count; + cudaStream_t stream; +} cudaMemsetAsync_v3020_params; + +typedef struct cudaMemset2DAsync_v3020_params_st { + void *devPtr; + size_t pitch; + int value; + size_t width; + size_t height; + cudaStream_t stream; +} cudaMemset2DAsync_v3020_params; + +typedef struct cudaMemset3DAsync_v3020_params_st { + struct cudaPitchedPtr pitchedDevPtr; + int value; + struct cudaExtent extent; + cudaStream_t stream; +} cudaMemset3DAsync_v3020_params; + +typedef struct cudaStreamQuery_v3020_params_st { + cudaStream_t stream; +} cudaStreamQuery_v3020_params; + +typedef struct cudaStreamGetFlags_v5050_params_st { + cudaStream_t hStream; + unsigned int *flags; +} cudaStreamGetFlags_v5050_params; + +typedef struct cudaStreamGetPriority_v5050_params_st { + cudaStream_t hStream; + int *priority; +} cudaStreamGetPriority_v5050_params; + +typedef struct cudaEventRecord_v3020_params_st { + cudaEvent_t event; + cudaStream_t stream; +} cudaEventRecord_v3020_params; + +typedef struct cudaEventRecordWithFlags_v11010_params_st { + cudaEvent_t event; + cudaStream_t stream; + unsigned int flags; +} cudaEventRecordWithFlags_v11010_params; + +typedef struct cudaStreamWaitEvent_v3020_params_st { + cudaStream_t stream; + cudaEvent_t event; + unsigned int flags; +} cudaStreamWaitEvent_v3020_params; + +typedef struct cudaStreamAddCallback_v5000_params_st { + cudaStream_t stream; + cudaStreamCallback_t callback; + void *userData; + unsigned int flags; +} cudaStreamAddCallback_v5000_params; + +typedef struct cudaStreamAttachMemAsync_v6000_params_st { + cudaStream_t stream; + void *devPtr; + size_t length; + unsigned int flags; +} cudaStreamAttachMemAsync_v6000_params; + +typedef struct cudaStreamSynchronize_v3020_params_st { + cudaStream_t stream; +} cudaStreamSynchronize_v3020_params; + +typedef struct cudaLaunchKernel_v7000_params_st { + const void *func; + dim3 gridDim; + dim3 blockDim; + void **args; + size_t sharedMem; + cudaStream_t stream; +} cudaLaunchKernel_v7000_params; + +typedef struct cudaLaunchKernelExC_v11060_params_st { + const cudaLaunchConfig_t *config; + const void *func; + void **args; +} cudaLaunchKernelExC_v11060_params; + +typedef struct cudaLaunchCooperativeKernel_v9000_params_st { + const void *func; + dim3 gridDim; + dim3 blockDim; + void **args; + size_t sharedMem; + cudaStream_t stream; +} cudaLaunchCooperativeKernel_v9000_params; + +typedef struct cudaLaunchHostFunc_v10000_params_st { + cudaStream_t stream; + cudaHostFn_t fn; + void *userData; +} cudaLaunchHostFunc_v10000_params; + +typedef struct cudaMemPrefetchAsync_v8000_params_st { + const void *devPtr; + size_t count; + int dstDevice; + cudaStream_t stream; +} cudaMemPrefetchAsync_v8000_params; + +typedef struct cudaSignalExternalSemaphoresAsync_v10000_params_st { + const cudaExternalSemaphore_t *extSemArray; + const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray; + unsigned int numExtSems; + cudaStream_t stream; +} cudaSignalExternalSemaphoresAsync_v10000_params; + +typedef struct cudaSignalExternalSemaphoresAsync_ptsz_v10000_params_st { + const cudaExternalSemaphore_t *extSemArray; + const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray; + unsigned int numExtSems; + cudaStream_t stream; +} cudaSignalExternalSemaphoresAsync_ptsz_v10000_params; + +typedef struct cudaSignalExternalSemaphoresAsync_v2_v11020_params_st { + const cudaExternalSemaphore_t *extSemArray; + const struct cudaExternalSemaphoreSignalParams *paramsArray; + unsigned int numExtSems; + cudaStream_t stream; +} cudaSignalExternalSemaphoresAsync_v2_v11020_params; + +typedef struct cudaWaitExternalSemaphoresAsync_v10000_params_st { + const cudaExternalSemaphore_t *extSemArray; + const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray; + unsigned int numExtSems; + cudaStream_t stream; +} cudaWaitExternalSemaphoresAsync_v10000_params; + +typedef struct cudaWaitExternalSemaphoresAsync_ptsz_v10000_params_st { + const cudaExternalSemaphore_t *extSemArray; + const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray; + unsigned int numExtSems; + cudaStream_t stream; +} cudaWaitExternalSemaphoresAsync_ptsz_v10000_params; + +typedef struct cudaWaitExternalSemaphoresAsync_v2_v11020_params_st { + const cudaExternalSemaphore_t *extSemArray; + const struct cudaExternalSemaphoreWaitParams *paramsArray; + unsigned int numExtSems; + cudaStream_t stream; +} cudaWaitExternalSemaphoresAsync_v2_v11020_params; + +typedef struct cudaGraphUpload_v10000_params_st { + cudaGraphExec_t graphExec; + cudaStream_t stream; +} cudaGraphUpload_v10000_params; + +typedef struct cudaGraphLaunch_v10000_params_st { + cudaGraphExec_t graphExec; + cudaStream_t stream; +} cudaGraphLaunch_v10000_params; + +typedef struct cudaStreamBeginCapture_v10000_params_st { + cudaStream_t stream; + enum cudaStreamCaptureMode mode; +} cudaStreamBeginCapture_v10000_params; + +typedef struct cudaStreamEndCapture_v10000_params_st { + cudaStream_t stream; + cudaGraph_t *pGraph; +} cudaStreamEndCapture_v10000_params; + +typedef struct cudaStreamIsCapturing_v10000_params_st { + cudaStream_t stream; + enum cudaStreamCaptureStatus *pCaptureStatus; +} cudaStreamIsCapturing_v10000_params; + +typedef struct cudaStreamGetCaptureInfo_v10010_params_st { + cudaStream_t stream; + enum cudaStreamCaptureStatus *captureStatus_out; + unsigned long long *id_out; +} cudaStreamGetCaptureInfo_v10010_params; + +typedef struct cudaStreamGetCaptureInfo_v2_v11030_params_st { + cudaStream_t stream; + enum cudaStreamCaptureStatus *captureStatus_out; + unsigned long long *id_out; + cudaGraph_t *graph_out; + const cudaGraphNode_t **dependencies_out; + size_t *numDependencies_out; +} cudaStreamGetCaptureInfo_v2_v11030_params; + +typedef struct cudaStreamUpdateCaptureDependencies_ptsz_v11030_params_st { + cudaStream_t stream; + cudaGraphNode_t *dependencies; + size_t numDependencies; + unsigned int flags; +} cudaStreamUpdateCaptureDependencies_ptsz_v11030_params; + +typedef struct cudaStreamCopyAttributes_v11000_params_st { + cudaStream_t dstStream; + cudaStream_t srcStream; +} cudaStreamCopyAttributes_v11000_params; + +typedef struct cudaStreamGetAttribute_v11000_params_st { + cudaStream_t stream; + cudaStreamAttrID attr; + cudaStreamAttrValue *value; +} cudaStreamGetAttribute_v11000_params; + +typedef struct cudaStreamSetAttribute_v11000_params_st { + cudaStream_t stream; + cudaStreamAttrID attr; + const cudaStreamAttrValue *param; +} cudaStreamSetAttribute_v11000_params; + +typedef struct cudaMallocAsync_v11020_params_st { + void **devPtr; + size_t size; + cudaStream_t hStream; +} cudaMallocAsync_v11020_params; + +typedef struct cudaFreeAsync_v11020_params_st { + void *devPtr; + cudaStream_t hStream; +} cudaFreeAsync_v11020_params; + +typedef struct cudaMallocFromPoolAsync_v11020_params_st { + void **ptr; + size_t size; + cudaMemPool_t memPool; + cudaStream_t stream; +} cudaMallocFromPoolAsync_v11020_params; + +typedef struct cudaGetDriverEntryPoint_v11030_params_st { + const char *symbol; + void **funcPtr; + unsigned long long flags; +} cudaGetDriverEntryPoint_v11030_params; + +// Parameter trace structures for removed functions + + +// End of parameter trace structures diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_vdpau_interop_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_vdpau_interop_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..88e79d1957925c4bbacd381e9461d5072de88f24 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_vdpau_interop_meta.h @@ -0,0 +1,38 @@ +// This file is generated. Any changes you make will be lost during the next clean build. + +// CUDA public interface, for type definitions and api function prototypes +#include "cuda_vdpau_interop.h" + +// ************************************************************************* +// Definitions of structs to hold parameters for each function +// ************************************************************************* + +// Currently used parameter trace structures +typedef struct cudaVDPAUGetDevice_v3020_params_st { + int *device; + VdpDevice vdpDevice; + VdpGetProcAddress *vdpGetProcAddress; +} cudaVDPAUGetDevice_v3020_params; + +typedef struct cudaVDPAUSetVDPAUDevice_v3020_params_st { + int device; + VdpDevice vdpDevice; + VdpGetProcAddress *vdpGetProcAddress; +} cudaVDPAUSetVDPAUDevice_v3020_params; + +typedef struct cudaGraphicsVDPAURegisterVideoSurface_v3020_params_st { + struct cudaGraphicsResource **resource; + VdpVideoSurface vdpSurface; + unsigned int flags; +} cudaGraphicsVDPAURegisterVideoSurface_v3020_params; + +typedef struct cudaGraphicsVDPAURegisterOutputSurface_v3020_params_st { + struct cudaGraphicsResource **resource; + VdpOutputSurface vdpSurface; + unsigned int flags; +} cudaGraphicsVDPAURegisterOutputSurface_v3020_params; + +// Parameter trace structures for removed functions + + +// End of parameter trace structures diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_infer.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_infer.h new file mode 100644 index 0000000000000000000000000000000000000000..e24cfcbba4d93b57f15a4bd60fbe60a99b493f66 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_cnn_infer.h @@ -0,0 +1,571 @@ +/* + * Copyright 2017-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * cudnn_cnn_infer : cuDNN's basic definitions and inference CNN functions. + */ + +#if !defined(CUDNN_CNN_INFER_H_) +#define CUDNN_CNN_INFER_H_ + +#pragma once +#include +#include + +#include "cudnn_version.h" +#include "cudnn_ops_infer.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_CNN_INFER_MAJOR 8 +#define CUDNN_CNN_INFER_MINOR 7 +#define CUDNN_CNN_INFER_PATCH 0 + +#if (CUDNN_CNN_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_CNN_INFER_MINOR != CUDNN_MINOR) || \ + (CUDNN_CNN_INFER_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN CNN INFER!!! +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +typedef struct cudnnConvolutionStruct *cudnnConvolutionDescriptor_t; + +/* + * convolution mode + */ +typedef enum { CUDNN_CONVOLUTION = 0, CUDNN_CROSS_CORRELATION = 1 } cudnnConvolutionMode_t; + +/* + * CUDNN Reorder + */ +typedef enum { + CUDNN_DEFAULT_REORDER = 0, + CUDNN_NO_REORDER = 1, +} cudnnReorderType_t; + +typedef struct cudnnConvolutionFwdAlgoPerfStruct { + cudnnConvolutionFwdAlgo_t algo; + cudnnStatus_t status; + float time; + size_t memory; + cudnnDeterminism_t determinism; + cudnnMathType_t mathType; + int reserved[3]; +} cudnnConvolutionFwdAlgoPerf_t; + +/* Create an instance of convolution descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc); + +/* Destroy an instance of convolution descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType); + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType); + +cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount); + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount); + +cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType); + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType); + +cudnnStatus_t CUDNNWINAPI +cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc, + int pad_h, /* zero-padding height */ + int pad_w, /* zero-padding width */ + int u, /* vertical filter stride */ + int v, /* horizontal filter stride */ + int dilation_h, /* filter dilation in the vertical dimension */ + int dilation_w, /* filter dilation in the horizontal dimension */ + cudnnConvolutionMode_t mode, + cudnnDataType_t computeType); + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc, + int *pad_h, /* zero-padding height */ + int *pad_w, /* zero-padding width */ + int *u, /* vertical filter stride */ + int *v, /* horizontal filter stride */ + int *dilation_h, /* filter dilation in the vertical dimension */ + int *dilation_w, /* filter dilation in the horizontal dimension */ + cudnnConvolutionMode_t *mode, + cudnnDataType_t *computeType); + +cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, + int arrayLength, /* nbDims-2 size */ + const int padA[], + const int filterStrideA[], + const int dilationA[], + cudnnConvolutionMode_t mode, + cudnnDataType_t computeType); /* convolution data type */ + +/* Helper function to return the dimensions of the output tensor given a convolution descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc, + int arrayLengthRequested, + int *arrayLength, + int padA[], + int strideA[], + int dilationA[], + cudnnConvolutionMode_t *mode, + cudnnDataType_t *computeType); /* convolution data type */ + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + const cudnnFilterDescriptor_t filterDesc, + int *n, + int *c, + int *h, + int *w); + +/* Helper function to return the dimensions of the output tensor given a convolution descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + const cudnnFilterDescriptor_t filterDesc, + int nbDims, + int tensorOuputDimA[]); + +/* helper function to provide the convolution forward algo that fit best the requirement */ +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count); + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle, + const cudnnTensorDescriptor_t srcDesc, + const cudnnFilterDescriptor_t filterDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t destDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults); + +cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults); + +cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + void *y, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnIm2Col(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + void *colBuffer); + +cudnnStatus_t CUDNNWINAPI +cudnnReorderFilterAndBias(cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + cudnnReorderType_t reorderType, + const void *filterData, + void *reorderedFilterData, + int reorderBias, + const void *biasData, + void *reorderedBiasData); + +/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + cudnnConvolutionFwdAlgo_t algo, + size_t *sizeInBytes); + +/* Convolution functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform the forward pass for batch convolution */ +cudnnStatus_t CUDNNWINAPI +cudnnConvolutionForward(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionFwdAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* Fused conv/bias/activation operation : y = Act( alpha1 * conv(x) + alpha2 * z + bias ) */ +cudnnStatus_t CUDNNWINAPI +cudnnConvolutionBiasActivationForward(cudnnHandle_t handle, + const void *alpha1, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionFwdAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *alpha2, + const cudnnTensorDescriptor_t zDesc, + const void *z, + const cudnnTensorDescriptor_t biasDesc, + const void *bias, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* helper function to provide the convolution backward data algo that fit best the requirement */ + +typedef struct cudnnConvolutionBwdDataAlgoPerfStruct { + cudnnConvolutionBwdDataAlgo_t algo; + cudnnStatus_t status; + float time; + size_t memory; + cudnnDeterminism_t determinism; + cudnnMathType_t mathType; + int reserved[3]; +} cudnnConvolutionBwdDataAlgoPerf_t; + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count); + +cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults); + +cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t gradDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults); + +/* + * convolution algorithm (which requires potentially some workspace) + */ + +/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ +cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + cudnnConvolutionBwdDataAlgo_t algo, + size_t *sizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnConvolutionBackwardData(cudnnHandle_t handle, + const void *alpha, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionBwdDataAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +/* Helper function to calculate folding descriptors for dgrad */ +cudnnStatus_t CUDNNWINAPI +cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t gradDesc, + const cudnnTensorFormat_t transformFormat, + cudnnFilterDescriptor_t foldedFilterDesc, + cudnnTensorDescriptor_t paddedDiffDesc, + cudnnConvolutionDescriptor_t foldedConvDesc, + cudnnTensorDescriptor_t foldedGradDesc, + cudnnTensorTransformDescriptor_t filterFoldTransDesc, + cudnnTensorTransformDescriptor_t diffPadTransDesc, + cudnnTensorTransformDescriptor_t gradFoldTransDesc, + cudnnTensorTransformDescriptor_t gradUnfoldTransDesc); + +/* cudnnFusedOps... */ +struct cudnnFusedOpsConstParamStruct; +typedef struct cudnnFusedOpsConstParamStruct *cudnnFusedOpsConstParamPack_t; + +struct cudnnFusedOpsVariantParamStruct; +typedef struct cudnnFusedOpsVariantParamStruct *cudnnFusedOpsVariantParamPack_t; + +struct cudnnFusedOpsPlanStruct; +typedef struct cudnnFusedOpsPlanStruct *cudnnFusedOpsPlan_t; + +typedef enum { + /* each op in [ ] can be disabled by passing NULL ptr */ + /* [per channel scale], [per channel bias], [activation], convolution, [generate BN stats] */ + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0, + /* [per channel scale], [per channel bias], [activation], convolutionBackwardWeights */ + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1, + /* utility for BN training in BN-conv fusion */ + /* computes the equivalent scale and bias from ySum ySqSum and learned scale, bias */ + /* optionally update running stats and generate saved stats */ + CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2, + /* utility for BN inference in BN-conv fusion */ + /* computes the equivalent scale and bias from learned running stats and learned scale, bias */ + CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3, + /* reserved for future use: convolution, [per channel scale], [per channel bias], [residual add], [activation] */ + CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4, + /* reserved for future use: [per channel scale], [per channel bias], [residual add], activation, bitmask */ + CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5, + /* reserved for future use */ + CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6, +} cudnnFusedOps_t; + +typedef enum { + /* set XDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get XDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_XDESC = 0, + /* set/get XDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_XDATA_PLACEHOLDER = 1, + /* set/get BN_MODE: pass cudnnBatchNormMode_t* */ + CUDNN_PARAM_BN_MODE = 2, + /* set CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3, + /* set/get BN_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4, + /* set/get BN_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5, + /* set ACTIVATION_DESC: pass previously initialized cudnnActivationDescriptor_t */ + /* get ACTIVATION_DESC: pass previously created cudnnActivationDescriptor_t */ + CUDNN_PARAM_ACTIVATION_DESC = 6, + /* set CONV_DESC: pass previously initialized cudnnConvolutionDescriptor_t */ + /* get CONV_DESC: pass previously created cudnnConvolutionDescriptor_t */ + CUDNN_PARAM_CONV_DESC = 7, + /* set WDESC: pass previously initialized cudnnFilterDescriptor_t */ + /* get WDESC: pass previously created cudnnFilterDescriptor_t */ + CUDNN_PARAM_WDESC = 8, + /* set/get WDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_WDATA_PLACEHOLDER = 9, + /* set DWDESC: pass previously initialized cudnnFilterDescriptor_t */ + /* get DWDESC: pass previously created cudnnFilterDescriptor_t */ + CUDNN_PARAM_DWDESC = 10, + /* set/get DWDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DWDATA_PLACEHOLDER = 11, + /* set YDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get YDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_YDESC = 12, + /* set/get YDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_YDATA_PLACEHOLDER = 13, + /* set DYDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get DYDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_DYDESC = 14, + /* set/get DYDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DYDATA_PLACEHOLDER = 15, + /* set YSTATS_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get YSTATS_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_YSTATS_DESC = 16, + /* set/get YSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_YSUM_PLACEHOLDER = 17, + /* set/get YSQSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18, + /* set CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19, + /* set/get CUDNN_PARAM_BN_SCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20, + /* set/get CUDNN_PARAM_BN_BIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21, + /* set/get CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22, + /* set/get CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23, + /* set/get CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24, + /* set/get CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25, + + /* set ZDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get ZDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_ZDESC = 26, + /* set/get ZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_ZDATA_PLACEHOLDER = 27, + /* set BN_Z_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get BN_Z_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28, + /* set/get BN_Z_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29, + /* set/get BN_Z_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30, + + /* set ACTIVATION_BITMASK_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get ACTIVATION_BITMASK_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31, + /* set/get ACTIVATION_BITMASK_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32, + + /* set DXDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get DXDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_DXDESC = 33, + /* set/get DXDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DXDATA_PLACEHOLDER = 34, + /* set DZDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get DZDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_DZDESC = 35, + /* set/get DZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DZDATA_PLACEHOLDER = 36, + /* set/get CUDNN_PARAM_BN_DSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37, + /* set/get CUDNN_PARAM_BN_DBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38, +} cudnnFusedOpsConstParamLabel_t; + +typedef enum { + CUDNN_PTR_NULL = 0, + CUDNN_PTR_ELEM_ALIGNED = 1, + CUDNN_PTR_16B_ALIGNED = 2, +} cudnnFusedOpsPointerPlaceHolder_t; + +typedef enum { + /* set: pass void* pointing to dev memory */ + /* get: pass void** pointing to host memory */ + CUDNN_PTR_XDATA = 0, + CUDNN_PTR_BN_EQSCALE = 1, + CUDNN_PTR_BN_EQBIAS = 2, + CUDNN_PTR_WDATA = 3, + CUDNN_PTR_DWDATA = 4, + CUDNN_PTR_YDATA = 5, + CUDNN_PTR_DYDATA = 6, + CUDNN_PTR_YSUM = 7, + CUDNN_PTR_YSQSUM = 8, + CUDNN_PTR_WORKSPACE = 9, + CUDNN_PTR_BN_SCALE = 10, + CUDNN_PTR_BN_BIAS = 11, + CUDNN_PTR_BN_SAVED_MEAN = 12, + CUDNN_PTR_BN_SAVED_INVSTD = 13, + CUDNN_PTR_BN_RUNNING_MEAN = 14, + CUDNN_PTR_BN_RUNNING_VAR = 15, + CUDNN_PTR_ZDATA = 16, + CUDNN_PTR_BN_Z_EQSCALE = 17, + CUDNN_PTR_BN_Z_EQBIAS = 18, + CUDNN_PTR_ACTIVATION_BITMASK = 19, + CUDNN_PTR_DXDATA = 20, + CUDNN_PTR_DZDATA = 21, + CUDNN_PTR_BN_DSCALE = 22, + CUDNN_PTR_BN_DBIAS = 23, + + /* set/get: pass size_t* pointing to host memory */ + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100, + /* set/get: pass int64_t* pointing to host memory */ + CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101, + /* set/get: pass double* pointing to host memory */ + CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102, + /* set/get: pass double* pointing to host memory */ + CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103, +} cudnnFusedOpsVariantParamLabel_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCnnInferVersionCheck(void); + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_CNN_INFER_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..274be325726f20b1c7a306c6ef7184475a28a07b Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftw.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftw.h new file mode 100644 index 0000000000000000000000000000000000000000..6f12b4e1ea68c5a186d73b5d943d2cba0218312f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftw.h @@ -0,0 +1,454 @@ + + /* Copyright 2005-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/*! +* \file cufftw.h +* \brief Public header file for the NVIDIA CUDA FFTW library (CUFFTW) +*/ + +#ifndef _CUFFTW_H_ +#define _CUFFTW_H_ + + +#include +#include "cufft.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// transform direction +#define FFTW_FORWARD -1 +#define FFTW_INVERSE 1 +#define FFTW_BACKWARD 1 + +// Planner flags + +#define FFTW_ESTIMATE 0x01 +#define FFTW_MEASURE 0x02 +#define FFTW_PATIENT 0x03 +#define FFTW_EXHAUSTIVE 0x04 +#define FFTW_WISDOM_ONLY 0x05 + +//Algorithm restriction flags + +#define FFTW_DESTROY_INPUT 0x08 +#define FFTW_PRESERVE_INPUT 0x0C +#define FFTW_UNALIGNED 0x10 + +// CUFFTW defines and supports the following data types + +// note if complex.h has been included we use the C99 complex types +#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined (complex) + typedef double _Complex fftw_complex; + typedef float _Complex fftwf_complex; +#else + typedef double fftw_complex[2]; + typedef float fftwf_complex[2]; +#endif + +typedef void *fftw_plan; + +typedef void *fftwf_plan; + +typedef struct { + int n; + int is; + int os; +} fftw_iodim; + +typedef fftw_iodim fftwf_iodim; + +typedef struct { + ptrdiff_t n; + ptrdiff_t is; + ptrdiff_t os; +} fftw_iodim64; + +typedef fftw_iodim64 fftwf_iodim64; + + +// CUFFTW defines and supports the following double precision APIs + + +fftw_plan CUFFTAPI fftw_plan_dft_1d(int n, + fftw_complex *in, + fftw_complex *out, + int sign, + unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_dft_2d(int n0, + int n1, + fftw_complex *in, + fftw_complex *out, + int sign, + unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_dft_3d(int n0, + int n1, + int n2, + fftw_complex *in, + fftw_complex *out, + int sign, + unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_dft(int rank, + const int *n, + fftw_complex *in, + fftw_complex *out, + int sign, + unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_dft_r2c_1d(int n, + double *in, + fftw_complex *out, + unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_dft_r2c_2d(int n0, + int n1, + double *in, + fftw_complex *out, + unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_dft_r2c_3d(int n0, + int n1, + int n2, + double *in, + fftw_complex *out, + unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_dft_r2c(int rank, + const int *n, + double *in, + fftw_complex *out, + unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_dft_c2r_1d(int n, + fftw_complex *in, + double *out, + unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_dft_c2r_2d(int n0, + int n1, + fftw_complex *in, + double *out, + unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_dft_c2r_3d(int n0, + int n1, + int n2, + fftw_complex *in, + double *out, + unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_dft_c2r(int rank, + const int *n, + fftw_complex *in, + double *out, + unsigned flags); + + +fftw_plan CUFFTAPI fftw_plan_many_dft(int rank, + const int *n, + int batch, + fftw_complex *in, + const int *inembed, int istride, int idist, + fftw_complex *out, + const int *onembed, int ostride, int odist, + int sign, unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_many_dft_r2c(int rank, + const int *n, + int batch, + double *in, + const int *inembed, int istride, int idist, + fftw_complex *out, + const int *onembed, int ostride, int odist, + unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_many_dft_c2r(int rank, + const int *n, + int batch, + fftw_complex *in, + const int *inembed, int istride, int idist, + double *out, + const int *onembed, int ostride, int odist, + unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_guru_dft(int rank, const fftw_iodim *dims, + int batch_rank, const fftw_iodim *batch_dims, + fftw_complex *in, fftw_complex *out, + int sign, unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_guru_dft_r2c(int rank, const fftw_iodim *dims, + int batch_rank, const fftw_iodim *batch_dims, + double *in, fftw_complex *out, + unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_guru_dft_c2r(int rank, const fftw_iodim *dims, + int batch_rank, const fftw_iodim *batch_dims, + fftw_complex *in, double *out, + unsigned flags); + +void CUFFTAPI fftw_execute(const fftw_plan plan); + +void CUFFTAPI fftw_execute_dft(const fftw_plan plan, + fftw_complex *idata, + fftw_complex *odata); + +void CUFFTAPI fftw_execute_dft_r2c(const fftw_plan plan, + double *idata, + fftw_complex *odata); + +void CUFFTAPI fftw_execute_dft_c2r(const fftw_plan plan, + fftw_complex *idata, + double *odata); + + +// CUFFTW defines and supports the following single precision APIs + +fftwf_plan CUFFTAPI fftwf_plan_dft_1d(int n, + fftwf_complex *in, + fftwf_complex *out, + int sign, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_dft_2d(int n0, + int n1, + fftwf_complex *in, + fftwf_complex *out, + int sign, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_dft_3d(int n0, + int n1, + int n2, + fftwf_complex *in, + fftwf_complex *out, + int sign, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_dft(int rank, + const int *n, + fftwf_complex *in, + fftwf_complex *out, + int sign, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_1d(int n, + float *in, + fftwf_complex *out, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_2d(int n0, + int n1, + float *in, + fftwf_complex *out, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_3d(int n0, + int n1, + int n2, + float *in, + fftwf_complex *out, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_dft_r2c(int rank, + const int *n, + float *in, + fftwf_complex *out, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_1d(int n, + fftwf_complex *in, + float *out, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_2d(int n0, + int n1, + fftwf_complex *in, + float *out, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_3d(int n0, + int n1, + int n2, + fftwf_complex *in, + float *out, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_dft_c2r(int rank, + const int *n, + fftwf_complex *in, + float *out, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_many_dft(int rank, + const int *n, + int batch, + fftwf_complex *in, + const int *inembed, int istride, int idist, + fftwf_complex *out, + const int *onembed, int ostride, int odist, + int sign, unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_many_dft_r2c(int rank, + const int *n, + int batch, + float *in, + const int *inembed, int istride, int idist, + fftwf_complex *out, + const int *onembed, int ostride, int odist, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_many_dft_c2r(int rank, + const int *n, + int batch, + fftwf_complex *in, + const int *inembed, int istride, int idist, + float *out, + const int *onembed, int ostride, int odist, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_guru_dft(int rank, const fftwf_iodim *dims, + int batch_rank, const fftwf_iodim *batch_dims, + fftwf_complex *in, fftwf_complex *out, + int sign, unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_guru_dft_r2c(int rank, const fftwf_iodim *dims, + int batch_rank, const fftwf_iodim *batch_dims, + float *in, fftwf_complex *out, + unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_guru_dft_c2r(int rank, const fftwf_iodim *dims, + int batch_rank, const fftwf_iodim *batch_dims, + fftwf_complex *in, float *out, + unsigned flags); + +void CUFFTAPI fftwf_execute(const fftw_plan plan); + +void CUFFTAPI fftwf_execute_dft(const fftwf_plan plan, + fftwf_complex *idata, + fftwf_complex *odata); + +void CUFFTAPI fftwf_execute_dft_r2c(const fftwf_plan plan, + float *idata, + fftwf_complex *odata); + +void CUFFTAPI fftwf_execute_dft_c2r(const fftwf_plan plan, + fftwf_complex *idata, + float *odata); + +/// CUFFTW 64-bit Guru Interface +/// dp +fftw_plan CUFFTAPI fftw_plan_guru64_dft(int rank, const fftw_iodim64* dims, int batch_rank, const fftw_iodim64* batch_dims, fftw_complex* in, fftw_complex* out, int sign, unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_guru64_dft_r2c(int rank, const fftw_iodim64* dims, int batch_rank, const fftw_iodim64* batch_dims, double* in, fftw_complex* out, unsigned flags); + +fftw_plan CUFFTAPI fftw_plan_guru64_dft_c2r(int rank, const fftw_iodim64* dims, int batch_rank, const fftw_iodim64* batch_dims, fftw_complex* in, double* out, unsigned flags); + +/// sp +fftwf_plan CUFFTAPI fftwf_plan_guru64_dft(int rank, const fftwf_iodim64* dims, int batch_rank, const fftwf_iodim64* batch_dims, fftwf_complex* in, fftwf_complex* out, int sign, unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_guru64_dft_r2c(int rank, const fftwf_iodim64* dims, int batch_rank, const fftwf_iodim64* batch_dims, float* in, fftwf_complex* out, unsigned flags); + +fftwf_plan CUFFTAPI fftwf_plan_guru64_dft_c2r(int rank, const fftwf_iodim64* dims, int batch_rank, const fftwf_iodim64* batch_dims, fftwf_complex* in, float* out, unsigned flags); + +#ifdef _WIN32 +#define _CUFFTAPI(T) T CUFFTAPI +#else +#define _CUFFTAPI(T) CUFFTAPI T +#endif + +// CUFFTW defines and supports the following support APIs +_CUFFTAPI(void *) fftw_malloc(size_t n); + +_CUFFTAPI(void *) fftwf_malloc(size_t n); + +void CUFFTAPI fftw_free(void *pointer); + +void CUFFTAPI fftwf_free(void *pointer); + +void CUFFTAPI fftw_export_wisdom_to_file(FILE * output_file); + +void CUFFTAPI fftwf_export_wisdom_to_file(FILE * output_file); + +void CUFFTAPI fftw_import_wisdom_from_file(FILE * input_file); + +void CUFFTAPI fftwf_import_wisdom_from_file(FILE * input_file); + +void CUFFTAPI fftw_print_plan(const fftw_plan plan); + +void CUFFTAPI fftwf_print_plan(const fftwf_plan plan); + +void CUFFTAPI fftw_set_timelimit(double seconds); + +void CUFFTAPI fftwf_set_timelimit(double seconds); + +double CUFFTAPI fftw_cost(const fftw_plan plan); + +double CUFFTAPI fftwf_cost(const fftw_plan plan); + +void CUFFTAPI fftw_flops(const fftw_plan plan, double *add, double *mul, double *fma); + +void CUFFTAPI fftwf_flops(const fftw_plan plan, double *add, double *mul, double *fma); + +void CUFFTAPI fftw_destroy_plan(fftw_plan plan); + +void CUFFTAPI fftwf_destroy_plan(fftwf_plan plan); + +void CUFFTAPI fftw_cleanup(void); + +void CUFFTAPI fftwf_cleanup(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _CUFFTW_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..299cf7e68206fd59da47e38d1f11dca84e3ed274 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/idna/__pycache__/idnadata.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/idna/__pycache__/idnadata.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d68a69db6781ab705c49a7a45ced26275869457 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/idna/__pycache__/idnadata.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e0c01fe1a0e5738b15b6952c63eebb273a28c12beefd13f01594da265a1b156 +size 101565 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/LICENSE b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..e466b0dfda14f3a7c8ece512937eb99c8b7b6d68 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/LICENSE @@ -0,0 +1,29 @@ +Copyright (c) 2016 Wenzel Jakob , All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Please also refer to the file .github/CONTRIBUTING.md, which clarifies licensing of +external contributions to this project including patches, pull requests, etc. diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/RECORD b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..d8caf213c456fbebc99e75e8743e41cd50a6832f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/RECORD @@ -0,0 +1,65 @@ +../../../bin/pybind11-config,sha256=KwKhJwrv86OeAvCUq7sBopc-kDZzCJdnh_4RZIF8T-c,265 +pybind11-2.13.6.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +pybind11-2.13.6.dist-info/LICENSE,sha256=g5ZbhDuY9nDTqFvQQe1LNyyOxQ17SlmVqDrGl7pnXcs,1684 +pybind11-2.13.6.dist-info/METADATA,sha256=Gg_aZ0f3aFFDF3bQvgzR9kwVT_jogjVEc74kDVldlq0,9513 +pybind11-2.13.6.dist-info/RECORD,, +pybind11-2.13.6.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pybind11-2.13.6.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91 +pybind11-2.13.6.dist-info/entry_points.txt,sha256=Q_kAwEJBDz8wHD0V50hY3AvchDk3Pfyeox2YHrAcWZ0,105 +pybind11-2.13.6.dist-info/top_level.txt,sha256=d1mqwSpUlmlZhXDQ9Y57eNlXc3dVDM1toKmfC1kJbvU,9 +pybind11/__init__.py,sha256=9vt06pvuwvdKW0YwYQKOTxBEgmQ0kb5ZUOJrgtGhdKs,459 +pybind11/__main__.py,sha256=p8vZ4btnkb_TaF03R1ac7qHmp-Eut86gCSUcVP8F3i4,2526 +pybind11/__pycache__/__init__.cpython-311.pyc,, +pybind11/__pycache__/__main__.cpython-311.pyc,, +pybind11/__pycache__/_version.cpython-311.pyc,, +pybind11/__pycache__/commands.cpython-311.pyc,, +pybind11/__pycache__/setup_helpers.cpython-311.pyc,, +pybind11/_version.py,sha256=XUUceDIbc3kdRixyEVMy5v0LcGF36QUxMG9rJHlT6P4,232 +pybind11/commands.py,sha256=V43hKb7VE_abYZvaO-TpJLOU65n6W3ZrdYHGF3G3qUs,1243 +pybind11/include/pybind11/attr.h,sha256=QPjH7BfhL8QFwHHkrDak8gNOLMlb1itAO5fobjdoLp8,24334 +pybind11/include/pybind11/buffer_info.h,sha256=_FcQisqdpphfWXKeCGNv3Gq5ivy1z-qF3d1Noeteaok,7778 +pybind11/include/pybind11/cast.h,sha256=8gJ4Y4nc83dyq12CuU7ircAvAV1HoEZEVr0UyfeLQNA,71696 +pybind11/include/pybind11/chrono.h,sha256=A23naeloqn-1NKVAABOsJtHU9Vz8lfvrAICuLk-7qBM,8458 +pybind11/include/pybind11/common.h,sha256=ATg9Bt1pwF8qnNuI086fprM4CUTdrZdk_g2HXE1Sf6A,120 +pybind11/include/pybind11/complex.h,sha256=AaDZ-rEmK4tFaue-K9P5y3TxxnaQF6JwZ_6LAzkdLQI,2096 +pybind11/include/pybind11/detail/class.h,sha256=Bjk3K6xAMgwxPNTKfik7SC5Y24wgKs8Oz5VjvFdy0kA,29026 +pybind11/include/pybind11/detail/common.h,sha256=uxFMVYKW87YPbUz8Mo70xoVrpK2D1NzhKSwlDpwrJxo,54708 +pybind11/include/pybind11/detail/cpp_conduit.h,sha256=Bbx5728XzvyCL2gfW7kG6vgDltS5-V5gtkNQFPFevXg,2589 +pybind11/include/pybind11/detail/descr.h,sha256=D63pIHsF3luO_g51CjbJU8Wl9VOihciEXQhXvfRg-Rk,6035 +pybind11/include/pybind11/detail/exception_translation.h,sha256=fM1J19z00AuDlozHt0srpCJr-1uWW4kj_fLdSJDbdY8,2600 +pybind11/include/pybind11/detail/init.h,sha256=Sb1UkPecC5l9xj5naYLdUM7qIRLVpe614H9Frvyg8xg,17983 +pybind11/include/pybind11/detail/internals.h,sha256=xs-I7JdJACxx7gJf12HBLjL007jRXcAffPDsd0oTrq4,31985 +pybind11/include/pybind11/detail/type_caster_base.h,sha256=mdgZ-FIkxdSShMPPe69EXxjvd1eQDDBVX835B7XqCNo,48938 +pybind11/include/pybind11/detail/typeid.h,sha256=jw5pr9m72vkDsloT8vxl9wj17VJGcEdXDyziBlt89Js,1625 +pybind11/include/pybind11/detail/value_and_holder.h,sha256=hwNYlqxjUhlUqihwMjr6s3LhhKlZiTLaWREtQrgOAkQ,2814 +pybind11/include/pybind11/eigen.h,sha256=-HmSA1kgwCQ-GHUt7PHtTEc-vxqw9xARpF8PHWJip28,316 +pybind11/include/pybind11/eigen/common.h,sha256=dIeqmK7IzW5K4k2larPnA1A863rDp38U9YbNIwiIyYk,378 +pybind11/include/pybind11/eigen/matrix.h,sha256=VjCfx8M2AcD3m8THUbIEYidJyIClaNw9jMbd_Fzfo1s,32142 +pybind11/include/pybind11/eigen/tensor.h,sha256=csE3_N9yy-9k0SWQPJuAxmv8Jp_-lFrrPdVOyMV8-gc,18384 +pybind11/include/pybind11/embed.h,sha256=F3JQiOWnLGSuZ0NuEyBWFhHyVdczD8D_67kriU4QfsY,13362 +pybind11/include/pybind11/eval.h,sha256=7re-O2Eor1yD0Q_KgFkHIjKD17ejzII687Yszl9_KfE,4731 +pybind11/include/pybind11/functional.h,sha256=iOyYuNmbI-K3zgc1IMDwe4iHEOO3F8vwZbVSvbgxFQ4,5267 +pybind11/include/pybind11/gil.h,sha256=hsJj6z1iXqlo5c7fPCgEvK_-eeDoKZm7PKPwPNCdVVo,7702 +pybind11/include/pybind11/gil_safe_call_once.h,sha256=KKcy9Wgc_MJY-U5WpCZeNyzW7oVmC-d6yXkgephZ7zs,3993 +pybind11/include/pybind11/iostream.h,sha256=K5rPXoCYN325r1PptcJCIhPhgtRtTJQjMr7bvUIOwxk,8862 +pybind11/include/pybind11/numpy.h,sha256=xREhfycUTCOPF8CF-UWRdoLX0B23V6YWRiBqeRRElZg,84442 +pybind11/include/pybind11/operators.h,sha256=224RoAXcv1la4NNY9rQ3aD_AeC8S9ZKx3HVK1O8B4MU,9103 +pybind11/include/pybind11/options.h,sha256=qXvmnj--9fZSp56NYefnB3W5V17ppHlY1Srgo3DNBpw,2734 +pybind11/include/pybind11/pybind11.h,sha256=hbzXHRCBIW7dwtwaKjXKPC0Nl1MGHZ5-BjGsMlE3LuU,129898 +pybind11/include/pybind11/pytypes.h,sha256=BF8x4S5fsAzWf-d9pu83UsqjwRRo0ragHPy9sDOpUvk,99894 +pybind11/include/pybind11/stl.h,sha256=aMi1OCCw2Zb-IRLSlAtQEJJHtWsRJiLT9dKDMHST1Ic,15532 +pybind11/include/pybind11/stl/filesystem.h,sha256=lcYRCwNA8Xf4e4FRbeYh36SAwQjxKgyTXXdrguR4gM4,4559 +pybind11/include/pybind11/stl_bind.h,sha256=B5t8E0A4Zdgm2sF0J8Q_UI2U5uqEBQ9TsJCelsJ4q0E,28495 +pybind11/include/pybind11/type_caster_pyobject_ptr.h,sha256=H7pKBYTvUlibiJQEcKmeAkygSQwoCkuIyukNSDmVq-U,1929 +pybind11/include/pybind11/typing.h,sha256=PIjZFNNzY_KsrkHQPlg0Vt24jlTi6kThdOldEJjchtY,7000 +pybind11/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pybind11/setup_helpers.py,sha256=AwD_CjfVzX653nW4_i0U4bkFMCG4ZILoMZixyL8CZ4o,17490 +pybind11/share/cmake/pybind11/FindPythonLibsNew.cmake,sha256=_ZVzgVp6GQSEEv-b2iuauqTgoi1k2jHiNJlpl25MN-4,12187 +pybind11/share/cmake/pybind11/pybind11Common.cmake,sha256=lvJJ518cN7SjKDgjpXw0XU0eKW358wEloIcKCyCNPB0,16164 +pybind11/share/cmake/pybind11/pybind11Config.cmake,sha256=I96KX_zIZvLHbedHknVBj2YKhMt_QjM5LhCbzVNTvD8,7959 +pybind11/share/cmake/pybind11/pybind11ConfigVersion.cmake,sha256=vDsLSBg7-Nop8Ar9wRe0xKgGUV4LRzWE4XE0kE5B6fE,1403 +pybind11/share/cmake/pybind11/pybind11GuessPythonExtSuffix.cmake,sha256=WvhK2E-vWi9ArY0WJZXEK4kEFHpDQjl-au963hqH0r0,3321 +pybind11/share/cmake/pybind11/pybind11NewTools.cmake,sha256=zGLNjL28gzi8tvwiabudLsye7id_sZI5ooYfiBBllvM,12169 +pybind11/share/cmake/pybind11/pybind11Targets.cmake,sha256=tIjPtIpfb5m9POtu484cjGgNyWc5E4bbKzESLrcOLA0,4271 +pybind11/share/cmake/pybind11/pybind11Tools.cmake,sha256=5K6EahoS7wIaQIhjrDS4p4jTpYr0b_MronXKee8zCAc,8565 +pybind11/share/pkgconfig/pybind11.pc,sha256=M17R2NbpW6o7ujxioMP5M6WgVGrmJ_1vu_-E-H_rbes,171 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/REQUESTED b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/REQUESTED new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/entry_points.txt b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/entry_points.txt new file mode 100644 index 0000000000000000000000000000000000000000..8de5a647622cd4cbe433550be7a6b91be72e304a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/entry_points.txt @@ -0,0 +1,5 @@ +[console_scripts] +pybind11-config = pybind11.__main__:main + +[pipx.run] +pybind11 = pybind11.__main__:main diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/cpp_conduit.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/cpp_conduit.h new file mode 100644 index 0000000000000000000000000000000000000000..b66c2d39c0019dcfc8a5fd137bbf37d2515a97b2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/cpp_conduit.h @@ -0,0 +1,77 @@ +// Copyright (c) 2024 The pybind Community. + +#pragma once + +#include + +#include "common.h" +#include "internals.h" + +#include + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +PYBIND11_NAMESPACE_BEGIN(detail) + +// Forward declaration needed here: Refactoring opportunity. +extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *); + +inline bool type_is_managed_by_our_internals(PyTypeObject *type_obj) { +#if defined(PYPY_VERSION) + auto &internals = get_internals(); + return bool(internals.registered_types_py.find(type_obj) + != internals.registered_types_py.end()); +#else + return bool(type_obj->tp_new == pybind11_object_new); +#endif +} + +inline bool is_instance_method_of_type(PyTypeObject *type_obj, PyObject *attr_name) { + PyObject *descr = _PyType_Lookup(type_obj, attr_name); + return bool((descr != nullptr) && PyInstanceMethod_Check(descr)); +} + +inline object try_get_cpp_conduit_method(PyObject *obj) { + if (PyType_Check(obj)) { + return object(); + } + PyTypeObject *type_obj = Py_TYPE(obj); + str attr_name("_pybind11_conduit_v1_"); + bool assumed_to_be_callable = false; + if (type_is_managed_by_our_internals(type_obj)) { + if (!is_instance_method_of_type(type_obj, attr_name.ptr())) { + return object(); + } + assumed_to_be_callable = true; + } + PyObject *method = PyObject_GetAttr(obj, attr_name.ptr()); + if (method == nullptr) { + PyErr_Clear(); + return object(); + } + if (!assumed_to_be_callable && PyCallable_Check(method) == 0) { + Py_DECREF(method); + return object(); + } + return reinterpret_steal(method); +} + +inline void *try_raw_pointer_ephemeral_from_cpp_conduit(handle src, + const std::type_info *cpp_type_info) { + object method = try_get_cpp_conduit_method(src.ptr()); + if (method) { + capsule cpp_type_info_capsule(const_cast(static_cast(cpp_type_info)), + typeid(std::type_info).name()); + object cpp_conduit = method(bytes(PYBIND11_PLATFORM_ABI_ID), + cpp_type_info_capsule, + bytes("raw_pointer_ephemeral")); + if (isinstance(cpp_conduit)) { + return reinterpret_borrow(cpp_conduit).get_pointer(); + } + } + return nullptr; +} + +#define PYBIND11_HAS_CPP_CONDUIT 1 + +PYBIND11_NAMESPACE_END(detail) +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/gil.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/gil.h new file mode 100644 index 0000000000000000000000000000000000000000..6b0edaee4e5521569fd985f273e3892fdcefe17f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/gil.h @@ -0,0 +1,219 @@ +/* + pybind11/gil.h: RAII helpers for managing the GIL + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "detail/common.h" + +#include + +#if !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT) +# include "detail/internals.h" +#endif + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +PYBIND11_NAMESPACE_BEGIN(detail) + +// forward declarations +PyThreadState *get_thread_state_unchecked(); + +PYBIND11_NAMESPACE_END(detail) + +#if !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT) + +/* The functions below essentially reproduce the PyGILState_* API using a RAII + * pattern, but there are a few important differences: + * + * 1. When acquiring the GIL from an non-main thread during the finalization + * phase, the GILState API blindly terminates the calling thread, which + * is often not what is wanted. This API does not do this. + * + * 2. The gil_scoped_release function can optionally cut the relationship + * of a PyThreadState and its associated thread, which allows moving it to + * another thread (this is a fairly rare/advanced use case). + * + * 3. The reference count of an acquired thread state can be controlled. This + * can be handy to prevent cases where callbacks issued from an external + * thread would otherwise constantly construct and destroy thread state data + * structures. + * + * See the Python bindings of NanoGUI (http://github.com/wjakob/nanogui) for an + * example which uses features 2 and 3 to migrate the Python thread of + * execution to another thread (to run the event loop on the original thread, + * in this case). + */ + +class gil_scoped_acquire { +public: + PYBIND11_NOINLINE gil_scoped_acquire() { + auto &internals = detail::get_internals(); + tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate); + + if (!tstate) { + /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if + calling from a Python thread). Since we use a different key, this ensures + we don't create a new thread state and deadlock in PyEval_AcquireThread + below. Note we don't save this state with internals.tstate, since we don't + create it we would fail to clear it (its reference count should be > 0). */ + tstate = PyGILState_GetThisThreadState(); + } + + if (!tstate) { + tstate = PyThreadState_New(internals.istate); +# if defined(PYBIND11_DETAILED_ERROR_MESSAGES) + if (!tstate) { + pybind11_fail("scoped_acquire: could not create thread state!"); + } +# endif + tstate->gilstate_counter = 0; + PYBIND11_TLS_REPLACE_VALUE(internals.tstate, tstate); + } else { + release = detail::get_thread_state_unchecked() != tstate; + } + + if (release) { + PyEval_AcquireThread(tstate); + } + + inc_ref(); + } + + gil_scoped_acquire(const gil_scoped_acquire &) = delete; + gil_scoped_acquire &operator=(const gil_scoped_acquire &) = delete; + + void inc_ref() { ++tstate->gilstate_counter; } + + PYBIND11_NOINLINE void dec_ref() { + --tstate->gilstate_counter; +# if defined(PYBIND11_DETAILED_ERROR_MESSAGES) + if (detail::get_thread_state_unchecked() != tstate) { + pybind11_fail("scoped_acquire::dec_ref(): thread state must be current!"); + } + if (tstate->gilstate_counter < 0) { + pybind11_fail("scoped_acquire::dec_ref(): reference count underflow!"); + } +# endif + if (tstate->gilstate_counter == 0) { +# if defined(PYBIND11_DETAILED_ERROR_MESSAGES) + if (!release) { + pybind11_fail("scoped_acquire::dec_ref(): internal error!"); + } +# endif + PyThreadState_Clear(tstate); + if (active) { + PyThreadState_DeleteCurrent(); + } + PYBIND11_TLS_DELETE_VALUE(detail::get_internals().tstate); + release = false; + } + } + + /// This method will disable the PyThreadState_DeleteCurrent call and the + /// GIL won't be acquired. This method should be used if the interpreter + /// could be shutting down when this is called, as thread deletion is not + /// allowed during shutdown. Check _Py_IsFinalizing() on Python 3.7+, and + /// protect subsequent code. + PYBIND11_NOINLINE void disarm() { active = false; } + + PYBIND11_NOINLINE ~gil_scoped_acquire() { + dec_ref(); + if (release) { + PyEval_SaveThread(); + } + } + +private: + PyThreadState *tstate = nullptr; + bool release = true; + bool active = true; +}; + +class gil_scoped_release { +public: + // PRECONDITION: The GIL must be held when this constructor is called. + explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) { + assert(PyGILState_Check()); + // `get_internals()` must be called here unconditionally in order to initialize + // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an + // initialization race could occur as multiple threads try `gil_scoped_acquire`. + auto &internals = detail::get_internals(); + // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer) + tstate = PyEval_SaveThread(); + if (disassoc) { + // Python >= 3.7 can remove this, it's an int before 3.7 + // NOLINTNEXTLINE(readability-qualified-auto) + auto key = internals.tstate; + PYBIND11_TLS_DELETE_VALUE(key); + } + } + + gil_scoped_release(const gil_scoped_release &) = delete; + gil_scoped_release &operator=(const gil_scoped_release &) = delete; + + /// This method will disable the PyThreadState_DeleteCurrent call and the + /// GIL won't be acquired. This method should be used if the interpreter + /// could be shutting down when this is called, as thread deletion is not + /// allowed during shutdown. Check _Py_IsFinalizing() on Python 3.7+, and + /// protect subsequent code. + PYBIND11_NOINLINE void disarm() { active = false; } + + ~gil_scoped_release() { + if (!tstate) { + return; + } + // `PyEval_RestoreThread()` should not be called if runtime is finalizing + if (active) { + PyEval_RestoreThread(tstate); + } + if (disassoc) { + // Python >= 3.7 can remove this, it's an int before 3.7 + // NOLINTNEXTLINE(readability-qualified-auto) + auto key = detail::get_internals().tstate; + PYBIND11_TLS_REPLACE_VALUE(key, tstate); + } + } + +private: + PyThreadState *tstate; + bool disassoc; + bool active = true; +}; + +#else // PYBIND11_SIMPLE_GIL_MANAGEMENT + +class gil_scoped_acquire { + PyGILState_STATE state; + +public: + gil_scoped_acquire() : state{PyGILState_Ensure()} {} + gil_scoped_acquire(const gil_scoped_acquire &) = delete; + gil_scoped_acquire &operator=(const gil_scoped_acquire &) = delete; + ~gil_scoped_acquire() { PyGILState_Release(state); } + void disarm() {} +}; + +class gil_scoped_release { + PyThreadState *state; + +public: + // PRECONDITION: The GIL must be held when this constructor is called. + gil_scoped_release() { + assert(PyGILState_Check()); + state = PyEval_SaveThread(); + } + gil_scoped_release(const gil_scoped_release &) = delete; + gil_scoped_release &operator=(const gil_scoped_release &) = delete; + ~gil_scoped_release() { PyEval_RestoreThread(state); } + void disarm() {} +}; + +#endif // PYBIND11_SIMPLE_GIL_MANAGEMENT + +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/iostream.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/iostream.h new file mode 100644 index 0000000000000000000000000000000000000000..1878089e3171bcfbd575ff4d7f925f981489316c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/iostream.h @@ -0,0 +1,265 @@ +/* + pybind11/iostream.h -- Tools to assist with redirecting cout and cerr to Python + + Copyright (c) 2017 Henry F. Schreiner + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. + + WARNING: The implementation in this file is NOT thread safe. Multiple + threads writing to a redirected ostream concurrently cause data races + and potentially buffer overflows. Therefore it is currently a requirement + that all (possibly) concurrent redirected ostream writes are protected by + a mutex. + #HelpAppreciated: Work on iostream.h thread safety. + For more background see the discussions under + https://github.com/pybind/pybind11/pull/2982 and + https://github.com/pybind/pybind11/pull/2995. +*/ + +#pragma once + +#include "pybind11.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +PYBIND11_NAMESPACE_BEGIN(detail) + +// Buffer that writes to Python instead of C++ +class pythonbuf : public std::streambuf { +private: + using traits_type = std::streambuf::traits_type; + + const size_t buf_size; + std::unique_ptr d_buffer; + object pywrite; + object pyflush; + + int overflow(int c) override { + if (!traits_type::eq_int_type(c, traits_type::eof())) { + *pptr() = traits_type::to_char_type(c); + pbump(1); + } + return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof(); + } + + // Computes how many bytes at the end of the buffer are part of an + // incomplete sequence of UTF-8 bytes. + // Precondition: pbase() < pptr() + size_t utf8_remainder() const { + const auto rbase = std::reverse_iterator(pbase()); + const auto rpptr = std::reverse_iterator(pptr()); + auto is_ascii = [](char c) { return (static_cast(c) & 0x80) == 0x00; }; + auto is_leading = [](char c) { return (static_cast(c) & 0xC0) == 0xC0; }; + auto is_leading_2b = [](char c) { return static_cast(c) <= 0xDF; }; + auto is_leading_3b = [](char c) { return static_cast(c) <= 0xEF; }; + // If the last character is ASCII, there are no incomplete code points + if (is_ascii(*rpptr)) { + return 0; + } + // Otherwise, work back from the end of the buffer and find the first + // UTF-8 leading byte + const auto rpend = rbase - rpptr >= 3 ? rpptr + 3 : rbase; + const auto leading = std::find_if(rpptr, rpend, is_leading); + if (leading == rbase) { + return 0; + } + const auto dist = static_cast(leading - rpptr); + size_t remainder = 0; + + if (dist == 0) { + remainder = 1; // 1-byte code point is impossible + } else if (dist == 1) { + remainder = is_leading_2b(*leading) ? 0 : dist + 1; + } else if (dist == 2) { + remainder = is_leading_3b(*leading) ? 0 : dist + 1; + } + // else if (dist >= 3), at least 4 bytes before encountering an UTF-8 + // leading byte, either no remainder or invalid UTF-8. + // Invalid UTF-8 will cause an exception later when converting + // to a Python string, so that's not handled here. + return remainder; + } + + // This function must be non-virtual to be called in a destructor. + int _sync() { + if (pbase() != pptr()) { // If buffer is not empty + gil_scoped_acquire tmp; + // This subtraction cannot be negative, so dropping the sign. + auto size = static_cast(pptr() - pbase()); + size_t remainder = utf8_remainder(); + + if (size > remainder) { + str line(pbase(), size - remainder); + pywrite(std::move(line)); + pyflush(); + } + + // Copy the remainder at the end of the buffer to the beginning: + if (remainder > 0) { + std::memmove(pbase(), pptr() - remainder, remainder); + } + setp(pbase(), epptr()); + pbump(static_cast(remainder)); + } + return 0; + } + + int sync() override { return _sync(); } + +public: + explicit pythonbuf(const object &pyostream, size_t buffer_size = 1024) + : buf_size(buffer_size), d_buffer(new char[buf_size]), pywrite(pyostream.attr("write")), + pyflush(pyostream.attr("flush")) { + setp(d_buffer.get(), d_buffer.get() + buf_size - 1); + } + + pythonbuf(pythonbuf &&) = default; + + /// Sync before destroy + ~pythonbuf() override { _sync(); } +}; + +PYBIND11_NAMESPACE_END(detail) + +/** \rst + This a move-only guard that redirects output. + + .. code-block:: cpp + + #include + + ... + + { + py::scoped_ostream_redirect output; + std::cout << "Hello, World!"; // Python stdout + } // <-- return std::cout to normal + + You can explicitly pass the c++ stream and the python object, + for example to guard stderr instead. + + .. code-block:: cpp + + { + py::scoped_ostream_redirect output{ + std::cerr, py::module::import("sys").attr("stderr")}; + std::cout << "Hello, World!"; + } + \endrst */ +class scoped_ostream_redirect { +protected: + std::streambuf *old; + std::ostream &costream; + detail::pythonbuf buffer; + +public: + explicit scoped_ostream_redirect(std::ostream &costream = std::cout, + const object &pyostream + = module_::import("sys").attr("stdout")) + : costream(costream), buffer(pyostream) { + old = costream.rdbuf(&buffer); + } + + ~scoped_ostream_redirect() { costream.rdbuf(old); } + + scoped_ostream_redirect(const scoped_ostream_redirect &) = delete; + scoped_ostream_redirect(scoped_ostream_redirect &&other) = default; + scoped_ostream_redirect &operator=(const scoped_ostream_redirect &) = delete; + scoped_ostream_redirect &operator=(scoped_ostream_redirect &&) = delete; +}; + +/** \rst + Like `scoped_ostream_redirect`, but redirects cerr by default. This class + is provided primary to make ``py::call_guard`` easier to make. + + .. code-block:: cpp + + m.def("noisy_func", &noisy_func, + py::call_guard()); + +\endrst */ +class scoped_estream_redirect : public scoped_ostream_redirect { +public: + explicit scoped_estream_redirect(std::ostream &costream = std::cerr, + const object &pyostream + = module_::import("sys").attr("stderr")) + : scoped_ostream_redirect(costream, pyostream) {} +}; + +PYBIND11_NAMESPACE_BEGIN(detail) + +// Class to redirect output as a context manager. C++ backend. +class OstreamRedirect { + bool do_stdout_; + bool do_stderr_; + std::unique_ptr redirect_stdout; + std::unique_ptr redirect_stderr; + +public: + explicit OstreamRedirect(bool do_stdout = true, bool do_stderr = true) + : do_stdout_(do_stdout), do_stderr_(do_stderr) {} + + void enter() { + if (do_stdout_) { + redirect_stdout.reset(new scoped_ostream_redirect()); + } + if (do_stderr_) { + redirect_stderr.reset(new scoped_estream_redirect()); + } + } + + void exit() { + redirect_stdout.reset(); + redirect_stderr.reset(); + } +}; + +PYBIND11_NAMESPACE_END(detail) + +/** \rst + This is a helper function to add a C++ redirect context manager to Python + instead of using a C++ guard. To use it, add the following to your binding code: + + .. code-block:: cpp + + #include + + ... + + py::add_ostream_redirect(m, "ostream_redirect"); + + You now have a Python context manager that redirects your output: + + .. code-block:: python + + with m.ostream_redirect(): + m.print_to_cout_function() + + This manager can optionally be told which streams to operate on: + + .. code-block:: python + + with m.ostream_redirect(stdout=true, stderr=true): + m.noisy_function_with_error_printing() + + \endrst */ +inline class_ +add_ostream_redirect(module_ m, const std::string &name = "ostream_redirect") { + return class_(std::move(m), name.c_str(), module_local()) + .def(init(), arg("stdout") = true, arg("stderr") = true) + .def("__enter__", &detail::OstreamRedirect::enter) + .def("__exit__", [](detail::OstreamRedirect &self_, const args &) { self_.exit(); }); +} + +PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_C.cpython-311-x86_64-linux-gnu.so b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_C.cpython-311-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..acb9f6e3e8dab7a9f988196266a61b1478c977e0 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_C.cpython-311-x86_64-linux-gnu.so differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_VF.pyi b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_VF.pyi new file mode 100644 index 0000000000000000000000000000000000000000..8f6e61bf678ae19145799f44bb6beceac9bf24c9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_VF.pyi @@ -0,0 +1,25648 @@ +# @generated from torch/_C/_VariableFunctions.pyi.in +# mypy: disable-error-code="type-arg" + +import builtins +from typing import ( + Any, + Callable, + ContextManager, + Iterator, + List, + Literal, + NamedTuple, + Optional, + overload, + Sequence, + Tuple, + TypeVar, + Union, +) + +import torch +from torch import contiguous_format, Generator, inf, memory_format, strided, SymInt, Tensor +from torch.types import ( + _bool, + _complex, + _device, + _dtype, + _float, + _int, + _layout, + _qscheme, + _size, + Device, + Number, +) + +from torch._prims_common import DeviceLikeType + +@overload +def __and__(input: Tensor, other: Tensor) -> Tensor: ... +@overload +def __and__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ... +@overload +def __lshift__(input: Tensor, other: Tensor) -> Tensor: ... +@overload +def __lshift__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ... +@overload +def __or__(input: Tensor, other: Tensor) -> Tensor: ... +@overload +def __or__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ... +@overload +def __rshift__(input: Tensor, other: Tensor) -> Tensor: ... +@overload +def __rshift__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ... +@overload +def __xor__(input: Tensor, other: Tensor) -> Tensor: ... +@overload +def __xor__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ... +def _adaptive_avg_pool2d(input: Tensor, output_size: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]]) -> Tensor: ... +def _adaptive_avg_pool3d(input: Tensor, output_size: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]]) -> Tensor: ... +def _add_batch_dim(input: Tensor, batch_dim: _int, level: _int) -> Tensor: ... +@overload +def _add_relu(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: ... +@overload +def _add_relu(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: ... +@overload +def _add_relu_(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: ... +@overload +def _add_relu_(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: ... +def _addmm_activation(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, use_gelu: _bool = False, out: Optional[Tensor] = None) -> Tensor: ... +@overload +def _aminmax(input: Tensor) -> Tuple[Tensor, Tensor]: ... +@overload +def _aminmax(input: Tensor, dim: _int, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: ... +def _amp_foreach_non_finite_check_and_unscale_(self: Union[Tuple[Tensor, ...], List[Tensor]], found_inf: Tensor, inv_scale: Tensor) -> None: ... +def _amp_update_scale_(input: Tensor, growth_tracker: Tensor, found_inf: Tensor, scale_growth_factor: _float, scale_backoff_factor: _float, growth_interval: _int) -> Tensor: ... +@overload +def _assert_async(input: Tensor) -> None: + r""" + _assert_async(tensor) -> void + + Asynchronously assert that the contents of tensor are nonzero. For CPU tensors, + this is equivalent to ``assert tensor`` or ``assert tensor.is_nonzero()``; for + CUDA tensors, we DO NOT synchronize and you may only find out the assertion + failed at a later CUDA kernel launch. Asynchronous assertion can be helpful for + testing invariants in CUDA tensors without giving up performance. This function + is NOT intended to be used for regular error checking, as it will trash your CUDA + context if the assert fails (forcing you to restart your PyTorch process.) + + Args: + tensor (Tensor): a one element tensor to test to see if it is nonzero. Zero + elements (including False for boolean tensors) cause an assertion failure + to be raised. + """ + ... +@overload +def _assert_async(input: Tensor, assert_msg: str) -> None: + r""" + _assert_async(tensor) -> void + + Asynchronously assert that the contents of tensor are nonzero. For CPU tensors, + this is equivalent to ``assert tensor`` or ``assert tensor.is_nonzero()``; for + CUDA tensors, we DO NOT synchronize and you may only find out the assertion + failed at a later CUDA kernel launch. Asynchronous assertion can be helpful for + testing invariants in CUDA tensors without giving up performance. This function + is NOT intended to be used for regular error checking, as it will trash your CUDA + context if the assert fails (forcing you to restart your PyTorch process.) + + Args: + tensor (Tensor): a one element tensor to test to see if it is nonzero. Zero + elements (including False for boolean tensors) cause an assertion failure + to be raised. + """ + ... +def _assert_scalar(self: Union[Number, _complex], assert_msg: str) -> None: ... +def _assert_tensor_metadata(a: Tensor, size: Optional[Sequence[Union[_int, SymInt]]] = None, stride: Optional[Sequence[Union[_int, SymInt]]] = None, dtype: Optional[_dtype] = None) -> None: ... +def _batch_norm_impl_index(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, momentum: _float, eps: _float, cudnn_enabled: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor, _int]: ... +def _cast_Byte(input: Tensor, non_blocking: _bool = False) -> Tensor: ... +def _cast_Char(input: Tensor, non_blocking: _bool = False) -> Tensor: ... +def _cast_Double(input: Tensor, non_blocking: _bool = False) -> Tensor: ... +def _cast_Float(input: Tensor, non_blocking: _bool = False) -> Tensor: ... +def _cast_Half(input: Tensor, non_blocking: _bool = False) -> Tensor: ... +def _cast_Int(input: Tensor, non_blocking: _bool = False) -> Tensor: ... +def _cast_Long(input: Tensor, non_blocking: _bool = False) -> Tensor: ... +def _cast_Short(input: Tensor, non_blocking: _bool = False) -> Tensor: ... +def _choose_qparams_per_tensor(input: Tensor, reduce_range: _bool = False) -> Tuple[_float, _int]: ... +def _chunk_cat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int, num_chunks: _int, *, out: Optional[Tensor] = None) -> Tensor: ... +def _coalesce(input: Tensor) -> Tensor: ... +def _compute_linear_combination(input: Tensor, coefficients: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ... +def _conj(input: Tensor) -> Tensor: ... +def _conj_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ... +def _conj_physical(input: Tensor) -> Tensor: ... +def _convert_indices_from_coo_to_csr(input: Tensor, size: _int, *, out_int32: _bool = False, out: Optional[Tensor] = None) -> Tensor: ... +def _convert_indices_from_csr_to_coo(crow_indices: Tensor, col_indices: Tensor, *, out_int32: _bool = False, transpose: _bool = False, out: Optional[Tensor] = None) -> Tensor: ... +def _convert_weight_to_int4pack(input: Tensor, innerKTiles: _int) -> Tensor: ... +@overload +def _convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], transposed: _bool, output_padding: _size, groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, cudnn_enabled: _bool) -> Tensor: ... +@overload +def _convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], transposed: _bool, output_padding: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, cudnn_enabled: _bool, allow_tf32: _bool) -> Tensor: ... +def _convolution_mode(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: str, dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ... +def _copy_from(input: Tensor, dst: Tensor, non_blocking: _bool = False) -> Tensor: ... +def _copy_from_and_resize(input: Tensor, dst: Tensor) -> Tensor: ... +def _cslt_compress(input: Tensor) -> Tensor: ... +def _cslt_sparse_mm(compressed_A: Tensor, dense_B: Tensor, bias: Optional[Tensor] = None, alpha: Optional[Tensor] = None, out_dtype: Optional[_dtype] = None, transpose_result: _bool = False, alg_id: _int = 0) -> Tensor: ... +def _cslt_sparse_mm_search(compressed_A: Tensor, dense_B: Tensor, bias: Optional[Tensor] = None, alpha: Optional[Tensor] = None, out_dtype: Optional[_dtype] = None, transpose_result: _bool = False) -> _int: ... +@overload +def _ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int = 0, zero_infinity: _bool = False) -> Tuple[Tensor, Tensor]: ... +@overload +def _ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int = 0, zero_infinity: _bool = False) -> Tuple[Tensor, Tensor]: ... +@overload +def _cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int, deterministic: _bool, zero_infinity: _bool) -> Tuple[Tensor, Tensor]: ... +@overload +def _cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int, deterministic: _bool, zero_infinity: _bool) -> Tuple[Tensor, Tensor]: ... +def _cudnn_init_dropout_state(dropout: _float, train: _bool, dropout_seed: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ... +def _cudnn_rnn(input: Tensor, weight: Union[Tuple[Tensor, ...], List[Tensor]], weight_stride0: _int, weight_buf: Optional[Tensor], hx: Tensor, cx: Optional[Tensor], mode: _int, hidden_size: Union[_int, SymInt], proj_size: Union[_int, SymInt], num_layers: _int, batch_first: _bool, dropout: _float, train: _bool, bidirectional: _bool, batch_sizes: Sequence[Union[_int, SymInt]], dropout_state: Optional[Tensor]) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: ... +def _cudnn_rnn_flatten_weight(weight_arr: Union[Tuple[Tensor, ...], List[Tensor]], weight_stride0: _int, input_size: Union[_int, SymInt], mode: _int, hidden_size: Union[_int, SymInt], proj_size: Union[_int, SymInt], num_layers: _int, batch_first: _bool, bidirectional: _bool) -> Tensor: ... +def _cufft_clear_plan_cache(device_index: _int) -> None: ... +def _cufft_get_plan_cache_max_size(device_index: _int) -> _int: ... +def _cufft_get_plan_cache_size(device_index: _int) -> _int: ... +def _cufft_set_plan_cache_max_size(device_index: _int, max_size: _int) -> None: ... +def _cummax_helper(input: Tensor, values: Tensor, indices: Tensor, dim: _int) -> None: ... +def _cummin_helper(input: Tensor, values: Tensor, indices: Tensor, dim: _int) -> None: ... +def _debug_has_internal_overlap(input: Tensor) -> _int: ... +def _dim_arange(like: Tensor, dim: _int) -> Tensor: ... +def _dirichlet_grad(x: Tensor, alpha: Tensor, total: Tensor) -> Tensor: ... +def _disable_functionalization(): ... +@overload +def _efficientzerotensor(size: Sequence[Union[_int, SymInt]], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ... +@overload +def _efficientzerotensor(*size: _int, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ... +def _embedding_bag(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool = False, mode: _int = 0, sparse: _bool = False, per_sample_weights: Optional[Tensor] = None, include_last_offset: _bool = False, padding_idx: _int = -1) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ... +def _embedding_bag_forward_only(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool = False, mode: _int = 0, sparse: _bool = False, per_sample_weights: Optional[Tensor] = None, include_last_offset: _bool = False, padding_idx: _int = -1) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ... +@overload +def _empty_affine_quantized(size: Sequence[Union[_int, SymInt]], *, scale: _float = 1, zero_point: _int = 0, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ... +@overload +def _empty_affine_quantized(*size: _int, scale: _float = 1, zero_point: _int = 0, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ... +@overload +def _empty_per_channel_affine_quantized(size: Sequence[Union[_int, SymInt]], *, scales: Tensor, zero_points: Tensor, axis: _int, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ... +@overload +def _empty_per_channel_affine_quantized(*size: _int, scales: Tensor, zero_points: Tensor, axis: _int, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ... +def _enable_functionalization(*, reapply_views: _bool = False): ... +def _euclidean_dist(x1: Tensor, x2: Tensor) -> Tensor: ... +def _fake_quantize_learnable_per_channel_affine(input: Tensor, scale: Tensor, zero_point: Tensor, axis: _int, quant_min: _int, quant_max: _int, grad_factor: _float = 1.0) -> Tensor: ... +def _fake_quantize_learnable_per_tensor_affine(input: Tensor, scale: Tensor, zero_point: Tensor, quant_min: _int, quant_max: _int, grad_factor: _float = 1.0) -> Tensor: ... +def _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(input: Tensor, scale: Tensor, zero_point: Tensor, fake_quant_enabled: Tensor, quant_min: _int, quant_max: _int) -> torch.return_types._fake_quantize_per_tensor_affine_cachemask_tensor_qparams: ... +def _fft_c2c(input: Tensor, dim: Sequence[Union[_int, SymInt]], normalization: _int, forward: _bool, *, out: Optional[Tensor] = None) -> Tensor: ... +def _fft_c2r(input: Tensor, dim: _size, normalization: _int, last_dim_size: Union[_int, SymInt], *, out: Optional[Tensor] = None) -> Tensor: ... +def _fft_r2c(input: Tensor, dim: _size, normalization: _int, onesided: _bool, *, out: Optional[Tensor] = None) -> Tensor: ... +def _fill_mem_eff_dropout_mask_(input: Tensor, dropout_p: _float, seed: _int, offset: _int) -> Tensor: ... +def _foobar(input: Tensor, arg1: _bool = True, arg2: _bool = True, *, arg3: _bool = True) -> Tensor: ... +def _foreach_abs(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_abs(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.abs` to each Tensor of the input list. + """ + ... +def _foreach_abs_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_abs_(self: List[Tensor]) -> None + + Apply :func:`torch.abs` to each Tensor of the input list. + """ + ... +def _foreach_acos(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_acos(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.acos` to each Tensor of the input list. + """ + ... +def _foreach_acos_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_acos_(self: List[Tensor]) -> None + + Apply :func:`torch.acos` to each Tensor of the input list. + """ + ... +@overload +def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ... +@overload +def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> None: ... +@overload +def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor, *, alpha: Union[Number, _complex] = 1) -> None: ... +@overload +def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ... +@overload +def _foreach_addcdiv(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_addcdiv(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_addcdiv(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_addcdiv_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ... +@overload +def _foreach_addcdiv_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> None: ... +@overload +def _foreach_addcdiv_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> None: ... +@overload +def _foreach_addcmul(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_addcmul(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_addcmul(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_addcmul_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ... +@overload +def _foreach_addcmul_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> None: ... +@overload +def _foreach_addcmul_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> None: ... +def _foreach_asin(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_asin(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.asin` to each Tensor of the input list. + """ + ... +def _foreach_asin_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_asin_(self: List[Tensor]) -> None + + Apply :func:`torch.asin` to each Tensor of the input list. + """ + ... +def _foreach_atan(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_atan(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.atan` to each Tensor of the input list. + """ + ... +def _foreach_atan_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_atan_(self: List[Tensor]) -> None + + Apply :func:`torch.atan` to each Tensor of the input list. + """ + ... +def _foreach_ceil(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_ceil(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.ceil` to each Tensor of the input list. + """ + ... +def _foreach_ceil_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_ceil_(self: List[Tensor]) -> None + + Apply :func:`torch.ceil` to each Tensor of the input list. + """ + ... +@overload +def _foreach_clamp_max(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_clamp_max(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_clamp_max(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_clamp_max_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ... +@overload +def _foreach_clamp_max_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ... +@overload +def _foreach_clamp_max_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ... +@overload +def _foreach_clamp_min(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_clamp_min(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_clamp_min(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_clamp_min_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ... +@overload +def _foreach_clamp_min_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ... +@overload +def _foreach_clamp_min_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ... +def _foreach_copy_(self: Union[Tuple[Tensor, ...], List[Tensor]], src: Union[Tuple[Tensor, ...], List[Tensor]], non_blocking: _bool = False) -> None: ... +def _foreach_cos(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_cos(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.cos` to each Tensor of the input list. + """ + ... +def _foreach_cos_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_cos_(self: List[Tensor]) -> None + + Apply :func:`torch.cos` to each Tensor of the input list. + """ + ... +def _foreach_cosh(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_cosh(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.cosh` to each Tensor of the input list. + """ + ... +def _foreach_cosh_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_cosh_(self: List[Tensor]) -> None + + Apply :func:`torch.cosh` to each Tensor of the input list. + """ + ... +@overload +def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ... +@overload +def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> None: ... +@overload +def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ... +@overload +def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ... +def _foreach_erf(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_erf(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.erf` to each Tensor of the input list. + """ + ... +def _foreach_erf_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_erf_(self: List[Tensor]) -> None + + Apply :func:`torch.erf` to each Tensor of the input list. + """ + ... +def _foreach_erfc(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_erfc(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.erfc` to each Tensor of the input list. + """ + ... +def _foreach_erfc_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_erfc_(self: List[Tensor]) -> None + + Apply :func:`torch.erfc` to each Tensor of the input list. + """ + ... +def _foreach_exp(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_exp(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.exp` to each Tensor of the input list. + """ + ... +def _foreach_exp_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_exp_(self: List[Tensor]) -> None + + Apply :func:`torch.exp` to each Tensor of the input list. + """ + ... +def _foreach_expm1(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_expm1(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.expm1` to each Tensor of the input list. + """ + ... +def _foreach_expm1_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_expm1_(self: List[Tensor]) -> None + + Apply :func:`torch.expm1` to each Tensor of the input list. + """ + ... +def _foreach_floor(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_floor(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.floor` to each Tensor of the input list. + """ + ... +def _foreach_floor_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_floor_(self: List[Tensor]) -> None + + Apply :func:`torch.floor` to each Tensor of the input list. + """ + ... +def _foreach_frac(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_frac(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.frac` to each Tensor of the input list. + """ + ... +def _foreach_frac_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_frac_(self: List[Tensor]) -> None + + Apply :func:`torch.frac` to each Tensor of the input list. + """ + ... +@overload +def _foreach_lerp(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weight: Union[Number, _complex]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_lerp(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weights: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_lerp_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weight: Union[Number, _complex]) -> None: ... +@overload +def _foreach_lerp_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weights: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ... +def _foreach_lgamma(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_lgamma(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.lgamma` to each Tensor of the input list. + """ + ... +def _foreach_lgamma_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_lgamma_(self: List[Tensor]) -> None + + Apply :func:`torch.lgamma` to each Tensor of the input list. + """ + ... +def _foreach_log(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_log(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.log` to each Tensor of the input list. + """ + ... +def _foreach_log10(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_log10(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.log10` to each Tensor of the input list. + """ + ... +def _foreach_log10_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_log10_(self: List[Tensor]) -> None + + Apply :func:`torch.log10` to each Tensor of the input list. + """ + ... +def _foreach_log1p(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_log1p(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.log1p` to each Tensor of the input list. + """ + ... +def _foreach_log1p_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_log1p_(self: List[Tensor]) -> None + + Apply :func:`torch.log1p` to each Tensor of the input list. + """ + ... +def _foreach_log2(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_log2(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.log2` to each Tensor of the input list. + """ + ... +def _foreach_log2_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_log2_(self: List[Tensor]) -> None + + Apply :func:`torch.log2` to each Tensor of the input list. + """ + ... +def _foreach_log_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_log_(self: List[Tensor]) -> None + + Apply :func:`torch.log` to each Tensor of the input list. + """ + ... +@overload +def _foreach_maximum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_maximum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_maximum(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_maximum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ... +@overload +def _foreach_maximum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ... +@overload +def _foreach_maximum_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ... +@overload +def _foreach_minimum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_minimum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_minimum(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_minimum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ... +@overload +def _foreach_minimum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ... +@overload +def _foreach_minimum_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ... +@overload +def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ... +@overload +def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> None: ... +@overload +def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ... +@overload +def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ... +def _foreach_neg(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_neg(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.neg` to each Tensor of the input list. + """ + ... +def _foreach_neg_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_neg_(self: List[Tensor]) -> None + + Apply :func:`torch.neg` to each Tensor of the input list. + """ + ... +def _foreach_norm(self: Union[Tuple[Tensor, ...], List[Tensor]], ord: Union[Number, _complex] = 2) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_pow(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_pow(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Number, _complex]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_pow(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_pow(self: Union[Number, _complex], exponent: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_pow_(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Sequence[Union[Number, _complex]]) -> None: ... +@overload +def _foreach_pow_(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Number, _complex]) -> None: ... +@overload +def _foreach_pow_(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ... +def _foreach_reciprocal(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_reciprocal(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.reciprocal` to each Tensor of the input list. + """ + ... +def _foreach_reciprocal_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_reciprocal_(self: List[Tensor]) -> None + + Apply :func:`torch.reciprocal` to each Tensor of the input list. + """ + ... +def _foreach_round(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_round(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.round` to each Tensor of the input list. + """ + ... +def _foreach_round_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_round_(self: List[Tensor]) -> None + + Apply :func:`torch.round` to each Tensor of the input list. + """ + ... +def _foreach_sigmoid(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_sigmoid(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.sigmoid` to each Tensor of the input list. + """ + ... +def _foreach_sigmoid_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_sigmoid_(self: List[Tensor]) -> None + + Apply :func:`torch.sigmoid` to each Tensor of the input list. + """ + ... +def _foreach_sign(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ... +def _foreach_sign_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ... +def _foreach_sin(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_sin(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.sin` to each Tensor of the input list. + """ + ... +def _foreach_sin_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_sin_(self: List[Tensor]) -> None + + Apply :func:`torch.sin` to each Tensor of the input list. + """ + ... +def _foreach_sinh(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_sinh(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.sinh` to each Tensor of the input list. + """ + ... +def _foreach_sinh_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_sinh_(self: List[Tensor]) -> None + + Apply :func:`torch.sinh` to each Tensor of the input list. + """ + ... +def _foreach_sqrt(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_sqrt(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.sqrt` to each Tensor of the input list. + """ + ... +def _foreach_sqrt_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_sqrt_(self: List[Tensor]) -> None + + Apply :func:`torch.sqrt` to each Tensor of the input list. + """ + ... +@overload +def _foreach_sub(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_sub(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_sub(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ... +@overload +def _foreach_sub_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ... +@overload +def _foreach_sub_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> None: ... +@overload +def _foreach_sub_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ... +def _foreach_tan(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_tan(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.tan` to each Tensor of the input list. + """ + ... +def _foreach_tan_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_tan_(self: List[Tensor]) -> None + + Apply :func:`torch.tan` to each Tensor of the input list. + """ + ... +def _foreach_tanh(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_tanh(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.tanh` to each Tensor of the input list. + """ + ... +def _foreach_tanh_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_tanh_(self: List[Tensor]) -> None + + Apply :func:`torch.tanh` to each Tensor of the input list. + """ + ... +def _foreach_trunc(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + _foreach_trunc(self: List[Tensor]) -> List[Tensor] + + Apply :func:`torch.trunc` to each Tensor of the input list. + """ + ... +def _foreach_trunc_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_trunc_(self: List[Tensor]) -> None + + Apply :func:`torch.trunc` to each Tensor of the input list. + """ + ... +def _foreach_zero_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: + r""" + _foreach_zero_(self: List[Tensor]) -> None + + Apply :func:`torch.zero` to each Tensor of the input list. + """ + ... +def _from_functional_tensor(t: Tensor) -> Tensor: ... +def _functional_assert_async(input: Tensor, assert_msg: str, dep_token: Tensor) -> Tensor: ... +def _functional_assert_scalar(self: Union[Number, _complex], assert_msg: str, dep_token: Tensor) -> Tensor: ... +def _functional_sym_constrain_range(size: Union[Number, _complex], min: Optional[_int], max: Optional[_int], dep_token: Tensor) -> Tensor: ... +def _functional_sym_constrain_range_for_size(size: Union[Number, _complex], min: Optional[_int], max: Optional[_int], dep_token: Tensor) -> Tensor: ... +def _functionalize_are_all_mutations_hidden_from_autograd(t: Tensor) -> _bool: ... +def _functionalize_are_all_mutations_under_no_grad_or_inference_mode(t: Tensor) -> _bool: ... +def _functionalize_commit_update(t: Tensor) -> None: ... +def _functionalize_mark_mutation_hidden_from_autograd(t: Tensor) -> None: ... +def _functionalize_replace(self_: Tensor, other: Tensor) -> None: ... +def _functionalize_sync(t: Tensor) -> None: ... +@overload +def _fused_adam_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: Tensor, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ... +@overload +def _fused_adam_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: _float, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ... +@overload +def _fused_adamw_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: Tensor, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ... +@overload +def _fused_adamw_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: _float, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ... +def _fused_dropout(input: Tensor, p: _float, generator: Optional[Generator] = None) -> Tuple[Tensor, Tensor]: ... +def _fused_moving_avg_obs_fq_helper(input: Tensor, observer_on: Tensor, fake_quant_on: Tensor, running_min: Tensor, running_max: Tensor, scale: Tensor, zero_point: Tensor, averaging_const: _float, quant_min: _int, quant_max: _int, ch_axis: _int, per_row_fake_quant: _bool = False, symmetric_quant: _bool = False) -> torch.return_types._fused_moving_avg_obs_fq_helper: ... +def _fused_sdp_choice(query: Tensor, key: Tensor, value: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: _float = 0.0, is_causal: _bool = False, *, scale: Optional[_float] = None) -> _int: ... +@overload +def _fused_sgd_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], momentum_buffer_list: Union[Tuple[Tensor, ...], List[Tensor]], *, weight_decay: _float, momentum: _float, lr: Tensor, dampening: _float, nesterov: _bool, maximize: _bool, is_first_step: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ... +@overload +def _fused_sgd_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], momentum_buffer_list: Union[Tuple[Tensor, ...], List[Tensor]], *, weight_decay: _float, momentum: _float, lr: _float, dampening: _float, nesterov: _bool, maximize: _bool, is_first_step: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ... +def _fw_primal_copy(input: Tensor, level: _int, *, out: Optional[Tensor] = None) -> Tensor: ... +def _grid_sampler_2d_cpu_fallback(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ... +def _has_compatible_shallow_copy_type(input: Tensor, from_: Tensor) -> _bool: ... +def _histogramdd_bin_edges(input: Tensor, bins: _size, *, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> Tuple[Tensor, ...]: ... +def _histogramdd_from_bin_cts(input: Tensor, bins: _size, *, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> Tensor: ... +def _histogramdd_from_bin_tensors(input: Tensor, bins: Union[Tuple[Tensor, ...], List[Tensor]], *, weight: Optional[Tensor] = None, density: _bool = False) -> Tensor: ... +def _index_put_impl_(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False, unsafe: _bool = False) -> Tensor: ... +def _indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ... +def _int_mm(input: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ... +def _is_all_true(input: Tensor) -> Tensor: ... +def _is_any_true(input: Tensor) -> Tensor: ... +def _is_functional_tensor(t: Tensor) -> _bool: ... +def _is_zerotensor(input: Tensor) -> _bool: ... +def _lazy_clone(input: Tensor) -> Tensor: ... +def _linalg_check_errors(info: Tensor, api_name: str, *, is_matrix: _bool) -> None: ... +def _linalg_det(A: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_det: ... +def _linalg_eigh(A: Tensor, UPLO: str = "L", compute_v: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_eigh: ... +def _linalg_slogdet(A: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_slogdet: ... +def _linalg_solve_ex(A: Tensor, B: Tensor, *, left: _bool = True, check_errors: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_solve_ex: ... +def _linalg_svd(A: Tensor, full_matrices: _bool = False, compute_uv: _bool = True, *, driver: Optional[str] = None, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_svd: ... +def _log_softmax(input: Tensor, dim: _int, half_to_float: _bool, *, out: Optional[Tensor] = None) -> Tensor: ... +def _log_softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input_dtype: _dtype, *, out: Optional[Tensor] = None) -> Tensor: ... +def _logcumsumexp(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: ... +def _lstm_mps(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: ... +def _lu_with_info(input: Tensor, pivot: _bool = True, check_errors: _bool = True) -> torch.return_types._lu_with_info: ... +def _make_dep_token(*, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ... +def _make_dual(primal: Tensor, tangent: Tensor, level: _int) -> Tensor: ... +def _make_dual_copy(primal: Tensor, tangent: Tensor, level: _int, *, out: Optional[Tensor] = None) -> Tensor: ... +def _make_per_channel_quantized_tensor(input: Tensor, scale: Tensor, zero_point: Tensor, axis: _int) -> Tensor: ... +def _make_per_tensor_quantized_tensor(input: Tensor, scale: _float, zero_point: _int) -> Tensor: ... +def _masked_scale(input: Tensor, mask: Tensor, scale: _float) -> Tensor: ... +def _masked_softmax(input: Tensor, mask: Tensor, dim: Optional[_int] = None, mask_type: Optional[_int] = None) -> Tensor: ... +def _mixed_dtypes_linear(input: Tensor, weight: Tensor, scale: Tensor, *, bias: Optional[Tensor] = None, activation: Optional[str] = None) -> Tensor: ... +def _mkldnn_reshape(input: Tensor, shape: _size) -> Tensor: ... +def _mkldnn_transpose(input: Tensor, dim0: _int, dim1: _int) -> Tensor: ... +def _mkldnn_transpose_(input: Tensor, dim0: _int, dim1: _int) -> Tensor: ... +def _mps_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ... +def _mps_convolution_transpose(input: Tensor, weight: Tensor, padding: Sequence[Union[_int, SymInt]], output_padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ... +@overload +def _native_batch_norm_legit(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Tensor, running_var: Tensor, training: _bool, momentum: _float, eps: _float, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor, Tensor]: ... +@overload +def _native_batch_norm_legit(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], training: _bool, momentum: _float, eps: _float, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor, Tensor]: ... +def _native_batch_norm_legit_no_training(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Tensor, running_var: Tensor, momentum: _float, eps: _float) -> Tuple[Tensor, Tensor, Tensor]: ... +def _native_multi_head_attention(query: Tensor, key: Tensor, value: Tensor, embed_dim: _int, num_head: _int, qkv_weight: Tensor, qkv_bias: Tensor, proj_weight: Tensor, proj_bias: Tensor, mask: Optional[Tensor] = None, need_weights: _bool = True, average_attn_weights: _bool = True, mask_type: Optional[_int] = None) -> Tuple[Tensor, Tensor]: ... +def _neg_view(input: Tensor) -> Tensor: ... +def _neg_view_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ... +def _nested_from_padded(padded: Tensor, cpu_nested_shape_example: Tensor, fuse_transform_0213: _bool = False) -> Tensor: ... +def _nested_from_padded_and_nested_example(padded: Tensor, nt_example: Tensor) -> Tensor: ... +def _nested_get_jagged_dummy(any: Tensor) -> Tensor: ... +def _nested_get_lengths(input: Tensor) -> Tensor: ... +def _nested_get_offsets(input: Tensor) -> Tensor: ... +def _nested_get_ragged_idx(input: Tensor) -> _int: ... +def _nested_get_values(input: Tensor) -> Tensor: ... +def _nested_get_values_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ... +def _nested_tensor_from_mask(t: Tensor, mask: Tensor, mask_check: _bool = True) -> Tensor: ... +def _nested_tensor_from_mask_left_aligned(t: Tensor, mask: Tensor) -> _bool: ... +def _nested_tensor_from_tensor_list(list: Union[Tuple[Tensor, ...], List[Tensor]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = None) -> Tensor: ... +def _nested_tensor_softmax_with_shape(input: Tensor, query: Tensor) -> Tensor: ... +def _nested_view_from_buffer(input: Tensor, nested_size: Tensor, nested_strides: Tensor, offsets: Tensor) -> Tensor: ... +def _nested_view_from_buffer_copy(input: Tensor, nested_size: Tensor, nested_strides: Tensor, offsets: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ... +def _nested_view_from_jagged(input: Tensor, offsets: Tensor, dummy: Tensor, lengths: Optional[Tensor] = None, ragged_idx: _int = 1) -> Tensor: ... +def _nested_view_from_jagged_copy(input: Tensor, offsets: Tensor, dummy: Tensor, lengths: Optional[Tensor] = None, ragged_idx: _int = 1, *, out: Optional[Tensor] = None) -> Tensor: ... +def _nnpack_available() -> _bool: ... +def _nnpack_spatial_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]], stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ... +def _pack_padded_sequence(input: Tensor, lengths: Tensor, batch_first: _bool) -> Tuple[Tensor, Tensor]: ... +def _pad_packed_sequence(data: Tensor, batch_sizes: Tensor, batch_first: _bool, padding_value: Union[Number, _complex], total_length: _int) -> Tuple[Tensor, Tensor]: ... +def _pin_memory(input: Tensor, device: Optional[Optional[DeviceLikeType]] = None) -> Tensor: ... +def _prelu_kernel(input: Tensor, weight: Tensor) -> Tensor: ... +def _print(s: str) -> None: ... +def _propagate_xla_data(input: Tensor, output: Tensor) -> None: ... +def _remove_batch_dim(input: Tensor, level: _int, batch_size: _int, out_dim: _int) -> Tensor: ... +def _reshape_alias_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None) -> Tensor: ... +def _reshape_from_tensor(input: Tensor, shape: Tensor) -> Tensor: ... +def _resize_output_(input: Tensor, size: Sequence[Union[_int, SymInt]], device: Optional[DeviceLikeType]) -> Tensor: ... +def _rowwise_prune(weight: Tensor, mask: Tensor, compressed_indices_dtype: _dtype) -> Tuple[Tensor, Tensor]: ... +def _sample_dirichlet(input: Tensor, generator: Optional[Generator] = None) -> Tensor: ... +def _saturate_weight_to_fp16(weight: Tensor) -> Tensor: ... +def _scaled_dot_product_attention_math(query: Tensor, key: Tensor, value: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: _float = 0.0, is_causal: _bool = False, dropout_mask: Optional[Tensor] = None, *, scale: Optional[_float] = None) -> Tuple[Tensor, Tensor]: ... +def _scaled_dot_product_cudnn_attention(query: Tensor, key: Tensor, value: Tensor, dropout_p: _float = 0.0, is_causal: _bool = False, return_debug_mask: _bool = False, *, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_cudnn_attention: ... +def _scaled_dot_product_efficient_attention(query: Tensor, key: Tensor, value: Tensor, attn_bias: Optional[Tensor], compute_log_sumexp: _bool, dropout_p: _float = 0.0, is_causal: _bool = False, *, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_efficient_attention: ... +def _scaled_dot_product_flash_attention(query: Tensor, key: Tensor, value: Tensor, dropout_p: _float = 0.0, is_causal: _bool = False, return_debug_mask: _bool = False, *, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_flash_attention: ... +def _scaled_dot_product_flash_attention_for_cpu(query: Tensor, key: Tensor, value: Tensor, dropout_p: _float = 0.0, is_causal: _bool = False, *, attn_mask: Optional[Tensor] = None, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_flash_attention_for_cpu: ... +def _scaled_mm(input: Tensor, mat2: Tensor, *, bias: Optional[Tensor] = None, out_dtype: Optional[_dtype] = None, scale_a: Optional[Tensor] = None, scale_b: Optional[Tensor] = None, scale_result: Optional[Tensor] = None, use_fast_accum: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor]: ... +def _shape_as_tensor(input: Tensor) -> Tensor: ... +def _sobol_engine_draw(quasi: Tensor, n: _int, sobolstate: Tensor, dimension: _int, num_generated: _int, dtype: Optional[_dtype]) -> Tuple[Tensor, Tensor]: ... +def _sobol_engine_ff_(input: Tensor, n: _int, sobolstate: Tensor, dimension: _int, num_generated: _int) -> Tensor: ... +def _sobol_engine_initialize_state_(input: Tensor, dimension: _int) -> Tensor: ... +def _sobol_engine_scramble_(input: Tensor, ltm: Tensor, dimension: _int) -> Tensor: ... +def _softmax(input: Tensor, dim: _int, half_to_float: _bool, *, out: Optional[Tensor] = None) -> Tensor: ... +def _softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input_dtype: _dtype, *, grad_input: Optional[Tensor] = None) -> Tensor: ... +def _sparse_broadcast_to(input: Tensor, size: _size) -> Tensor: ... +def _sparse_broadcast_to_copy(input: Tensor, size: _size, *, out: Optional[Tensor] = None) -> Tensor: ... +def _sparse_csr_prod(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: ... +def _sparse_csr_sum(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: ... +def _sparse_log_softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input: Tensor) -> Tensor: ... +def _sparse_semi_structured_linear(input: Tensor, weight: Tensor, meta: Tensor, *, bias: Optional[Tensor] = None, activation: Optional[str] = None, out_dtype: Optional[_dtype] = None) -> Tensor: ... +def _sparse_softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input: Tensor) -> Tensor: ... +def _sparse_sparse_matmul(input: Tensor, other: Tensor) -> Tensor: ... +@overload +def _sparse_sum(input: Tensor) -> Tensor: ... +@overload +def _sparse_sum(input: Tensor, *, dtype: _dtype) -> Tensor: ... +@overload +def _sparse_sum(input: Tensor, dim: Union[_int, _size]) -> Tensor: ... +@overload +def _sparse_sum(input: Tensor, dim: Union[_int, _size], *, dtype: _dtype) -> Tensor: ... +def _stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: ... +def _standard_gamma(input: Tensor, generator: Optional[Generator] = None) -> Tensor: ... +def _standard_gamma_grad(input: Tensor, output: Tensor) -> Tensor: ... +def _sync(t: Tensor) -> None: ... +@overload +def _test_autograd_multiple_dispatch(input: Tensor) -> Tensor: ... +@overload +def _test_autograd_multiple_dispatch(input: Tensor, b: _bool) -> Tensor: ... +def _test_autograd_multiple_dispatch_view(input: Tensor) -> Tensor: ... +def _test_autograd_multiple_dispatch_view_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ... +def _test_check_tensor(input: Tensor) -> Tensor: ... +def _test_functorch_fallback(input: Tensor, other: Tensor) -> Tensor: ... +def _test_parallel_materialize(input: Tensor, num_parallel: _int, skip_first: _bool = False) -> Tensor: ... +def _test_serialization_subcmul(input: Tensor, other: Tensor, alpha: Union[Number, _complex] = 1) -> Tensor: ... +def _to_cpu(tensors: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ... +def _to_functional_tensor(t: Tensor) -> Tensor: ... +def _to_sparse_semi_structured(dense: Tensor) -> Tuple[Tensor, Tensor]: ... +def _transform_bias_rescale_qkv(qkv: Tensor, qkv_bias: Tensor, num_heads: _int) -> Tuple[Tensor, Tensor, Tensor]: ... +def _transformer_encoder_layer_fwd(src: Tensor, embed_dim: _int, num_heads: _int, qkv_weight: Tensor, qkv_bias: Tensor, proj_weight: Tensor, proj_bias: Tensor, use_gelu: _bool, norm_first: _bool, eps: _float, norm_weight_1: Tensor, norm_bias_1: Tensor, norm_weight_2: Tensor, norm_bias_2: Tensor, ffn_weight_1: Tensor, ffn_bias_1: Tensor, ffn_weight_2: Tensor, ffn_bias_2: Tensor, mask: Optional[Tensor] = None, mask_type: Optional[_int] = None) -> Tensor: ... +def _trilinear(i1: Tensor, i2: Tensor, i3: Tensor, expand1: _size, expand2: _size, expand3: _size, sumdim: _size, unroll_dim: _int = 1) -> Tensor: ... +def _triton_multi_head_attention(query: Tensor, key: Tensor, value: Tensor, embed_dim: _int, num_head: _int, qkv_weight: Tensor, qkv_bias: Tensor, proj_weight: Tensor, proj_bias: Tensor, mask: Optional[Tensor] = None) -> Tensor: ... +def _triton_scaled_dot_attention(q: Tensor, k: Tensor, v: Tensor, dropout_p: _float = 0.0) -> Tensor: ... +def _unique(input: Tensor, sorted: _bool = True, return_inverse: _bool = False) -> Tuple[Tensor, Tensor]: ... +def _unique2(input: Tensor, sorted: _bool = True, return_inverse: _bool = False, return_counts: _bool = False) -> Tuple[Tensor, Tensor, Tensor]: ... +def _unpack_dual(dual: Tensor, level: _int) -> torch.return_types._unpack_dual: ... +def _unsafe_index(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]]) -> Tensor: ... +def _unsafe_index_put(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False) -> Tensor: ... +@overload +def _use_cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int) -> _bool: ... +@overload +def _use_cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int) -> _bool: ... +def _use_cudnn_rnn_flatten_weight() -> _bool: ... +def _validate_compressed_sparse_indices(is_crow: _bool, compressed_idx: Tensor, plain_idx: Tensor, cdim: _int, dim: _int, nnz: _int) -> None: ... +def _validate_sparse_bsc_tensor_args(ccol_indices: Tensor, row_indices: Tensor, values: Tensor, size: _size) -> None: ... +def _validate_sparse_bsr_tensor_args(crow_indices: Tensor, col_indices: Tensor, values: Tensor, size: _size) -> None: ... +def _validate_sparse_compressed_tensor_args(compressed_indices: Tensor, plain_indices: Tensor, values: Tensor, size: _size, layout: _layout) -> None: ... +def _validate_sparse_coo_tensor_args(indices: Tensor, values: Tensor, size: _size, is_coalesced: Optional[_bool] = None) -> None: ... +def _validate_sparse_csc_tensor_args(ccol_indices: Tensor, row_indices: Tensor, values: Tensor, size: _size) -> None: ... +def _validate_sparse_csr_tensor_args(crow_indices: Tensor, col_indices: Tensor, values: Tensor, size: _size) -> None: ... +def _values_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ... +def _weight_int4pack_mm(input: Tensor, mat2: Tensor, qGroupSize: _int, qScaleAndZeros: Tensor) -> Tensor: ... +def _weight_int8pack_mm(input: Tensor, mat2: Tensor, scales: Tensor) -> Tensor: ... +def _weight_norm(v: Tensor, g: Tensor, dim: _int = 0) -> Tensor: ... +def _weight_norm_interface(v: Tensor, g: Tensor, dim: _int = 0) -> Tuple[Tensor, Tensor]: ... +def abs(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + abs(input, *, out=None) -> Tensor + + Computes the absolute value of each element in :attr:`input`. + + .. math:: + \text{out}_{i} = |\text{input}_{i}| + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.abs(torch.tensor([-1, -2, 3])) + tensor([ 1, 2, 3]) + """ + ... +def abs_(input: Tensor) -> Tensor: ... +def absolute(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + absolute(input, *, out=None) -> Tensor + + Alias for :func:`torch.abs` + """ + ... +def acos(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + acos(input, *, out=None) -> Tensor + + Computes the inverse cosine of each element in :attr:`input`. + + .. math:: + \text{out}_{i} = \cos^{-1}(\text{input}_{i}) + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([ 0.3348, -0.5889, 0.2005, -0.1584]) + >>> torch.acos(a) + tensor([ 1.2294, 2.2004, 1.3690, 1.7298]) + """ + ... +def acos_(input: Tensor) -> Tensor: ... +def acosh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + acosh(input, *, out=None) -> Tensor + + Returns a new tensor with the inverse hyperbolic cosine of the elements of :attr:`input`. + + .. math:: + \text{out}_{i} = \cosh^{-1}(\text{input}_{i}) + + Note: + The domain of the inverse hyperbolic cosine is `[1, inf)` and values outside this range + will be mapped to ``NaN``, except for `+ INF` for which the output is mapped to `+ INF`. + + Args: + input (Tensor): the input tensor. + + Keyword arguments: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4).uniform_(1, 2) + >>> a + tensor([ 1.3192, 1.9915, 1.9674, 1.7151 ]) + >>> torch.acosh(a) + tensor([ 0.7791, 1.3120, 1.2979, 1.1341 ]) + """ + ... +def acosh_(input: Tensor) -> Tensor: ... +def adaptive_avg_pool1d(input: Tensor, output_size: Union[_int, _size]) -> Tensor: ... +def adaptive_max_pool1d(input: Tensor, output_size: Union[_int, _size]) -> Tuple[Tensor, Tensor]: ... +@overload +def add(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], *, alpha: Optional[Union[Number, _complex]] = 1, out: Optional[Tensor] = None) -> Tensor: + r""" + add(input, other, *, alpha=1, out=None) -> Tensor + + Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`. + + .. math:: + \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i + + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer, float, and complex inputs. + + Args: + input (Tensor): the input tensor. + other (Tensor or Number): the tensor or number to add to :attr:`input`. + + Keyword arguments: + alpha (Number): the multiplier for :attr:`other`. + out (Tensor, optional): the output tensor. + + Examples:: + + >>> a = torch.randn(4) + >>> a + tensor([ 0.0202, 1.0985, 1.3506, -0.6056]) + >>> torch.add(a, 20) + tensor([ 20.0202, 21.0985, 21.3506, 19.3944]) + + >>> b = torch.randn(4) + >>> b + tensor([-0.9732, -0.3497, 0.6245, 0.4022]) + >>> c = torch.randn(4, 1) + >>> c + tensor([[ 0.3743], + [-1.7724], + [-0.5811], + [-0.8017]]) + >>> torch.add(b, c, alpha=10) + tensor([[ 2.7695, 3.3930, 4.3672, 4.1450], + [-18.6971, -18.0736, -17.0994, -17.3216], + [ -6.7845, -6.1610, -5.1868, -5.4090], + [ -8.9902, -8.3667, -7.3925, -7.6147]]) + """ + ... +@overload +def add(self: Tensor, alpha: Union[Number, _complex], other: Tensor) -> Tensor: + r""" + add(input, other, *, alpha=1, out=None) -> Tensor + + Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`. + + .. math:: + \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i + + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer, float, and complex inputs. + + Args: + input (Tensor): the input tensor. + other (Tensor or Number): the tensor or number to add to :attr:`input`. + + Keyword arguments: + alpha (Number): the multiplier for :attr:`other`. + out (Tensor, optional): the output tensor. + + Examples:: + + >>> a = torch.randn(4) + >>> a + tensor([ 0.0202, 1.0985, 1.3506, -0.6056]) + >>> torch.add(a, 20) + tensor([ 20.0202, 21.0985, 21.3506, 19.3944]) + + >>> b = torch.randn(4) + >>> b + tensor([-0.9732, -0.3497, 0.6245, 0.4022]) + >>> c = torch.randn(4, 1) + >>> c + tensor([[ 0.3743], + [-1.7724], + [-0.5811], + [-0.8017]]) + >>> torch.add(b, c, alpha=10) + tensor([[ 2.7695, 3.3930, 4.3672, 4.1450], + [-18.6971, -18.0736, -17.0994, -17.3216], + [ -6.7845, -6.1610, -5.1868, -5.4090], + [ -8.9902, -8.3667, -7.3925, -7.6147]]) + """ + ... +@overload +def add(self: Tensor, alpha: Union[Number, _complex], other: Tensor, *, out: Tensor) -> Tensor: + r""" + add(input, other, *, alpha=1, out=None) -> Tensor + + Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`. + + .. math:: + \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i + + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer, float, and complex inputs. + + Args: + input (Tensor): the input tensor. + other (Tensor or Number): the tensor or number to add to :attr:`input`. + + Keyword arguments: + alpha (Number): the multiplier for :attr:`other`. + out (Tensor, optional): the output tensor. + + Examples:: + + >>> a = torch.randn(4) + >>> a + tensor([ 0.0202, 1.0985, 1.3506, -0.6056]) + >>> torch.add(a, 20) + tensor([ 20.0202, 21.0985, 21.3506, 19.3944]) + + >>> b = torch.randn(4) + >>> b + tensor([-0.9732, -0.3497, 0.6245, 0.4022]) + >>> c = torch.randn(4, 1) + >>> c + tensor([[ 0.3743], + [-1.7724], + [-0.5811], + [-0.8017]]) + >>> torch.add(b, c, alpha=10) + tensor([[ 2.7695, 3.3930, 4.3672, 4.1450], + [-18.6971, -18.0736, -17.0994, -17.3216], + [ -6.7845, -6.1610, -5.1868, -5.4090], + [ -8.9902, -8.3667, -7.3925, -7.6147]]) + """ + ... +@overload +def addbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor) -> Tensor: + r""" + addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a batch matrix-matrix product of matrices stored + in :attr:`batch1` and :attr:`batch2`, + with a reduced add step (all matrix multiplications get accumulated + along the first dimension). + :attr:`input` is added to the final result. + + :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the + same number of matrices. + + If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a + :math:`(b \times m \times p)` tensor, :attr:`input` must be + :ref:`broadcastable ` with a :math:`(n \times p)` tensor + and :attr:`out` will be a :math:`(n \times p)` tensor. + + .. math:: + out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha` + must be real numbers, otherwise they should be integers. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + batch1 (Tensor): the first batch of matrices to be multiplied + batch2 (Tensor): the second batch of matrices to be multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + input (Tensor): matrix to be added + alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(3, 5) + >>> batch1 = torch.randn(10, 3, 4) + >>> batch2 = torch.randn(10, 4, 5) + >>> torch.addbmm(M, batch1, batch2) + tensor([[ 6.6311, 0.0503, 6.9768, -12.0362, -2.1653], + [ -4.8185, -1.4255, -6.6760, 8.9453, 2.5743], + [ -3.8202, 4.3691, 1.0943, -1.1109, 5.4730]]) + """ + ... +@overload +def addbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: + r""" + addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a batch matrix-matrix product of matrices stored + in :attr:`batch1` and :attr:`batch2`, + with a reduced add step (all matrix multiplications get accumulated + along the first dimension). + :attr:`input` is added to the final result. + + :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the + same number of matrices. + + If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a + :math:`(b \times m \times p)` tensor, :attr:`input` must be + :ref:`broadcastable ` with a :math:`(n \times p)` tensor + and :attr:`out` will be a :math:`(n \times p)` tensor. + + .. math:: + out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha` + must be real numbers, otherwise they should be integers. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + batch1 (Tensor): the first batch of matrices to be multiplied + batch2 (Tensor): the second batch of matrices to be multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + input (Tensor): matrix to be added + alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(3, 5) + >>> batch1 = torch.randn(10, 3, 4) + >>> batch2 = torch.randn(10, 4, 5) + >>> torch.addbmm(M, batch1, batch2) + tensor([[ 6.6311, 0.0503, 6.9768, -12.0362, -2.1653], + [ -4.8185, -1.4255, -6.6760, 8.9453, 2.5743], + [ -3.8202, 4.3691, 1.0943, -1.1109, 5.4730]]) + """ + ... +@overload +def addbmm(input: Tensor, batch1: Tensor, batch2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: + r""" + addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a batch matrix-matrix product of matrices stored + in :attr:`batch1` and :attr:`batch2`, + with a reduced add step (all matrix multiplications get accumulated + along the first dimension). + :attr:`input` is added to the final result. + + :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the + same number of matrices. + + If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a + :math:`(b \times m \times p)` tensor, :attr:`input` must be + :ref:`broadcastable ` with a :math:`(n \times p)` tensor + and :attr:`out` will be a :math:`(n \times p)` tensor. + + .. math:: + out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha` + must be real numbers, otherwise they should be integers. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + batch1 (Tensor): the first batch of matrices to be multiplied + batch2 (Tensor): the second batch of matrices to be multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + input (Tensor): matrix to be added + alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(3, 5) + >>> batch1 = torch.randn(10, 3, 4) + >>> batch2 = torch.randn(10, 4, 5) + >>> torch.addbmm(M, batch1, batch2) + tensor([[ 6.6311, 0.0503, 6.9768, -12.0362, -2.1653], + [ -4.8185, -1.4255, -6.6760, 8.9453, 2.5743], + [ -3.8202, 4.3691, 1.0943, -1.1109, 5.4730]]) + """ + ... +@overload +def addbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor) -> Tensor: + r""" + addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a batch matrix-matrix product of matrices stored + in :attr:`batch1` and :attr:`batch2`, + with a reduced add step (all matrix multiplications get accumulated + along the first dimension). + :attr:`input` is added to the final result. + + :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the + same number of matrices. + + If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a + :math:`(b \times m \times p)` tensor, :attr:`input` must be + :ref:`broadcastable ` with a :math:`(n \times p)` tensor + and :attr:`out` will be a :math:`(n \times p)` tensor. + + .. math:: + out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha` + must be real numbers, otherwise they should be integers. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + batch1 (Tensor): the first batch of matrices to be multiplied + batch2 (Tensor): the second batch of matrices to be multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + input (Tensor): matrix to be added + alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(3, 5) + >>> batch1 = torch.randn(10, 3, 4) + >>> batch2 = torch.randn(10, 4, 5) + >>> torch.addbmm(M, batch1, batch2) + tensor([[ 6.6311, 0.0503, 6.9768, -12.0362, -2.1653], + [ -4.8185, -1.4255, -6.6760, 8.9453, 2.5743], + [ -3.8202, 4.3691, 1.0943, -1.1109, 5.4730]]) + """ + ... +@overload +def addbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: + r""" + addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a batch matrix-matrix product of matrices stored + in :attr:`batch1` and :attr:`batch2`, + with a reduced add step (all matrix multiplications get accumulated + along the first dimension). + :attr:`input` is added to the final result. + + :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the + same number of matrices. + + If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a + :math:`(b \times m \times p)` tensor, :attr:`input` must be + :ref:`broadcastable ` with a :math:`(n \times p)` tensor + and :attr:`out` will be a :math:`(n \times p)` tensor. + + .. math:: + out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha` + must be real numbers, otherwise they should be integers. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + batch1 (Tensor): the first batch of matrices to be multiplied + batch2 (Tensor): the second batch of matrices to be multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + input (Tensor): matrix to be added + alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(3, 5) + >>> batch1 = torch.randn(10, 3, 4) + >>> batch2 = torch.randn(10, 4, 5) + >>> torch.addbmm(M, batch1, batch2) + tensor([[ 6.6311, 0.0503, 6.9768, -12.0362, -2.1653], + [ -4.8185, -1.4255, -6.6760, 8.9453, 2.5743], + [ -3.8202, 4.3691, 1.0943, -1.1109, 5.4730]]) + """ + ... +@overload +def addcdiv(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor) -> Tensor: + r""" + addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor + + Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`, + multiplies the result by the scalar :attr:`value` and adds it to :attr:`input`. + + .. warning:: + Integer division with addcdiv is no longer supported, and in a future + release addcdiv will perform a true division of tensor1 and tensor2. + The historic addcdiv behavior can be implemented as + (input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype) + for integer inputs and as (input + value * tensor1 / tensor2) for float inputs. + The future addcdiv behavior is just the latter implementation: + (input + value * tensor1 / tensor2), for all dtypes. + + .. math:: + \text{out}_i = \text{input}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i} + + + The shapes of :attr:`input`, :attr:`tensor1`, and :attr:`tensor2` must be + :ref:`broadcastable `. + + For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be + a real number, otherwise an integer. + + Args: + input (Tensor): the tensor to be added + tensor1 (Tensor): the numerator tensor + tensor2 (Tensor): the denominator tensor + + Keyword args: + value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}` + out (Tensor, optional): the output tensor. + + Example:: + + >>> t = torch.randn(1, 3) + >>> t1 = torch.randn(3, 1) + >>> t2 = torch.randn(1, 3) + >>> torch.addcdiv(t, t1, t2, value=0.1) + tensor([[-0.2312, -3.6496, 0.1312], + [-1.0428, 3.4292, -0.1030], + [-0.5369, -0.9829, 0.0430]]) + """ + ... +@overload +def addcdiv(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor, *, out: Tensor) -> Tensor: + r""" + addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor + + Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`, + multiplies the result by the scalar :attr:`value` and adds it to :attr:`input`. + + .. warning:: + Integer division with addcdiv is no longer supported, and in a future + release addcdiv will perform a true division of tensor1 and tensor2. + The historic addcdiv behavior can be implemented as + (input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype) + for integer inputs and as (input + value * tensor1 / tensor2) for float inputs. + The future addcdiv behavior is just the latter implementation: + (input + value * tensor1 / tensor2), for all dtypes. + + .. math:: + \text{out}_i = \text{input}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i} + + + The shapes of :attr:`input`, :attr:`tensor1`, and :attr:`tensor2` must be + :ref:`broadcastable `. + + For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be + a real number, otherwise an integer. + + Args: + input (Tensor): the tensor to be added + tensor1 (Tensor): the numerator tensor + tensor2 (Tensor): the denominator tensor + + Keyword args: + value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}` + out (Tensor, optional): the output tensor. + + Example:: + + >>> t = torch.randn(1, 3) + >>> t1 = torch.randn(3, 1) + >>> t2 = torch.randn(1, 3) + >>> torch.addcdiv(t, t1, t2, value=0.1) + tensor([[-0.2312, -3.6496, 0.1312], + [-1.0428, 3.4292, -0.1030], + [-0.5369, -0.9829, 0.0430]]) + """ + ... +@overload +def addcdiv(input: Tensor, tensor1: Tensor, tensor2: Tensor, *, value: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: + r""" + addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor + + Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`, + multiplies the result by the scalar :attr:`value` and adds it to :attr:`input`. + + .. warning:: + Integer division with addcdiv is no longer supported, and in a future + release addcdiv will perform a true division of tensor1 and tensor2. + The historic addcdiv behavior can be implemented as + (input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype) + for integer inputs and as (input + value * tensor1 / tensor2) for float inputs. + The future addcdiv behavior is just the latter implementation: + (input + value * tensor1 / tensor2), for all dtypes. + + .. math:: + \text{out}_i = \text{input}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i} + + + The shapes of :attr:`input`, :attr:`tensor1`, and :attr:`tensor2` must be + :ref:`broadcastable `. + + For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be + a real number, otherwise an integer. + + Args: + input (Tensor): the tensor to be added + tensor1 (Tensor): the numerator tensor + tensor2 (Tensor): the denominator tensor + + Keyword args: + value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}` + out (Tensor, optional): the output tensor. + + Example:: + + >>> t = torch.randn(1, 3) + >>> t1 = torch.randn(3, 1) + >>> t2 = torch.randn(1, 3) + >>> torch.addcdiv(t, t1, t2, value=0.1) + tensor([[-0.2312, -3.6496, 0.1312], + [-1.0428, 3.4292, -0.1030], + [-0.5369, -0.9829, 0.0430]]) + """ + ... +@overload +def addcmul(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor) -> Tensor: + r""" + addcmul(input, tensor1, tensor2, *, value=1, out=None) -> Tensor + + Performs the element-wise multiplication of :attr:`tensor1` + by :attr:`tensor2`, multiplies the result by the scalar :attr:`value` + and adds it to :attr:`input`. + + .. math:: + \text{out}_i = \text{input}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i + + The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be + :ref:`broadcastable `. + + For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be + a real number, otherwise an integer. + + Args: + input (Tensor): the tensor to be added + tensor1 (Tensor): the tensor to be multiplied + tensor2 (Tensor): the tensor to be multiplied + + Keyword args: + value (Number, optional): multiplier for :math:`tensor1 .* tensor2` + out (Tensor, optional): the output tensor. + + Example:: + + >>> t = torch.randn(1, 3) + >>> t1 = torch.randn(3, 1) + >>> t2 = torch.randn(1, 3) + >>> torch.addcmul(t, t1, t2, value=0.1) + tensor([[-0.8635, -0.6391, 1.6174], + [-0.7617, -0.5879, 1.7388], + [-0.8353, -0.6249, 1.6511]]) + """ + ... +@overload +def addcmul(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor, *, out: Tensor) -> Tensor: + r""" + addcmul(input, tensor1, tensor2, *, value=1, out=None) -> Tensor + + Performs the element-wise multiplication of :attr:`tensor1` + by :attr:`tensor2`, multiplies the result by the scalar :attr:`value` + and adds it to :attr:`input`. + + .. math:: + \text{out}_i = \text{input}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i + + The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be + :ref:`broadcastable `. + + For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be + a real number, otherwise an integer. + + Args: + input (Tensor): the tensor to be added + tensor1 (Tensor): the tensor to be multiplied + tensor2 (Tensor): the tensor to be multiplied + + Keyword args: + value (Number, optional): multiplier for :math:`tensor1 .* tensor2` + out (Tensor, optional): the output tensor. + + Example:: + + >>> t = torch.randn(1, 3) + >>> t1 = torch.randn(3, 1) + >>> t2 = torch.randn(1, 3) + >>> torch.addcmul(t, t1, t2, value=0.1) + tensor([[-0.8635, -0.6391, 1.6174], + [-0.7617, -0.5879, 1.7388], + [-0.8353, -0.6249, 1.6511]]) + """ + ... +@overload +def addcmul(input: Tensor, tensor1: Tensor, tensor2: Tensor, *, value: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: + r""" + addcmul(input, tensor1, tensor2, *, value=1, out=None) -> Tensor + + Performs the element-wise multiplication of :attr:`tensor1` + by :attr:`tensor2`, multiplies the result by the scalar :attr:`value` + and adds it to :attr:`input`. + + .. math:: + \text{out}_i = \text{input}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i + + The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be + :ref:`broadcastable `. + + For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be + a real number, otherwise an integer. + + Args: + input (Tensor): the tensor to be added + tensor1 (Tensor): the tensor to be multiplied + tensor2 (Tensor): the tensor to be multiplied + + Keyword args: + value (Number, optional): multiplier for :math:`tensor1 .* tensor2` + out (Tensor, optional): the output tensor. + + Example:: + + >>> t = torch.randn(1, 3) + >>> t1 = torch.randn(3, 1) + >>> t2 = torch.randn(1, 3) + >>> torch.addcmul(t, t1, t2, value=0.1) + tensor([[-0.8635, -0.6391, 1.6174], + [-0.7617, -0.5879, 1.7388], + [-0.8353, -0.6249, 1.6511]]) + """ + ... +@overload +def addmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat1: Tensor, mat2: Tensor) -> Tensor: + r""" + addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`. + The matrix :attr:`input` is added to the final result. + + If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a + :math:`(m \times p)` tensor, then :attr:`input` must be + :ref:`broadcastable ` with a :math:`(n \times p)` tensor + and :attr:`out` will be a :math:`(n \times p)` tensor. + + :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between + :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + This operation has support for arguments with :ref:`sparse layouts`. If + :attr:`input` is sparse the result will have the same layout and if :attr:`out` + is provided it must have the same layout as :attr:`input`. + + + .. warning:: + Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported, + or may not have autograd support. If you notice missing functionality please + open a feature request. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + input (Tensor): matrix to be added + mat1 (Tensor): the first matrix to be matrix multiplied + mat2 (Tensor): the second matrix to be matrix multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(2, 3) + >>> mat1 = torch.randn(2, 3) + >>> mat2 = torch.randn(3, 3) + >>> torch.addmm(M, mat1, mat2) + tensor([[-4.8716, 1.4671, -1.3746], + [ 0.7573, -3.9555, -2.8681]]) + """ + ... +@overload +def addmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat1: Tensor, mat2: Tensor, *, out: Tensor) -> Tensor: + r""" + addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`. + The matrix :attr:`input` is added to the final result. + + If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a + :math:`(m \times p)` tensor, then :attr:`input` must be + :ref:`broadcastable ` with a :math:`(n \times p)` tensor + and :attr:`out` will be a :math:`(n \times p)` tensor. + + :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between + :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + This operation has support for arguments with :ref:`sparse layouts`. If + :attr:`input` is sparse the result will have the same layout and if :attr:`out` + is provided it must have the same layout as :attr:`input`. + + + .. warning:: + Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported, + or may not have autograd support. If you notice missing functionality please + open a feature request. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + input (Tensor): matrix to be added + mat1 (Tensor): the first matrix to be matrix multiplied + mat2 (Tensor): the second matrix to be matrix multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(2, 3) + >>> mat1 = torch.randn(2, 3) + >>> mat2 = torch.randn(3, 3) + >>> torch.addmm(M, mat1, mat2) + tensor([[-4.8716, 1.4671, -1.3746], + [ 0.7573, -3.9555, -2.8681]]) + """ + ... +@overload +def addmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: + r""" + addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`. + The matrix :attr:`input` is added to the final result. + + If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a + :math:`(m \times p)` tensor, then :attr:`input` must be + :ref:`broadcastable ` with a :math:`(n \times p)` tensor + and :attr:`out` will be a :math:`(n \times p)` tensor. + + :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between + :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + This operation has support for arguments with :ref:`sparse layouts`. If + :attr:`input` is sparse the result will have the same layout and if :attr:`out` + is provided it must have the same layout as :attr:`input`. + + + .. warning:: + Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported, + or may not have autograd support. If you notice missing functionality please + open a feature request. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + input (Tensor): matrix to be added + mat1 (Tensor): the first matrix to be matrix multiplied + mat2 (Tensor): the second matrix to be matrix multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(2, 3) + >>> mat1 = torch.randn(2, 3) + >>> mat2 = torch.randn(3, 3) + >>> torch.addmm(M, mat1, mat2) + tensor([[-4.8716, 1.4671, -1.3746], + [ 0.7573, -3.9555, -2.8681]]) + """ + ... +@overload +def addmm(beta: Union[Number, _complex], self: Tensor, mat1: Tensor, mat2: Tensor) -> Tensor: + r""" + addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`. + The matrix :attr:`input` is added to the final result. + + If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a + :math:`(m \times p)` tensor, then :attr:`input` must be + :ref:`broadcastable ` with a :math:`(n \times p)` tensor + and :attr:`out` will be a :math:`(n \times p)` tensor. + + :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between + :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + This operation has support for arguments with :ref:`sparse layouts`. If + :attr:`input` is sparse the result will have the same layout and if :attr:`out` + is provided it must have the same layout as :attr:`input`. + + + .. warning:: + Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported, + or may not have autograd support. If you notice missing functionality please + open a feature request. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + input (Tensor): matrix to be added + mat1 (Tensor): the first matrix to be matrix multiplied + mat2 (Tensor): the second matrix to be matrix multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(2, 3) + >>> mat1 = torch.randn(2, 3) + >>> mat2 = torch.randn(3, 3) + >>> torch.addmm(M, mat1, mat2) + tensor([[-4.8716, 1.4671, -1.3746], + [ 0.7573, -3.9555, -2.8681]]) + """ + ... +@overload +def addmm(beta: Union[Number, _complex], self: Tensor, mat1: Tensor, mat2: Tensor, *, out: Tensor) -> Tensor: + r""" + addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`. + The matrix :attr:`input` is added to the final result. + + If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a + :math:`(m \times p)` tensor, then :attr:`input` must be + :ref:`broadcastable ` with a :math:`(n \times p)` tensor + and :attr:`out` will be a :math:`(n \times p)` tensor. + + :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between + :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + This operation has support for arguments with :ref:`sparse layouts`. If + :attr:`input` is sparse the result will have the same layout and if :attr:`out` + is provided it must have the same layout as :attr:`input`. + + + .. warning:: + Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported, + or may not have autograd support. If you notice missing functionality please + open a feature request. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + input (Tensor): matrix to be added + mat1 (Tensor): the first matrix to be matrix multiplied + mat2 (Tensor): the second matrix to be matrix multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(2, 3) + >>> mat1 = torch.randn(2, 3) + >>> mat2 = torch.randn(3, 3) + >>> torch.addmm(M, mat1, mat2) + tensor([[-4.8716, 1.4671, -1.3746], + [ 0.7573, -3.9555, -2.8681]]) + """ + ... +@overload +def addmv(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat: Tensor, vec: Tensor) -> Tensor: + r""" + addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a matrix-vector product of the matrix :attr:`mat` and + the vector :attr:`vec`. + The vector :attr:`input` is added to the final result. + + If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of + size `m`, then :attr:`input` must be + :ref:`broadcastable ` with a 1-D tensor of size `n` and + :attr:`out` will be 1-D tensor of size `n`. + + :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between + :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec}) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + Args: + input (Tensor): vector to be added + mat (Tensor): matrix to be matrix multiplied + vec (Tensor): vector to be matrix multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(2) + >>> mat = torch.randn(2, 3) + >>> vec = torch.randn(3) + >>> torch.addmv(M, mat, vec) + tensor([-0.3768, -5.5565]) + """ + ... +@overload +def addmv(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat: Tensor, vec: Tensor, *, out: Tensor) -> Tensor: + r""" + addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a matrix-vector product of the matrix :attr:`mat` and + the vector :attr:`vec`. + The vector :attr:`input` is added to the final result. + + If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of + size `m`, then :attr:`input` must be + :ref:`broadcastable ` with a 1-D tensor of size `n` and + :attr:`out` will be 1-D tensor of size `n`. + + :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between + :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec}) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + Args: + input (Tensor): vector to be added + mat (Tensor): matrix to be matrix multiplied + vec (Tensor): vector to be matrix multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(2) + >>> mat = torch.randn(2, 3) + >>> vec = torch.randn(3) + >>> torch.addmv(M, mat, vec) + tensor([-0.3768, -5.5565]) + """ + ... +@overload +def addmv(input: Tensor, mat: Tensor, vec: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: + r""" + addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a matrix-vector product of the matrix :attr:`mat` and + the vector :attr:`vec`. + The vector :attr:`input` is added to the final result. + + If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of + size `m`, then :attr:`input` must be + :ref:`broadcastable ` with a 1-D tensor of size `n` and + :attr:`out` will be 1-D tensor of size `n`. + + :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between + :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec}) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + Args: + input (Tensor): vector to be added + mat (Tensor): matrix to be matrix multiplied + vec (Tensor): vector to be matrix multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(2) + >>> mat = torch.randn(2, 3) + >>> vec = torch.randn(3) + >>> torch.addmv(M, mat, vec) + tensor([-0.3768, -5.5565]) + """ + ... +@overload +def addmv(beta: Union[Number, _complex], self: Tensor, mat: Tensor, vec: Tensor) -> Tensor: + r""" + addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a matrix-vector product of the matrix :attr:`mat` and + the vector :attr:`vec`. + The vector :attr:`input` is added to the final result. + + If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of + size `m`, then :attr:`input` must be + :ref:`broadcastable ` with a 1-D tensor of size `n` and + :attr:`out` will be 1-D tensor of size `n`. + + :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between + :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec}) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + Args: + input (Tensor): vector to be added + mat (Tensor): matrix to be matrix multiplied + vec (Tensor): vector to be matrix multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(2) + >>> mat = torch.randn(2, 3) + >>> vec = torch.randn(3) + >>> torch.addmv(M, mat, vec) + tensor([-0.3768, -5.5565]) + """ + ... +@overload +def addmv(beta: Union[Number, _complex], self: Tensor, mat: Tensor, vec: Tensor, *, out: Tensor) -> Tensor: + r""" + addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a matrix-vector product of the matrix :attr:`mat` and + the vector :attr:`vec`. + The vector :attr:`input` is added to the final result. + + If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of + size `m`, then :attr:`input` must be + :ref:`broadcastable ` with a 1-D tensor of size `n` and + :attr:`out` will be 1-D tensor of size `n`. + + :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between + :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec}) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + Args: + input (Tensor): vector to be added + mat (Tensor): matrix to be matrix multiplied + vec (Tensor): vector to be matrix multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(2) + >>> mat = torch.randn(2, 3) + >>> vec = torch.randn(3) + >>> torch.addmv(M, mat, vec) + tensor([-0.3768, -5.5565]) + """ + ... +@overload +def addmv_(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat: Tensor, vec: Tensor) -> Tensor: ... +@overload +def addmv_(input: Tensor, mat: Tensor, vec: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: ... +@overload +def addmv_(beta: Union[Number, _complex], self: Tensor, mat: Tensor, vec: Tensor) -> Tensor: ... +@overload +def addr(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], vec1: Tensor, vec2: Tensor) -> Tensor: + r""" + addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2` + and adds it to the matrix :attr:`input`. + + Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the + outer product between :attr:`vec1` and :attr:`vec2` and the added matrix + :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2}) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector + of size `m`, then :attr:`input` must be + :ref:`broadcastable ` with a matrix of size + :math:`(n \times m)` and :attr:`out` will be a matrix of size + :math:`(n \times m)`. + + Args: + input (Tensor): matrix to be added + vec1 (Tensor): the first vector of the outer product + vec2 (Tensor): the second vector of the outer product + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> vec1 = torch.arange(1., 4.) + >>> vec2 = torch.arange(1., 3.) + >>> M = torch.zeros(3, 2) + >>> torch.addr(M, vec1, vec2) + tensor([[ 1., 2.], + [ 2., 4.], + [ 3., 6.]]) + """ + ... +@overload +def addr(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], vec1: Tensor, vec2: Tensor, *, out: Tensor) -> Tensor: + r""" + addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2` + and adds it to the matrix :attr:`input`. + + Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the + outer product between :attr:`vec1` and :attr:`vec2` and the added matrix + :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2}) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector + of size `m`, then :attr:`input` must be + :ref:`broadcastable ` with a matrix of size + :math:`(n \times m)` and :attr:`out` will be a matrix of size + :math:`(n \times m)`. + + Args: + input (Tensor): matrix to be added + vec1 (Tensor): the first vector of the outer product + vec2 (Tensor): the second vector of the outer product + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> vec1 = torch.arange(1., 4.) + >>> vec2 = torch.arange(1., 3.) + >>> M = torch.zeros(3, 2) + >>> torch.addr(M, vec1, vec2) + tensor([[ 1., 2.], + [ 2., 4.], + [ 3., 6.]]) + """ + ... +@overload +def addr(input: Tensor, vec1: Tensor, vec2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: + r""" + addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2` + and adds it to the matrix :attr:`input`. + + Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the + outer product between :attr:`vec1` and :attr:`vec2` and the added matrix + :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2}) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector + of size `m`, then :attr:`input` must be + :ref:`broadcastable ` with a matrix of size + :math:`(n \times m)` and :attr:`out` will be a matrix of size + :math:`(n \times m)`. + + Args: + input (Tensor): matrix to be added + vec1 (Tensor): the first vector of the outer product + vec2 (Tensor): the second vector of the outer product + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> vec1 = torch.arange(1., 4.) + >>> vec2 = torch.arange(1., 3.) + >>> M = torch.zeros(3, 2) + >>> torch.addr(M, vec1, vec2) + tensor([[ 1., 2.], + [ 2., 4.], + [ 3., 6.]]) + """ + ... +@overload +def addr(beta: Union[Number, _complex], self: Tensor, vec1: Tensor, vec2: Tensor) -> Tensor: + r""" + addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2` + and adds it to the matrix :attr:`input`. + + Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the + outer product between :attr:`vec1` and :attr:`vec2` and the added matrix + :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2}) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector + of size `m`, then :attr:`input` must be + :ref:`broadcastable ` with a matrix of size + :math:`(n \times m)` and :attr:`out` will be a matrix of size + :math:`(n \times m)`. + + Args: + input (Tensor): matrix to be added + vec1 (Tensor): the first vector of the outer product + vec2 (Tensor): the second vector of the outer product + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> vec1 = torch.arange(1., 4.) + >>> vec2 = torch.arange(1., 3.) + >>> M = torch.zeros(3, 2) + >>> torch.addr(M, vec1, vec2) + tensor([[ 1., 2.], + [ 2., 4.], + [ 3., 6.]]) + """ + ... +@overload +def addr(beta: Union[Number, _complex], self: Tensor, vec1: Tensor, vec2: Tensor, *, out: Tensor) -> Tensor: + r""" + addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2` + and adds it to the matrix :attr:`input`. + + Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the + outer product between :attr:`vec1` and :attr:`vec2` and the added matrix + :attr:`input` respectively. + + .. math:: + \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2}) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector + of size `m`, then :attr:`input` must be + :ref:`broadcastable ` with a matrix of size + :math:`(n \times m)` and :attr:`out` will be a matrix of size + :math:`(n \times m)`. + + Args: + input (Tensor): matrix to be added + vec1 (Tensor): the first vector of the outer product + vec2 (Tensor): the second vector of the outer product + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> vec1 = torch.arange(1., 4.) + >>> vec2 = torch.arange(1., 3.) + >>> M = torch.zeros(3, 2) + >>> torch.addr(M, vec1, vec2) + tensor([[ 1., 2.], + [ 2., 4.], + [ 3., 6.]]) + """ + ... +def adjoint(input: Tensor) -> Tensor: + r""" + adjoint(Tensor) -> Tensor + Returns a view of the tensor conjugated and with the last two dimensions transposed. + + ``x.adjoint()`` is equivalent to ``x.transpose(-2, -1).conj()`` for complex tensors and + to ``x.transpose(-2, -1)`` for real tensors. + + Example:: + >>> x = torch.arange(4, dtype=torch.float) + >>> A = torch.complex(x, x).reshape(2, 2) + >>> A + tensor([[0.+0.j, 1.+1.j], + [2.+2.j, 3.+3.j]]) + >>> A.adjoint() + tensor([[0.-0.j, 2.-2.j], + [1.-1.j, 3.-3.j]]) + >>> (A.adjoint() == A.mH).all() + tensor(True) + """ + ... +def affine_grid_generator(theta: Tensor, size: Sequence[Union[_int, SymInt]], align_corners: _bool) -> Tensor: ... +def alias_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.alias`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +@overload +def all(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + all(input) -> Tensor + + Tests if all elements in :attr:`input` evaluate to `True`. + + .. note:: This function matches the behaviour of NumPy in returning + output of dtype `bool` for all supported dtypes except `uint8`. + For `uint8` the dtype of output is `uint8` itself. + + Example:: + + >>> a = torch.rand(1, 2).bool() + >>> a + tensor([[False, True]], dtype=torch.bool) + >>> torch.all(a) + tensor(False, dtype=torch.bool) + >>> a = torch.arange(0, 3) + >>> a + tensor([0, 1, 2]) + >>> torch.all(a) + tensor(False) + + .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor + :noindex: + + For each row of :attr:`input` in the given dimension :attr:`dim`, + returns `True` if all elements in the row evaluate to `True` and `False` otherwise. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.rand(4, 2).bool() + >>> a + tensor([[True, True], + [True, False], + [True, True], + [True, True]], dtype=torch.bool) + >>> torch.all(a, dim=1) + tensor([ True, False, True, True], dtype=torch.bool) + >>> torch.all(a, dim=0) + tensor([ True, False], dtype=torch.bool) + """ + ... +@overload +def all(input: Tensor, dim: Optional[_size] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + all(input) -> Tensor + + Tests if all elements in :attr:`input` evaluate to `True`. + + .. note:: This function matches the behaviour of NumPy in returning + output of dtype `bool` for all supported dtypes except `uint8`. + For `uint8` the dtype of output is `uint8` itself. + + Example:: + + >>> a = torch.rand(1, 2).bool() + >>> a + tensor([[False, True]], dtype=torch.bool) + >>> torch.all(a) + tensor(False, dtype=torch.bool) + >>> a = torch.arange(0, 3) + >>> a + tensor([0, 1, 2]) + >>> torch.all(a) + tensor(False) + + .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor + :noindex: + + For each row of :attr:`input` in the given dimension :attr:`dim`, + returns `True` if all elements in the row evaluate to `True` and `False` otherwise. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.rand(4, 2).bool() + >>> a + tensor([[True, True], + [True, False], + [True, True], + [True, True]], dtype=torch.bool) + >>> torch.all(a, dim=1) + tensor([ True, False, True, True], dtype=torch.bool) + >>> torch.all(a, dim=0) + tensor([ True, False], dtype=torch.bool) + """ + ... +@overload +def all(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + all(input) -> Tensor + + Tests if all elements in :attr:`input` evaluate to `True`. + + .. note:: This function matches the behaviour of NumPy in returning + output of dtype `bool` for all supported dtypes except `uint8`. + For `uint8` the dtype of output is `uint8` itself. + + Example:: + + >>> a = torch.rand(1, 2).bool() + >>> a + tensor([[False, True]], dtype=torch.bool) + >>> torch.all(a) + tensor(False, dtype=torch.bool) + >>> a = torch.arange(0, 3) + >>> a + tensor([0, 1, 2]) + >>> torch.all(a) + tensor(False) + + .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor + :noindex: + + For each row of :attr:`input` in the given dimension :attr:`dim`, + returns `True` if all elements in the row evaluate to `True` and `False` otherwise. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.rand(4, 2).bool() + >>> a + tensor([[True, True], + [True, False], + [True, True], + [True, True]], dtype=torch.bool) + >>> torch.all(a, dim=1) + tensor([ True, False, True, True], dtype=torch.bool) + >>> torch.all(a, dim=0) + tensor([ True, False], dtype=torch.bool) + """ + ... +@overload +def all(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + all(input) -> Tensor + + Tests if all elements in :attr:`input` evaluate to `True`. + + .. note:: This function matches the behaviour of NumPy in returning + output of dtype `bool` for all supported dtypes except `uint8`. + For `uint8` the dtype of output is `uint8` itself. + + Example:: + + >>> a = torch.rand(1, 2).bool() + >>> a + tensor([[False, True]], dtype=torch.bool) + >>> torch.all(a) + tensor(False, dtype=torch.bool) + >>> a = torch.arange(0, 3) + >>> a + tensor([0, 1, 2]) + >>> torch.all(a) + tensor(False) + + .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor + :noindex: + + For each row of :attr:`input` in the given dimension :attr:`dim`, + returns `True` if all elements in the row evaluate to `True` and `False` otherwise. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.rand(4, 2).bool() + >>> a + tensor([[True, True], + [True, False], + [True, True], + [True, True]], dtype=torch.bool) + >>> torch.all(a, dim=1) + tensor([ True, False, True, True], dtype=torch.bool) + >>> torch.all(a, dim=0) + tensor([ True, False], dtype=torch.bool) + """ + ... +def allclose(input: Tensor, other: Tensor, rtol: _float = 1e-05, atol: _float = 1e-08, equal_nan: _bool = False) -> _bool: + r""" + allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False) -> bool + + This function checks if :attr:`input` and :attr:`other` satisfy the condition: + + .. math:: + \lvert \text{input} - \text{other} \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert \text{other} \rvert + + elementwise, for all elements of :attr:`input` and :attr:`other`. The behaviour of this function is analogous to + `numpy.allclose `_ + + Args: + input (Tensor): first tensor to compare + other (Tensor): second tensor to compare + atol (float, optional): absolute tolerance. Default: 1e-08 + rtol (float, optional): relative tolerance. Default: 1e-05 + equal_nan (bool, optional): if ``True``, then two ``NaN`` s will be considered equal. Default: ``False`` + + Example:: + + >>> torch.allclose(torch.tensor([10000., 1e-07]), torch.tensor([10000.1, 1e-08])) + False + >>> torch.allclose(torch.tensor([10000., 1e-08]), torch.tensor([10000.1, 1e-09])) + True + >>> torch.allclose(torch.tensor([1.0, float('nan')]), torch.tensor([1.0, float('nan')])) + False + >>> torch.allclose(torch.tensor([1.0, float('nan')]), torch.tensor([1.0, float('nan')]), equal_nan=True) + True + """ + ... +def alpha_dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ... +def alpha_dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ... +def amax(input: Tensor, dim: Union[_int, _size] = (), keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + amax(input, dim, keepdim=False, *, out=None) -> Tensor + + Returns the maximum value of each slice of the :attr:`input` tensor in the given + dimension(s) :attr:`dim`. + + .. note:: + The difference between ``max``/``min`` and ``amax``/``amin`` is: + - ``amax``/``amin`` supports reducing on multiple dimensions, + - ``amax``/``amin`` does not return indices, + - ``amax``/``amin`` evenly distributes gradient between equal values, + while ``max(dim)``/``min(dim)`` propagates gradient only to a single + index in the source tensor. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[ 0.8177, 1.4878, -0.2491, 0.9130], + [-0.7158, 1.1775, 2.0992, 0.4817], + [-0.0053, 0.0164, -1.3738, -0.0507], + [ 1.9700, 1.1106, -1.0318, -1.0816]]) + >>> torch.amax(a, 1) + tensor([1.4878, 2.0992, 0.0164, 1.9700]) + """ + ... +def amin(input: Tensor, dim: Union[_int, _size] = (), keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + amin(input, dim, keepdim=False, *, out=None) -> Tensor + + Returns the minimum value of each slice of the :attr:`input` tensor in the given + dimension(s) :attr:`dim`. + + .. note:: + The difference between ``max``/``min`` and ``amax``/``amin`` is: + - ``amax``/``amin`` supports reducing on multiple dimensions, + - ``amax``/``amin`` does not return indices, + - ``amax``/``amin`` evenly distributes gradient between equal values, + while ``max(dim)``/``min(dim)`` propagates gradient only to a single + index in the source tensor. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[ 0.6451, -0.4866, 0.2987, -1.3312], + [-0.5744, 1.2980, 1.8397, -0.2713], + [ 0.9128, 0.9214, -1.7268, -0.2995], + [ 0.9023, 0.4853, 0.9075, -1.6165]]) + >>> torch.amin(a, 1) + tensor([-1.3312, -0.5744, -1.7268, -1.6165]) + """ + ... +def aminmax(input: Tensor, *, dim: Optional[_int] = None, keepdim: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.aminmax: + r""" + aminmax(input, *, dim=None, keepdim=False, out=None) -> (Tensor min, Tensor max) + + Computes the minimum and maximum values of the :attr:`input` tensor. + + Args: + input (Tensor): + The input tensor + + Keyword Args: + dim (Optional[int]): + The dimension along which to compute the values. If `None`, + computes the values over the entire :attr:`input` tensor. + Default is `None`. + keepdim (bool): + If `True`, the reduced dimensions will be kept in the output + tensor as dimensions with size 1 for broadcasting, otherwise + they will be removed, as if calling (:func:`torch.squeeze`). + Default is `False`. + out (Optional[Tuple[Tensor, Tensor]]): + Optional tensors on which to write the result. Must have the same + shape and dtype as the expected output. + Default is `None`. + + Returns: + A named tuple `(min, max)` containing the minimum and maximum values. + + Raises: + RuntimeError + If any of the dimensions to compute the values over has size 0. + + .. note:: + NaN values are propagated to the output if at least one value is NaN. + + .. seealso:: + :func:`torch.amin` computes just the minimum value + :func:`torch.amax` computes just the maximum value + + Example:: + + >>> torch.aminmax(torch.tensor([1, -3, 5])) + torch.return_types.aminmax( + min=tensor(-3), + max=tensor(5)) + + >>> # aminmax propagates NaNs + >>> torch.aminmax(torch.tensor([1, -3, 5, torch.nan])) + torch.return_types.aminmax( + min=tensor(nan), + max=tensor(nan)) + + >>> t = torch.arange(10).view(2, 5) + >>> t + tensor([[0, 1, 2, 3, 4], + [5, 6, 7, 8, 9]]) + >>> t.aminmax(dim=0, keepdim=True) + torch.return_types.aminmax( + min=tensor([[0, 1, 2, 3, 4]]), + max=tensor([[5, 6, 7, 8, 9]])) + """ + ... +def angle(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + angle(input, *, out=None) -> Tensor + + Computes the element-wise angle (in radians) of the given :attr:`input` tensor. + + .. math:: + \text{out}_{i} = angle(\text{input}_{i}) + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + .. note:: Starting in PyTorch 1.8, angle returns pi for negative real numbers, + zero for non-negative real numbers, and propagates NaNs. Previously + the function would return zero for all real numbers and not propagate + floating-point NaNs. + + Example:: + + >>> torch.angle(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]))*180/3.14159 + tensor([ 135., 135, -45]) + """ + ... +@overload +def any(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + any(input) -> Tensor + + Tests if any element in :attr:`input` evaluates to `True`. + + .. note:: This function matches the behaviour of NumPy in returning + output of dtype `bool` for all supported dtypes except `uint8`. + For `uint8` the dtype of output is `uint8` itself. + + Example:: + + >>> a = torch.rand(1, 2).bool() + >>> a + tensor([[False, True]], dtype=torch.bool) + >>> torch.any(a) + tensor(True, dtype=torch.bool) + >>> a = torch.arange(0, 3) + >>> a + tensor([0, 1, 2]) + >>> torch.any(a) + tensor(True) + + .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor + :noindex: + + For each row of :attr:`input` in the given dimension :attr:`dim`, + returns `True` if any element in the row evaluate to `True` and `False` otherwise. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4, 2) < 0 + >>> a + tensor([[ True, True], + [False, True], + [ True, True], + [False, False]]) + >>> torch.any(a, 1) + tensor([ True, True, True, False]) + >>> torch.any(a, 0) + tensor([True, True]) + """ + ... +@overload +def any(input: Tensor, dim: Optional[_size] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + any(input) -> Tensor + + Tests if any element in :attr:`input` evaluates to `True`. + + .. note:: This function matches the behaviour of NumPy in returning + output of dtype `bool` for all supported dtypes except `uint8`. + For `uint8` the dtype of output is `uint8` itself. + + Example:: + + >>> a = torch.rand(1, 2).bool() + >>> a + tensor([[False, True]], dtype=torch.bool) + >>> torch.any(a) + tensor(True, dtype=torch.bool) + >>> a = torch.arange(0, 3) + >>> a + tensor([0, 1, 2]) + >>> torch.any(a) + tensor(True) + + .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor + :noindex: + + For each row of :attr:`input` in the given dimension :attr:`dim`, + returns `True` if any element in the row evaluate to `True` and `False` otherwise. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4, 2) < 0 + >>> a + tensor([[ True, True], + [False, True], + [ True, True], + [False, False]]) + >>> torch.any(a, 1) + tensor([ True, True, True, False]) + >>> torch.any(a, 0) + tensor([True, True]) + """ + ... +@overload +def any(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + any(input) -> Tensor + + Tests if any element in :attr:`input` evaluates to `True`. + + .. note:: This function matches the behaviour of NumPy in returning + output of dtype `bool` for all supported dtypes except `uint8`. + For `uint8` the dtype of output is `uint8` itself. + + Example:: + + >>> a = torch.rand(1, 2).bool() + >>> a + tensor([[False, True]], dtype=torch.bool) + >>> torch.any(a) + tensor(True, dtype=torch.bool) + >>> a = torch.arange(0, 3) + >>> a + tensor([0, 1, 2]) + >>> torch.any(a) + tensor(True) + + .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor + :noindex: + + For each row of :attr:`input` in the given dimension :attr:`dim`, + returns `True` if any element in the row evaluate to `True` and `False` otherwise. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4, 2) < 0 + >>> a + tensor([[ True, True], + [False, True], + [ True, True], + [False, False]]) + >>> torch.any(a, 1) + tensor([ True, True, True, False]) + >>> torch.any(a, 0) + tensor([True, True]) + """ + ... +@overload +def any(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + any(input) -> Tensor + + Tests if any element in :attr:`input` evaluates to `True`. + + .. note:: This function matches the behaviour of NumPy in returning + output of dtype `bool` for all supported dtypes except `uint8`. + For `uint8` the dtype of output is `uint8` itself. + + Example:: + + >>> a = torch.rand(1, 2).bool() + >>> a + tensor([[False, True]], dtype=torch.bool) + >>> torch.any(a) + tensor(True, dtype=torch.bool) + >>> a = torch.arange(0, 3) + >>> a + tensor([0, 1, 2]) + >>> torch.any(a) + tensor(True) + + .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor + :noindex: + + For each row of :attr:`input` in the given dimension :attr:`dim`, + returns `True` if any element in the row evaluate to `True` and `False` otherwise. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4, 2) < 0 + >>> a + tensor([[ True, True], + [False, True], + [ True, True], + [False, False]]) + >>> torch.any(a, 1) + tensor([ True, True, True, False]) + >>> torch.any(a, 0) + tensor([True, True]) + """ + ... +@overload +def arange(start: Number, end: Number, step: Number, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: + r""" + arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil` + with values from the interval ``[start, end)`` taken with common difference + :attr:`step` beginning from `start`. + + Note that non-integer :attr:`step` is subject to floating point rounding errors when + comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end` + in such cases. + + .. math:: + \text{out}_{{i+1}} = \text{out}_{i} + \text{step} + + Args: + start (Number): the starting value for the set of points. Default: ``0``. + end (Number): the ending value for the set of points + step (Number): the gap between each pair of adjacent points. Default: ``1``. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input + arguments. If any of `start`, `end`, or `stop` are floating-point, the + `dtype` is inferred to be the default dtype, see + :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to + be `torch.int64`. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.arange(5) + tensor([ 0, 1, 2, 3, 4]) + >>> torch.arange(1, 4) + tensor([ 1, 2, 3]) + >>> torch.arange(1, 2.5, 0.5) + tensor([ 1.0000, 1.5000, 2.0000]) + """ + ... +@overload +def arange(start: Number, end: Number, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: + r""" + arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil` + with values from the interval ``[start, end)`` taken with common difference + :attr:`step` beginning from `start`. + + Note that non-integer :attr:`step` is subject to floating point rounding errors when + comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end` + in such cases. + + .. math:: + \text{out}_{{i+1}} = \text{out}_{i} + \text{step} + + Args: + start (Number): the starting value for the set of points. Default: ``0``. + end (Number): the ending value for the set of points + step (Number): the gap between each pair of adjacent points. Default: ``1``. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input + arguments. If any of `start`, `end`, or `stop` are floating-point, the + `dtype` is inferred to be the default dtype, see + :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to + be `torch.int64`. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.arange(5) + tensor([ 0, 1, 2, 3, 4]) + >>> torch.arange(1, 4) + tensor([ 1, 2, 3]) + >>> torch.arange(1, 2.5, 0.5) + tensor([ 1.0000, 1.5000, 2.0000]) + """ + ... +@overload +def arange(end: Number, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: + r""" + arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil` + with values from the interval ``[start, end)`` taken with common difference + :attr:`step` beginning from `start`. + + Note that non-integer :attr:`step` is subject to floating point rounding errors when + comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end` + in such cases. + + .. math:: + \text{out}_{{i+1}} = \text{out}_{i} + \text{step} + + Args: + start (Number): the starting value for the set of points. Default: ``0``. + end (Number): the ending value for the set of points + step (Number): the gap between each pair of adjacent points. Default: ``1``. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input + arguments. If any of `start`, `end`, or `stop` are floating-point, the + `dtype` is inferred to be the default dtype, see + :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to + be `torch.int64`. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.arange(5) + tensor([ 0, 1, 2, 3, 4]) + >>> torch.arange(1, 4) + tensor([ 1, 2, 3]) + >>> torch.arange(1, 2.5, 0.5) + tensor([ 1.0000, 1.5000, 2.0000]) + """ + ... +@overload +def arange(end: Union[Number, _complex], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil` + with values from the interval ``[start, end)`` taken with common difference + :attr:`step` beginning from `start`. + + Note that non-integer :attr:`step` is subject to floating point rounding errors when + comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end` + in such cases. + + .. math:: + \text{out}_{{i+1}} = \text{out}_{i} + \text{step} + + Args: + start (Number): the starting value for the set of points. Default: ``0``. + end (Number): the ending value for the set of points + step (Number): the gap between each pair of adjacent points. Default: ``1``. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input + arguments. If any of `start`, `end`, or `stop` are floating-point, the + `dtype` is inferred to be the default dtype, see + :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to + be `torch.int64`. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.arange(5) + tensor([ 0, 1, 2, 3, 4]) + >>> torch.arange(1, 4) + tensor([ 1, 2, 3]) + >>> torch.arange(1, 2.5, 0.5) + tensor([ 1.0000, 1.5000, 2.0000]) + """ + ... +@overload +def arange(start: Union[Number, _complex], end: Union[Number, _complex], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil` + with values from the interval ``[start, end)`` taken with common difference + :attr:`step` beginning from `start`. + + Note that non-integer :attr:`step` is subject to floating point rounding errors when + comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end` + in such cases. + + .. math:: + \text{out}_{{i+1}} = \text{out}_{i} + \text{step} + + Args: + start (Number): the starting value for the set of points. Default: ``0``. + end (Number): the ending value for the set of points + step (Number): the gap between each pair of adjacent points. Default: ``1``. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input + arguments. If any of `start`, `end`, or `stop` are floating-point, the + `dtype` is inferred to be the default dtype, see + :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to + be `torch.int64`. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.arange(5) + tensor([ 0, 1, 2, 3, 4]) + >>> torch.arange(1, 4) + tensor([ 1, 2, 3]) + >>> torch.arange(1, 2.5, 0.5) + tensor([ 1.0000, 1.5000, 2.0000]) + """ + ... +@overload +def arange(start: Union[Number, _complex], end: Union[Number, _complex], step: Union[Number, _complex] = 1, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil` + with values from the interval ``[start, end)`` taken with common difference + :attr:`step` beginning from `start`. + + Note that non-integer :attr:`step` is subject to floating point rounding errors when + comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end` + in such cases. + + .. math:: + \text{out}_{{i+1}} = \text{out}_{i} + \text{step} + + Args: + start (Number): the starting value for the set of points. Default: ``0``. + end (Number): the ending value for the set of points + step (Number): the gap between each pair of adjacent points. Default: ``1``. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input + arguments. If any of `start`, `end`, or `stop` are floating-point, the + `dtype` is inferred to be the default dtype, see + :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to + be `torch.int64`. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.arange(5) + tensor([ 0, 1, 2, 3, 4]) + >>> torch.arange(1, 4) + tensor([ 1, 2, 3]) + >>> torch.arange(1, 2.5, 0.5) + tensor([ 1.0000, 1.5000, 2.0000]) + """ + ... +def arccos(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + arccos(input, *, out=None) -> Tensor + + Alias for :func:`torch.acos`. + """ + ... +def arccos_(input: Tensor) -> Tensor: ... +def arccosh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + arccosh(input, *, out=None) -> Tensor + + Alias for :func:`torch.acosh`. + """ + ... +def arccosh_(input: Tensor) -> Tensor: ... +def arcsin(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + arcsin(input, *, out=None) -> Tensor + + Alias for :func:`torch.asin`. + """ + ... +def arcsin_(input: Tensor) -> Tensor: ... +def arcsinh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + arcsinh(input, *, out=None) -> Tensor + + Alias for :func:`torch.asinh`. + """ + ... +def arcsinh_(input: Tensor) -> Tensor: ... +def arctan(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + arctan(input, *, out=None) -> Tensor + + Alias for :func:`torch.atan`. + """ + ... +def arctan2(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + arctan2(input, other, *, out=None) -> Tensor + Alias for :func:`torch.atan2`. + """ + ... +def arctan_(input: Tensor) -> Tensor: ... +def arctanh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + arctanh(input, *, out=None) -> Tensor + + Alias for :func:`torch.atanh`. + """ + ... +def arctanh_(input: Tensor) -> Tensor: ... +def argmax(input: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + argmax(input) -> LongTensor + + Returns the indices of the maximum value of all elements in the :attr:`input` tensor. + + This is the second value returned by :meth:`torch.max`. See its + documentation for the exact semantics of this method. + + .. note:: If there are multiple maximal values then the indices of the first maximal value are returned. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[ 1.3398, 0.2663, -0.2686, 0.2450], + [-0.7401, -0.8805, -0.3402, -1.1936], + [ 0.4907, -1.3948, -1.0691, -0.3132], + [-1.6092, 0.5419, -0.2993, 0.3195]]) + >>> torch.argmax(a) + tensor(0) + + .. function:: argmax(input, dim, keepdim=False) -> LongTensor + :noindex: + + Returns the indices of the maximum values of a tensor across a dimension. + + This is the second value returned by :meth:`torch.max`. See its + documentation for the exact semantics of this method. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. If ``None``, the argmax of the flattened input is returned. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[ 1.3398, 0.2663, -0.2686, 0.2450], + [-0.7401, -0.8805, -0.3402, -1.1936], + [ 0.4907, -1.3948, -1.0691, -0.3132], + [-1.6092, 0.5419, -0.2993, 0.3195]]) + >>> torch.argmax(a, dim=1) + tensor([ 0, 2, 0, 1]) + """ + ... +def argmin(input: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + argmin(input, dim=None, keepdim=False) -> LongTensor + + Returns the indices of the minimum value(s) of the flattened tensor or along a dimension + + This is the second value returned by :meth:`torch.min`. See its + documentation for the exact semantics of this method. + + .. note:: If there are multiple minimal values then the indices of the first minimal value are returned. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. If ``None``, the argmin of the flattened input is returned. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[ 0.1139, 0.2254, -0.1381, 0.3687], + [ 1.0100, -1.1975, -0.0102, -0.4732], + [-0.9240, 0.1207, -0.7506, -1.0213], + [ 1.7809, -1.2960, 0.9384, 0.1438]]) + >>> torch.argmin(a) + tensor(13) + >>> torch.argmin(a, dim=1) + tensor([ 2, 1, 3, 1]) + >>> torch.argmin(a, dim=1, keepdim=True) + tensor([[2], + [1], + [3], + [1]]) + """ + ... +@overload +def argsort(input: Tensor, *, stable: _bool, dim: _int = -1, descending: _bool = False) -> Tensor: + r""" + argsort(input, dim=-1, descending=False, stable=False) -> Tensor + + Returns the indices that sort a tensor along a given dimension in ascending + order by value. + + This is the second value returned by :meth:`torch.sort`. See its documentation + for the exact semantics of this method. + + If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving + the order of equivalent elements. If ``False``, the relative order of values + which compare equal is not guaranteed. ``True`` is slower. + + Args: + input (Tensor): the input tensor. + dim (int, optional): the dimension to sort along + descending (bool, optional): controls the sorting order (ascending or descending) + stable (bool, optional): controls the relative order of equivalent elements + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[ 0.0785, 1.5267, -0.8521, 0.4065], + [ 0.1598, 0.0788, -0.0745, -1.2700], + [ 1.2208, 1.0722, -0.7064, 1.2564], + [ 0.0669, -0.2318, -0.8229, -0.9280]]) + + + >>> torch.argsort(a, dim=1) + tensor([[2, 0, 3, 1], + [3, 2, 1, 0], + [2, 1, 0, 3], + [3, 2, 1, 0]]) + """ + ... +@overload +def argsort(input: Tensor, dim: _int = -1, descending: _bool = False) -> Tensor: + r""" + argsort(input, dim=-1, descending=False, stable=False) -> Tensor + + Returns the indices that sort a tensor along a given dimension in ascending + order by value. + + This is the second value returned by :meth:`torch.sort`. See its documentation + for the exact semantics of this method. + + If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving + the order of equivalent elements. If ``False``, the relative order of values + which compare equal is not guaranteed. ``True`` is slower. + + Args: + input (Tensor): the input tensor. + dim (int, optional): the dimension to sort along + descending (bool, optional): controls the sorting order (ascending or descending) + stable (bool, optional): controls the relative order of equivalent elements + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[ 0.0785, 1.5267, -0.8521, 0.4065], + [ 0.1598, 0.0788, -0.0745, -1.2700], + [ 1.2208, 1.0722, -0.7064, 1.2564], + [ 0.0669, -0.2318, -0.8229, -0.9280]]) + + + >>> torch.argsort(a, dim=1) + tensor([[2, 0, 3, 1], + [3, 2, 1, 0], + [2, 1, 0, 3], + [3, 2, 1, 0]]) + """ + ... +@overload +def argsort(input: Tensor, dim: Union[str, ellipsis, None], descending: _bool = False) -> Tensor: + r""" + argsort(input, dim=-1, descending=False, stable=False) -> Tensor + + Returns the indices that sort a tensor along a given dimension in ascending + order by value. + + This is the second value returned by :meth:`torch.sort`. See its documentation + for the exact semantics of this method. + + If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving + the order of equivalent elements. If ``False``, the relative order of values + which compare equal is not guaranteed. ``True`` is slower. + + Args: + input (Tensor): the input tensor. + dim (int, optional): the dimension to sort along + descending (bool, optional): controls the sorting order (ascending or descending) + stable (bool, optional): controls the relative order of equivalent elements + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[ 0.0785, 1.5267, -0.8521, 0.4065], + [ 0.1598, 0.0788, -0.0745, -1.2700], + [ 1.2208, 1.0722, -0.7064, 1.2564], + [ 0.0669, -0.2318, -0.8229, -0.9280]]) + + + >>> torch.argsort(a, dim=1) + tensor([[2, 0, 3, 1], + [3, 2, 1, 0], + [2, 1, 0, 3], + [3, 2, 1, 0]]) + """ + ... +def argwhere(input: Tensor) -> Tensor: + r""" + argwhere(input) -> Tensor + + Returns a tensor containing the indices of all non-zero elements of + :attr:`input`. Each row in the result contains the indices of a non-zero + element in :attr:`input`. The result is sorted lexicographically, with + the last index changing the fastest (C-style). + + If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor + :attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of + non-zero elements in the :attr:`input` tensor. + + .. note:: + This function is similar to NumPy's `argwhere`. + + When :attr:`input` is on CUDA, this function causes host-device synchronization. + + Args: + {input} + + Example:: + + >>> t = torch.tensor([1, 0, 1]) + >>> torch.argwhere(t) + tensor([[0], + [2]]) + >>> t = torch.tensor([[1, 0, 1], [0, 1, 1]]) + >>> torch.argwhere(t) + tensor([[0, 0], + [0, 2], + [1, 1], + [1, 2]]) + """ + ... +def as_strided(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: + r""" + as_strided(input, size, stride, storage_offset=None) -> Tensor + + Create a view of an existing `torch.Tensor` :attr:`input` with specified + :attr:`size`, :attr:`stride` and :attr:`storage_offset`. + + .. warning:: + Prefer using other view functions, like :meth:`torch.Tensor.expand`, + to setting a view's strides manually with `as_strided`, as this + function's behavior depends on the implementation of a tensor's storage. + The constructed view of the storage must only refer to elements within + the storage or a runtime error will be thrown, and if the view is + "overlapped" (with multiple indices referring to the same element in + memory) its behavior is undefined. + + Args: + input (Tensor): the input tensor. + size (tuple or ints): the shape of the output tensor + stride (tuple or ints): the stride of the output tensor + storage_offset (int, optional): the offset in the underlying storage of the output tensor. + If ``None``, the storage_offset of the output tensor will match the input tensor. + + Example:: + + >>> x = torch.randn(3, 3) + >>> x + tensor([[ 0.9039, 0.6291, 1.0795], + [ 0.1586, 2.1939, -0.4900], + [-0.1909, -0.7503, 1.9355]]) + >>> t = torch.as_strided(x, (2, 2), (1, 2)) + >>> t + tensor([[0.9039, 1.0795], + [0.6291, 0.1586]]) + >>> t = torch.as_strided(x, (2, 2), (1, 2), 1) + tensor([[0.6291, 0.1586], + [1.0795, 2.1939]]) + """ + ... +def as_strided_(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: ... +def as_strided_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.as_strided`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def as_strided_scatter(input: Tensor, src: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: + r""" + as_strided_scatter(input, src, size, stride, storage_offset=None) -> Tensor + + Embeds the values of the :attr:`src` tensor into :attr:`input` along + the elements corresponding to the result of calling + input.as_strided(size, stride, storage_offset). + + This function returns a tensor with fresh storage; it does not + return a view. + + Args: + input (Tensor): the input tensor. + size (tuple or ints): the shape of the output tensor + stride (tuple or ints): the stride of the output tensor + storage_offset (int, optional): the offset in the underlying storage of the output tensor + + .. note:: + + :attr:`src` must be of the proper size in order to be embedded + into :attr:`input`. Specifically, it should have the same shape as + `torch.as_strided(input, size, stride, storage_offset)` + + Example:: + + >>> a = torch.arange(4).reshape(2, 2) + 1 + >>> a + tensor([[1, 2], + [3, 4]]) + >>> b = torch.zeros(3, 3) + >>> b + tensor([[0., 0., 0.], + [0., 0., 0.], + [0., 0., 0.]]) + >>> torch.as_strided_scatter(b, a, (2, 2), (1, 2)) + tensor([[1., 3., 2.], + [4., 0., 0.], + [0., 0., 0.]]) + """ + ... +def as_tensor(data: Any, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None) -> Tensor: + r""" + as_tensor(data, dtype=None, device=None) -> Tensor + + Converts :attr:`data` into a tensor, sharing data and preserving autograd + history if possible. + + If :attr:`data` is already a tensor with the requested dtype and device + then :attr:`data` itself is returned, but if :attr:`data` is a + tensor with a different dtype or device then it's copied as if using + `data.to(dtype=dtype, device=device)`. + + If :attr:`data` is a NumPy array (an ndarray) with the same dtype and device then a + tensor is constructed using :func:`torch.from_numpy`. + + .. seealso:: + + :func:`torch.tensor` never shares its data and creates a new "leaf tensor" (see :doc:`/notes/autograd`). + + + Args: + data (array_like): Initial data for the tensor. Can be a list, tuple, + NumPy ``ndarray``, scalar, and other types. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, infers data type from :attr:`data`. + device (:class:`torch.device`, optional): the device of the constructed tensor. If None and data is a tensor + then the device of data is used. If None and data is not a tensor then + the result tensor is constructed on the current device. + + + Example:: + + >>> a = numpy.array([1, 2, 3]) + >>> t = torch.as_tensor(a) + >>> t + tensor([ 1, 2, 3]) + >>> t[0] = -1 + >>> a + array([-1, 2, 3]) + + >>> a = numpy.array([1, 2, 3]) + >>> t = torch.as_tensor(a, device=torch.device('cuda')) + >>> t + tensor([ 1, 2, 3]) + >>> t[0] = -1 + >>> a + array([1, 2, 3]) + """ + ... +def asarray(obj: Any, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, copy: Optional[_bool] = None, requires_grad: _bool = False) -> Tensor: + r""" + asarray(obj, *, dtype=None, device=None, copy=None, requires_grad=False) -> Tensor + + Converts :attr:`obj` to a tensor. + + :attr:`obj` can be one of: + + 1. a tensor + 2. a NumPy array or a NumPy scalar + 3. a DLPack capsule + 4. an object that implements Python's buffer protocol + 5. a scalar + 6. a sequence of scalars + + When :attr:`obj` is a tensor, NumPy array, or DLPack capsule the returned tensor will, + by default, not require a gradient, have the same datatype as :attr:`obj`, be on the + same device, and share memory with it. These properties can be controlled with the + :attr:`dtype`, :attr:`device`, :attr:`copy`, and :attr:`requires_grad` keyword arguments. + If the returned tensor is of a different datatype, on a different device, or a copy is + requested then it will not share its memory with :attr:`obj`. If :attr:`requires_grad` + is ``True`` then the returned tensor will require a gradient, and if :attr:`obj` is + also a tensor with an autograd history then the returned tensor will have the same history. + + When :attr:`obj` is not a tensor, NumPy array, or DLPack capsule but implements Python's + buffer protocol then the buffer is interpreted as an array of bytes grouped according to + the size of the datatype passed to the :attr:`dtype` keyword argument. (If no datatype is + passed then the default floating point datatype is used, instead.) The returned tensor + will have the specified datatype (or default floating point datatype if none is specified) + and, by default, be on the CPU device and share memory with the buffer. + + When :attr:`obj` is a NumPy scalar, the returned tensor will be a 0-dimensional tensor on + the CPU and that doesn't share its memory (i.e. ``copy=True``). By default datatype will + be the PyTorch datatype corresponding to the NumPy's scalar's datatype. + + When :attr:`obj` is none of the above but a scalar, or a sequence of scalars then the + returned tensor will, by default, infer its datatype from the scalar values, be on the + current default device, and not share its memory. + + .. seealso:: + + :func:`torch.tensor` creates a tensor that always copies the data from the input object. + :func:`torch.from_numpy` creates a tensor that always shares memory from NumPy arrays. + :func:`torch.frombuffer` creates a tensor that always shares memory from objects that + implement the buffer protocol. + :func:`torch.from_dlpack` creates a tensor that always shares memory from + DLPack capsules. + + Args: + obj (object): a tensor, NumPy array, DLPack Capsule, object that implements Python's + buffer protocol, scalar, or sequence of scalars. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the datatype of the returned tensor. + Default: ``None``, which causes the datatype of the returned tensor to be + inferred from :attr:`obj`. + copy (bool, optional): controls whether the returned tensor shares memory with :attr:`obj`. + Default: ``None``, which causes the returned tensor to share memory with :attr:`obj` + whenever possible. If ``True`` then the returned tensor does not share its memory. + If ``False`` then the returned tensor shares its memory with :attr:`obj` and an + error is thrown if it cannot. + device (:class:`torch.device`, optional): the device of the returned tensor. + Default: ``None``, which causes the device of :attr:`obj` to be used. Or, if + :attr:`obj` is a Python sequence, the current default device will be used. + requires_grad (bool, optional): whether the returned tensor requires grad. + Default: ``False``, which causes the returned tensor not to require a gradient. + If ``True``, then the returned tensor will require a gradient, and if :attr:`obj` + is also a tensor with an autograd history then the returned tensor will have + the same history. + + Example:: + + >>> a = torch.tensor([1, 2, 3]) + >>> # Shares memory with tensor 'a' + >>> b = torch.asarray(a) + >>> a.data_ptr() == b.data_ptr() + True + >>> # Forces memory copy + >>> c = torch.asarray(a, copy=True) + >>> a.data_ptr() == c.data_ptr() + False + + >>> a = torch.tensor([1., 2., 3.], requires_grad=True) + >>> b = a + 2 + >>> b + tensor([3., 4., 5.], grad_fn=) + >>> # Shares memory with tensor 'b', with no grad + >>> c = torch.asarray(b) + >>> c + tensor([3., 4., 5.]) + >>> # Shares memory with tensor 'b', retaining autograd history + >>> d = torch.asarray(b, requires_grad=True) + >>> d + tensor([3., 4., 5.], grad_fn=) + + >>> array = numpy.array([1, 2, 3]) + >>> # Shares memory with array 'array' + >>> t1 = torch.asarray(array) + >>> array.__array_interface__['data'][0] == t1.data_ptr() + True + >>> # Copies memory due to dtype mismatch + >>> t2 = torch.asarray(array, dtype=torch.float32) + >>> array.__array_interface__['data'][0] == t2.data_ptr() + False + + >>> scalar = numpy.float64(0.5) + >>> torch.asarray(scalar) + tensor(0.5000, dtype=torch.float64) + """ + ... +def asin(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + asin(input, *, out=None) -> Tensor + + Returns a new tensor with the arcsine of the elements of :attr:`input`. + + .. math:: + \text{out}_{i} = \sin^{-1}(\text{input}_{i}) + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([-0.5962, 1.4985, -0.4396, 1.4525]) + >>> torch.asin(a) + tensor([-0.6387, nan, -0.4552, nan]) + """ + ... +def asin_(input: Tensor) -> Tensor: ... +def asinh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + asinh(input, *, out=None) -> Tensor + + Returns a new tensor with the inverse hyperbolic sine of the elements of :attr:`input`. + + .. math:: + \text{out}_{i} = \sinh^{-1}(\text{input}_{i}) + + Args: + input (Tensor): the input tensor. + + Keyword arguments: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([ 0.1606, -1.4267, -1.0899, -1.0250 ]) + >>> torch.asinh(a) + tensor([ 0.1599, -1.1534, -0.9435, -0.8990 ]) + """ + ... +def asinh_(input: Tensor) -> Tensor: ... +def atan(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + atan(input, *, out=None) -> Tensor + + Returns a new tensor with the arctangent of the elements of :attr:`input`. + + .. math:: + \text{out}_{i} = \tan^{-1}(\text{input}_{i}) + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([ 0.2341, 0.2539, -0.6256, -0.6448]) + >>> torch.atan(a) + tensor([ 0.2299, 0.2487, -0.5591, -0.5727]) + """ + ... +def atan2(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + atan2(input, other, *, out=None) -> Tensor + + Element-wise arctangent of :math:`\text{input}_{i} / \text{other}_{i}` + with consideration of the quadrant. Returns a new tensor with the signed angles + in radians between vector :math:`(\text{other}_{i}, \text{input}_{i})` + and vector :math:`(1, 0)`. (Note that :math:`\text{other}_{i}`, the second + parameter, is the x-coordinate, while :math:`\text{input}_{i}`, the first + parameter, is the y-coordinate.) + + The shapes of ``input`` and ``other`` must be + :ref:`broadcastable `. + + Args: + input (Tensor): the first input tensor + other (Tensor): the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([ 0.9041, 0.0196, -0.3108, -2.4423]) + >>> torch.atan2(a, torch.randn(4)) + tensor([ 0.9833, 0.0811, -1.9743, -1.4151]) + """ + ... +def atan_(input: Tensor) -> Tensor: ... +def atanh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + atanh(input, *, out=None) -> Tensor + + Returns a new tensor with the inverse hyperbolic tangent of the elements of :attr:`input`. + + Note: + The domain of the inverse hyperbolic tangent is `(-1, 1)` and values outside this range + will be mapped to ``NaN``, except for the values `1` and `-1` for which the output is + mapped to `+/-INF` respectively. + + .. math:: + \text{out}_{i} = \tanh^{-1}(\text{input}_{i}) + + Args: + input (Tensor): the input tensor. + + Keyword arguments: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4).uniform_(-1, 1) + >>> a + tensor([ -0.9385, 0.2968, -0.8591, -0.1871 ]) + >>> torch.atanh(a) + tensor([ -1.7253, 0.3060, -1.2899, -0.1893 ]) + """ + ... +def atanh_(input: Tensor) -> Tensor: ... +def avg_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, ceil_mode: _bool = False, count_include_pad: _bool = True) -> Tensor: ... +@overload +def baddbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor) -> Tensor: + r""" + baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a batch matrix-matrix product of matrices in :attr:`batch1` + and :attr:`batch2`. + :attr:`input` is added to the final result. + + :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same + number of matrices. + + If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a + :math:`(b \times m \times p)` tensor, then :attr:`input` must be + :ref:`broadcastable ` with a + :math:`(b \times n \times p)` tensor and :attr:`out` will be a + :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the + same as the scaling factors used in :meth:`torch.addbmm`. + + .. math:: + \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + input (Tensor): the tensor to be added + batch1 (Tensor): the first batch of matrices to be multiplied + batch2 (Tensor): the second batch of matrices to be multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(10, 3, 5) + >>> batch1 = torch.randn(10, 3, 4) + >>> batch2 = torch.randn(10, 4, 5) + >>> torch.baddbmm(M, batch1, batch2).size() + torch.Size([10, 3, 5]) + """ + ... +@overload +def baddbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: + r""" + baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a batch matrix-matrix product of matrices in :attr:`batch1` + and :attr:`batch2`. + :attr:`input` is added to the final result. + + :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same + number of matrices. + + If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a + :math:`(b \times m \times p)` tensor, then :attr:`input` must be + :ref:`broadcastable ` with a + :math:`(b \times n \times p)` tensor and :attr:`out` will be a + :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the + same as the scaling factors used in :meth:`torch.addbmm`. + + .. math:: + \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + input (Tensor): the tensor to be added + batch1 (Tensor): the first batch of matrices to be multiplied + batch2 (Tensor): the second batch of matrices to be multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(10, 3, 5) + >>> batch1 = torch.randn(10, 3, 4) + >>> batch2 = torch.randn(10, 4, 5) + >>> torch.baddbmm(M, batch1, batch2).size() + torch.Size([10, 3, 5]) + """ + ... +@overload +def baddbmm(input: Tensor, batch1: Tensor, batch2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: + r""" + baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a batch matrix-matrix product of matrices in :attr:`batch1` + and :attr:`batch2`. + :attr:`input` is added to the final result. + + :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same + number of matrices. + + If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a + :math:`(b \times m \times p)` tensor, then :attr:`input` must be + :ref:`broadcastable ` with a + :math:`(b \times n \times p)` tensor and :attr:`out` will be a + :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the + same as the scaling factors used in :meth:`torch.addbmm`. + + .. math:: + \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + input (Tensor): the tensor to be added + batch1 (Tensor): the first batch of matrices to be multiplied + batch2 (Tensor): the second batch of matrices to be multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(10, 3, 5) + >>> batch1 = torch.randn(10, 3, 4) + >>> batch2 = torch.randn(10, 4, 5) + >>> torch.baddbmm(M, batch1, batch2).size() + torch.Size([10, 3, 5]) + """ + ... +@overload +def baddbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor) -> Tensor: + r""" + baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a batch matrix-matrix product of matrices in :attr:`batch1` + and :attr:`batch2`. + :attr:`input` is added to the final result. + + :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same + number of matrices. + + If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a + :math:`(b \times m \times p)` tensor, then :attr:`input` must be + :ref:`broadcastable ` with a + :math:`(b \times n \times p)` tensor and :attr:`out` will be a + :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the + same as the scaling factors used in :meth:`torch.addbmm`. + + .. math:: + \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + input (Tensor): the tensor to be added + batch1 (Tensor): the first batch of matrices to be multiplied + batch2 (Tensor): the second batch of matrices to be multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(10, 3, 5) + >>> batch1 = torch.randn(10, 3, 4) + >>> batch2 = torch.randn(10, 4, 5) + >>> torch.baddbmm(M, batch1, batch2).size() + torch.Size([10, 3, 5]) + """ + ... +@overload +def baddbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: + r""" + baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor + + Performs a batch matrix-matrix product of matrices in :attr:`batch1` + and :attr:`batch2`. + :attr:`input` is added to the final result. + + :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same + number of matrices. + + If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a + :math:`(b \times m \times p)` tensor, then :attr:`input` must be + :ref:`broadcastable ` with a + :math:`(b \times n \times p)` tensor and :attr:`out` will be a + :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the + same as the scaling factors used in :meth:`torch.addbmm`. + + .. math:: + \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i) + + If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in + it will not be propagated. + + For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and + :attr:`alpha` must be real numbers, otherwise they should be integers. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + input (Tensor): the tensor to be added + batch1 (Tensor): the first batch of matrices to be multiplied + batch2 (Tensor): the second batch of matrices to be multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + + Example:: + + >>> M = torch.randn(10, 3, 5) + >>> batch1 = torch.randn(10, 3, 4) + >>> batch2 = torch.randn(10, 4, 5) + >>> torch.baddbmm(M, batch1, batch2).size() + torch.Size([10, 3, 5]) + """ + ... +@overload +def bartlett_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + bartlett_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Bartlett window function. + + .. math:: + w[n] = 1 - \left| \frac{2n}{N-1} - 1 \right| = \begin{cases} + \frac{2n}{N - 1} & \text{if } 0 \leq n \leq \frac{N - 1}{2} \\ + 2 - \frac{2n}{N - 1} & \text{if } \frac{N - 1}{2} < n < N \\ + \end{cases}, + + where :math:`N` is the full window size. + + The input :attr:`window_length` is a positive integer controlling the + returned window size. :attr:`periodic` flag determines whether the returned + window trims off the last duplicate value from the symmetric window and is + ready to be used as a periodic window with functions like + :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in + above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have + ``torch.bartlett_window(L, periodic=True)`` equal to + ``torch.bartlett_window(L + 1, periodic=False)[:-1])``. + + .. note:: + If :attr:`window_length` :math:`=1`, the returned window contains a single value 1. + + Arguments: + window_length (int): the size of returned window + periodic (bool, optional): If True, returns a window to be used as periodic + function. If False, return a symmetric window. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported. + layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only + ``torch.strided`` (dense layout) is supported. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Returns: + Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window + """ + ... +@overload +def bartlett_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + bartlett_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Bartlett window function. + + .. math:: + w[n] = 1 - \left| \frac{2n}{N-1} - 1 \right| = \begin{cases} + \frac{2n}{N - 1} & \text{if } 0 \leq n \leq \frac{N - 1}{2} \\ + 2 - \frac{2n}{N - 1} & \text{if } \frac{N - 1}{2} < n < N \\ + \end{cases}, + + where :math:`N` is the full window size. + + The input :attr:`window_length` is a positive integer controlling the + returned window size. :attr:`periodic` flag determines whether the returned + window trims off the last duplicate value from the symmetric window and is + ready to be used as a periodic window with functions like + :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in + above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have + ``torch.bartlett_window(L, periodic=True)`` equal to + ``torch.bartlett_window(L + 1, periodic=False)[:-1])``. + + .. note:: + If :attr:`window_length` :math:`=1`, the returned window contains a single value 1. + + Arguments: + window_length (int): the size of returned window + periodic (bool, optional): If True, returns a window to be used as periodic + function. If False, return a symmetric window. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported. + layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only + ``torch.strided`` (dense layout) is supported. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Returns: + Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window + """ + ... +def batch_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, momentum: _float, eps: _float, cudnn_enabled: _bool) -> Tensor: ... +def batch_norm_backward_elemt(grad_out: Tensor, input: Tensor, mean: Tensor, invstd: Tensor, weight: Optional[Tensor], sum_dy: Tensor, sum_dy_xmu: Tensor, count: Tensor) -> Tensor: ... +def batch_norm_backward_reduce(grad_out: Tensor, input: Tensor, mean: Tensor, invstd: Tensor, weight: Optional[Tensor], input_g: _bool, weight_g: _bool, bias_g: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ... +def batch_norm_elemt(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], mean: Tensor, invstd: Tensor, eps: _float, *, out: Optional[Tensor] = None) -> Tensor: ... +def batch_norm_gather_stats(input: Tensor, mean: Tensor, invstd: Tensor, running_mean: Optional[Tensor], running_var: Optional[Tensor], momentum: _float, eps: _float, count: _int) -> Tuple[Tensor, Tensor]: ... +def batch_norm_gather_stats_with_counts(input: Tensor, mean: Tensor, invstd: Tensor, running_mean: Optional[Tensor], running_var: Optional[Tensor], momentum: _float, eps: _float, counts: Tensor) -> Tuple[Tensor, Tensor]: ... +def batch_norm_stats(input: Tensor, eps: _float) -> Tuple[Tensor, Tensor]: ... +def batch_norm_update_stats(input: Tensor, running_mean: Optional[Tensor], running_var: Optional[Tensor], momentum: _float) -> Tuple[Tensor, Tensor]: ... +@overload +def bernoulli(input: Tensor, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + bernoulli(input, *, generator=None, out=None) -> Tensor + + Draws binary random numbers (0 or 1) from a Bernoulli distribution. + + The :attr:`input` tensor should be a tensor containing probabilities + to be used for drawing the binary random number. + Hence, all values in :attr:`input` have to be in the range: + :math:`0 \leq \text{input}_i \leq 1`. + + The :math:`\text{i}^{th}` element of the output tensor will draw a + value :math:`1` according to the :math:`\text{i}^{th}` probability value given + in :attr:`input`. + + .. math:: + \text{out}_{i} \sim \mathrm{Bernoulli}(p = \text{input}_{i}) + + The returned :attr:`out` tensor only has values 0 or 1 and is of the same + shape as :attr:`input`. + + :attr:`out` can have integral ``dtype``, but :attr:`input` must have floating + point ``dtype``. + + Args: + input (Tensor): the input tensor of probability values for the Bernoulli distribution + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.empty(3, 3).uniform_(0, 1) # generate a uniform random matrix with range [0, 1] + >>> a + tensor([[ 0.1737, 0.0950, 0.3609], + [ 0.7148, 0.0289, 0.2676], + [ 0.9456, 0.8937, 0.7202]]) + >>> torch.bernoulli(a) + tensor([[ 1., 0., 0.], + [ 0., 0., 0.], + [ 1., 1., 1.]]) + + >>> a = torch.ones(3, 3) # probability of drawing "1" is 1 + >>> torch.bernoulli(a) + tensor([[ 1., 1., 1.], + [ 1., 1., 1.], + [ 1., 1., 1.]]) + >>> a = torch.zeros(3, 3) # probability of drawing "1" is 0 + >>> torch.bernoulli(a) + tensor([[ 0., 0., 0.], + [ 0., 0., 0.], + [ 0., 0., 0.]]) + """ + ... +@overload +def bernoulli(input: Tensor, p: _float, *, generator: Optional[Generator] = None) -> Tensor: + r""" + bernoulli(input, *, generator=None, out=None) -> Tensor + + Draws binary random numbers (0 or 1) from a Bernoulli distribution. + + The :attr:`input` tensor should be a tensor containing probabilities + to be used for drawing the binary random number. + Hence, all values in :attr:`input` have to be in the range: + :math:`0 \leq \text{input}_i \leq 1`. + + The :math:`\text{i}^{th}` element of the output tensor will draw a + value :math:`1` according to the :math:`\text{i}^{th}` probability value given + in :attr:`input`. + + .. math:: + \text{out}_{i} \sim \mathrm{Bernoulli}(p = \text{input}_{i}) + + The returned :attr:`out` tensor only has values 0 or 1 and is of the same + shape as :attr:`input`. + + :attr:`out` can have integral ``dtype``, but :attr:`input` must have floating + point ``dtype``. + + Args: + input (Tensor): the input tensor of probability values for the Bernoulli distribution + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.empty(3, 3).uniform_(0, 1) # generate a uniform random matrix with range [0, 1] + >>> a + tensor([[ 0.1737, 0.0950, 0.3609], + [ 0.7148, 0.0289, 0.2676], + [ 0.9456, 0.8937, 0.7202]]) + >>> torch.bernoulli(a) + tensor([[ 1., 0., 0.], + [ 0., 0., 0.], + [ 1., 1., 1.]]) + + >>> a = torch.ones(3, 3) # probability of drawing "1" is 1 + >>> torch.bernoulli(a) + tensor([[ 1., 1., 1.], + [ 1., 1., 1.], + [ 1., 1., 1.]]) + >>> a = torch.zeros(3, 3) # probability of drawing "1" is 0 + >>> torch.bernoulli(a) + tensor([[ 0., 0., 0.], + [ 0., 0., 0.], + [ 0., 0., 0.]]) + """ + ... +def bilinear(input1: Tensor, input2: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor: ... +def binary_cross_entropy_with_logits(input: Tensor, target: Tensor, weight: Optional[Tensor] = None, pos_weight: Optional[Tensor] = None, reduction: _int = 1) -> Tensor: ... +def bincount(input: Tensor, weights: Optional[Tensor] = None, minlength: _int = 0) -> Tensor: + r""" + bincount(input, weights=None, minlength=0) -> Tensor + + Count the frequency of each value in an array of non-negative ints. + + The number of bins (size 1) is one larger than the largest value in + :attr:`input` unless :attr:`input` is empty, in which case the result is a + tensor of size 0. If :attr:`minlength` is specified, the number of bins is at least + :attr:`minlength` and if :attr:`input` is empty, then the result is tensor of size + :attr:`minlength` filled with zeros. If ``n`` is the value at position ``i``, + ``out[n] += weights[i]`` if :attr:`weights` is specified else + ``out[n] += 1``. + + Note: + This operation may produce nondeterministic gradients when given tensors on a CUDA device. See :doc:`/notes/randomness` for more information. + + Arguments: + input (Tensor): 1-d int tensor + weights (Tensor): optional, weight for each value in the input tensor. + Should be of same size as input tensor. + minlength (int): optional, minimum number of bins. Should be non-negative. + + Returns: + output (Tensor): a tensor of shape ``Size([max(input) + 1])`` if + :attr:`input` is non-empty, else ``Size(0)`` + + Example:: + + >>> input = torch.randint(0, 8, (5,), dtype=torch.int64) + >>> weights = torch.linspace(0, 1, steps=5) + >>> input, weights + (tensor([4, 3, 6, 3, 4]), + tensor([ 0.0000, 0.2500, 0.5000, 0.7500, 1.0000]) + + >>> torch.bincount(input) + tensor([0, 0, 0, 2, 2, 0, 1]) + + >>> input.bincount(weights) + tensor([0.0000, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 0.5000]) + """ + ... +def binomial(count: Tensor, prob: Tensor, generator: Optional[Generator] = None) -> Tensor: ... +@overload +def bitwise_and(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + bitwise_and(input, other, *, out=None) -> Tensor + + Computes the bitwise AND of :attr:`input` and :attr:`other`. The input tensor must be of + integral or Boolean types. For bool tensors, it computes the logical AND. + + Args: + input: the first input tensor + other: the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_and(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([1, 0, 3], dtype=torch.int8) + >>> torch.bitwise_and(torch.tensor([True, True, False]), torch.tensor([False, True, False])) + tensor([ False, True, False]) + """ + ... +@overload +def bitwise_and(self: Union[Number, _complex], other: Tensor) -> Tensor: + r""" + bitwise_and(input, other, *, out=None) -> Tensor + + Computes the bitwise AND of :attr:`input` and :attr:`other`. The input tensor must be of + integral or Boolean types. For bool tensors, it computes the logical AND. + + Args: + input: the first input tensor + other: the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_and(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([1, 0, 3], dtype=torch.int8) + >>> torch.bitwise_and(torch.tensor([True, True, False]), torch.tensor([False, True, False])) + tensor([ False, True, False]) + """ + ... +@overload +def bitwise_and(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + bitwise_and(input, other, *, out=None) -> Tensor + + Computes the bitwise AND of :attr:`input` and :attr:`other`. The input tensor must be of + integral or Boolean types. For bool tensors, it computes the logical AND. + + Args: + input: the first input tensor + other: the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_and(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([1, 0, 3], dtype=torch.int8) + >>> torch.bitwise_and(torch.tensor([True, True, False]), torch.tensor([False, True, False])) + tensor([ False, True, False]) + """ + ... +@overload +def bitwise_left_shift(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + bitwise_left_shift(input, other, *, out=None) -> Tensor + + Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits. + The input tensor must be of integral type. This operator supports + :ref:`broadcasting to a common shape ` and + :ref:`type promotion `. + + The operation applied is: + + .. math:: + \text{out}_i = \text{input}_i << \text{other}_i + + Args: + input (Tensor or Scalar): the first input tensor + other (Tensor or Scalar): the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_left_shift(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([-2, -2, 24], dtype=torch.int8) + """ + ... +@overload +def bitwise_left_shift(self: Union[Number, _complex], other: Tensor) -> Tensor: + r""" + bitwise_left_shift(input, other, *, out=None) -> Tensor + + Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits. + The input tensor must be of integral type. This operator supports + :ref:`broadcasting to a common shape ` and + :ref:`type promotion `. + + The operation applied is: + + .. math:: + \text{out}_i = \text{input}_i << \text{other}_i + + Args: + input (Tensor or Scalar): the first input tensor + other (Tensor or Scalar): the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_left_shift(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([-2, -2, 24], dtype=torch.int8) + """ + ... +@overload +def bitwise_left_shift(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + bitwise_left_shift(input, other, *, out=None) -> Tensor + + Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits. + The input tensor must be of integral type. This operator supports + :ref:`broadcasting to a common shape ` and + :ref:`type promotion `. + + The operation applied is: + + .. math:: + \text{out}_i = \text{input}_i << \text{other}_i + + Args: + input (Tensor or Scalar): the first input tensor + other (Tensor or Scalar): the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_left_shift(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([-2, -2, 24], dtype=torch.int8) + """ + ... +def bitwise_not(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + bitwise_not(input, *, out=None) -> Tensor + + Computes the bitwise NOT of the given input tensor. The input tensor must be of + integral or Boolean types. For bool tensors, it computes the logical NOT. + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_not(torch.tensor([-1, -2, 3], dtype=torch.int8)) + tensor([ 0, 1, -4], dtype=torch.int8) + """ + ... +@overload +def bitwise_or(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + bitwise_or(input, other, *, out=None) -> Tensor + + Computes the bitwise OR of :attr:`input` and :attr:`other`. The input tensor must be of + integral or Boolean types. For bool tensors, it computes the logical OR. + + Args: + input: the first input tensor + other: the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_or(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([-1, -2, 3], dtype=torch.int8) + >>> torch.bitwise_or(torch.tensor([True, True, False]), torch.tensor([False, True, False])) + tensor([ True, True, False]) + """ + ... +@overload +def bitwise_or(self: Union[Number, _complex], other: Tensor) -> Tensor: + r""" + bitwise_or(input, other, *, out=None) -> Tensor + + Computes the bitwise OR of :attr:`input` and :attr:`other`. The input tensor must be of + integral or Boolean types. For bool tensors, it computes the logical OR. + + Args: + input: the first input tensor + other: the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_or(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([-1, -2, 3], dtype=torch.int8) + >>> torch.bitwise_or(torch.tensor([True, True, False]), torch.tensor([False, True, False])) + tensor([ True, True, False]) + """ + ... +@overload +def bitwise_or(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + bitwise_or(input, other, *, out=None) -> Tensor + + Computes the bitwise OR of :attr:`input` and :attr:`other`. The input tensor must be of + integral or Boolean types. For bool tensors, it computes the logical OR. + + Args: + input: the first input tensor + other: the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_or(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([-1, -2, 3], dtype=torch.int8) + >>> torch.bitwise_or(torch.tensor([True, True, False]), torch.tensor([False, True, False])) + tensor([ True, True, False]) + """ + ... +@overload +def bitwise_right_shift(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + bitwise_right_shift(input, other, *, out=None) -> Tensor + + Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits. + The input tensor must be of integral type. This operator supports + :ref:`broadcasting to a common shape ` and + :ref:`type promotion `. + In any case, if the value of the right operand is negative or is greater + or equal to the number of bits in the promoted left operand, the behavior is undefined. + + The operation applied is: + + .. math:: + \text{out}_i = \text{input}_i >> \text{other}_i + + Args: + input (Tensor or Scalar): the first input tensor + other (Tensor or Scalar): the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_right_shift(torch.tensor([-2, -7, 31], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([-1, -7, 3], dtype=torch.int8) + """ + ... +@overload +def bitwise_right_shift(self: Union[Number, _complex], other: Tensor) -> Tensor: + r""" + bitwise_right_shift(input, other, *, out=None) -> Tensor + + Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits. + The input tensor must be of integral type. This operator supports + :ref:`broadcasting to a common shape ` and + :ref:`type promotion `. + In any case, if the value of the right operand is negative or is greater + or equal to the number of bits in the promoted left operand, the behavior is undefined. + + The operation applied is: + + .. math:: + \text{out}_i = \text{input}_i >> \text{other}_i + + Args: + input (Tensor or Scalar): the first input tensor + other (Tensor or Scalar): the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_right_shift(torch.tensor([-2, -7, 31], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([-1, -7, 3], dtype=torch.int8) + """ + ... +@overload +def bitwise_right_shift(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + bitwise_right_shift(input, other, *, out=None) -> Tensor + + Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits. + The input tensor must be of integral type. This operator supports + :ref:`broadcasting to a common shape ` and + :ref:`type promotion `. + In any case, if the value of the right operand is negative or is greater + or equal to the number of bits in the promoted left operand, the behavior is undefined. + + The operation applied is: + + .. math:: + \text{out}_i = \text{input}_i >> \text{other}_i + + Args: + input (Tensor or Scalar): the first input tensor + other (Tensor or Scalar): the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_right_shift(torch.tensor([-2, -7, 31], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([-1, -7, 3], dtype=torch.int8) + """ + ... +@overload +def bitwise_xor(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + bitwise_xor(input, other, *, out=None) -> Tensor + + Computes the bitwise XOR of :attr:`input` and :attr:`other`. The input tensor must be of + integral or Boolean types. For bool tensors, it computes the logical XOR. + + Args: + input: the first input tensor + other: the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_xor(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([-2, -2, 0], dtype=torch.int8) + >>> torch.bitwise_xor(torch.tensor([True, True, False]), torch.tensor([False, True, False])) + tensor([ True, False, False]) + """ + ... +@overload +def bitwise_xor(self: Union[Number, _complex], other: Tensor) -> Tensor: + r""" + bitwise_xor(input, other, *, out=None) -> Tensor + + Computes the bitwise XOR of :attr:`input` and :attr:`other`. The input tensor must be of + integral or Boolean types. For bool tensors, it computes the logical XOR. + + Args: + input: the first input tensor + other: the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_xor(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([-2, -2, 0], dtype=torch.int8) + >>> torch.bitwise_xor(torch.tensor([True, True, False]), torch.tensor([False, True, False])) + tensor([ True, False, False]) + """ + ... +@overload +def bitwise_xor(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + bitwise_xor(input, other, *, out=None) -> Tensor + + Computes the bitwise XOR of :attr:`input` and :attr:`other`. The input tensor must be of + integral or Boolean types. For bool tensors, it computes the logical XOR. + + Args: + input: the first input tensor + other: the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.bitwise_xor(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8)) + tensor([-2, -2, 0], dtype=torch.int8) + >>> torch.bitwise_xor(torch.tensor([True, True, False]), torch.tensor([False, True, False])) + tensor([ True, False, False]) + """ + ... +@overload +def blackman_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + blackman_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Blackman window function. + + .. math:: + w[n] = 0.42 - 0.5 \cos \left( \frac{2 \pi n}{N - 1} \right) + 0.08 \cos \left( \frac{4 \pi n}{N - 1} \right) + + where :math:`N` is the full window size. + + The input :attr:`window_length` is a positive integer controlling the + returned window size. :attr:`periodic` flag determines whether the returned + window trims off the last duplicate value from the symmetric window and is + ready to be used as a periodic window with functions like + :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in + above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have + ``torch.blackman_window(L, periodic=True)`` equal to + ``torch.blackman_window(L + 1, periodic=False)[:-1])``. + + .. note:: + If :attr:`window_length` :math:`=1`, the returned window contains a single value 1. + + Arguments: + window_length (int): the size of returned window + periodic (bool, optional): If True, returns a window to be used as periodic + function. If False, return a symmetric window. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported. + layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only + ``torch.strided`` (dense layout) is supported. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Returns: + Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window + """ + ... +@overload +def blackman_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + blackman_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Blackman window function. + + .. math:: + w[n] = 0.42 - 0.5 \cos \left( \frac{2 \pi n}{N - 1} \right) + 0.08 \cos \left( \frac{4 \pi n}{N - 1} \right) + + where :math:`N` is the full window size. + + The input :attr:`window_length` is a positive integer controlling the + returned window size. :attr:`periodic` flag determines whether the returned + window trims off the last duplicate value from the symmetric window and is + ready to be used as a periodic window with functions like + :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in + above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have + ``torch.blackman_window(L, periodic=True)`` equal to + ``torch.blackman_window(L + 1, periodic=False)[:-1])``. + + .. note:: + If :attr:`window_length` :math:`=1`, the returned window contains a single value 1. + + Arguments: + window_length (int): the size of returned window + periodic (bool, optional): If True, returns a window to be used as periodic + function. If False, return a symmetric window. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported. + layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only + ``torch.strided`` (dense layout) is supported. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Returns: + Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window + """ + ... +def bmm(input: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + bmm(input, mat2, *, out=None) -> Tensor + + Performs a batch matrix-matrix product of matrices stored in :attr:`input` + and :attr:`mat2`. + + :attr:`input` and :attr:`mat2` must be 3-D tensors each containing + the same number of matrices. + + If :attr:`input` is a :math:`(b \times n \times m)` tensor, :attr:`mat2` is a + :math:`(b \times m \times p)` tensor, :attr:`out` will be a + :math:`(b \times n \times p)` tensor. + + .. math:: + \text{out}_i = \text{input}_i \mathbin{@} \text{mat2}_i + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + .. note:: This function does not :ref:`broadcast `. + For broadcasting matrix products, see :func:`torch.matmul`. + + Args: + input (Tensor): the first batch of matrices to be multiplied + mat2 (Tensor): the second batch of matrices to be multiplied + + Keyword Args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> input = torch.randn(10, 3, 4) + >>> mat2 = torch.randn(10, 4, 5) + >>> res = torch.bmm(input, mat2) + >>> res.size() + torch.Size([10, 3, 5]) + """ + ... +def broadcast_to(input: Tensor, size: Sequence[Union[_int, SymInt]]) -> Tensor: + r""" + broadcast_to(input, shape) -> Tensor + + Broadcasts :attr:`input` to the shape :attr:`\shape`. + Equivalent to calling ``input.expand(shape)``. See :meth:`~Tensor.expand` for details. + + Args: + input (Tensor): the input tensor. + shape (list, tuple, or :class:`torch.Size`): the new shape. + + Example:: + + >>> x = torch.tensor([1, 2, 3]) + >>> torch.broadcast_to(x, (3, 3)) + tensor([[1, 2, 3], + [1, 2, 3], + [1, 2, 3]]) + """ + ... +@overload +def bucketize(input: Tensor, boundaries: Tensor, *, out_int32: _bool = False, right: _bool = False, out: Optional[Tensor] = None) -> Tensor: + r""" + bucketize(input, boundaries, *, out_int32=False, right=False, out=None) -> Tensor + + Returns the indices of the buckets to which each value in the :attr:`input` belongs, where the + boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size + as :attr:`input`. If :attr:`right` is False (default), then the left boundary is open. Note that + this behavior is opposite the behavior of + `numpy.digitize `_. + More formally, the returned index satisfies the following rules: + + .. list-table:: + :widths: 15 85 + :header-rows: 1 + + * - :attr:`right` + - *returned index satisfies* + * - False + - ``boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]`` + * - True + - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]`` + + Args: + input (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s). + boundaries (Tensor): 1-D tensor, must contain a strictly increasing sequence, or the return value is undefined. + + Keyword args: + out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise. + Default value is False, i.e. default output data type is torch.int64. + right (bool, optional): if False, return the first suitable location that is found. If True, return the + last such index. If no suitable index found, return 0 for non-numerical value + (eg. nan, inf) or the size of :attr:`boundaries` (one pass the last index). + In other words, if False, gets the lower bound index for each value in :attr:`input` + from :attr:`boundaries`. If True, gets the upper bound index instead. + Default value is False. + out (Tensor, optional): the output tensor, must be the same size as :attr:`input` if provided. + + + Example:: + + >>> boundaries = torch.tensor([1, 3, 5, 7, 9]) + >>> boundaries + tensor([1, 3, 5, 7, 9]) + >>> v = torch.tensor([[3, 6, 9], [3, 6, 9]]) + >>> v + tensor([[3, 6, 9], + [3, 6, 9]]) + >>> torch.bucketize(v, boundaries) + tensor([[1, 3, 4], + [1, 3, 4]]) + >>> torch.bucketize(v, boundaries, right=True) + tensor([[2, 3, 5], + [2, 3, 5]]) + """ + ... +@overload +def bucketize(self: Union[Number, _complex], boundaries: Tensor, *, out_int32: _bool = False, right: _bool = False) -> Tensor: + r""" + bucketize(input, boundaries, *, out_int32=False, right=False, out=None) -> Tensor + + Returns the indices of the buckets to which each value in the :attr:`input` belongs, where the + boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size + as :attr:`input`. If :attr:`right` is False (default), then the left boundary is open. Note that + this behavior is opposite the behavior of + `numpy.digitize `_. + More formally, the returned index satisfies the following rules: + + .. list-table:: + :widths: 15 85 + :header-rows: 1 + + * - :attr:`right` + - *returned index satisfies* + * - False + - ``boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]`` + * - True + - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]`` + + Args: + input (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s). + boundaries (Tensor): 1-D tensor, must contain a strictly increasing sequence, or the return value is undefined. + + Keyword args: + out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise. + Default value is False, i.e. default output data type is torch.int64. + right (bool, optional): if False, return the first suitable location that is found. If True, return the + last such index. If no suitable index found, return 0 for non-numerical value + (eg. nan, inf) or the size of :attr:`boundaries` (one pass the last index). + In other words, if False, gets the lower bound index for each value in :attr:`input` + from :attr:`boundaries`. If True, gets the upper bound index instead. + Default value is False. + out (Tensor, optional): the output tensor, must be the same size as :attr:`input` if provided. + + + Example:: + + >>> boundaries = torch.tensor([1, 3, 5, 7, 9]) + >>> boundaries + tensor([1, 3, 5, 7, 9]) + >>> v = torch.tensor([[3, 6, 9], [3, 6, 9]]) + >>> v + tensor([[3, 6, 9], + [3, 6, 9]]) + >>> torch.bucketize(v, boundaries) + tensor([[1, 3, 4], + [1, 3, 4]]) + >>> torch.bucketize(v, boundaries, right=True) + tensor([[2, 3, 5], + [2, 3, 5]]) + """ + ... +def can_cast(from_: _dtype, to: _dtype) -> _bool: + r""" + can_cast(from, to) -> bool + + Determines if a type conversion is allowed under PyTorch casting rules + described in the type promotion :ref:`documentation `. + + Args: + from (dtype): The original :class:`torch.dtype`. + to (dtype): The target :class:`torch.dtype`. + + Example:: + + >>> torch.can_cast(torch.double, torch.float) + True + >>> torch.can_cast(torch.float, torch.int) + False + """ + ... +@overload +def cat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: + r""" + cat(tensors, dim=0, *, out=None) -> Tensor + + Concatenates the given sequence of :attr:`seq` tensors in the given dimension. + All tensors must either have the same shape (except in the concatenating + dimension) or be a 1-D empty tensor with size ``(0,)``. + + :func:`torch.cat` can be seen as an inverse operation for :func:`torch.split` + and :func:`torch.chunk`. + + :func:`torch.cat` can be best understood via examples. + + .. seealso:: + + :func:`torch.stack` concatenates the given sequence along a new dimension. + + Args: + tensors (sequence of Tensors): any python sequence of tensors of the same type. + Non-empty tensors provided must have the same shape, except in the + cat dimension. + dim (int, optional): the dimension over which the tensors are concatenated + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> x = torch.randn(2, 3) + >>> x + tensor([[ 0.6580, -1.0969, -0.4614], + [-0.1034, -0.5790, 0.1497]]) + >>> torch.cat((x, x, x), 0) + tensor([[ 0.6580, -1.0969, -0.4614], + [-0.1034, -0.5790, 0.1497], + [ 0.6580, -1.0969, -0.4614], + [-0.1034, -0.5790, 0.1497], + [ 0.6580, -1.0969, -0.4614], + [-0.1034, -0.5790, 0.1497]]) + >>> torch.cat((x, x, x), 1) + tensor([[ 0.6580, -1.0969, -0.4614, 0.6580, -1.0969, -0.4614, 0.6580, + -1.0969, -0.4614], + [-0.1034, -0.5790, 0.1497, -0.1034, -0.5790, 0.1497, -0.1034, + -0.5790, 0.1497]]) + """ + ... +@overload +def cat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: + r""" + cat(tensors, dim=0, *, out=None) -> Tensor + + Concatenates the given sequence of :attr:`seq` tensors in the given dimension. + All tensors must either have the same shape (except in the concatenating + dimension) or be a 1-D empty tensor with size ``(0,)``. + + :func:`torch.cat` can be seen as an inverse operation for :func:`torch.split` + and :func:`torch.chunk`. + + :func:`torch.cat` can be best understood via examples. + + .. seealso:: + + :func:`torch.stack` concatenates the given sequence along a new dimension. + + Args: + tensors (sequence of Tensors): any python sequence of tensors of the same type. + Non-empty tensors provided must have the same shape, except in the + cat dimension. + dim (int, optional): the dimension over which the tensors are concatenated + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> x = torch.randn(2, 3) + >>> x + tensor([[ 0.6580, -1.0969, -0.4614], + [-0.1034, -0.5790, 0.1497]]) + >>> torch.cat((x, x, x), 0) + tensor([[ 0.6580, -1.0969, -0.4614], + [-0.1034, -0.5790, 0.1497], + [ 0.6580, -1.0969, -0.4614], + [-0.1034, -0.5790, 0.1497], + [ 0.6580, -1.0969, -0.4614], + [-0.1034, -0.5790, 0.1497]]) + >>> torch.cat((x, x, x), 1) + tensor([[ 0.6580, -1.0969, -0.4614, 0.6580, -1.0969, -0.4614, 0.6580, + -1.0969, -0.4614], + [-0.1034, -0.5790, 0.1497, -0.1034, -0.5790, 0.1497, -0.1034, + -0.5790, 0.1497]]) + """ + ... +def ccol_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ... +def ceil(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + ceil(input, *, out=None) -> Tensor + + Returns a new tensor with the ceil of the elements of :attr:`input`, + the smallest integer greater than or equal to each element. + + For integer inputs, follows the array-api convention of returning a + copy of the input tensor. + + .. math:: + \text{out}_{i} = \left\lceil \text{input}_{i} \right\rceil + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([-0.6341, -1.4208, -1.0900, 0.5826]) + >>> torch.ceil(a) + tensor([-0., -1., -1., 1.]) + """ + ... +def ceil_(input: Tensor) -> Tensor: ... +def celu(input: Tensor, alpha: Union[Number, _complex] = 1.0) -> Tensor: ... +def celu_(input: Tensor, alpha: Union[Number, _complex] = 1.0) -> Tensor: ... +def channel_shuffle(input: Tensor, groups: Union[_int, SymInt]) -> Tensor: ... +def cholesky(input: Tensor, upper: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + cholesky(input, upper=False, *, out=None) -> Tensor + + Computes the Cholesky decomposition of a symmetric positive-definite + matrix :math:`A` or for batches of symmetric positive-definite matrices. + + If :attr:`upper` is ``True``, the returned matrix ``U`` is upper-triangular, and + the decomposition has the form: + + .. math:: + + A = U^TU + + If :attr:`upper` is ``False``, the returned matrix ``L`` is lower-triangular, and + the decomposition has the form: + + .. math:: + + A = LL^T + + If :attr:`upper` is ``True``, and :math:`A` is a batch of symmetric positive-definite + matrices, then the returned tensor will be composed of upper-triangular Cholesky factors + of each of the individual matrices. Similarly, when :attr:`upper` is ``False``, the returned + tensor will be composed of lower-triangular Cholesky factors of each of the individual + matrices. + + .. warning:: + + :func:`torch.cholesky` is deprecated in favor of :func:`torch.linalg.cholesky` + and will be removed in a future PyTorch release. + + ``L = torch.cholesky(A)`` should be replaced with + + .. code:: python + + L = torch.linalg.cholesky(A) + + ``U = torch.cholesky(A, upper=True)`` should be replaced with + + .. code:: python + + U = torch.linalg.cholesky(A).mH + + This transform will produce equivalent results for all valid (symmetric positive definite) inputs. + + Args: + input (Tensor): the input tensor :math:`A` of size :math:`(*, n, n)` where `*` is zero or more + batch dimensions consisting of symmetric positive-definite matrices. + upper (bool, optional): flag that indicates whether to return a + upper or lower triangular matrix. Default: ``False`` + + Keyword args: + out (Tensor, optional): the output matrix + + Example:: + + >>> a = torch.randn(3, 3) + >>> a = a @ a.mT + 1e-3 # make symmetric positive-definite + >>> l = torch.cholesky(a) + >>> a + tensor([[ 2.4112, -0.7486, 1.4551], + [-0.7486, 1.3544, 0.1294], + [ 1.4551, 0.1294, 1.6724]]) + >>> l + tensor([[ 1.5528, 0.0000, 0.0000], + [-0.4821, 1.0592, 0.0000], + [ 0.9371, 0.5487, 0.7023]]) + >>> l @ l.mT + tensor([[ 2.4112, -0.7486, 1.4551], + [-0.7486, 1.3544, 0.1294], + [ 1.4551, 0.1294, 1.6724]]) + >>> a = torch.randn(3, 2, 2) # Example for batched input + >>> a = a @ a.mT + 1e-03 # make symmetric positive-definite + >>> l = torch.cholesky(a) + >>> z = l @ l.mT + >>> torch.dist(z, a) + tensor(2.3842e-07) + """ + ... +def cholesky_inverse(input: Tensor, upper: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + cholesky_inverse(L, upper=False, *, out=None) -> Tensor + + Computes the inverse of a complex Hermitian or real symmetric + positive-definite matrix given its Cholesky decomposition. + + Let :math:`A` be a complex Hermitian or real symmetric positive-definite matrix, + and :math:`L` its Cholesky decomposition such that: + + .. math:: + + A = LL^{\text{H}} + + where :math:`L^{\text{H}}` is the conjugate transpose when :math:`L` is complex, + and the transpose when :math:`L` is real-valued. + + Computes the inverse matrix :math:`A^{-1}`. + + Supports input of float, double, cfloat and cdouble dtypes. + Also supports batches of matrices, and if :math:`A` is a batch of matrices + then the output has the same batch dimensions. + + Args: + L (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions + consisting of lower or upper triangular Cholesky decompositions of + symmetric or Hermitian positive-definite matrices. + upper (bool, optional): flag that indicates whether :math:`L` is lower triangular + or upper triangular. Default: ``False`` + + Keyword args: + out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`. + + Example:: + + >>> A = torch.randn(3, 3) + >>> A = A @ A.T + torch.eye(3) * 1e-3 # Creates a symmetric positive-definite matrix + >>> L = torch.linalg.cholesky(A) # Extract Cholesky decomposition + >>> torch.cholesky_inverse(L) + tensor([[ 1.9314, 1.2251, -0.0889], + [ 1.2251, 2.4439, 0.2122], + [-0.0889, 0.2122, 0.1412]]) + >>> A.inverse() + tensor([[ 1.9314, 1.2251, -0.0889], + [ 1.2251, 2.4439, 0.2122], + [-0.0889, 0.2122, 0.1412]]) + + >>> A = torch.randn(3, 2, 2, dtype=torch.complex64) + >>> A = A @ A.mH + torch.eye(2) * 1e-3 # Batch of Hermitian positive-definite matrices + >>> L = torch.linalg.cholesky(A) + >>> torch.dist(torch.inverse(A), torch.cholesky_inverse(L)) + tensor(5.6358e-7) + """ + ... +def cholesky_solve(input: Tensor, input2: Tensor, upper: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + cholesky_solve(B, L, upper=False, *, out=None) -> Tensor + + Computes the solution of a system of linear equations with complex Hermitian + or real symmetric positive-definite lhs given its Cholesky decomposition. + + Let :math:`A` be a complex Hermitian or real symmetric positive-definite matrix, + and :math:`L` its Cholesky decomposition such that: + + .. math:: + + A = LL^{\text{H}} + + where :math:`L^{\text{H}}` is the conjugate transpose when :math:`L` is complex, + and the transpose when :math:`L` is real-valued. + + Returns the solution :math:`X` of the following linear system: + + .. math:: + + AX = B + + Supports inputs of float, double, cfloat and cdouble dtypes. + Also supports batches of matrices, and if :math:`A` or :math:`B` is a batch of matrices + then the output has the same batch dimensions. + + Args: + B (Tensor): right-hand side tensor of shape `(*, n, k)` + where :math:`*` is zero or more batch dimensions + L (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions + consisting of lower or upper triangular Cholesky decompositions of + symmetric or Hermitian positive-definite matrices. + upper (bool, optional): flag that indicates whether :math:`L` is lower triangular + or upper triangular. Default: ``False``. + + Keyword args: + out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`. + + Example:: + + >>> A = torch.randn(3, 3) + >>> A = A @ A.T + torch.eye(3) * 1e-3 # Creates a symmetric positive-definite matrix + >>> L = torch.linalg.cholesky(A) # Extract Cholesky decomposition + >>> B = torch.randn(3, 2) + >>> torch.cholesky_solve(B, L) + tensor([[ -8.1625, 19.6097], + [ -5.8398, 14.2387], + [ -4.3771, 10.4173]]) + >>> A.inverse() @ B + tensor([[ -8.1626, 19.6097], + [ -5.8398, 14.2387], + [ -4.3771, 10.4173]]) + + >>> A = torch.randn(3, 2, 2, dtype=torch.complex64) + >>> A = A @ A.mH + torch.eye(2) * 1e-3 # Batch of Hermitian positive-definite matrices + >>> L = torch.linalg.cholesky(A) + >>> B = torch.randn(2, 1, dtype=torch.complex64) + >>> X = torch.cholesky_solve(B, L) + >>> torch.dist(X, A.inverse() @ B) + tensor(1.6881e-5) + """ + ... +def choose_qparams_optimized(input: Tensor, numel: _int, n_bins: _int, ratio: _float, bit_width: _int) -> Tuple[Tensor, Tensor]: ... +def chunk(input: Tensor, chunks: _int, dim: _int = 0) -> Tuple[Tensor, ...]: + r""" + chunk(input, chunks, dim=0) -> List of Tensors + + Attempts to split a tensor into the specified number of chunks. Each chunk is a view of + the input tensor. + + + .. note:: + + This function may return fewer than the specified number of chunks! + + .. seealso:: + + :func:`torch.tensor_split` a function that always returns exactly the specified number of chunks + + If the tensor size along the given dimension :attr:`dim` is divisible by :attr:`chunks`, + all returned chunks will be the same size. + If the tensor size along the given dimension :attr:`dim` is not divisible by :attr:`chunks`, + all returned chunks will be the same size, except the last one. + If such division is not possible, this function may return fewer + than the specified number of chunks. + + Arguments: + input (Tensor): the tensor to split + chunks (int): number of chunks to return + dim (int): dimension along which to split the tensor + + Example: + >>> torch.arange(11).chunk(6) + (tensor([0, 1]), + tensor([2, 3]), + tensor([4, 5]), + tensor([6, 7]), + tensor([8, 9]), + tensor([10])) + >>> torch.arange(12).chunk(6) + (tensor([0, 1]), + tensor([2, 3]), + tensor([4, 5]), + tensor([6, 7]), + tensor([8, 9]), + tensor([10, 11])) + >>> torch.arange(13).chunk(6) + (tensor([0, 1, 2]), + tensor([3, 4, 5]), + tensor([6, 7, 8]), + tensor([ 9, 10, 11]), + tensor([12])) + """ + ... +@overload +def clamp(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None, *, out: Optional[Tensor] = None) -> Tensor: + r""" + clamp(input, min=None, max=None, *, out=None) -> Tensor + + Clamps all elements in :attr:`input` into the range `[` :attr:`min`, :attr:`max` `]`. + Letting min_value and max_value be :attr:`min` and :attr:`max`, respectively, this returns: + + .. math:: + y_i = \min(\max(x_i, \text{min\_value}_i), \text{max\_value}_i) + + If :attr:`min` is ``None``, there is no lower bound. + Or, if :attr:`max` is ``None`` there is no upper bound. + + + .. note:: + If :attr:`min` is greater than :attr:`max` :func:`torch.clamp(..., min, max) ` + sets all elements in :attr:`input` to the value of :attr:`max`. + + Args: + input (Tensor): the input tensor. + min (Number or Tensor, optional): lower-bound of the range to be clamped to + max (Number or Tensor, optional): upper-bound of the range to be clamped to + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([-1.7120, 0.1734, -0.0478, -0.0922]) + >>> torch.clamp(a, min=-0.5, max=0.5) + tensor([-0.5000, 0.1734, -0.0478, -0.0922]) + + >>> min = torch.linspace(-1, 1, steps=4) + >>> torch.clamp(a, min=min) + tensor([-1.0000, 0.1734, 0.3333, 1.0000]) + """ + ... +@overload +def clamp(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None, *, out: Optional[Tensor] = None) -> Tensor: + r""" + clamp(input, min=None, max=None, *, out=None) -> Tensor + + Clamps all elements in :attr:`input` into the range `[` :attr:`min`, :attr:`max` `]`. + Letting min_value and max_value be :attr:`min` and :attr:`max`, respectively, this returns: + + .. math:: + y_i = \min(\max(x_i, \text{min\_value}_i), \text{max\_value}_i) + + If :attr:`min` is ``None``, there is no lower bound. + Or, if :attr:`max` is ``None`` there is no upper bound. + + + .. note:: + If :attr:`min` is greater than :attr:`max` :func:`torch.clamp(..., min, max) ` + sets all elements in :attr:`input` to the value of :attr:`max`. + + Args: + input (Tensor): the input tensor. + min (Number or Tensor, optional): lower-bound of the range to be clamped to + max (Number or Tensor, optional): upper-bound of the range to be clamped to + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([-1.7120, 0.1734, -0.0478, -0.0922]) + >>> torch.clamp(a, min=-0.5, max=0.5) + tensor([-0.5000, 0.1734, -0.0478, -0.0922]) + + >>> min = torch.linspace(-1, 1, steps=4) + >>> torch.clamp(a, min=min) + tensor([-1.0000, 0.1734, 0.3333, 1.0000]) + """ + ... +@overload +def clamp_(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None) -> Tensor: ... +@overload +def clamp_(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None) -> Tensor: ... +@overload +def clamp_max(input: Tensor, max: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ... +@overload +def clamp_max(input: Tensor, max: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: ... +@overload +def clamp_max_(input: Tensor, max: Tensor) -> Tensor: ... +@overload +def clamp_max_(input: Tensor, max: Union[Number, _complex]) -> Tensor: ... +@overload +def clamp_min(input: Tensor, min: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ... +@overload +def clamp_min(input: Tensor, min: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: ... +@overload +def clamp_min_(input: Tensor, min: Tensor) -> Tensor: ... +@overload +def clamp_min_(input: Tensor, min: Union[Number, _complex]) -> Tensor: ... +@overload +def clip(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None, *, out: Optional[Tensor] = None) -> Tensor: + r""" + clip(input, min=None, max=None, *, out=None) -> Tensor + + Alias for :func:`torch.clamp`. + """ + ... +@overload +def clip(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None, *, out: Optional[Tensor] = None) -> Tensor: + r""" + clip(input, min=None, max=None, *, out=None) -> Tensor + + Alias for :func:`torch.clamp`. + """ + ... +@overload +def clip_(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None) -> Tensor: ... +@overload +def clip_(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None) -> Tensor: ... +def clone(input: Tensor, *, memory_format: Optional[memory_format] = None) -> Tensor: + r""" + clone(input, *, memory_format=torch.preserve_format) -> Tensor + + Returns a copy of :attr:`input`. + + .. note:: + + This function is differentiable, so gradients will flow back from the + result of this operation to :attr:`input`. To create a tensor without an + autograd relationship to :attr:`input` see :meth:`~Tensor.detach`. + + Args: + input (Tensor): the input tensor. + + Keyword args: + memory_format (:class:`torch.memory_format`, optional): the desired memory format of + returned tensor. Default: ``torch.preserve_format``. + """ + ... +def col_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.col_indices`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def column_stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: + r""" + column_stack(tensors, *, out=None) -> Tensor + + Creates a new tensor by horizontally stacking the tensors in :attr:`tensors`. + + Equivalent to ``torch.hstack(tensors)``, except each zero or one dimensional tensor ``t`` + in :attr:`tensors` is first reshaped into a ``(t.numel(), 1)`` column before being stacked horizontally. + + Args: + tensors (sequence of Tensors): sequence of tensors to concatenate + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([1, 2, 3]) + >>> b = torch.tensor([4, 5, 6]) + >>> torch.column_stack((a, b)) + tensor([[1, 4], + [2, 5], + [3, 6]]) + >>> a = torch.arange(5) + >>> b = torch.arange(10).reshape(5, 2) + >>> torch.column_stack((a, b, b)) + tensor([[0, 0, 1, 0, 1], + [1, 2, 3, 2, 3], + [2, 4, 5, 4, 5], + [3, 6, 7, 6, 7], + [4, 8, 9, 8, 9]]) + """ + ... +def combinations(input: Tensor, r: _int = 2, with_replacement: _bool = False) -> Tensor: + r""" + combinations(input, r=2, with_replacement=False) -> seq + + Compute combinations of length :math:`r` of the given tensor. The behavior is similar to + python's `itertools.combinations` when `with_replacement` is set to `False`, and + `itertools.combinations_with_replacement` when `with_replacement` is set to `True`. + + Arguments: + input (Tensor): 1D vector. + r (int, optional): number of elements to combine + with_replacement (bool, optional): whether to allow duplication in combination + + Returns: + Tensor: A tensor equivalent to converting all the input tensors into lists, do + `itertools.combinations` or `itertools.combinations_with_replacement` on these + lists, and finally convert the resulting list into tensor. + + Example:: + + >>> a = [1, 2, 3] + >>> list(itertools.combinations(a, r=2)) + [(1, 2), (1, 3), (2, 3)] + >>> list(itertools.combinations(a, r=3)) + [(1, 2, 3)] + >>> list(itertools.combinations_with_replacement(a, r=2)) + [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)] + >>> tensor_a = torch.tensor(a) + >>> torch.combinations(tensor_a) + tensor([[1, 2], + [1, 3], + [2, 3]]) + >>> torch.combinations(tensor_a, r=3) + tensor([[1, 2, 3]]) + >>> torch.combinations(tensor_a, with_replacement=True) + tensor([[1, 1], + [1, 2], + [1, 3], + [2, 2], + [2, 3], + [3, 3]]) + """ + ... +def complex(real: Tensor, imag: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + complex(real, imag, *, out=None) -> Tensor + + Constructs a complex tensor with its real part equal to :attr:`real` and its + imaginary part equal to :attr:`imag`. + + Args: + real (Tensor): The real part of the complex tensor. Must be half, float or double. + imag (Tensor): The imaginary part of the complex tensor. Must be same dtype + as :attr:`real`. + + Keyword args: + out (Tensor): If the inputs are ``torch.float32``, must be + ``torch.complex64``. If the inputs are ``torch.float64``, must be + ``torch.complex128``. + + Example:: + + >>> real = torch.tensor([1, 2], dtype=torch.float32) + >>> imag = torch.tensor([3, 4], dtype=torch.float32) + >>> z = torch.complex(real, imag) + >>> z + tensor([(1.+3.j), (2.+4.j)]) + >>> z.dtype + torch.complex64 + """ + ... +@overload +def concat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: + r""" + concat(tensors, dim=0, *, out=None) -> Tensor + + Alias of :func:`torch.cat`. + """ + ... +@overload +def concat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: + r""" + concat(tensors, dim=0, *, out=None) -> Tensor + + Alias of :func:`torch.cat`. + """ + ... +@overload +def concatenate(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: + r""" + concatenate(tensors, axis=0, out=None) -> Tensor + + Alias of :func:`torch.cat`. + """ + ... +@overload +def concatenate(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: + r""" + concatenate(tensors, axis=0, out=None) -> Tensor + + Alias of :func:`torch.cat`. + """ + ... +def conj(input: Tensor) -> Tensor: + r""" + conj(input) -> Tensor + + Returns a view of :attr:`input` with a flipped conjugate bit. If :attr:`input` has a non-complex dtype, + this function just returns :attr:`input`. + + .. note:: + :func:`torch.conj` performs a lazy conjugation, but the actual conjugated tensor can be materialized + at any time using :func:`torch.resolve_conj`. + + .. warning:: In the future, :func:`torch.conj` may return a non-writeable view for an :attr:`input` of + non-complex dtype. It's recommended that programs not modify the tensor returned by :func:`torch.conj_physical` + when :attr:`input` is of non-complex dtype to be compatible with this change. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]) + >>> x.is_conj() + False + >>> y = torch.conj(x) + >>> y.is_conj() + True + """ + ... +def conj_physical(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + conj_physical(input, *, out=None) -> Tensor + + Computes the element-wise conjugate of the given :attr:`input` tensor. + If :attr:`input` has a non-complex dtype, this function just returns :attr:`input`. + + .. note:: + This performs the conjugate operation regardless of the fact conjugate bit is set or not. + + .. warning:: In the future, :func:`torch.conj_physical` may return a non-writeable view for an :attr:`input` of + non-complex dtype. It's recommended that programs not modify the tensor returned by :func:`torch.conj_physical` + when :attr:`input` is of non-complex dtype to be compatible with this change. + + .. math:: + \text{out}_{i} = conj(\text{input}_{i}) + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.conj_physical(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j])) + tensor([-1 - 1j, -2 - 2j, 3 + 3j]) + """ + ... +def conj_physical_(input: Tensor) -> Tensor: ... +def constant_pad_nd(input: Tensor, pad: Sequence[Union[_int, SymInt]], value: Union[Number, _complex] = 0) -> Tensor: ... +@overload +def conv1d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ... +@overload +def conv1d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: str = "valid", dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ... +@overload +def conv2d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ... +@overload +def conv2d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: str = "valid", dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ... +@overload +def conv3d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ... +@overload +def conv3d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: str = "valid", dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ... +def conv_tbc(input: Tensor, weight: Tensor, bias: Tensor, pad: _int = 0) -> Tensor: ... +def conv_transpose1d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, output_padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, groups: Union[_int, SymInt] = 1, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ... +def conv_transpose2d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, output_padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, groups: Union[_int, SymInt] = 1, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ... +def conv_transpose3d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, output_padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, groups: Union[_int, SymInt] = 1, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ... +def convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], transposed: _bool, output_padding: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ... +@overload +def copysign(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + copysign(input, other, *, out=None) -> Tensor + + Create a new floating-point tensor with the magnitude of :attr:`input` and the sign of :attr:`other`, elementwise. + + .. math:: + \text{out}_{i} = \begin{cases} + -|\text{input}_{i}| & \text{if } \text{other}_{i} \leq -0.0 \\ + |\text{input}_{i}| & \text{if } \text{other}_{i} \geq 0.0 \\ + \end{cases} + + + Supports :ref:`broadcasting to a common shape `, + and integer and float inputs. + + Args: + input (Tensor): magnitudes. + other (Tensor or Number): contains value(s) whose signbit(s) are + applied to the magnitudes in :attr:`input`. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(5) + >>> a + tensor([-1.2557, -0.0026, -0.5387, 0.4740, -0.9244]) + >>> torch.copysign(a, 1) + tensor([1.2557, 0.0026, 0.5387, 0.4740, 0.9244]) + >>> a = torch.randn(4, 4) + >>> a + tensor([[ 0.7079, 0.2778, -1.0249, 0.5719], + [-0.0059, -0.2600, -0.4475, -1.3948], + [ 0.3667, -0.9567, -2.5757, -0.1751], + [ 0.2046, -0.0742, 0.2998, -0.1054]]) + >>> b = torch.randn(4) + tensor([ 0.2373, 0.3120, 0.3190, -1.1128]) + >>> torch.copysign(a, b) + tensor([[ 0.7079, 0.2778, 1.0249, -0.5719], + [ 0.0059, 0.2600, 0.4475, -1.3948], + [ 0.3667, 0.9567, 2.5757, -0.1751], + [ 0.2046, 0.0742, 0.2998, -0.1054]]) + >>> a = torch.tensor([1.]) + >>> b = torch.tensor([-0.]) + >>> torch.copysign(a, b) + tensor([-1.]) + + .. note:: + copysign handles signed zeros. If the other argument has a negative zero (-0), + the corresponding output value will be negative. + """ + ... +@overload +def copysign(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + copysign(input, other, *, out=None) -> Tensor + + Create a new floating-point tensor with the magnitude of :attr:`input` and the sign of :attr:`other`, elementwise. + + .. math:: + \text{out}_{i} = \begin{cases} + -|\text{input}_{i}| & \text{if } \text{other}_{i} \leq -0.0 \\ + |\text{input}_{i}| & \text{if } \text{other}_{i} \geq 0.0 \\ + \end{cases} + + + Supports :ref:`broadcasting to a common shape `, + and integer and float inputs. + + Args: + input (Tensor): magnitudes. + other (Tensor or Number): contains value(s) whose signbit(s) are + applied to the magnitudes in :attr:`input`. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(5) + >>> a + tensor([-1.2557, -0.0026, -0.5387, 0.4740, -0.9244]) + >>> torch.copysign(a, 1) + tensor([1.2557, 0.0026, 0.5387, 0.4740, 0.9244]) + >>> a = torch.randn(4, 4) + >>> a + tensor([[ 0.7079, 0.2778, -1.0249, 0.5719], + [-0.0059, -0.2600, -0.4475, -1.3948], + [ 0.3667, -0.9567, -2.5757, -0.1751], + [ 0.2046, -0.0742, 0.2998, -0.1054]]) + >>> b = torch.randn(4) + tensor([ 0.2373, 0.3120, 0.3190, -1.1128]) + >>> torch.copysign(a, b) + tensor([[ 0.7079, 0.2778, 1.0249, -0.5719], + [ 0.0059, 0.2600, 0.4475, -1.3948], + [ 0.3667, 0.9567, 2.5757, -0.1751], + [ 0.2046, 0.0742, 0.2998, -0.1054]]) + >>> a = torch.tensor([1.]) + >>> b = torch.tensor([-0.]) + >>> torch.copysign(a, b) + tensor([-1.]) + + .. note:: + copysign handles signed zeros. If the other argument has a negative zero (-0), + the corresponding output value will be negative. + """ + ... +def corrcoef(input: Tensor) -> Tensor: + r""" + corrcoef(input) -> Tensor + + Estimates the Pearson product-moment correlation coefficient matrix of the variables given by the :attr:`input` matrix, + where rows are the variables and columns are the observations. + + .. note:: + + The correlation coefficient matrix R is computed using the covariance matrix C as given by + :math:`R_{ij} = \frac{ C_{ij} } { \sqrt{ C_{ii} * C_{jj} } }` + + .. note:: + + Due to floating point rounding, the resulting array may not be Hermitian and its diagonal elements may not be 1. + The real and imaginary values are clipped to the interval [-1, 1] in an attempt to improve this situation. + + Args: + input (Tensor): A 2D matrix containing multiple variables and observations, or a + Scalar or 1D vector representing a single variable. + + Returns: + (Tensor) The correlation coefficient matrix of the variables. + + .. seealso:: + + :func:`torch.cov` covariance matrix. + + Example:: + + >>> x = torch.tensor([[0, 1, 2], [2, 1, 0]]) + >>> torch.corrcoef(x) + tensor([[ 1., -1.], + [-1., 1.]]) + >>> x = torch.randn(2, 4) + >>> x + tensor([[-0.2678, -0.0908, -0.3766, 0.2780], + [-0.5812, 0.1535, 0.2387, 0.2350]]) + >>> torch.corrcoef(x) + tensor([[1.0000, 0.3582], + [0.3582, 1.0000]]) + >>> torch.corrcoef(x[0]) + tensor(1.) + """ + ... +def cos(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + cos(input, *, out=None) -> Tensor + + Returns a new tensor with the cosine of the elements of :attr:`input`. + + .. math:: + \text{out}_{i} = \cos(\text{input}_{i}) + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([ 1.4309, 1.2706, -0.8562, 0.9796]) + >>> torch.cos(a) + tensor([ 0.1395, 0.2957, 0.6553, 0.5574]) + """ + ... +def cos_(input: Tensor) -> Tensor: ... +def cosh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + cosh(input, *, out=None) -> Tensor + + Returns a new tensor with the hyperbolic cosine of the elements of + :attr:`input`. + + .. math:: + \text{out}_{i} = \cosh(\text{input}_{i}) + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([ 0.1632, 1.1835, -0.6979, -0.7325]) + >>> torch.cosh(a) + tensor([ 1.0133, 1.7860, 1.2536, 1.2805]) + + .. note:: + When :attr:`input` is on the CPU, the implementation of torch.cosh may use + the Sleef library, which rounds very large results to infinity or negative + infinity. See `here `_ for details. + """ + ... +def cosh_(input: Tensor) -> Tensor: ... +def cosine_embedding_loss(input1: Tensor, input2: Tensor, target: Tensor, margin: _float = 0.0, reduction: _int = 1) -> Tensor: ... +def cosine_similarity(x1: Tensor, x2: Tensor, dim: _int = 1, eps: _float = 1e-08) -> Tensor: ... +@overload +def count_nonzero(input: Tensor, dim: Optional[_int] = None) -> Tensor: + r""" + count_nonzero(input, dim=None) -> Tensor + + Counts the number of non-zero values in the tensor :attr:`input` along the given :attr:`dim`. + If no dim is specified then all non-zeros in the tensor are counted. + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints, optional): Dim or tuple of dims along which to count non-zeros. + + Example:: + + >>> x = torch.zeros(3,3) + >>> x[torch.randn(3,3) > 0.5] = 1 + >>> x + tensor([[0., 1., 1.], + [0., 0., 0.], + [0., 0., 1.]]) + >>> torch.count_nonzero(x) + tensor(3) + >>> torch.count_nonzero(x, dim=0) + tensor([0, 1, 2]) + """ + ... +@overload +def count_nonzero(input: Tensor, dim: _size) -> Tensor: + r""" + count_nonzero(input, dim=None) -> Tensor + + Counts the number of non-zero values in the tensor :attr:`input` along the given :attr:`dim`. + If no dim is specified then all non-zeros in the tensor are counted. + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints, optional): Dim or tuple of dims along which to count non-zeros. + + Example:: + + >>> x = torch.zeros(3,3) + >>> x[torch.randn(3,3) > 0.5] = 1 + >>> x + tensor([[0., 1., 1.], + [0., 0., 0.], + [0., 0., 1.]]) + >>> torch.count_nonzero(x) + tensor(3) + >>> torch.count_nonzero(x, dim=0) + tensor([0, 1, 2]) + """ + ... +def cov(input: Tensor, *, correction: _int = 1, fweights: Optional[Tensor] = None, aweights: Optional[Tensor] = None) -> Tensor: + r""" + cov(input, *, correction=1, fweights=None, aweights=None) -> Tensor + + Estimates the covariance matrix of the variables given by the :attr:`input` matrix, where rows are + the variables and columns are the observations. + + A covariance matrix is a square matrix giving the covariance of each pair of variables. The diagonal contains + the variance of each variable (covariance of a variable with itself). By definition, if :attr:`input` represents + a single variable (Scalar or 1D) then its variance is returned. + + The sample covariance of the variables :math:`x` and :math:`y` is given by: + + .. math:: + \text{cov}(x,y) = \frac{\sum^{N}_{i = 1}(x_{i} - \bar{x})(y_{i} - \bar{y})}{\max(0,~N~-~\delta N)} + + where :math:`\bar{x}` and :math:`\bar{y}` are the simple means of the :math:`x` and :math:`y` respectively, and + :math:`\delta N` is the :attr:`correction`. + + If :attr:`fweights` and/or :attr:`aweights` are provided, the weighted covariance + is calculated, which is given by: + + .. math:: + \text{cov}_w(x,y) = \frac{\sum^{N}_{i = 1}w_i(x_{i} - \mu_x^*)(y_{i} - \mu_y^*)} + {\max(0,~\sum^{N}_{i = 1}w_i~-~\frac{\sum^{N}_{i = 1}w_ia_i}{\sum^{N}_{i = 1}w_i}~\delta N)} + + where :math:`w` denotes :attr:`fweights` or :attr:`aweights` (``f`` and ``a`` for brevity) based on whichever is + provided, or :math:`w = f \times a` if both are provided, and + :math:`\mu_x^* = \frac{\sum^{N}_{i = 1}w_ix_{i} }{\sum^{N}_{i = 1}w_i}` is the weighted mean of the variable. If not + provided, ``f`` and/or ``a`` can be seen as a :math:`\mathbb{1}` vector of appropriate size. + + Args: + input (Tensor): A 2D matrix containing multiple variables and observations, or a + Scalar or 1D vector representing a single variable. + + Keyword Args: + correction (int, optional): difference between the sample size and sample degrees of freedom. + Defaults to Bessel's correction, ``correction = 1`` which returns the unbiased estimate, + even if both :attr:`fweights` and :attr:`aweights` are specified. ``correction = 0`` + will return the simple average. Defaults to ``1``. + fweights (tensor, optional): A Scalar or 1D tensor of observation vector frequencies representing the number of + times each observation should be repeated. Its numel must equal the number of columns of :attr:`input`. + Must have integral dtype. Ignored if ``None``. Defaults to ``None``. + aweights (tensor, optional): A Scalar or 1D array of observation vector weights. + These relative weights are typically large for observations considered “important” and smaller for + observations considered less “important”. Its numel must equal the number of columns of :attr:`input`. + Must have floating point dtype. Ignored if ``None``. Defaults to ``None``. + + Returns: + (Tensor) The covariance matrix of the variables. + + .. seealso:: + + :func:`torch.corrcoef` normalized covariance matrix. + + Example:: + >>> x = torch.tensor([[0, 2], [1, 1], [2, 0]]).T + >>> x + tensor([[0, 1, 2], + [2, 1, 0]]) + >>> torch.cov(x) + tensor([[ 1., -1.], + [-1., 1.]]) + >>> torch.cov(x, correction=0) + tensor([[ 0.6667, -0.6667], + [-0.6667, 0.6667]]) + >>> fw = torch.randint(1, 10, (3,)) + >>> fw + tensor([1, 6, 9]) + >>> aw = torch.rand(3) + >>> aw + tensor([0.4282, 0.0255, 0.4144]) + >>> torch.cov(x, fweights=fw, aweights=aw) + tensor([[ 0.4169, -0.4169], + [-0.4169, 0.4169]]) + """ + ... +def cross(input: Tensor, other: Tensor, dim: Optional[_int] = None, *, out: Optional[Tensor] = None) -> Tensor: + r""" + cross(input, other, dim=None, *, out=None) -> Tensor + + + Returns the cross product of vectors in dimension :attr:`dim` of :attr:`input` + and :attr:`other`. + + Supports input of float, double, cfloat and cdouble dtypes. Also supports batches + of vectors, for which it computes the product along the dimension :attr:`dim`. + In this case, the output has the same batch dimensions as the inputs. + + .. warning:: + If :attr:`dim` is not given, it defaults to the first dimension found + with the size 3. Note that this might be unexpected. + + This behavior is deprecated and will be changed to match that of :func:`torch.linalg.cross` + in a future release. + + .. seealso:: + :func:`torch.linalg.cross` which has dim=-1 as default. + + + Args: + input (Tensor): the input tensor. + other (Tensor): the second input tensor + dim (int, optional): the dimension to take the cross-product in. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4, 3) + >>> a + tensor([[-0.3956, 1.1455, 1.6895], + [-0.5849, 1.3672, 0.3599], + [-1.1626, 0.7180, -0.0521], + [-0.1339, 0.9902, -2.0225]]) + >>> b = torch.randn(4, 3) + >>> b + tensor([[-0.0257, -1.4725, -1.2251], + [-1.1479, -0.7005, -1.9757], + [-1.3904, 0.3726, -1.1836], + [-0.9688, -0.7153, 0.2159]]) + >>> torch.cross(a, b, dim=1) + tensor([[ 1.0844, -0.5281, 0.6120], + [-2.4490, -1.5687, 1.9792], + [-0.8304, -1.3037, 0.5650], + [-1.2329, 1.9883, 1.0551]]) + >>> torch.cross(a, b) + tensor([[ 1.0844, -0.5281, 0.6120], + [-2.4490, -1.5687, 1.9792], + [-0.8304, -1.3037, 0.5650], + [-1.2329, 1.9883, 1.0551]]) + """ + ... +def crow_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.crow_indices`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +@overload +def ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int = 0, reduction: _int = 1, zero_infinity: _bool = False) -> Tensor: ... +@overload +def ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int = 0, reduction: _int = 1, zero_infinity: _bool = False) -> Tensor: ... +def cudnn_affine_grid_generator(theta: Tensor, N: _int, C: _int, H: _int, W: _int) -> Tensor: ... +def cudnn_batch_norm(input: Tensor, weight: Tensor, bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, exponential_average_factor: _float, epsilon: _float) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ... +def cudnn_convolution(input: Tensor, weight: Tensor, padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, allow_tf32: _bool, *, out: Optional[Tensor] = None) -> Tensor: ... +def cudnn_convolution_add_relu(input: Tensor, weight: Tensor, z: Tensor, alpha: Optional[Union[Number, _complex]], bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ... +def cudnn_convolution_relu(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ... +def cudnn_convolution_transpose(input: Tensor, weight: Tensor, padding: Sequence[Union[_int, SymInt]], output_padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, allow_tf32: _bool) -> Tensor: ... +def cudnn_grid_sampler(input: Tensor, grid: Tensor) -> Tensor: ... +def cudnn_is_acceptable(input: Tensor) -> _bool: ... +@overload +def cummax(input: Tensor, dim: _int, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummax: + r""" + cummax(input, dim, *, out=None) -> (Tensor, LongTensor) + Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative maximum of + elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index + location of each maximum value found in the dimension :attr:`dim`. + + .. math:: + y_i = max(x_1, x_2, x_3, \dots, x_i) + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to do the operation over + + Keyword args: + out (tuple, optional): the result tuple of two output tensors (values, indices) + + Example:: + + >>> a = torch.randn(10) + >>> a + tensor([-0.3449, -1.5447, 0.0685, -1.5104, -1.1706, 0.2259, 1.4696, -1.3284, + 1.9946, -0.8209]) + >>> torch.cummax(a, dim=0) + torch.return_types.cummax( + values=tensor([-0.3449, -0.3449, 0.0685, 0.0685, 0.0685, 0.2259, 1.4696, 1.4696, + 1.9946, 1.9946]), + indices=tensor([0, 0, 2, 2, 2, 5, 6, 6, 8, 8])) + """ + ... +@overload +def cummax(input: Tensor, dim: Union[str, ellipsis, None], *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummax: + r""" + cummax(input, dim, *, out=None) -> (Tensor, LongTensor) + Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative maximum of + elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index + location of each maximum value found in the dimension :attr:`dim`. + + .. math:: + y_i = max(x_1, x_2, x_3, \dots, x_i) + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to do the operation over + + Keyword args: + out (tuple, optional): the result tuple of two output tensors (values, indices) + + Example:: + + >>> a = torch.randn(10) + >>> a + tensor([-0.3449, -1.5447, 0.0685, -1.5104, -1.1706, 0.2259, 1.4696, -1.3284, + 1.9946, -0.8209]) + >>> torch.cummax(a, dim=0) + torch.return_types.cummax( + values=tensor([-0.3449, -0.3449, 0.0685, 0.0685, 0.0685, 0.2259, 1.4696, 1.4696, + 1.9946, 1.9946]), + indices=tensor([0, 0, 2, 2, 2, 5, 6, 6, 8, 8])) + """ + ... +@overload +def cummin(input: Tensor, dim: _int, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummin: + r""" + cummin(input, dim, *, out=None) -> (Tensor, LongTensor) + Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative minimum of + elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index + location of each maximum value found in the dimension :attr:`dim`. + + .. math:: + y_i = min(x_1, x_2, x_3, \dots, x_i) + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to do the operation over + + Keyword args: + out (tuple, optional): the result tuple of two output tensors (values, indices) + + Example:: + + >>> a = torch.randn(10) + >>> a + tensor([-0.2284, -0.6628, 0.0975, 0.2680, -1.3298, -0.4220, -0.3885, 1.1762, + 0.9165, 1.6684]) + >>> torch.cummin(a, dim=0) + torch.return_types.cummin( + values=tensor([-0.2284, -0.6628, -0.6628, -0.6628, -1.3298, -1.3298, -1.3298, -1.3298, + -1.3298, -1.3298]), + indices=tensor([0, 1, 1, 1, 4, 4, 4, 4, 4, 4])) + """ + ... +@overload +def cummin(input: Tensor, dim: Union[str, ellipsis, None], *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummin: + r""" + cummin(input, dim, *, out=None) -> (Tensor, LongTensor) + Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative minimum of + elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index + location of each maximum value found in the dimension :attr:`dim`. + + .. math:: + y_i = min(x_1, x_2, x_3, \dots, x_i) + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to do the operation over + + Keyword args: + out (tuple, optional): the result tuple of two output tensors (values, indices) + + Example:: + + >>> a = torch.randn(10) + >>> a + tensor([-0.2284, -0.6628, 0.0975, 0.2680, -1.3298, -0.4220, -0.3885, 1.1762, + 0.9165, 1.6684]) + >>> torch.cummin(a, dim=0) + torch.return_types.cummin( + values=tensor([-0.2284, -0.6628, -0.6628, -0.6628, -1.3298, -1.3298, -1.3298, -1.3298, + -1.3298, -1.3298]), + indices=tensor([0, 1, 1, 1, 4, 4, 4, 4, 4, 4])) + """ + ... +@overload +def cumprod(input: Tensor, dim: _int, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + cumprod(input, dim, *, dtype=None, out=None) -> Tensor + + Returns the cumulative product of elements of :attr:`input` in the dimension + :attr:`dim`. + + For example, if :attr:`input` is a vector of size N, the result will also be + a vector of size N, with elements. + + .. math:: + y_i = x_1 \times x_2\times x_3\times \dots \times x_i + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to do the operation over + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(10) + >>> a + tensor([ 0.6001, 0.2069, -0.1919, 0.9792, 0.6727, 1.0062, 0.4126, + -0.2129, -0.4206, 0.1968]) + >>> torch.cumprod(a, dim=0) + tensor([ 0.6001, 0.1241, -0.0238, -0.0233, -0.0157, -0.0158, -0.0065, + 0.0014, -0.0006, -0.0001]) + + >>> a[5] = 0.0 + >>> torch.cumprod(a, dim=0) + tensor([ 0.6001, 0.1241, -0.0238, -0.0233, -0.0157, -0.0000, -0.0000, + 0.0000, -0.0000, -0.0000]) + """ + ... +@overload +def cumprod(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + cumprod(input, dim, *, dtype=None, out=None) -> Tensor + + Returns the cumulative product of elements of :attr:`input` in the dimension + :attr:`dim`. + + For example, if :attr:`input` is a vector of size N, the result will also be + a vector of size N, with elements. + + .. math:: + y_i = x_1 \times x_2\times x_3\times \dots \times x_i + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to do the operation over + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(10) + >>> a + tensor([ 0.6001, 0.2069, -0.1919, 0.9792, 0.6727, 1.0062, 0.4126, + -0.2129, -0.4206, 0.1968]) + >>> torch.cumprod(a, dim=0) + tensor([ 0.6001, 0.1241, -0.0238, -0.0233, -0.0157, -0.0158, -0.0065, + 0.0014, -0.0006, -0.0001]) + + >>> a[5] = 0.0 + >>> torch.cumprod(a, dim=0) + tensor([ 0.6001, 0.1241, -0.0238, -0.0233, -0.0157, -0.0000, -0.0000, + 0.0000, -0.0000, -0.0000]) + """ + ... +@overload +def cumsum(input: Tensor, dim: _int, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + cumsum(input, dim, *, dtype=None, out=None) -> Tensor + + Returns the cumulative sum of elements of :attr:`input` in the dimension + :attr:`dim`. + + For example, if :attr:`input` is a vector of size N, the result will also be + a vector of size N, with elements. + + .. math:: + y_i = x_1 + x_2 + x_3 + \dots + x_i + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to do the operation over + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randint(1, 20, (10,)) + >>> a + tensor([13, 7, 3, 10, 13, 3, 15, 10, 9, 10]) + >>> torch.cumsum(a, dim=0) + tensor([13, 20, 23, 33, 46, 49, 64, 74, 83, 93]) + """ + ... +@overload +def cumsum(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + cumsum(input, dim, *, dtype=None, out=None) -> Tensor + + Returns the cumulative sum of elements of :attr:`input` in the dimension + :attr:`dim`. + + For example, if :attr:`input` is a vector of size N, the result will also be + a vector of size N, with elements. + + .. math:: + y_i = x_1 + x_2 + x_3 + \dots + x_i + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to do the operation over + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randint(1, 20, (10,)) + >>> a + tensor([13, 7, 3, 10, 13, 3, 15, 10, 9, 10]) + >>> torch.cumsum(a, dim=0) + tensor([13, 20, 23, 33, 46, 49, 64, 74, 83, 93]) + """ + ... +@overload +def cumulative_trapezoid(y: Tensor, x: Tensor, *, dim: _int = -1) -> Tensor: + r""" + cumulative_trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor + + Cumulatively computes the `trapezoidal rule `_ + along :attr:`dim`. By default the spacing between elements is assumed to be 1, but + :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be + used to specify arbitrary spacing along :attr:`dim`. + + For more details, please read :func:`torch.trapezoid`. The difference between :func:`torch.trapezoid` + and this function is that, :func:`torch.trapezoid` returns a value for each integration, + where as this function returns a cumulative value for every spacing within the integration. This + is analogous to how `.sum` returns a value and `.cumsum` returns a cumulative sum. + + Arguments: + y (Tensor): Values to use when computing the trapezoidal rule. + x (Tensor): If specified, defines spacing between values as specified above. + + Keyword arguments: + dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx` + are specified then this defaults to 1. Effectively multiplies the result by its value. + dim (int): The dimension along which to compute the trapezoidal rule. + The last (inner-most) dimension by default. + + Examples:: + + >>> # Cumulatively computes the trapezoidal rule in 1D, spacing is implicitly 1. + >>> y = torch.tensor([1, 5, 10]) + >>> torch.cumulative_trapezoid(y) + tensor([3., 10.5]) + + >>> # Computes the same trapezoidal rule directly up to each element to verify + >>> (1 + 5) / 2 + 3.0 + >>> (1 + 10 + 10) / 2 + 10.5 + + >>> # Cumulatively computes the trapezoidal rule in 1D with constant spacing of 2 + >>> # NOTE: the result is the same as before, but multiplied by 2 + >>> torch.cumulative_trapezoid(y, dx=2) + tensor([6., 21.]) + + >>> # Cumulatively computes the trapezoidal rule in 1D with arbitrary spacing + >>> x = torch.tensor([1, 3, 6]) + >>> torch.cumulative_trapezoid(y, x) + tensor([6., 28.5]) + + >>> # Computes the same trapezoidal rule directly up to each element to verify + >>> ((3 - 1) * (1 + 5)) / 2 + 6.0 + >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2 + 28.5 + + >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 matrix + >>> y = torch.arange(9).reshape(3, 3) + tensor([[0, 1, 2], + [3, 4, 5], + [6, 7, 8]]) + >>> torch.cumulative_trapezoid(y) + tensor([[ 0.5, 2.], + [ 3.5, 8.], + [ 6.5, 14.]]) + + >>> # Cumulatively computes the trapezoidal rule for each column of the matrix + >>> torch.cumulative_trapezoid(y, dim=0) + tensor([[ 1.5, 2.5, 3.5], + [ 6.0, 8.0, 10.0]]) + + >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix + >>> # with the same arbitrary spacing + >>> y = torch.ones(3, 3) + >>> x = torch.tensor([1, 3, 6]) + >>> torch.cumulative_trapezoid(y, x) + tensor([[2., 5.], + [2., 5.], + [2., 5.]]) + + >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix + >>> # with different arbitrary spacing per row + >>> y = torch.ones(3, 3) + >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]]) + >>> torch.cumulative_trapezoid(y, x) + tensor([[1., 2.], + [2., 4.], + [3., 6.]]) + """ + ... +@overload +def cumulative_trapezoid(y: Tensor, *, dx: Union[Number, _complex] = 1, dim: _int = -1) -> Tensor: + r""" + cumulative_trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor + + Cumulatively computes the `trapezoidal rule `_ + along :attr:`dim`. By default the spacing between elements is assumed to be 1, but + :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be + used to specify arbitrary spacing along :attr:`dim`. + + For more details, please read :func:`torch.trapezoid`. The difference between :func:`torch.trapezoid` + and this function is that, :func:`torch.trapezoid` returns a value for each integration, + where as this function returns a cumulative value for every spacing within the integration. This + is analogous to how `.sum` returns a value and `.cumsum` returns a cumulative sum. + + Arguments: + y (Tensor): Values to use when computing the trapezoidal rule. + x (Tensor): If specified, defines spacing between values as specified above. + + Keyword arguments: + dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx` + are specified then this defaults to 1. Effectively multiplies the result by its value. + dim (int): The dimension along which to compute the trapezoidal rule. + The last (inner-most) dimension by default. + + Examples:: + + >>> # Cumulatively computes the trapezoidal rule in 1D, spacing is implicitly 1. + >>> y = torch.tensor([1, 5, 10]) + >>> torch.cumulative_trapezoid(y) + tensor([3., 10.5]) + + >>> # Computes the same trapezoidal rule directly up to each element to verify + >>> (1 + 5) / 2 + 3.0 + >>> (1 + 10 + 10) / 2 + 10.5 + + >>> # Cumulatively computes the trapezoidal rule in 1D with constant spacing of 2 + >>> # NOTE: the result is the same as before, but multiplied by 2 + >>> torch.cumulative_trapezoid(y, dx=2) + tensor([6., 21.]) + + >>> # Cumulatively computes the trapezoidal rule in 1D with arbitrary spacing + >>> x = torch.tensor([1, 3, 6]) + >>> torch.cumulative_trapezoid(y, x) + tensor([6., 28.5]) + + >>> # Computes the same trapezoidal rule directly up to each element to verify + >>> ((3 - 1) * (1 + 5)) / 2 + 6.0 + >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2 + 28.5 + + >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 matrix + >>> y = torch.arange(9).reshape(3, 3) + tensor([[0, 1, 2], + [3, 4, 5], + [6, 7, 8]]) + >>> torch.cumulative_trapezoid(y) + tensor([[ 0.5, 2.], + [ 3.5, 8.], + [ 6.5, 14.]]) + + >>> # Cumulatively computes the trapezoidal rule for each column of the matrix + >>> torch.cumulative_trapezoid(y, dim=0) + tensor([[ 1.5, 2.5, 3.5], + [ 6.0, 8.0, 10.0]]) + + >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix + >>> # with the same arbitrary spacing + >>> y = torch.ones(3, 3) + >>> x = torch.tensor([1, 3, 6]) + >>> torch.cumulative_trapezoid(y, x) + tensor([[2., 5.], + [2., 5.], + [2., 5.]]) + + >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix + >>> # with different arbitrary spacing per row + >>> y = torch.ones(3, 3) + >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]]) + >>> torch.cumulative_trapezoid(y, x) + tensor([[1., 2.], + [2., 4.], + [3., 6.]]) + """ + ... +def deg2rad(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + deg2rad(input, *, out=None) -> Tensor + + Returns a new tensor with each of the elements of :attr:`input` + converted from angles in degrees to radians. + + Args: + input (Tensor): the input tensor. + + Keyword arguments: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([[180.0, -180.0], [360.0, -360.0], [90.0, -90.0]]) + >>> torch.deg2rad(a) + tensor([[ 3.1416, -3.1416], + [ 6.2832, -6.2832], + [ 1.5708, -1.5708]]) + """ + ... +def deg2rad_(input: Tensor) -> Tensor: ... +@overload +def dequantize(input: Tensor) -> Tensor: + r""" + dequantize(tensor) -> Tensor + + Returns an fp32 Tensor by dequantizing a quantized Tensor + + Args: + tensor (Tensor): A quantized Tensor + + .. function:: dequantize(tensors) -> sequence of Tensors + :noindex: + + Given a list of quantized Tensors, dequantize them and return a list of fp32 Tensors + + Args: + tensors (sequence of Tensors): A list of quantized Tensors + """ + ... +@overload +def dequantize(tensors: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: + r""" + dequantize(tensor) -> Tensor + + Returns an fp32 Tensor by dequantizing a quantized Tensor + + Args: + tensor (Tensor): A quantized Tensor + + .. function:: dequantize(tensors) -> sequence of Tensors + :noindex: + + Given a list of quantized Tensors, dequantize them and return a list of fp32 Tensors + + Args: + tensors (sequence of Tensors): A list of quantized Tensors + """ + ... +def det(input: Tensor) -> Tensor: + r""" + det(input) -> Tensor + + Alias for :func:`torch.linalg.det` + """ + ... +def detach(input: Tensor) -> Tensor: ... +def detach_(input: Tensor) -> Tensor: ... +def detach_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.detach`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def diag(input: Tensor, diagonal: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: + r""" + diag(input, diagonal=0, *, out=None) -> Tensor + + - If :attr:`input` is a vector (1-D tensor), then returns a 2-D square tensor + with the elements of :attr:`input` as the diagonal. + - If :attr:`input` is a matrix (2-D tensor), then returns a 1-D tensor with + the diagonal elements of :attr:`input`. + + The argument :attr:`diagonal` controls which diagonal to consider: + + - If :attr:`diagonal` = 0, it is the main diagonal. + - If :attr:`diagonal` > 0, it is above the main diagonal. + - If :attr:`diagonal` < 0, it is below the main diagonal. + + Args: + input (Tensor): the input tensor. + diagonal (int, optional): the diagonal to consider + + Keyword args: + out (Tensor, optional): the output tensor. + + .. seealso:: + + :func:`torch.diagonal` always returns the diagonal of its input. + + :func:`torch.diagflat` always constructs a tensor with diagonal elements + specified by the input. + + Examples: + + Get the square matrix where the input vector is the diagonal:: + + >>> a = torch.randn(3) + >>> a + tensor([ 0.5950,-0.0872, 2.3298]) + >>> torch.diag(a) + tensor([[ 0.5950, 0.0000, 0.0000], + [ 0.0000,-0.0872, 0.0000], + [ 0.0000, 0.0000, 2.3298]]) + >>> torch.diag(a, 1) + tensor([[ 0.0000, 0.5950, 0.0000, 0.0000], + [ 0.0000, 0.0000,-0.0872, 0.0000], + [ 0.0000, 0.0000, 0.0000, 2.3298], + [ 0.0000, 0.0000, 0.0000, 0.0000]]) + + Get the k-th diagonal of a given matrix:: + + >>> a = torch.randn(3, 3) + >>> a + tensor([[-0.4264, 0.0255,-0.1064], + [ 0.8795,-0.2429, 0.1374], + [ 0.1029,-0.6482,-1.6300]]) + >>> torch.diag(a, 0) + tensor([-0.4264,-0.2429,-1.6300]) + >>> torch.diag(a, 1) + tensor([ 0.0255, 0.1374]) + """ + ... +def diag_embed(input: Tensor, offset: _int = 0, dim1: _int = -2, dim2: _int = -1) -> Tensor: + r""" + diag_embed(input, offset=0, dim1=-2, dim2=-1) -> Tensor + + Creates a tensor whose diagonals of certain 2D planes (specified by + :attr:`dim1` and :attr:`dim2`) are filled by :attr:`input`. + To facilitate creating batched diagonal matrices, the 2D planes formed by + the last two dimensions of the returned tensor are chosen by default. + + The argument :attr:`offset` controls which diagonal to consider: + + - If :attr:`offset` = 0, it is the main diagonal. + - If :attr:`offset` > 0, it is above the main diagonal. + - If :attr:`offset` < 0, it is below the main diagonal. + + The size of the new matrix will be calculated to make the specified diagonal + of the size of the last input dimension. + Note that for :attr:`offset` other than :math:`0`, the order of :attr:`dim1` + and :attr:`dim2` matters. Exchanging them is equivalent to changing the + sign of :attr:`offset`. + + Applying :meth:`torch.diagonal` to the output of this function with + the same arguments yields a matrix identical to input. However, + :meth:`torch.diagonal` has different default dimensions, so those + need to be explicitly specified. + + Args: + input (Tensor): the input tensor. Must be at least 1-dimensional. + offset (int, optional): which diagonal to consider. Default: 0 + (main diagonal). + dim1 (int, optional): first dimension with respect to which to + take diagonal. Default: -2. + dim2 (int, optional): second dimension with respect to which to + take diagonal. Default: -1. + + Example:: + + >>> a = torch.randn(2, 3) + >>> torch.diag_embed(a) + tensor([[[ 1.5410, 0.0000, 0.0000], + [ 0.0000, -0.2934, 0.0000], + [ 0.0000, 0.0000, -2.1788]], + + [[ 0.5684, 0.0000, 0.0000], + [ 0.0000, -1.0845, 0.0000], + [ 0.0000, 0.0000, -1.3986]]]) + + >>> torch.diag_embed(a, offset=1, dim1=0, dim2=2) + tensor([[[ 0.0000, 1.5410, 0.0000, 0.0000], + [ 0.0000, 0.5684, 0.0000, 0.0000]], + + [[ 0.0000, 0.0000, -0.2934, 0.0000], + [ 0.0000, 0.0000, -1.0845, 0.0000]], + + [[ 0.0000, 0.0000, 0.0000, -2.1788], + [ 0.0000, 0.0000, 0.0000, -1.3986]], + + [[ 0.0000, 0.0000, 0.0000, 0.0000], + [ 0.0000, 0.0000, 0.0000, 0.0000]]]) + """ + ... +def diagflat(input: Tensor, offset: _int = 0) -> Tensor: + r""" + diagflat(input, offset=0) -> Tensor + + - If :attr:`input` is a vector (1-D tensor), then returns a 2-D square tensor + with the elements of :attr:`input` as the diagonal. + - If :attr:`input` is a tensor with more than one dimension, then returns a + 2-D tensor with diagonal elements equal to a flattened :attr:`input`. + + The argument :attr:`offset` controls which diagonal to consider: + + - If :attr:`offset` = 0, it is the main diagonal. + - If :attr:`offset` > 0, it is above the main diagonal. + - If :attr:`offset` < 0, it is below the main diagonal. + + Args: + input (Tensor): the input tensor. + offset (int, optional): the diagonal to consider. Default: 0 (main + diagonal). + + Examples:: + + >>> a = torch.randn(3) + >>> a + tensor([-0.2956, -0.9068, 0.1695]) + >>> torch.diagflat(a) + tensor([[-0.2956, 0.0000, 0.0000], + [ 0.0000, -0.9068, 0.0000], + [ 0.0000, 0.0000, 0.1695]]) + >>> torch.diagflat(a, 1) + tensor([[ 0.0000, -0.2956, 0.0000, 0.0000], + [ 0.0000, 0.0000, -0.9068, 0.0000], + [ 0.0000, 0.0000, 0.0000, 0.1695], + [ 0.0000, 0.0000, 0.0000, 0.0000]]) + + >>> a = torch.randn(2, 2) + >>> a + tensor([[ 0.2094, -0.3018], + [-0.1516, 1.9342]]) + >>> torch.diagflat(a) + tensor([[ 0.2094, 0.0000, 0.0000, 0.0000], + [ 0.0000, -0.3018, 0.0000, 0.0000], + [ 0.0000, 0.0000, -0.1516, 0.0000], + [ 0.0000, 0.0000, 0.0000, 1.9342]]) + """ + ... +@overload +def diagonal(input: Tensor, offset: _int = 0, dim1: _int = 0, dim2: _int = 1) -> Tensor: + r""" + diagonal(input, offset=0, dim1=0, dim2=1) -> Tensor + + Returns a partial view of :attr:`input` with the its diagonal elements + with respect to :attr:`dim1` and :attr:`dim2` appended as a dimension + at the end of the shape. + + The argument :attr:`offset` controls which diagonal to consider: + + - If :attr:`offset` = 0, it is the main diagonal. + - If :attr:`offset` > 0, it is above the main diagonal. + - If :attr:`offset` < 0, it is below the main diagonal. + + Applying :meth:`torch.diag_embed` to the output of this function with + the same arguments yields a diagonal matrix with the diagonal entries + of the input. However, :meth:`torch.diag_embed` has different default + dimensions, so those need to be explicitly specified. + + Args: + input (Tensor): the input tensor. Must be at least 2-dimensional. + offset (int, optional): which diagonal to consider. Default: 0 + (main diagonal). + dim1 (int, optional): first dimension with respect to which to + take diagonal. Default: 0. + dim2 (int, optional): second dimension with respect to which to + take diagonal. Default: 1. + + .. note:: To take a batch diagonal, pass in dim1=-2, dim2=-1. + + Examples:: + + >>> a = torch.randn(3, 3) + >>> a + tensor([[-1.0854, 1.1431, -0.1752], + [ 0.8536, -0.0905, 0.0360], + [ 0.6927, -0.3735, -0.4945]]) + + + >>> torch.diagonal(a, 0) + tensor([-1.0854, -0.0905, -0.4945]) + + + >>> torch.diagonal(a, 1) + tensor([ 1.1431, 0.0360]) + + + >>> x = torch.randn(2, 5, 4, 2) + >>> torch.diagonal(x, offset=-1, dim1=1, dim2=2) + tensor([[[-1.2631, 0.3755, -1.5977, -1.8172], + [-1.1065, 1.0401, -0.2235, -0.7938]], + + [[-1.7325, -0.3081, 0.6166, 0.2335], + [ 1.0500, 0.7336, -0.3836, -1.1015]]]) + """ + ... +@overload +def diagonal(input: Tensor, *, outdim: Union[str, ellipsis, None], dim1: Union[str, ellipsis, None], dim2: Union[str, ellipsis, None], offset: _int = 0) -> Tensor: + r""" + diagonal(input, offset=0, dim1=0, dim2=1) -> Tensor + + Returns a partial view of :attr:`input` with the its diagonal elements + with respect to :attr:`dim1` and :attr:`dim2` appended as a dimension + at the end of the shape. + + The argument :attr:`offset` controls which diagonal to consider: + + - If :attr:`offset` = 0, it is the main diagonal. + - If :attr:`offset` > 0, it is above the main diagonal. + - If :attr:`offset` < 0, it is below the main diagonal. + + Applying :meth:`torch.diag_embed` to the output of this function with + the same arguments yields a diagonal matrix with the diagonal entries + of the input. However, :meth:`torch.diag_embed` has different default + dimensions, so those need to be explicitly specified. + + Args: + input (Tensor): the input tensor. Must be at least 2-dimensional. + offset (int, optional): which diagonal to consider. Default: 0 + (main diagonal). + dim1 (int, optional): first dimension with respect to which to + take diagonal. Default: 0. + dim2 (int, optional): second dimension with respect to which to + take diagonal. Default: 1. + + .. note:: To take a batch diagonal, pass in dim1=-2, dim2=-1. + + Examples:: + + >>> a = torch.randn(3, 3) + >>> a + tensor([[-1.0854, 1.1431, -0.1752], + [ 0.8536, -0.0905, 0.0360], + [ 0.6927, -0.3735, -0.4945]]) + + + >>> torch.diagonal(a, 0) + tensor([-1.0854, -0.0905, -0.4945]) + + + >>> torch.diagonal(a, 1) + tensor([ 1.1431, 0.0360]) + + + >>> x = torch.randn(2, 5, 4, 2) + >>> torch.diagonal(x, offset=-1, dim1=1, dim2=2) + tensor([[[-1.2631, 0.3755, -1.5977, -1.8172], + [-1.1065, 1.0401, -0.2235, -0.7938]], + + [[-1.7325, -0.3081, 0.6166, 0.2335], + [ 1.0500, 0.7336, -0.3836, -1.1015]]]) + """ + ... +def diagonal_copy(input: Tensor, offset: _int = 0, dim1: _int = 0, dim2: _int = 1, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.diagonal`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def diagonal_scatter(input: Tensor, src: Tensor, offset: _int = 0, dim1: _int = 0, dim2: _int = 1) -> Tensor: + r""" + diagonal_scatter(input, src, offset=0, dim1=0, dim2=1) -> Tensor + + Embeds the values of the :attr:`src` tensor into :attr:`input` along + the diagonal elements of :attr:`input`, with respect to :attr:`dim1` + and :attr:`dim2`. + + This function returns a tensor with fresh storage; it does not + return a view. + + The argument :attr:`offset` controls which diagonal to consider: + + - If :attr:`offset` = 0, it is the main diagonal. + - If :attr:`offset` > 0, it is above the main diagonal. + - If :attr:`offset` < 0, it is below the main diagonal. + + Args: + input (Tensor): the input tensor. Must be at least 2-dimensional. + src (Tensor): the tensor to embed into :attr:`input`. + offset (int, optional): which diagonal to consider. Default: 0 + (main diagonal). + dim1 (int, optional): first dimension with respect to which to + take diagonal. Default: 0. + dim2 (int, optional): second dimension with respect to which to + take diagonal. Default: 1. + + .. note:: + + :attr:`src` must be of the proper size in order to be embedded + into :attr:`input`. Specifically, it should have the same shape as + ``torch.diagonal(input, offset, dim1, dim2)`` + + Examples:: + + >>> a = torch.zeros(3, 3) + >>> a + tensor([[0., 0., 0.], + [0., 0., 0.], + [0., 0., 0.]]) + + >>> torch.diagonal_scatter(a, torch.ones(3), 0) + tensor([[1., 0., 0.], + [0., 1., 0.], + [0., 0., 1.]]) + + >>> torch.diagonal_scatter(a, torch.ones(2), 1) + tensor([[0., 1., 0.], + [0., 0., 1.], + [0., 0., 0.]]) + """ + ... +def diff(input: Tensor, n: _int = 1, dim: _int = -1, prepend: Optional[Tensor] = None, append: Optional[Tensor] = None, *, out: Optional[Tensor] = None) -> Tensor: + r""" + diff(input, n=1, dim=-1, prepend=None, append=None) -> Tensor + + Computes the n-th forward difference along the given dimension. + + The first-order differences are given by `out[i] = input[i + 1] - input[i]`. Higher-order + differences are calculated by using :func:`torch.diff` recursively. + + Args: + input (Tensor): the tensor to compute the differences on + n (int, optional): the number of times to recursively compute the difference + dim (int, optional): the dimension to compute the difference along. + Default is the last dimension. + prepend, append (Tensor, optional): values to prepend or append to + :attr:`input` along :attr:`dim` before computing the difference. + Their dimensions must be equivalent to that of input, and their shapes + must match input's shape except on :attr:`dim`. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([1, 3, 2]) + >>> torch.diff(a) + tensor([ 2, -1]) + >>> b = torch.tensor([4, 5]) + >>> torch.diff(a, append=b) + tensor([ 2, -1, 2, 1]) + >>> c = torch.tensor([[1, 2, 3], [3, 4, 5]]) + >>> torch.diff(c, dim=0) + tensor([[2, 2, 2]]) + >>> torch.diff(c, dim=1) + tensor([[1, 1], + [1, 1]]) + """ + ... +def digamma(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + digamma(input, *, out=None) -> Tensor + + Alias for :func:`torch.special.digamma`. + """ + ... +def dist(input: Tensor, other: Tensor, p: Union[Number, _complex] = 2) -> Tensor: + r""" + dist(input, other, p=2) -> Tensor + + Returns the p-norm of (:attr:`input` - :attr:`other`) + + The shapes of :attr:`input` and :attr:`other` must be + :ref:`broadcastable `. + + Args: + input (Tensor): the input tensor. + other (Tensor): the Right-hand-side input tensor + p (float, optional): the norm to be computed + + Example:: + + >>> x = torch.randn(4) + >>> x + tensor([-1.5393, -0.8675, 0.5916, 1.6321]) + >>> y = torch.randn(4) + >>> y + tensor([ 0.0967, -1.0511, 0.6295, 0.8360]) + >>> torch.dist(x, y, 3.5) + tensor(1.6727) + >>> torch.dist(x, y, 3) + tensor(1.6973) + >>> torch.dist(x, y, 0) + tensor(4.) + >>> torch.dist(x, y, 1) + tensor(2.6537) + """ + ... +def div(input: Union[Tensor, Number], other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + div(input, other, *, rounding_mode=None, out=None) -> Tensor + + Divides each element of the input ``input`` by the corresponding element of + :attr:`other`. + + .. math:: + \text{out}_i = \frac{\text{input}_i}{\text{other}_i} + + .. note:: + By default, this performs a "true" division like Python 3. + See the :attr:`rounding_mode` argument for floor division. + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer, float, and complex inputs. + Always promotes integer types to the default scalar type. + + Args: + input (Tensor): the dividend + other (Tensor or Number): the divisor + + Keyword args: + rounding_mode (str, optional): Type of rounding applied to the result: + + * None - default behavior. Performs no rounding and, if both :attr:`input` and + :attr:`other` are integer types, promotes the inputs to the default scalar type. + Equivalent to true division in Python (the ``/`` operator) and NumPy's ``np.true_divide``. + * ``"trunc"`` - rounds the results of the division towards zero. + Equivalent to C-style integer division. + * ``"floor"`` - rounds the results of the division down. + Equivalent to floor division in Python (the ``//`` operator) and NumPy's ``np.floor_divide``. + + out (Tensor, optional): the output tensor. + + Examples:: + + >>> x = torch.tensor([ 0.3810, 1.2774, -0.2972, -0.3719, 0.4637]) + >>> torch.div(x, 0.5) + tensor([ 0.7620, 2.5548, -0.5944, -0.7438, 0.9274]) + + >>> a = torch.tensor([[-0.3711, -1.9353, -0.4605, -0.2917], + ... [ 0.1815, -1.0111, 0.9805, -1.5923], + ... [ 0.1062, 1.4581, 0.7759, -1.2344], + ... [-0.1830, -0.0313, 1.1908, -1.4757]]) + >>> b = torch.tensor([ 0.8032, 0.2930, -0.8113, -0.2308]) + >>> torch.div(a, b) + tensor([[-0.4620, -6.6051, 0.5676, 1.2639], + [ 0.2260, -3.4509, -1.2086, 6.8990], + [ 0.1322, 4.9764, -0.9564, 5.3484], + [-0.2278, -0.1068, -1.4678, 6.3938]]) + + >>> torch.div(a, b, rounding_mode='trunc') + tensor([[-0., -6., 0., 1.], + [ 0., -3., -1., 6.], + [ 0., 4., -0., 5.], + [-0., -0., -1., 6.]]) + + >>> torch.div(a, b, rounding_mode='floor') + tensor([[-1., -7., 0., 1.], + [ 0., -4., -2., 6.], + [ 0., 4., -1., 5.], + [-1., -1., -2., 6.]]) + """ + ... +@overload +def divide(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + divide(input, other, *, rounding_mode=None, out=None) -> Tensor + + Alias for :func:`torch.div`. + """ + ... +@overload +def divide(input: Tensor, other: Tensor, *, rounding_mode: Optional[str], out: Optional[Tensor] = None) -> Tensor: + r""" + divide(input, other, *, rounding_mode=None, out=None) -> Tensor + + Alias for :func:`torch.div`. + """ + ... +@overload +def divide(input: Tensor, other: Union[Number, _complex], *, rounding_mode: Optional[str]) -> Tensor: + r""" + divide(input, other, *, rounding_mode=None, out=None) -> Tensor + + Alias for :func:`torch.div`. + """ + ... +@overload +def divide(input: Tensor, other: Union[Number, _complex]) -> Tensor: + r""" + divide(input, other, *, rounding_mode=None, out=None) -> Tensor + + Alias for :func:`torch.div`. + """ + ... +def dot(input: Tensor, tensor: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + dot(input, other, *, out=None) -> Tensor + + Computes the dot product of two 1D tensors. + + .. note:: + + Unlike NumPy's dot, torch.dot intentionally only supports computing the dot product + of two 1D tensors with the same number of elements. + + Args: + input (Tensor): first tensor in the dot product, must be 1D. + other (Tensor): second tensor in the dot product, must be 1D. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.dot(torch.tensor([2, 3]), torch.tensor([2, 1])) + tensor(7) + """ + ... +def dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ... +def dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ... +def dsmm(input: Tensor, mat2: Tensor) -> Tensor: ... +@overload +def dsplit(input: Tensor, sections: _int) -> Tuple[Tensor, ...]: + r""" + dsplit(input, indices_or_sections) -> List of Tensors + + Splits :attr:`input`, a tensor with three or more dimensions, into multiple tensors + depthwise according to :attr:`indices_or_sections`. Each split is a view of + :attr:`input`. + + This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=2) + (the split dimension is 2), except that if :attr:`indices_or_sections` is an integer + it must evenly divide the split dimension or a runtime error will be thrown. + + This function is based on NumPy's :func:`numpy.dsplit`. + + Args: + input (Tensor): tensor to split. + indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`. + + Example:: + >>> t = torch.arange(16.0).reshape(2, 2, 4) + >>> t + tensor([[[ 0., 1., 2., 3.], + [ 4., 5., 6., 7.]], + [[ 8., 9., 10., 11.], + [12., 13., 14., 15.]]]) + >>> torch.dsplit(t, 2) + (tensor([[[ 0., 1.], + [ 4., 5.]], + [[ 8., 9.], + [12., 13.]]]), + tensor([[[ 2., 3.], + [ 6., 7.]], + [[10., 11.], + [14., 15.]]])) + + >>> torch.dsplit(t, [3, 6]) + (tensor([[[ 0., 1., 2.], + [ 4., 5., 6.]], + [[ 8., 9., 10.], + [12., 13., 14.]]]), + tensor([[[ 3.], + [ 7.]], + [[11.], + [15.]]]), + tensor([], size=(2, 2, 0))) + """ + ... +@overload +def dsplit(input: Tensor, indices: _size) -> Tuple[Tensor, ...]: + r""" + dsplit(input, indices_or_sections) -> List of Tensors + + Splits :attr:`input`, a tensor with three or more dimensions, into multiple tensors + depthwise according to :attr:`indices_or_sections`. Each split is a view of + :attr:`input`. + + This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=2) + (the split dimension is 2), except that if :attr:`indices_or_sections` is an integer + it must evenly divide the split dimension or a runtime error will be thrown. + + This function is based on NumPy's :func:`numpy.dsplit`. + + Args: + input (Tensor): tensor to split. + indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`. + + Example:: + >>> t = torch.arange(16.0).reshape(2, 2, 4) + >>> t + tensor([[[ 0., 1., 2., 3.], + [ 4., 5., 6., 7.]], + [[ 8., 9., 10., 11.], + [12., 13., 14., 15.]]]) + >>> torch.dsplit(t, 2) + (tensor([[[ 0., 1.], + [ 4., 5.]], + [[ 8., 9.], + [12., 13.]]]), + tensor([[[ 2., 3.], + [ 6., 7.]], + [[10., 11.], + [14., 15.]]])) + + >>> torch.dsplit(t, [3, 6]) + (tensor([[[ 0., 1., 2.], + [ 4., 5., 6.]], + [[ 8., 9., 10.], + [12., 13., 14.]]]), + tensor([[[ 3.], + [ 7.]], + [[11.], + [15.]]]), + tensor([], size=(2, 2, 0))) + """ + ... +def dstack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: + r""" + dstack(tensors, *, out=None) -> Tensor + + Stack tensors in sequence depthwise (along third axis). + + This is equivalent to concatenation along the third axis after 1-D and 2-D tensors have been reshaped by :func:`torch.atleast_3d`. + + Args: + tensors (sequence of Tensors): sequence of tensors to concatenate + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([1, 2, 3]) + >>> b = torch.tensor([4, 5, 6]) + >>> torch.dstack((a,b)) + tensor([[[1, 4], + [2, 5], + [3, 6]]]) + >>> a = torch.tensor([[1],[2],[3]]) + >>> b = torch.tensor([[4],[5],[6]]) + >>> torch.dstack((a,b)) + tensor([[[1, 4]], + [[2, 5]], + [[3, 6]]]) + """ + ... +def embedding(weight: Tensor, indices: Tensor, padding_idx: Union[_int, SymInt] = -1, scale_grad_by_freq: _bool = False, sparse: _bool = False) -> Tensor: ... +@overload +def embedding_bag(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool, mode: _int, sparse: _bool, per_sample_weights: Optional[Tensor], include_last_offset: _bool, padding_idx: Optional[_int]) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ... +@overload +def embedding_bag(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool = False, mode: _int = 0, sparse: _bool = False, per_sample_weights: Optional[Tensor] = None, include_last_offset: _bool = False) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ... +def embedding_renorm_(input: Tensor, indices: Tensor, max_norm: _float, norm_type: _float) -> Tensor: ... +@overload +def empty(size: Sequence[Union[_int, SymInt]], *, memory_format: Optional[memory_format] = None, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor + + Returns a tensor filled with uninitialized data. The shape of the tensor is + defined by the variable argument :attr:`size`. + + .. note:: + If :func:`torch.use_deterministic_algorithms()` and + :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to + ``True``, the output tensor is initialized to prevent any possible + nondeterministic behavior from using the data as an input to an operation. + Floating point and complex tensors are filled with NaN, and integer tensors + are filled with the maximum value. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + memory_format (:class:`torch.memory_format`, optional): the desired memory format of + returned Tensor. Default: ``torch.contiguous_format``. + + Example:: + + >>> torch.empty((2,3), dtype=torch.int64) + tensor([[ 9.4064e+13, 2.8000e+01, 9.3493e+13], + [ 7.5751e+18, 7.1428e+18, 7.5955e+18]]) + """ + ... +@overload +def empty(*size: _int, memory_format: Optional[memory_format] = None, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor + + Returns a tensor filled with uninitialized data. The shape of the tensor is + defined by the variable argument :attr:`size`. + + .. note:: + If :func:`torch.use_deterministic_algorithms()` and + :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to + ``True``, the output tensor is initialized to prevent any possible + nondeterministic behavior from using the data as an input to an operation. + Floating point and complex tensors are filled with NaN, and integer tensors + are filled with the maximum value. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + memory_format (:class:`torch.memory_format`, optional): the desired memory format of + returned Tensor. Default: ``torch.contiguous_format``. + + Example:: + + >>> torch.empty((2,3), dtype=torch.int64) + tensor([[ 9.4064e+13, 2.8000e+01, 9.3493e+13], + [ 7.5751e+18, 7.1428e+18, 7.5955e+18]]) + """ + ... +@overload +def empty(size: _size, *, names: Optional[Sequence[Union[str, ellipsis, None]]], memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor + + Returns a tensor filled with uninitialized data. The shape of the tensor is + defined by the variable argument :attr:`size`. + + .. note:: + If :func:`torch.use_deterministic_algorithms()` and + :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to + ``True``, the output tensor is initialized to prevent any possible + nondeterministic behavior from using the data as an input to an operation. + Floating point and complex tensors are filled with NaN, and integer tensors + are filled with the maximum value. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + memory_format (:class:`torch.memory_format`, optional): the desired memory format of + returned Tensor. Default: ``torch.contiguous_format``. + + Example:: + + >>> torch.empty((2,3), dtype=torch.int64) + tensor([[ 9.4064e+13, 2.8000e+01, 9.3493e+13], + [ 7.5751e+18, 7.1428e+18, 7.5955e+18]]) + """ + ... +@overload +def empty(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor + + Returns a tensor filled with uninitialized data. The shape of the tensor is + defined by the variable argument :attr:`size`. + + .. note:: + If :func:`torch.use_deterministic_algorithms()` and + :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to + ``True``, the output tensor is initialized to prevent any possible + nondeterministic behavior from using the data as an input to an operation. + Floating point and complex tensors are filled with NaN, and integer tensors + are filled with the maximum value. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + memory_format (:class:`torch.memory_format`, optional): the desired memory format of + returned Tensor. Default: ``torch.contiguous_format``. + + Example:: + + >>> torch.empty((2,3), dtype=torch.int64) + tensor([[ 9.4064e+13, 2.8000e+01, 9.3493e+13], + [ 7.5751e+18, 7.1428e+18, 7.5955e+18]]) + """ + ... +def empty_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + empty_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor + + Returns an uninitialized tensor with the same size as :attr:`input`. + ``torch.empty_like(input)`` is equivalent to + ``torch.empty(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``. + + .. note:: + If :func:`torch.use_deterministic_algorithms()` and + :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to + ``True``, the output tensor is initialized to prevent any possible + nondeterministic behavior from using the data as an input to an operation. + Floating point and complex tensors are filled with NaN, and integer tensors + are filled with the maximum value. + + Args: + input (Tensor): the size of :attr:`input` will determine size of the output tensor. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor. + Default: if ``None``, defaults to the dtype of :attr:`input`. + layout (:class:`torch.layout`, optional): the desired layout of returned tensor. + Default: if ``None``, defaults to the layout of :attr:`input`. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, defaults to the device of :attr:`input`. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + memory_format (:class:`torch.memory_format`, optional): the desired memory format of + returned Tensor. Default: ``torch.preserve_format``. + + Example:: + + >>> a=torch.empty((2,3), dtype=torch.int32, device = 'cuda') + >>> torch.empty_like(a) + tensor([[0, 0, 0], + [0, 0, 0]], device='cuda:0', dtype=torch.int32) + """ + ... +def empty_permuted(size: Sequence[Union[_int, SymInt]], physical_layout: _size, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + empty_permuted(size, physical_layout, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor + + Creates an uninitialized, non-overlapping and dense tensor with the + specified :attr:`size`, with :attr:`physical_layout` specifying how the + dimensions are physically laid out in memory (each logical dimension is listed + from outermost to innermost). :attr:`physical_layout` is a generalization + of NCHW/NHWC notation: if each dimension is assigned a number according to + what order they occur in size (N=0, C=1, H=2, W=3), then NCHW is ``(0, 1, 2, 3)`` + while NHWC is ``(0, 2, 3, 1)``. Equivalently, the strides of the output + tensor ``t`` are such that ``t.stride(physical_layout[i]) == contiguous_strides[i]`` + (notably, this function is *not* equivalent to ``torch.empty(size).permute(physical_layout)``). + + Unlike :func:`torch.empty_strided`, this is guaranteed to produce a dense + tensor with no overlaps. If possible, prefer using this function over + :func:`torch.empty_strided` or manual use of :func:`torch.as_strided`. + + .. note:: + If :func:`torch.use_deterministic_algorithms()` and + :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to + ``True``, the output tensor is initialized to prevent any possible + nondeterministic behavior from using the data as an input to an operation. + Floating point and complex tensors are filled with NaN, and integer tensors + are filled with the maximum value. + + Args: + size (tuple of int): the shape of the output tensor + physical_layout (tuple of int): the ordering of dimensions physically in memory + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Examples: + + >>> torch.empty((2, 3, 5, 7)).stride() + (105, 35, 7, 1) + >>> torch.empty_permuted((2, 3, 5, 7), (0, 1, 2, 3)).stride() + (105, 35, 7, 1) + >>> torch.empty((2, 3, 5, 7), memory_format=torch.channels_last).stride() + (105, 1, 21, 3) + >>> torch.empty_permuted((2, 3, 5, 7), (0, 2, 3, 1)).stride() + (105, 1, 21, 3) + >>> torch.empty_permuted((2, 3, 5, 7), (0, 2, 3, 1)).dim_order() + (0, 2, 3, 1) + """ + ... +def empty_quantized(size: _size, qtensor: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ... +def empty_strided(size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + empty_strided(size, stride, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor + + Creates a tensor with the specified :attr:`size` and :attr:`stride` and filled with undefined data. + + .. warning:: + If the constructed tensor is "overlapped" (with multiple indices referring to the same element + in memory) its behavior is undefined. + + .. note:: + If :func:`torch.use_deterministic_algorithms()` and + :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to + ``True``, the output tensor is initialized to prevent any possible + nondeterministic behavior from using the data as an input to an operation. + Floating point and complex tensors are filled with NaN, and integer tensors + are filled with the maximum value. + + Args: + size (tuple of int): the shape of the output tensor + stride (tuple of int): the strides of the output tensor + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> a = torch.empty_strided((2, 3), (1, 2)) + >>> a + tensor([[8.9683e-44, 4.4842e-44, 5.1239e+07], + [0.0000e+00, 0.0000e+00, 3.0705e-41]]) + >>> a.stride() + (1, 2) + >>> a.size() + torch.Size([2, 3]) + """ + ... +@overload +def eq(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + eq(input, other, *, out=None) -> Tensor + + Computes element-wise equality + + The second argument can be a number or a tensor whose shape is + :ref:`broadcastable ` with the first argument. + + Args: + input (Tensor): the tensor to compare + other (Tensor or float): the tensor or value to compare + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is equal to :attr:`other` and False elsewhere + + Example:: + + >>> torch.eq(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]])) + tensor([[ True, False], + [False, True]]) + """ + ... +@overload +def eq(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + eq(input, other, *, out=None) -> Tensor + + Computes element-wise equality + + The second argument can be a number or a tensor whose shape is + :ref:`broadcastable ` with the first argument. + + Args: + input (Tensor): the tensor to compare + other (Tensor or float): the tensor or value to compare + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is equal to :attr:`other` and False elsewhere + + Example:: + + >>> torch.eq(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]])) + tensor([[ True, False], + [False, True]]) + """ + ... +def equal(input: Tensor, other: Tensor) -> _bool: + r""" + equal(input, other) -> bool + + ``True`` if two tensors have the same size and elements, ``False`` otherwise. + + Example:: + + >>> torch.equal(torch.tensor([1, 2]), torch.tensor([1, 2])) + True + """ + ... +def erf(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + erf(input, *, out=None) -> Tensor + + Alias for :func:`torch.special.erf`. + """ + ... +def erf_(input: Tensor) -> Tensor: ... +def erfc(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + erfc(input, *, out=None) -> Tensor + + Alias for :func:`torch.special.erfc`. + """ + ... +def erfc_(input: Tensor) -> Tensor: ... +def erfinv(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + erfinv(input, *, out=None) -> Tensor + + Alias for :func:`torch.special.erfinv`. + """ + ... +def exp(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + exp(input, *, out=None) -> Tensor + + Returns a new tensor with the exponential of the elements + of the input tensor :attr:`input`. + + .. math:: + y_{i} = e^{x_{i}} + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.exp(torch.tensor([0, math.log(2.)])) + tensor([ 1., 2.]) + """ + ... +def exp2(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + exp2(input, *, out=None) -> Tensor + + Alias for :func:`torch.special.exp2`. + """ + ... +def exp2_(input: Tensor) -> Tensor: ... +def exp_(input: Tensor) -> Tensor: ... +def expand_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], *, implicit: _bool = False, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.expand`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def expm1(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + expm1(input, *, out=None) -> Tensor + + Alias for :func:`torch.special.expm1`. + """ + ... +def expm1_(input: Tensor) -> Tensor: ... +@overload +def eye(n: Union[_int, SymInt], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + eye(n, m=None, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a 2-D tensor with ones on the diagonal and zeros elsewhere. + + Args: + n (int): the number of rows + m (int, optional): the number of columns with default being :attr:`n` + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Returns: + Tensor: A 2-D tensor with ones on the diagonal and zeros elsewhere + + Example:: + + >>> torch.eye(3) + tensor([[ 1., 0., 0.], + [ 0., 1., 0.], + [ 0., 0., 1.]]) + """ + ... +@overload +def eye(n: Union[_int, SymInt], m: Union[_int, SymInt], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + eye(n, m=None, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a 2-D tensor with ones on the diagonal and zeros elsewhere. + + Args: + n (int): the number of rows + m (int, optional): the number of columns with default being :attr:`n` + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Returns: + Tensor: A 2-D tensor with ones on the diagonal and zeros elsewhere + + Example:: + + >>> torch.eye(3) + tensor([[ 1., 0., 0.], + [ 0., 1., 0.], + [ 0., 0., 1.]]) + """ + ... +def fake_quantize_per_channel_affine(input: Tensor, scale: Tensor, zero_point: Tensor, axis: _int, quant_min: _int, quant_max: _int) -> Tensor: + r""" + fake_quantize_per_channel_affine(input, scale, zero_point, axis, quant_min, quant_max) -> Tensor + + Returns a new tensor with the data in :attr:`input` fake quantized per channel using :attr:`scale`, + :attr:`zero_point`, :attr:`quant_min` and :attr:`quant_max`, across the channel specified by :attr:`axis`. + + .. math:: + \text{output} = ( + min( + \text{quant\_max}, + max( + \text{quant\_min}, + \text{std::nearby\_int}(\text{input} / \text{scale}) + \text{zero\_point} + ) + ) - \text{zero\_point} + ) \times \text{scale} + + Args: + input (Tensor): the input value(s), in ``torch.float32`` + scale (Tensor): quantization scale, per channel in ``torch.float32`` + zero_point (Tensor): quantization zero_point, per channel in ``torch.int32`` or ``torch.half`` or ``torch.float32`` + axis (int32): channel axis + quant_min (int64): lower bound of the quantized domain + quant_max (int64): upper bound of the quantized domain + + Returns: + Tensor: A newly fake_quantized per channel ``torch.float32`` tensor + + Example:: + + >>> x = torch.randn(2, 2, 2) + >>> x + tensor([[[-0.2525, -0.0466], + [ 0.3491, -0.2168]], + + [[-0.5906, 1.6258], + [ 0.6444, -0.0542]]]) + >>> scales = (torch.randn(2) + 1) * 0.05 + >>> scales + tensor([0.0475, 0.0486]) + >>> zero_points = torch.zeros(2).to(torch.int32) + >>> zero_points + tensor([0, 0]) + >>> torch.fake_quantize_per_channel_affine(x, scales, zero_points, 1, 0, 255) + tensor([[[0.0000, 0.0000], + [0.3405, 0.0000]], + + [[0.0000, 1.6134], + [0.6323, 0.0000]]]) + """ + ... +@overload +def fake_quantize_per_tensor_affine(input: Tensor, scale: _float, zero_point: _int, quant_min: _int, quant_max: _int) -> Tensor: + r""" + fake_quantize_per_tensor_affine(input, scale, zero_point, quant_min, quant_max) -> Tensor + + Returns a new tensor with the data in :attr:`input` fake quantized using :attr:`scale`, + :attr:`zero_point`, :attr:`quant_min` and :attr:`quant_max`. + + .. math:: + \text{output} = ( + min( + \text{quant\_max}, + max( + \text{quant\_min}, + \text{std::nearby\_int}(\text{input} / \text{scale}) + \text{zero\_point} + ) + ) - \text{zero\_point} + ) \times \text{scale} + + Args: + input (Tensor): the input value(s), ``torch.float32`` tensor + scale (double scalar or ``float32`` Tensor): quantization scale + zero_point (int64 scalar or ``int32`` Tensor): quantization zero_point + quant_min (int64): lower bound of the quantized domain + quant_max (int64): upper bound of the quantized domain + + Returns: + Tensor: A newly fake_quantized ``torch.float32`` tensor + + Example:: + + >>> x = torch.randn(4) + >>> x + tensor([ 0.0552, 0.9730, 0.3973, -1.0780]) + >>> torch.fake_quantize_per_tensor_affine(x, 0.1, 0, 0, 255) + tensor([0.1000, 1.0000, 0.4000, 0.0000]) + >>> torch.fake_quantize_per_tensor_affine(x, torch.tensor(0.1), torch.tensor(0), 0, 255) + tensor([0.1000, 1.0000, 0.4000, 0.0000]) + """ + ... +@overload +def fake_quantize_per_tensor_affine(input: Tensor, scale: Tensor, zero_point: Tensor, quant_min: _int, quant_max: _int) -> Tensor: + r""" + fake_quantize_per_tensor_affine(input, scale, zero_point, quant_min, quant_max) -> Tensor + + Returns a new tensor with the data in :attr:`input` fake quantized using :attr:`scale`, + :attr:`zero_point`, :attr:`quant_min` and :attr:`quant_max`. + + .. math:: + \text{output} = ( + min( + \text{quant\_max}, + max( + \text{quant\_min}, + \text{std::nearby\_int}(\text{input} / \text{scale}) + \text{zero\_point} + ) + ) - \text{zero\_point} + ) \times \text{scale} + + Args: + input (Tensor): the input value(s), ``torch.float32`` tensor + scale (double scalar or ``float32`` Tensor): quantization scale + zero_point (int64 scalar or ``int32`` Tensor): quantization zero_point + quant_min (int64): lower bound of the quantized domain + quant_max (int64): upper bound of the quantized domain + + Returns: + Tensor: A newly fake_quantized ``torch.float32`` tensor + + Example:: + + >>> x = torch.randn(4) + >>> x + tensor([ 0.0552, 0.9730, 0.3973, -1.0780]) + >>> torch.fake_quantize_per_tensor_affine(x, 0.1, 0, 0, 255) + tensor([0.1000, 1.0000, 0.4000, 0.0000]) + >>> torch.fake_quantize_per_tensor_affine(x, torch.tensor(0.1), torch.tensor(0), 0, 255) + tensor([0.1000, 1.0000, 0.4000, 0.0000]) + """ + ... +def fbgemm_linear_fp16_weight(input: Tensor, packed_weight: Tensor, bias: Tensor) -> Tensor: ... +def fbgemm_linear_fp16_weight_fp32_activation(input: Tensor, packed_weight: Tensor, bias: Tensor) -> Tensor: ... +def fbgemm_linear_int8_weight(input: Tensor, weight: Tensor, packed: Tensor, col_offsets: Tensor, weight_scale: Union[Number, _complex], weight_zero_point: Union[Number, _complex], bias: Tensor) -> Tensor: ... +def fbgemm_linear_int8_weight_fp32_activation(input: Tensor, weight: Tensor, packed: Tensor, col_offsets: Tensor, weight_scale: Union[Number, _complex], weight_zero_point: Union[Number, _complex], bias: Tensor) -> Tensor: ... +def fbgemm_linear_quantize_weight(input: Tensor) -> Tuple[Tensor, Tensor, _float, _int]: ... +def fbgemm_pack_gemm_matrix_fp16(input: Tensor) -> Tensor: ... +@overload +def fbgemm_pack_quantized_matrix(input: Tensor) -> Tensor: ... +@overload +def fbgemm_pack_quantized_matrix(input: Tensor, K: _int, N: _int) -> Tensor: ... +def feature_alpha_dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ... +def feature_alpha_dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ... +def feature_dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ... +def feature_dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ... +@overload +def fill(input: Tensor, value: Tensor) -> Tensor: ... +@overload +def fill(input: Tensor, value: Union[Number, _complex]) -> Tensor: ... +@overload +def fill_(input: Tensor, value: Tensor) -> Tensor: ... +@overload +def fill_(input: Tensor, value: Union[Number, _complex]) -> Tensor: ... +def fix(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + fix(input, *, out=None) -> Tensor + + Alias for :func:`torch.trunc` + """ + ... +def fix_(input: Tensor) -> Tensor: ... +@overload +def flatten(input: Tensor, start_dim: _int = 0, end_dim: _int = -1) -> Tensor: + r""" + flatten(input, start_dim=0, end_dim=-1) -> Tensor + + Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim` + are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened. + The order of elements in :attr:`input` is unchanged. + + Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view, + or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can + be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the + flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned. + + .. note:: + Flattening a zero-dimensional tensor will return a one-dimensional view. + + Args: + input (Tensor): the input tensor. + start_dim (int): the first dim to flatten + end_dim (int): the last dim to flatten + + Example:: + + >>> t = torch.tensor([[[1, 2], + ... [3, 4]], + ... [[5, 6], + ... [7, 8]]]) + >>> torch.flatten(t) + tensor([1, 2, 3, 4, 5, 6, 7, 8]) + >>> torch.flatten(t, start_dim=1) + tensor([[1, 2, 3, 4], + [5, 6, 7, 8]]) + """ + ... +@overload +def flatten(input: Tensor, start_dim: _int, end_dim: _int, out_dim: Union[str, ellipsis, None]) -> Tensor: + r""" + flatten(input, start_dim=0, end_dim=-1) -> Tensor + + Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim` + are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened. + The order of elements in :attr:`input` is unchanged. + + Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view, + or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can + be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the + flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned. + + .. note:: + Flattening a zero-dimensional tensor will return a one-dimensional view. + + Args: + input (Tensor): the input tensor. + start_dim (int): the first dim to flatten + end_dim (int): the last dim to flatten + + Example:: + + >>> t = torch.tensor([[[1, 2], + ... [3, 4]], + ... [[5, 6], + ... [7, 8]]]) + >>> torch.flatten(t) + tensor([1, 2, 3, 4, 5, 6, 7, 8]) + >>> torch.flatten(t, start_dim=1) + tensor([[1, 2, 3, 4], + [5, 6, 7, 8]]) + """ + ... +@overload +def flatten(input: Tensor, start_dim: Union[str, ellipsis, None], end_dim: Union[str, ellipsis, None], out_dim: Union[str, ellipsis, None]) -> Tensor: + r""" + flatten(input, start_dim=0, end_dim=-1) -> Tensor + + Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim` + are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened. + The order of elements in :attr:`input` is unchanged. + + Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view, + or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can + be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the + flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned. + + .. note:: + Flattening a zero-dimensional tensor will return a one-dimensional view. + + Args: + input (Tensor): the input tensor. + start_dim (int): the first dim to flatten + end_dim (int): the last dim to flatten + + Example:: + + >>> t = torch.tensor([[[1, 2], + ... [3, 4]], + ... [[5, 6], + ... [7, 8]]]) + >>> torch.flatten(t) + tensor([1, 2, 3, 4, 5, 6, 7, 8]) + >>> torch.flatten(t, start_dim=1) + tensor([[1, 2, 3, 4], + [5, 6, 7, 8]]) + """ + ... +@overload +def flatten(input: Tensor, dims: Sequence[Union[str, ellipsis, None]], out_dim: Union[str, ellipsis, None]) -> Tensor: + r""" + flatten(input, start_dim=0, end_dim=-1) -> Tensor + + Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim` + are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened. + The order of elements in :attr:`input` is unchanged. + + Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view, + or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can + be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the + flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned. + + .. note:: + Flattening a zero-dimensional tensor will return a one-dimensional view. + + Args: + input (Tensor): the input tensor. + start_dim (int): the first dim to flatten + end_dim (int): the last dim to flatten + + Example:: + + >>> t = torch.tensor([[[1, 2], + ... [3, 4]], + ... [[5, 6], + ... [7, 8]]]) + >>> torch.flatten(t) + tensor([1, 2, 3, 4, 5, 6, 7, 8]) + >>> torch.flatten(t, start_dim=1) + tensor([[1, 2, 3, 4], + [5, 6, 7, 8]]) + """ + ... +def flip(input: Tensor, dims: _size) -> Tensor: + r""" + flip(input, dims) -> Tensor + + Reverse the order of an n-D tensor along given axis in dims. + + .. note:: + `torch.flip` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.flip`, + which returns a view in constant time. Since copying a tensor's data is more work than viewing that data, + `torch.flip` is expected to be slower than `np.flip`. + + Args: + input (Tensor): the input tensor. + dims (a list or tuple): axis to flip on + + Example:: + + >>> x = torch.arange(8).view(2, 2, 2) + >>> x + tensor([[[ 0, 1], + [ 2, 3]], + + [[ 4, 5], + [ 6, 7]]]) + >>> torch.flip(x, [0, 1]) + tensor([[[ 6, 7], + [ 4, 5]], + + [[ 2, 3], + [ 0, 1]]]) + """ + ... +def fliplr(input: Tensor) -> Tensor: + r""" + fliplr(input) -> Tensor + + Flip tensor in the left/right direction, returning a new tensor. + + Flip the entries in each row in the left/right direction. + Columns are preserved, but appear in a different order than before. + + Note: + Requires the tensor to be at least 2-D. + + .. note:: + `torch.fliplr` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.fliplr`, + which returns a view in constant time. Since copying a tensor's data is more work than viewing that data, + `torch.fliplr` is expected to be slower than `np.fliplr`. + + Args: + input (Tensor): Must be at least 2-dimensional. + + Example:: + + >>> x = torch.arange(4).view(2, 2) + >>> x + tensor([[0, 1], + [2, 3]]) + >>> torch.fliplr(x) + tensor([[1, 0], + [3, 2]]) + """ + ... +def flipud(input: Tensor) -> Tensor: + r""" + flipud(input) -> Tensor + + Flip tensor in the up/down direction, returning a new tensor. + + Flip the entries in each column in the up/down direction. + Rows are preserved, but appear in a different order than before. + + Note: + Requires the tensor to be at least 1-D. + + .. note:: + `torch.flipud` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.flipud`, + which returns a view in constant time. Since copying a tensor's data is more work than viewing that data, + `torch.flipud` is expected to be slower than `np.flipud`. + + Args: + input (Tensor): Must be at least 1-dimensional. + + Example:: + + >>> x = torch.arange(4).view(2, 2) + >>> x + tensor([[0, 1], + [2, 3]]) + >>> torch.flipud(x) + tensor([[2, 3], + [0, 1]]) + """ + ... +@overload +def float_power(input: Tensor, exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + float_power(input, exponent, *, out=None) -> Tensor + + Raises :attr:`input` to the power of :attr:`exponent`, elementwise, in double precision. + If neither input is complex returns a ``torch.float64`` tensor, + and if one or more inputs is complex returns a ``torch.complex128`` tensor. + + .. note:: + This function always computes in double precision, unlike :func:`torch.pow`, + which implements more typical :ref:`type promotion `. + This is useful when the computation needs to be performed in a wider or more precise dtype, + or the results of the computation may contain fractional values not representable in the input dtypes, + like when an integer base is raised to a negative integer exponent. + + Args: + input (Tensor or Number): the base value(s) + exponent (Tensor or Number): the exponent value(s) + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randint(10, (4,)) + >>> a + tensor([6, 4, 7, 1]) + >>> torch.float_power(a, 2) + tensor([36., 16., 49., 1.], dtype=torch.float64) + + >>> a = torch.arange(1, 5) + >>> a + tensor([ 1, 2, 3, 4]) + >>> exp = torch.tensor([2, -3, 4, -5]) + >>> exp + tensor([ 2, -3, 4, -5]) + >>> torch.float_power(a, exp) + tensor([1.0000e+00, 1.2500e-01, 8.1000e+01, 9.7656e-04], dtype=torch.float64) + """ + ... +@overload +def float_power(self: Union[Number, _complex], exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + float_power(input, exponent, *, out=None) -> Tensor + + Raises :attr:`input` to the power of :attr:`exponent`, elementwise, in double precision. + If neither input is complex returns a ``torch.float64`` tensor, + and if one or more inputs is complex returns a ``torch.complex128`` tensor. + + .. note:: + This function always computes in double precision, unlike :func:`torch.pow`, + which implements more typical :ref:`type promotion `. + This is useful when the computation needs to be performed in a wider or more precise dtype, + or the results of the computation may contain fractional values not representable in the input dtypes, + like when an integer base is raised to a negative integer exponent. + + Args: + input (Tensor or Number): the base value(s) + exponent (Tensor or Number): the exponent value(s) + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randint(10, (4,)) + >>> a + tensor([6, 4, 7, 1]) + >>> torch.float_power(a, 2) + tensor([36., 16., 49., 1.], dtype=torch.float64) + + >>> a = torch.arange(1, 5) + >>> a + tensor([ 1, 2, 3, 4]) + >>> exp = torch.tensor([2, -3, 4, -5]) + >>> exp + tensor([ 2, -3, 4, -5]) + >>> torch.float_power(a, exp) + tensor([1.0000e+00, 1.2500e-01, 8.1000e+01, 9.7656e-04], dtype=torch.float64) + """ + ... +@overload +def float_power(input: Tensor, exponent: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + float_power(input, exponent, *, out=None) -> Tensor + + Raises :attr:`input` to the power of :attr:`exponent`, elementwise, in double precision. + If neither input is complex returns a ``torch.float64`` tensor, + and if one or more inputs is complex returns a ``torch.complex128`` tensor. + + .. note:: + This function always computes in double precision, unlike :func:`torch.pow`, + which implements more typical :ref:`type promotion `. + This is useful when the computation needs to be performed in a wider or more precise dtype, + or the results of the computation may contain fractional values not representable in the input dtypes, + like when an integer base is raised to a negative integer exponent. + + Args: + input (Tensor or Number): the base value(s) + exponent (Tensor or Number): the exponent value(s) + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randint(10, (4,)) + >>> a + tensor([6, 4, 7, 1]) + >>> torch.float_power(a, 2) + tensor([36., 16., 49., 1.], dtype=torch.float64) + + >>> a = torch.arange(1, 5) + >>> a + tensor([ 1, 2, 3, 4]) + >>> exp = torch.tensor([2, -3, 4, -5]) + >>> exp + tensor([ 2, -3, 4, -5]) + >>> torch.float_power(a, exp) + tensor([1.0000e+00, 1.2500e-01, 8.1000e+01, 9.7656e-04], dtype=torch.float64) + """ + ... +def floor(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + floor(input, *, out=None) -> Tensor + + Returns a new tensor with the floor of the elements of :attr:`input`, + the largest integer less than or equal to each element. + + For integer inputs, follows the array-api convention of returning a + copy of the input tensor. + + .. math:: + \text{out}_{i} = \left\lfloor \text{input}_{i} \right\rfloor + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([-0.8166, 1.5308, -0.2530, -0.2091]) + >>> torch.floor(a) + tensor([-1., 1., -1., -1.]) + """ + ... +def floor_(input: Tensor) -> Tensor: ... +def floor_divide(input: Union[Tensor, Number], other: Union[Tensor, Number], *, out: Optional[Tensor] = None) -> Tensor: + r""" + floor_divide(input, other, *, out=None) -> Tensor + + .. note:: + + Before PyTorch 1.13 :func:`torch.floor_divide` incorrectly performed + truncation division. To restore the previous behavior use + :func:`torch.div` with ``rounding_mode='trunc'``. + + Computes :attr:`input` divided by :attr:`other`, elementwise, and floors + the result. + + .. math:: + \text{{out}}_i = \text{floor} \left( \frac{{\text{{input}}_i}}{{\text{{other}}_i}} \right) + + + + Supports broadcasting to a common shape, type promotion, and integer and float inputs. + + Args: + input (Tensor or Number): the dividend + other (Tensor or Number): the divisor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([4.0, 3.0]) + >>> b = torch.tensor([2.0, 2.0]) + >>> torch.floor_divide(a, b) + tensor([2.0, 1.0]) + >>> torch.floor_divide(a, 1.4) + tensor([2.0, 2.0]) + """ + ... +def fmax(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + fmax(input, other, *, out=None) -> Tensor + + Computes the element-wise maximum of :attr:`input` and :attr:`other`. + + This is like :func:`torch.maximum` except it handles NaNs differently: + if exactly one of the two elements being compared is a NaN then the non-NaN element is taken as the maximum. + Only if both elements are NaN is NaN propagated. + + This function is a wrapper around C++'s ``std::fmax`` and is similar to NumPy's ``fmax`` function. + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer and floating-point inputs. + + Args: + input (Tensor): the input tensor. + other (Tensor): the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([9.7, float('nan'), 3.1, float('nan')]) + >>> b = torch.tensor([-2.2, 0.5, float('nan'), float('nan')]) + >>> torch.fmax(a, b) + tensor([9.7000, 0.5000, 3.1000, nan]) + """ + ... +def fmin(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + fmin(input, other, *, out=None) -> Tensor + + Computes the element-wise minimum of :attr:`input` and :attr:`other`. + + This is like :func:`torch.minimum` except it handles NaNs differently: + if exactly one of the two elements being compared is a NaN then the non-NaN element is taken as the minimum. + Only if both elements are NaN is NaN propagated. + + This function is a wrapper around C++'s ``std::fmin`` and is similar to NumPy's ``fmin`` function. + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer and floating-point inputs. + + Args: + input (Tensor): the input tensor. + other (Tensor): the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([2.2, float('nan'), 2.1, float('nan')]) + >>> b = torch.tensor([-9.3, 0.1, float('nan'), float('nan')]) + >>> torch.fmin(a, b) + tensor([-9.3000, 0.1000, 2.1000, nan]) + """ + ... +@overload +def fmod(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + fmod(input, other, *, out=None) -> Tensor + + Applies C++'s `std::fmod `_ entrywise. + The result has the same sign as the dividend :attr:`input` and its absolute value + is less than that of :attr:`other`. + + This function may be defined in terms of :func:`torch.div` as + + .. code:: python + + torch.fmod(a, b) == a - a.div(b, rounding_mode="trunc") * b + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer and float inputs. + + .. note:: + + When the divisor is zero, returns ``NaN`` for floating point dtypes + on both CPU and GPU; raises ``RuntimeError`` for integer division by + zero on CPU; Integer division by zero on GPU may return any value. + + .. note:: + + Complex inputs are not supported. In some cases, it is not mathematically + possible to satisfy the definition of a modulo operation with complex numbers. + + .. seealso:: + + :func:`torch.remainder` which implements Python's modulus operator. + This one is defined using division rounding down the result. + + Args: + input (Tensor): the dividend + other (Tensor or Scalar): the divisor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.fmod(torch.tensor([-3., -2, -1, 1, 2, 3]), 2) + tensor([-1., -0., -1., 1., 0., 1.]) + >>> torch.fmod(torch.tensor([1, 2, 3, 4, 5]), -1.5) + tensor([1.0000, 0.5000, 0.0000, 1.0000, 0.5000]) + """ + ... +@overload +def fmod(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + fmod(input, other, *, out=None) -> Tensor + + Applies C++'s `std::fmod `_ entrywise. + The result has the same sign as the dividend :attr:`input` and its absolute value + is less than that of :attr:`other`. + + This function may be defined in terms of :func:`torch.div` as + + .. code:: python + + torch.fmod(a, b) == a - a.div(b, rounding_mode="trunc") * b + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer and float inputs. + + .. note:: + + When the divisor is zero, returns ``NaN`` for floating point dtypes + on both CPU and GPU; raises ``RuntimeError`` for integer division by + zero on CPU; Integer division by zero on GPU may return any value. + + .. note:: + + Complex inputs are not supported. In some cases, it is not mathematically + possible to satisfy the definition of a modulo operation with complex numbers. + + .. seealso:: + + :func:`torch.remainder` which implements Python's modulus operator. + This one is defined using division rounding down the result. + + Args: + input (Tensor): the dividend + other (Tensor or Scalar): the divisor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.fmod(torch.tensor([-3., -2, -1, 1, 2, 3]), 2) + tensor([-1., -0., -1., 1., 0., 1.]) + >>> torch.fmod(torch.tensor([1, 2, 3, 4, 5]), -1.5) + tensor([1.0000, 0.5000, 0.0000, 1.0000, 0.5000]) + """ + ... +def frac(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + frac(input, *, out=None) -> Tensor + + Computes the fractional portion of each element in :attr:`input`. + + .. math:: + \text{out}_{i} = \text{input}_{i} - \left\lfloor |\text{input}_{i}| \right\rfloor * \operatorname{sgn}(\text{input}_{i}) + + Example:: + + >>> torch.frac(torch.tensor([1, 2.5, -3.2])) + tensor([ 0.0000, 0.5000, -0.2000]) + """ + ... +def frac_(input: Tensor) -> Tensor: ... +def frexp(input: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.frexp: + r""" + frexp(input, *, out=None) -> (Tensor mantissa, Tensor exponent) + + Decomposes :attr:`input` into mantissa and exponent tensors + such that :math:`\text{input} = \text{mantissa} \times 2^{\text{exponent}}`. + + The range of mantissa is the open interval (-1, 1). + + Supports float inputs. + + Args: + input (Tensor): the input tensor + + + Keyword args: + out (tuple, optional): the output tensors + + Example:: + + >>> x = torch.arange(9.) + >>> mantissa, exponent = torch.frexp(x) + >>> mantissa + tensor([0.0000, 0.5000, 0.5000, 0.7500, 0.5000, 0.6250, 0.7500, 0.8750, 0.5000]) + >>> exponent + tensor([0, 1, 2, 2, 3, 3, 3, 3, 4], dtype=torch.int32) + >>> torch.ldexp(mantissa, exponent) + tensor([0., 1., 2., 3., 4., 5., 6., 7., 8.]) + """ + ... +def frobenius_norm(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: ... +def from_file(filename: str, shared: Optional[_bool] = None, size: Optional[_int] = 0, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + from_file(filename, shared=None, size=0, *, dtype=None, layout=None, device=None, pin_memory=False) + + Creates a CPU tensor with a storage backed by a memory-mapped file. + + If ``shared`` is True, then memory is shared between processes. All changes are written to the file. + If ``shared`` is False, then changes to the tensor do not affect the file. + + ``size`` is the number of elements in the Tensor. If ``shared`` is ``False``, then the file must contain + at least ``size * sizeof(dtype)`` bytes. If ``shared`` is ``True`` the file will be created if needed. + + .. note:: + Only CPU tensors can be mapped to files. + + .. note:: + For now, tensors with storages backed by a memory-mapped file cannot be created in pinned memory. + + + Args: + filename (str): file name to map + shared (bool): whether to share memory (whether ``MAP_SHARED`` or ``MAP_PRIVATE`` is passed to the + underlying `mmap(2) call `_) + size (int): number of elements in the tensor + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + >>> t = torch.randn(2, 5, dtype=torch.float64) + >>> t.numpy().tofile('storage.pt') + >>> t_mapped = torch.from_file('storage.pt', shared=False, size=10, dtype=torch.float64) + """ + ... +def from_numpy(ndarray) -> Tensor: + r""" + from_numpy(ndarray) -> Tensor + + Creates a :class:`Tensor` from a :class:`numpy.ndarray`. + + The returned tensor and :attr:`ndarray` share the same memory. Modifications to + the tensor will be reflected in the :attr:`ndarray` and vice versa. The returned + tensor is not resizable. + + It currently accepts :attr:`ndarray` with dtypes of ``numpy.float64``, + ``numpy.float32``, ``numpy.float16``, ``numpy.complex64``, ``numpy.complex128``, + ``numpy.int64``, ``numpy.int32``, ``numpy.int16``, ``numpy.int8``, ``numpy.uint8``, + and ``bool``. + + .. warning:: + Writing to a tensor created from a read-only NumPy array is not supported and will result in undefined behavior. + + Example:: + + >>> a = numpy.array([1, 2, 3]) + >>> t = torch.from_numpy(a) + >>> t + tensor([ 1, 2, 3]) + >>> t[0] = -1 + >>> a + array([-1, 2, 3]) + """ + ... +def frombuffer(buffer: Any, *, dtype: _dtype, count: int = -1, offset: int = 0, requires_grad: _bool = False) -> Tensor: + r""" + frombuffer(buffer, *, dtype, count=-1, offset=0, requires_grad=False) -> Tensor + + Creates a 1-dimensional :class:`Tensor` from an object that implements + the Python buffer protocol. + + Skips the first :attr:`offset` bytes in the buffer, and interprets the rest of + the raw bytes as a 1-dimensional tensor of type :attr:`dtype` with :attr:`count` + elements. + + Note that either of the following must be true: + + 1. :attr:`count` is a positive non-zero number, and the total number of bytes + in the buffer is more than :attr:`offset` plus :attr:`count` times the size + (in bytes) of :attr:`dtype`. + + 2. :attr:`count` is negative, and the length (number of bytes) of the buffer + subtracted by the :attr:`offset` is a multiple of the size (in bytes) of + :attr:`dtype`. + + The returned tensor and buffer share the same memory. Modifications to + the tensor will be reflected in the buffer and vice versa. The returned + tensor is not resizable. + + .. note:: + This function increments the reference count for the object that + owns the shared memory. Therefore, such memory will not be deallocated + before the returned tensor goes out of scope. + + .. warning:: + This function's behavior is undefined when passed an object implementing + the buffer protocol whose data is not on the CPU. Doing so is likely to + cause a segmentation fault. + + .. warning:: + This function does not try to infer the :attr:`dtype` (hence, it is not + optional). Passing a different :attr:`dtype` than its source may result + in unexpected behavior. + + Args: + buffer (object): a Python object that exposes the buffer interface. + + Keyword args: + dtype (:class:`torch.dtype`): the desired data type of returned tensor. + count (int, optional): the number of desired elements to be read. + If negative, all the elements (until the end of the buffer) will be + read. Default: -1. + offset (int, optional): the number of bytes to skip at the start of + the buffer. Default: 0. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> import array + >>> a = array.array('i', [1, 2, 3]) + >>> t = torch.frombuffer(a, dtype=torch.int32) + >>> t + tensor([ 1, 2, 3]) + >>> t[0] = -1 + >>> a + array([-1, 2, 3]) + + >>> # Interprets the signed char bytes as 32-bit integers. + >>> # Each 4 signed char elements will be interpreted as + >>> # 1 signed 32-bit integer. + >>> import array + >>> a = array.array('b', [-1, 0, 0, 0]) + >>> torch.frombuffer(a, dtype=torch.int32) + tensor([255], dtype=torch.int32) + """ + ... +@overload +def full(size: _size, fill_value: Union[Number, _complex], *, out: Optional[Tensor] = None, layout: _layout = strided, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: + r""" + full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The + tensor's dtype is inferred from :attr:`fill_value`. + + Args: + size (int...): a list, tuple, or :class:`torch.Size` of integers defining the + shape of the output tensor. + fill_value (Scalar): the value to fill the output tensor with. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.full((2, 3), 3.141592) + tensor([[ 3.1416, 3.1416, 3.1416], + [ 3.1416, 3.1416, 3.1416]]) + """ + ... +@overload +def full(size: _size, fill_value: Union[Number, _complex], *, names: List[Union[str, None]], layout: _layout = strided, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: + r""" + full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The + tensor's dtype is inferred from :attr:`fill_value`. + + Args: + size (int...): a list, tuple, or :class:`torch.Size` of integers defining the + shape of the output tensor. + fill_value (Scalar): the value to fill the output tensor with. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.full((2, 3), 3.141592) + tensor([[ 3.1416, 3.1416, 3.1416], + [ 3.1416, 3.1416, 3.1416]]) + """ + ... +@overload +def full(size: Sequence[Union[_int, SymInt]], fill_value: Union[Number, _complex], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The + tensor's dtype is inferred from :attr:`fill_value`. + + Args: + size (int...): a list, tuple, or :class:`torch.Size` of integers defining the + shape of the output tensor. + fill_value (Scalar): the value to fill the output tensor with. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.full((2, 3), 3.141592) + tensor([[ 3.1416, 3.1416, 3.1416], + [ 3.1416, 3.1416, 3.1416]]) + """ + ... +@overload +def full(size: _size, fill_value: Union[Number, _complex], *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The + tensor's dtype is inferred from :attr:`fill_value`. + + Args: + size (int...): a list, tuple, or :class:`torch.Size` of integers defining the + shape of the output tensor. + fill_value (Scalar): the value to fill the output tensor with. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.full((2, 3), 3.141592) + tensor([[ 3.1416, 3.1416, 3.1416], + [ 3.1416, 3.1416, 3.1416]]) + """ + ... +def full_like(input: Tensor, fill_value: Union[Number, _complex], *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + full_like(input, fill_value, \*, dtype=None, layout=torch.strided, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor + + Returns a tensor with the same size as :attr:`input` filled with :attr:`fill_value`. + ``torch.full_like(input, fill_value)`` is equivalent to + ``torch.full(input.size(), fill_value, dtype=input.dtype, layout=input.layout, device=input.device)``. + + Args: + input (Tensor): the size of :attr:`input` will determine size of the output tensor. + fill_value: the number to fill the output tensor with. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor. + Default: if ``None``, defaults to the dtype of :attr:`input`. + layout (:class:`torch.layout`, optional): the desired layout of returned tensor. + Default: if ``None``, defaults to the layout of :attr:`input`. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, defaults to the device of :attr:`input`. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + memory_format (:class:`torch.memory_format`, optional): the desired memory format of + returned Tensor. Default: ``torch.preserve_format``. + """ + ... +def fused_moving_avg_obs_fake_quant(input: Tensor, observer_on: Tensor, fake_quant_on: Tensor, running_min: Tensor, running_max: Tensor, scale: Tensor, zero_point: Tensor, averaging_const: _float, quant_min: _int, quant_max: _int, ch_axis: _int, per_row_fake_quant: _bool = False, symmetric_quant: _bool = False) -> Tensor: ... +@overload +def gather(input: Tensor, dim: _int, index: Tensor, *, sparse_grad: _bool = False, out: Optional[Tensor] = None) -> Tensor: + r""" + gather(input, dim, index, *, sparse_grad=False, out=None) -> Tensor + + Gathers values along an axis specified by `dim`. + + For a 3-D tensor the output is specified by:: + + out[i][j][k] = input[index[i][j][k]][j][k] # if dim == 0 + out[i][j][k] = input[i][index[i][j][k]][k] # if dim == 1 + out[i][j][k] = input[i][j][index[i][j][k]] # if dim == 2 + + :attr:`input` and :attr:`index` must have the same number of dimensions. + It is also required that ``index.size(d) <= input.size(d)`` for all + dimensions ``d != dim``. :attr:`out` will have the same shape as :attr:`index`. + Note that ``input`` and ``index`` do not broadcast against each other. + + Args: + input (Tensor): the source tensor + dim (int): the axis along which to index + index (LongTensor): the indices of elements to gather + + Keyword arguments: + sparse_grad (bool, optional): If ``True``, gradient w.r.t. :attr:`input` will be a sparse tensor. + out (Tensor, optional): the destination tensor + + Example:: + + >>> t = torch.tensor([[1, 2], [3, 4]]) + >>> torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]])) + tensor([[ 1, 1], + [ 4, 3]]) + """ + ... +@overload +def gather(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, *, sparse_grad: _bool = False, out: Optional[Tensor] = None) -> Tensor: + r""" + gather(input, dim, index, *, sparse_grad=False, out=None) -> Tensor + + Gathers values along an axis specified by `dim`. + + For a 3-D tensor the output is specified by:: + + out[i][j][k] = input[index[i][j][k]][j][k] # if dim == 0 + out[i][j][k] = input[i][index[i][j][k]][k] # if dim == 1 + out[i][j][k] = input[i][j][index[i][j][k]] # if dim == 2 + + :attr:`input` and :attr:`index` must have the same number of dimensions. + It is also required that ``index.size(d) <= input.size(d)`` for all + dimensions ``d != dim``. :attr:`out` will have the same shape as :attr:`index`. + Note that ``input`` and ``index`` do not broadcast against each other. + + Args: + input (Tensor): the source tensor + dim (int): the axis along which to index + index (LongTensor): the indices of elements to gather + + Keyword arguments: + sparse_grad (bool, optional): If ``True``, gradient w.r.t. :attr:`input` will be a sparse tensor. + out (Tensor, optional): the destination tensor + + Example:: + + >>> t = torch.tensor([[1, 2], [3, 4]]) + >>> torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]])) + tensor([[ 1, 1], + [ 4, 3]]) + """ + ... +def gcd(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + gcd(input, other, *, out=None) -> Tensor + + Computes the element-wise greatest common divisor (GCD) of :attr:`input` and :attr:`other`. + + Both :attr:`input` and :attr:`other` must have integer types. + + .. note:: + This defines :math:`gcd(0, 0) = 0`. + + Args: + input (Tensor): the input tensor. + other (Tensor): the second input tensor + + Keyword arguments: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([5, 10, 15]) + >>> b = torch.tensor([3, 4, 5]) + >>> torch.gcd(a, b) + tensor([1, 2, 5]) + >>> c = torch.tensor([3]) + >>> torch.gcd(a, c) + tensor([1, 1, 3]) + """ + ... +def gcd_(input: Tensor, other: Tensor) -> Tensor: ... +@overload +def ge(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + ge(input, other, *, out=None) -> Tensor + + Computes :math:`\text{input} \geq \text{other}` element-wise. + + + The second argument can be a number or a tensor whose shape is + :ref:`broadcastable ` with the first argument. + + Args: + input (Tensor): the tensor to compare + other (Tensor or float): the tensor or value to compare + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is greater than or equal to :attr:`other` and False elsewhere + + Example:: + + >>> torch.ge(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]])) + tensor([[True, True], [False, True]]) + """ + ... +@overload +def ge(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + ge(input, other, *, out=None) -> Tensor + + Computes :math:`\text{input} \geq \text{other}` element-wise. + + + The second argument can be a number or a tensor whose shape is + :ref:`broadcastable ` with the first argument. + + Args: + input (Tensor): the tensor to compare + other (Tensor or float): the tensor or value to compare + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is greater than or equal to :attr:`other` and False elsewhere + + Example:: + + >>> torch.ge(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]])) + tensor([[True, True], [False, True]]) + """ + ... +def geqrf(input: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.geqrf: + r""" + geqrf(input, *, out=None) -> (Tensor, Tensor) + + This is a low-level function for calling LAPACK's geqrf directly. This function + returns a namedtuple (a, tau) as defined in `LAPACK documentation for geqrf`_ . + + Computes a QR decomposition of :attr:`input`. + Both `Q` and `R` matrices are stored in the same output tensor `a`. + The elements of `R` are stored on and above the diagonal. + Elementary reflectors (or Householder vectors) implicitly defining matrix `Q` + are stored below the diagonal. + The results of this function can be used together with :func:`torch.linalg.householder_product` + to obtain the `Q` matrix or + with :func:`torch.ormqr`, which uses an implicit representation of the `Q` matrix, + for an efficient matrix-matrix multiplication. + + See `LAPACK documentation for geqrf`_ for further details. + + .. note:: + See also :func:`torch.linalg.qr`, which computes Q and R matrices, and :func:`torch.linalg.lstsq` + with the ``driver="gels"`` option for a function that can solve matrix equations using a QR decomposition. + + Args: + input (Tensor): the input matrix + + Keyword args: + out (tuple, optional): the output tuple of (Tensor, Tensor). Ignored if `None`. Default: `None`. + + .. _LAPACK documentation for geqrf: + http://www.netlib.org/lapack/explore-html/df/dc5/group__variants_g_ecomputational_ga3766ea903391b5cf9008132f7440ec7b.html + """ + ... +def ger(input: Tensor, vec2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + ger(input, vec2, *, out=None) -> Tensor + + Alias of :func:`torch.outer`. + + .. warning:: + This function is deprecated and will be removed in a future PyTorch release. + Use :func:`torch.outer` instead. + """ + ... +def get_default_dtype() -> _dtype: + r""" + get_default_dtype() -> torch.dtype + + Get the current default floating point :class:`torch.dtype`. + + Example:: + + >>> torch.get_default_dtype() # initial default for floating point is torch.float32 + torch.float32 + >>> torch.set_default_dtype(torch.float64) + >>> torch.get_default_dtype() # default is now changed to torch.float64 + torch.float64 + """ + ... +def get_num_interop_threads() -> _int: + r""" + get_num_interop_threads() -> int + + Returns the number of threads used for inter-op parallelism on CPU + (e.g. in JIT interpreter) + """ + ... +def get_num_threads() -> _int: + r""" + get_num_threads() -> int + + Returns the number of threads used for parallelizing CPU operations + """ + ... +@overload +def gradient(input: Tensor, *, spacing: Optional[Union[Number, _complex]] = None, dim: Optional[_int] = None, edge_order: _int = 1) -> Tuple[Tensor, ...]: + r""" + gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors + + Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in + one or more dimensions using the `second-order accurate central differences method + `_ and + either first or second order estimates at the boundaries. + + The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not + specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates + to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional + :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and + :math:`g(1, 2, 3)\ == input[1, 2, 3]`. + + When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates. + This is detailed in the "Keyword Arguments" section below. + + The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is + accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be + improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative + is estimated using `Taylor’s theorem with remainder `_. + Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring + it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using: + + .. math:: + \begin{aligned} + f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2 \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\ + f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2 \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\ + \end{aligned} + + Using the fact that :math:`f \in C^3` and solving the linear system, we derive: + + .. math:: + f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l) + + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} } + + .. note:: + We estimate the gradient of functions in complex domain + :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way. + + The value of each partial derivative at the boundary points is computed differently. See edge_order below. + + Args: + input (``Tensor``): the tensor that represents the values of the function + + Keyword args: + spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify + how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then + the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the + indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding + indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9). + Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for + the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then + the coordinates are (t0[1], t1[2], t2[3]) + + dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over. By default + the partial gradient in every dimension is computed. Note that when :attr:`dim` is specified the elements of + the :attr:`spacing` argument must correspond with the specified dims." + + edge_order (``int``, optional): 1 or 2, for `first-order + `_ or + `second-order `_ + estimation of the boundary ("edge") values, respectively. + + Examples:: + + >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4] + >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),) + >>> values = torch.tensor([4., 1., 1., 16.], ) + >>> torch.gradient(values, spacing = coordinates) + (tensor([-3., -2., 2., 5.]),) + + >>> # Estimates the gradient of the R^2 -> R function whose samples are + >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost + >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates + >>> # partial derivative for both dimensions. + >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]]) + >>> torch.gradient(t) + (tensor([[ 9., 18., 36., 72.], + [ 9., 18., 36., 72.]]), + tensor([[ 1.0000, 1.5000, 3.0000, 4.0000], + [10.0000, 15.0000, 30.0000, 40.0000]])) + + >>> # A scalar value for spacing modifies the relationship between tensor indices + >>> # and input coordinates by multiplying the indices to find the + >>> # coordinates. For example, below the indices of the innermost + >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of + >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2]. + >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1]) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.5000, 0.7500, 1.5000, 2.0000], + [ 5.0000, 7.5000, 15.0000, 20.0000]])) + >>> # doubling the spacing between samples halves the estimated partial gradients. + + >>> + >>> # Estimates only the partial derivative for dimension 1 + >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.) + (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000], + [10.0000, 15.0000, 30.0000, 40.0000]]),) + + >>> # When spacing is a list of scalars, the relationship between the tensor + >>> # indices and input coordinates changes based on dimension. + >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate + >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension + >>> # 0, 1 translate to coordinates of [0, 2]. + >>> torch.gradient(t, spacing = [3., 2.]) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.3333, 0.5000, 1.0000, 1.3333], + [ 3.3333, 5.0000, 10.0000, 13.3333]])) + + >>> # The following example is a replication of the previous one with explicit + >>> # coordinates. + >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9])) + >>> torch.gradient(t, spacing = coords) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.3333, 0.5000, 1.0000, 1.3333], + [ 3.3333, 5.0000, 10.0000, 13.3333]])) + """ + ... +@overload +def gradient(input: Tensor, *, spacing: Sequence[Union[Number, _complex]], dim: Optional[_int] = None, edge_order: _int = 1) -> Tuple[Tensor, ...]: + r""" + gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors + + Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in + one or more dimensions using the `second-order accurate central differences method + `_ and + either first or second order estimates at the boundaries. + + The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not + specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates + to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional + :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and + :math:`g(1, 2, 3)\ == input[1, 2, 3]`. + + When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates. + This is detailed in the "Keyword Arguments" section below. + + The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is + accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be + improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative + is estimated using `Taylor’s theorem with remainder `_. + Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring + it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using: + + .. math:: + \begin{aligned} + f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2 \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\ + f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2 \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\ + \end{aligned} + + Using the fact that :math:`f \in C^3` and solving the linear system, we derive: + + .. math:: + f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l) + + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} } + + .. note:: + We estimate the gradient of functions in complex domain + :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way. + + The value of each partial derivative at the boundary points is computed differently. See edge_order below. + + Args: + input (``Tensor``): the tensor that represents the values of the function + + Keyword args: + spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify + how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then + the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the + indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding + indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9). + Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for + the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then + the coordinates are (t0[1], t1[2], t2[3]) + + dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over. By default + the partial gradient in every dimension is computed. Note that when :attr:`dim` is specified the elements of + the :attr:`spacing` argument must correspond with the specified dims." + + edge_order (``int``, optional): 1 or 2, for `first-order + `_ or + `second-order `_ + estimation of the boundary ("edge") values, respectively. + + Examples:: + + >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4] + >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),) + >>> values = torch.tensor([4., 1., 1., 16.], ) + >>> torch.gradient(values, spacing = coordinates) + (tensor([-3., -2., 2., 5.]),) + + >>> # Estimates the gradient of the R^2 -> R function whose samples are + >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost + >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates + >>> # partial derivative for both dimensions. + >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]]) + >>> torch.gradient(t) + (tensor([[ 9., 18., 36., 72.], + [ 9., 18., 36., 72.]]), + tensor([[ 1.0000, 1.5000, 3.0000, 4.0000], + [10.0000, 15.0000, 30.0000, 40.0000]])) + + >>> # A scalar value for spacing modifies the relationship between tensor indices + >>> # and input coordinates by multiplying the indices to find the + >>> # coordinates. For example, below the indices of the innermost + >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of + >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2]. + >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1]) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.5000, 0.7500, 1.5000, 2.0000], + [ 5.0000, 7.5000, 15.0000, 20.0000]])) + >>> # doubling the spacing between samples halves the estimated partial gradients. + + >>> + >>> # Estimates only the partial derivative for dimension 1 + >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.) + (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000], + [10.0000, 15.0000, 30.0000, 40.0000]]),) + + >>> # When spacing is a list of scalars, the relationship between the tensor + >>> # indices and input coordinates changes based on dimension. + >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate + >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension + >>> # 0, 1 translate to coordinates of [0, 2]. + >>> torch.gradient(t, spacing = [3., 2.]) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.3333, 0.5000, 1.0000, 1.3333], + [ 3.3333, 5.0000, 10.0000, 13.3333]])) + + >>> # The following example is a replication of the previous one with explicit + >>> # coordinates. + >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9])) + >>> torch.gradient(t, spacing = coords) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.3333, 0.5000, 1.0000, 1.3333], + [ 3.3333, 5.0000, 10.0000, 13.3333]])) + """ + ... +@overload +def gradient(input: Tensor, *, spacing: Sequence[Union[Number, _complex]], dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: + r""" + gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors + + Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in + one or more dimensions using the `second-order accurate central differences method + `_ and + either first or second order estimates at the boundaries. + + The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not + specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates + to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional + :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and + :math:`g(1, 2, 3)\ == input[1, 2, 3]`. + + When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates. + This is detailed in the "Keyword Arguments" section below. + + The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is + accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be + improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative + is estimated using `Taylor’s theorem with remainder `_. + Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring + it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using: + + .. math:: + \begin{aligned} + f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2 \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\ + f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2 \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\ + \end{aligned} + + Using the fact that :math:`f \in C^3` and solving the linear system, we derive: + + .. math:: + f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l) + + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} } + + .. note:: + We estimate the gradient of functions in complex domain + :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way. + + The value of each partial derivative at the boundary points is computed differently. See edge_order below. + + Args: + input (``Tensor``): the tensor that represents the values of the function + + Keyword args: + spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify + how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then + the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the + indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding + indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9). + Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for + the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then + the coordinates are (t0[1], t1[2], t2[3]) + + dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over. By default + the partial gradient in every dimension is computed. Note that when :attr:`dim` is specified the elements of + the :attr:`spacing` argument must correspond with the specified dims." + + edge_order (``int``, optional): 1 or 2, for `first-order + `_ or + `second-order `_ + estimation of the boundary ("edge") values, respectively. + + Examples:: + + >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4] + >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),) + >>> values = torch.tensor([4., 1., 1., 16.], ) + >>> torch.gradient(values, spacing = coordinates) + (tensor([-3., -2., 2., 5.]),) + + >>> # Estimates the gradient of the R^2 -> R function whose samples are + >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost + >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates + >>> # partial derivative for both dimensions. + >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]]) + >>> torch.gradient(t) + (tensor([[ 9., 18., 36., 72.], + [ 9., 18., 36., 72.]]), + tensor([[ 1.0000, 1.5000, 3.0000, 4.0000], + [10.0000, 15.0000, 30.0000, 40.0000]])) + + >>> # A scalar value for spacing modifies the relationship between tensor indices + >>> # and input coordinates by multiplying the indices to find the + >>> # coordinates. For example, below the indices of the innermost + >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of + >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2]. + >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1]) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.5000, 0.7500, 1.5000, 2.0000], + [ 5.0000, 7.5000, 15.0000, 20.0000]])) + >>> # doubling the spacing between samples halves the estimated partial gradients. + + >>> + >>> # Estimates only the partial derivative for dimension 1 + >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.) + (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000], + [10.0000, 15.0000, 30.0000, 40.0000]]),) + + >>> # When spacing is a list of scalars, the relationship between the tensor + >>> # indices and input coordinates changes based on dimension. + >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate + >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension + >>> # 0, 1 translate to coordinates of [0, 2]. + >>> torch.gradient(t, spacing = [3., 2.]) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.3333, 0.5000, 1.0000, 1.3333], + [ 3.3333, 5.0000, 10.0000, 13.3333]])) + + >>> # The following example is a replication of the previous one with explicit + >>> # coordinates. + >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9])) + >>> torch.gradient(t, spacing = coords) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.3333, 0.5000, 1.0000, 1.3333], + [ 3.3333, 5.0000, 10.0000, 13.3333]])) + """ + ... +@overload +def gradient(input: Tensor, *, spacing: Union[Tuple[Tensor, ...], List[Tensor]], dim: Optional[_int] = None, edge_order: _int = 1) -> Tuple[Tensor, ...]: + r""" + gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors + + Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in + one or more dimensions using the `second-order accurate central differences method + `_ and + either first or second order estimates at the boundaries. + + The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not + specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates + to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional + :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and + :math:`g(1, 2, 3)\ == input[1, 2, 3]`. + + When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates. + This is detailed in the "Keyword Arguments" section below. + + The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is + accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be + improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative + is estimated using `Taylor’s theorem with remainder `_. + Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring + it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using: + + .. math:: + \begin{aligned} + f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2 \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\ + f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2 \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\ + \end{aligned} + + Using the fact that :math:`f \in C^3` and solving the linear system, we derive: + + .. math:: + f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l) + + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} } + + .. note:: + We estimate the gradient of functions in complex domain + :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way. + + The value of each partial derivative at the boundary points is computed differently. See edge_order below. + + Args: + input (``Tensor``): the tensor that represents the values of the function + + Keyword args: + spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify + how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then + the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the + indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding + indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9). + Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for + the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then + the coordinates are (t0[1], t1[2], t2[3]) + + dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over. By default + the partial gradient in every dimension is computed. Note that when :attr:`dim` is specified the elements of + the :attr:`spacing` argument must correspond with the specified dims." + + edge_order (``int``, optional): 1 or 2, for `first-order + `_ or + `second-order `_ + estimation of the boundary ("edge") values, respectively. + + Examples:: + + >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4] + >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),) + >>> values = torch.tensor([4., 1., 1., 16.], ) + >>> torch.gradient(values, spacing = coordinates) + (tensor([-3., -2., 2., 5.]),) + + >>> # Estimates the gradient of the R^2 -> R function whose samples are + >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost + >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates + >>> # partial derivative for both dimensions. + >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]]) + >>> torch.gradient(t) + (tensor([[ 9., 18., 36., 72.], + [ 9., 18., 36., 72.]]), + tensor([[ 1.0000, 1.5000, 3.0000, 4.0000], + [10.0000, 15.0000, 30.0000, 40.0000]])) + + >>> # A scalar value for spacing modifies the relationship between tensor indices + >>> # and input coordinates by multiplying the indices to find the + >>> # coordinates. For example, below the indices of the innermost + >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of + >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2]. + >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1]) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.5000, 0.7500, 1.5000, 2.0000], + [ 5.0000, 7.5000, 15.0000, 20.0000]])) + >>> # doubling the spacing between samples halves the estimated partial gradients. + + >>> + >>> # Estimates only the partial derivative for dimension 1 + >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.) + (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000], + [10.0000, 15.0000, 30.0000, 40.0000]]),) + + >>> # When spacing is a list of scalars, the relationship between the tensor + >>> # indices and input coordinates changes based on dimension. + >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate + >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension + >>> # 0, 1 translate to coordinates of [0, 2]. + >>> torch.gradient(t, spacing = [3., 2.]) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.3333, 0.5000, 1.0000, 1.3333], + [ 3.3333, 5.0000, 10.0000, 13.3333]])) + + >>> # The following example is a replication of the previous one with explicit + >>> # coordinates. + >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9])) + >>> torch.gradient(t, spacing = coords) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.3333, 0.5000, 1.0000, 1.3333], + [ 3.3333, 5.0000, 10.0000, 13.3333]])) + """ + ... +@overload +def gradient(input: Tensor, *, spacing: Union[Number, _complex], dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: + r""" + gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors + + Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in + one or more dimensions using the `second-order accurate central differences method + `_ and + either first or second order estimates at the boundaries. + + The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not + specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates + to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional + :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and + :math:`g(1, 2, 3)\ == input[1, 2, 3]`. + + When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates. + This is detailed in the "Keyword Arguments" section below. + + The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is + accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be + improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative + is estimated using `Taylor’s theorem with remainder `_. + Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring + it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using: + + .. math:: + \begin{aligned} + f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2 \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\ + f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2 \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\ + \end{aligned} + + Using the fact that :math:`f \in C^3` and solving the linear system, we derive: + + .. math:: + f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l) + + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} } + + .. note:: + We estimate the gradient of functions in complex domain + :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way. + + The value of each partial derivative at the boundary points is computed differently. See edge_order below. + + Args: + input (``Tensor``): the tensor that represents the values of the function + + Keyword args: + spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify + how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then + the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the + indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding + indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9). + Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for + the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then + the coordinates are (t0[1], t1[2], t2[3]) + + dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over. By default + the partial gradient in every dimension is computed. Note that when :attr:`dim` is specified the elements of + the :attr:`spacing` argument must correspond with the specified dims." + + edge_order (``int``, optional): 1 or 2, for `first-order + `_ or + `second-order `_ + estimation of the boundary ("edge") values, respectively. + + Examples:: + + >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4] + >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),) + >>> values = torch.tensor([4., 1., 1., 16.], ) + >>> torch.gradient(values, spacing = coordinates) + (tensor([-3., -2., 2., 5.]),) + + >>> # Estimates the gradient of the R^2 -> R function whose samples are + >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost + >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates + >>> # partial derivative for both dimensions. + >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]]) + >>> torch.gradient(t) + (tensor([[ 9., 18., 36., 72.], + [ 9., 18., 36., 72.]]), + tensor([[ 1.0000, 1.5000, 3.0000, 4.0000], + [10.0000, 15.0000, 30.0000, 40.0000]])) + + >>> # A scalar value for spacing modifies the relationship between tensor indices + >>> # and input coordinates by multiplying the indices to find the + >>> # coordinates. For example, below the indices of the innermost + >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of + >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2]. + >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1]) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.5000, 0.7500, 1.5000, 2.0000], + [ 5.0000, 7.5000, 15.0000, 20.0000]])) + >>> # doubling the spacing between samples halves the estimated partial gradients. + + >>> + >>> # Estimates only the partial derivative for dimension 1 + >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.) + (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000], + [10.0000, 15.0000, 30.0000, 40.0000]]),) + + >>> # When spacing is a list of scalars, the relationship between the tensor + >>> # indices and input coordinates changes based on dimension. + >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate + >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension + >>> # 0, 1 translate to coordinates of [0, 2]. + >>> torch.gradient(t, spacing = [3., 2.]) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.3333, 0.5000, 1.0000, 1.3333], + [ 3.3333, 5.0000, 10.0000, 13.3333]])) + + >>> # The following example is a replication of the previous one with explicit + >>> # coordinates. + >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9])) + >>> torch.gradient(t, spacing = coords) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.3333, 0.5000, 1.0000, 1.3333], + [ 3.3333, 5.0000, 10.0000, 13.3333]])) + """ + ... +@overload +def gradient(input: Tensor, *, spacing: Union[Tuple[Tensor, ...], List[Tensor]], dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: + r""" + gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors + + Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in + one or more dimensions using the `second-order accurate central differences method + `_ and + either first or second order estimates at the boundaries. + + The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not + specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates + to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional + :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and + :math:`g(1, 2, 3)\ == input[1, 2, 3]`. + + When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates. + This is detailed in the "Keyword Arguments" section below. + + The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is + accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be + improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative + is estimated using `Taylor’s theorem with remainder `_. + Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring + it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using: + + .. math:: + \begin{aligned} + f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2 \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\ + f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2 \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\ + \end{aligned} + + Using the fact that :math:`f \in C^3` and solving the linear system, we derive: + + .. math:: + f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l) + + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} } + + .. note:: + We estimate the gradient of functions in complex domain + :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way. + + The value of each partial derivative at the boundary points is computed differently. See edge_order below. + + Args: + input (``Tensor``): the tensor that represents the values of the function + + Keyword args: + spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify + how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then + the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the + indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding + indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9). + Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for + the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then + the coordinates are (t0[1], t1[2], t2[3]) + + dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over. By default + the partial gradient in every dimension is computed. Note that when :attr:`dim` is specified the elements of + the :attr:`spacing` argument must correspond with the specified dims." + + edge_order (``int``, optional): 1 or 2, for `first-order + `_ or + `second-order `_ + estimation of the boundary ("edge") values, respectively. + + Examples:: + + >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4] + >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),) + >>> values = torch.tensor([4., 1., 1., 16.], ) + >>> torch.gradient(values, spacing = coordinates) + (tensor([-3., -2., 2., 5.]),) + + >>> # Estimates the gradient of the R^2 -> R function whose samples are + >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost + >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates + >>> # partial derivative for both dimensions. + >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]]) + >>> torch.gradient(t) + (tensor([[ 9., 18., 36., 72.], + [ 9., 18., 36., 72.]]), + tensor([[ 1.0000, 1.5000, 3.0000, 4.0000], + [10.0000, 15.0000, 30.0000, 40.0000]])) + + >>> # A scalar value for spacing modifies the relationship between tensor indices + >>> # and input coordinates by multiplying the indices to find the + >>> # coordinates. For example, below the indices of the innermost + >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of + >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2]. + >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1]) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.5000, 0.7500, 1.5000, 2.0000], + [ 5.0000, 7.5000, 15.0000, 20.0000]])) + >>> # doubling the spacing between samples halves the estimated partial gradients. + + >>> + >>> # Estimates only the partial derivative for dimension 1 + >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.) + (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000], + [10.0000, 15.0000, 30.0000, 40.0000]]),) + + >>> # When spacing is a list of scalars, the relationship between the tensor + >>> # indices and input coordinates changes based on dimension. + >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate + >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension + >>> # 0, 1 translate to coordinates of [0, 2]. + >>> torch.gradient(t, spacing = [3., 2.]) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.3333, 0.5000, 1.0000, 1.3333], + [ 3.3333, 5.0000, 10.0000, 13.3333]])) + + >>> # The following example is a replication of the previous one with explicit + >>> # coordinates. + >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9])) + >>> torch.gradient(t, spacing = coords) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.3333, 0.5000, 1.0000, 1.3333], + [ 3.3333, 5.0000, 10.0000, 13.3333]])) + """ + ... +@overload +def gradient(input: Tensor, *, dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: + r""" + gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors + + Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in + one or more dimensions using the `second-order accurate central differences method + `_ and + either first or second order estimates at the boundaries. + + The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not + specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates + to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional + :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and + :math:`g(1, 2, 3)\ == input[1, 2, 3]`. + + When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates. + This is detailed in the "Keyword Arguments" section below. + + The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is + accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be + improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative + is estimated using `Taylor’s theorem with remainder `_. + Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring + it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using: + + .. math:: + \begin{aligned} + f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2 \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\ + f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2 \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\ + \end{aligned} + + Using the fact that :math:`f \in C^3` and solving the linear system, we derive: + + .. math:: + f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l) + + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} } + + .. note:: + We estimate the gradient of functions in complex domain + :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way. + + The value of each partial derivative at the boundary points is computed differently. See edge_order below. + + Args: + input (``Tensor``): the tensor that represents the values of the function + + Keyword args: + spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify + how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then + the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the + indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding + indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9). + Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for + the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then + the coordinates are (t0[1], t1[2], t2[3]) + + dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over. By default + the partial gradient in every dimension is computed. Note that when :attr:`dim` is specified the elements of + the :attr:`spacing` argument must correspond with the specified dims." + + edge_order (``int``, optional): 1 or 2, for `first-order + `_ or + `second-order `_ + estimation of the boundary ("edge") values, respectively. + + Examples:: + + >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4] + >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),) + >>> values = torch.tensor([4., 1., 1., 16.], ) + >>> torch.gradient(values, spacing = coordinates) + (tensor([-3., -2., 2., 5.]),) + + >>> # Estimates the gradient of the R^2 -> R function whose samples are + >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost + >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates + >>> # partial derivative for both dimensions. + >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]]) + >>> torch.gradient(t) + (tensor([[ 9., 18., 36., 72.], + [ 9., 18., 36., 72.]]), + tensor([[ 1.0000, 1.5000, 3.0000, 4.0000], + [10.0000, 15.0000, 30.0000, 40.0000]])) + + >>> # A scalar value for spacing modifies the relationship between tensor indices + >>> # and input coordinates by multiplying the indices to find the + >>> # coordinates. For example, below the indices of the innermost + >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of + >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2]. + >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1]) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.5000, 0.7500, 1.5000, 2.0000], + [ 5.0000, 7.5000, 15.0000, 20.0000]])) + >>> # doubling the spacing between samples halves the estimated partial gradients. + + >>> + >>> # Estimates only the partial derivative for dimension 1 + >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.) + (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000], + [10.0000, 15.0000, 30.0000, 40.0000]]),) + + >>> # When spacing is a list of scalars, the relationship between the tensor + >>> # indices and input coordinates changes based on dimension. + >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate + >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension + >>> # 0, 1 translate to coordinates of [0, 2]. + >>> torch.gradient(t, spacing = [3., 2.]) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.3333, 0.5000, 1.0000, 1.3333], + [ 3.3333, 5.0000, 10.0000, 13.3333]])) + + >>> # The following example is a replication of the previous one with explicit + >>> # coordinates. + >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9])) + >>> torch.gradient(t, spacing = coords) + (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000], + [ 4.5000, 9.0000, 18.0000, 36.0000]]), + tensor([[ 0.3333, 0.5000, 1.0000, 1.3333], + [ 3.3333, 5.0000, 10.0000, 13.3333]])) + """ + ... +@overload +def greater(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + greater(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.gt`. + """ + ... +@overload +def greater(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + greater(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.gt`. + """ + ... +@overload +def greater_equal(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + greater_equal(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.ge`. + """ + ... +@overload +def greater_equal(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + greater_equal(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.ge`. + """ + ... +def grid_sampler(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ... +def grid_sampler_2d(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ... +def grid_sampler_3d(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ... +def group_norm(input: Tensor, num_groups: _int, weight: Optional[Tensor] = None, bias: Optional[Tensor] = None, eps: _float = 1e-05, cudnn_enabled: _bool = True) -> Tensor: ... +@overload +def gru(data: Tensor, batch_sizes: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor]: ... +@overload +def gru(input: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor]: ... +def gru_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tensor: ... +@overload +def gt(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + gt(input, other, *, out=None) -> Tensor + + Computes :math:`\text{input} > \text{other}` element-wise. + + + The second argument can be a number or a tensor whose shape is + :ref:`broadcastable ` with the first argument. + + Args: + input (Tensor): the tensor to compare + other (Tensor or float): the tensor or value to compare + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is greater than :attr:`other` and False elsewhere + + Example:: + + >>> torch.gt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]])) + tensor([[False, True], [False, False]]) + """ + ... +@overload +def gt(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + gt(input, other, *, out=None) -> Tensor + + Computes :math:`\text{input} > \text{other}` element-wise. + + + The second argument can be a number or a tensor whose shape is + :ref:`broadcastable ` with the first argument. + + Args: + input (Tensor): the tensor to compare + other (Tensor or float): the tensor or value to compare + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is greater than :attr:`other` and False elsewhere + + Example:: + + >>> torch.gt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]])) + tensor([[False, True], [False, False]]) + """ + ... +@overload +def hamming_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Hamming window function. + + .. math:: + w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right), + + where :math:`N` is the full window size. + + The input :attr:`window_length` is a positive integer controlling the + returned window size. :attr:`periodic` flag determines whether the returned + window trims off the last duplicate value from the symmetric window and is + ready to be used as a periodic window with functions like + :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in + above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have + ``torch.hamming_window(L, periodic=True)`` equal to + ``torch.hamming_window(L + 1, periodic=False)[:-1])``. + + .. note:: + If :attr:`window_length` :math:`=1`, the returned window contains a single value 1. + + .. note:: + This is a generalized version of :meth:`torch.hann_window`. + + Arguments: + window_length (int): the size of returned window + periodic (bool, optional): If True, returns a window to be used as periodic + function. If False, return a symmetric window. + alpha (float, optional): The coefficient :math:`\alpha` in the equation above + beta (float, optional): The coefficient :math:`\beta` in the equation above + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported. + layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only + ``torch.strided`` (dense layout) is supported. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Returns: + Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window. + """ + ... +@overload +def hamming_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Hamming window function. + + .. math:: + w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right), + + where :math:`N` is the full window size. + + The input :attr:`window_length` is a positive integer controlling the + returned window size. :attr:`periodic` flag determines whether the returned + window trims off the last duplicate value from the symmetric window and is + ready to be used as a periodic window with functions like + :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in + above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have + ``torch.hamming_window(L, periodic=True)`` equal to + ``torch.hamming_window(L + 1, periodic=False)[:-1])``. + + .. note:: + If :attr:`window_length` :math:`=1`, the returned window contains a single value 1. + + .. note:: + This is a generalized version of :meth:`torch.hann_window`. + + Arguments: + window_length (int): the size of returned window + periodic (bool, optional): If True, returns a window to be used as periodic + function. If False, return a symmetric window. + alpha (float, optional): The coefficient :math:`\alpha` in the equation above + beta (float, optional): The coefficient :math:`\beta` in the equation above + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported. + layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only + ``torch.strided`` (dense layout) is supported. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Returns: + Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window. + """ + ... +@overload +def hamming_window(window_length: _int, periodic: _bool, alpha: _float, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Hamming window function. + + .. math:: + w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right), + + where :math:`N` is the full window size. + + The input :attr:`window_length` is a positive integer controlling the + returned window size. :attr:`periodic` flag determines whether the returned + window trims off the last duplicate value from the symmetric window and is + ready to be used as a periodic window with functions like + :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in + above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have + ``torch.hamming_window(L, periodic=True)`` equal to + ``torch.hamming_window(L + 1, periodic=False)[:-1])``. + + .. note:: + If :attr:`window_length` :math:`=1`, the returned window contains a single value 1. + + .. note:: + This is a generalized version of :meth:`torch.hann_window`. + + Arguments: + window_length (int): the size of returned window + periodic (bool, optional): If True, returns a window to be used as periodic + function. If False, return a symmetric window. + alpha (float, optional): The coefficient :math:`\alpha` in the equation above + beta (float, optional): The coefficient :math:`\beta` in the equation above + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported. + layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only + ``torch.strided`` (dense layout) is supported. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Returns: + Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window. + """ + ... +@overload +def hamming_window(window_length: _int, periodic: _bool, alpha: _float, beta: _float, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Hamming window function. + + .. math:: + w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right), + + where :math:`N` is the full window size. + + The input :attr:`window_length` is a positive integer controlling the + returned window size. :attr:`periodic` flag determines whether the returned + window trims off the last duplicate value from the symmetric window and is + ready to be used as a periodic window with functions like + :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in + above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have + ``torch.hamming_window(L, periodic=True)`` equal to + ``torch.hamming_window(L + 1, periodic=False)[:-1])``. + + .. note:: + If :attr:`window_length` :math:`=1`, the returned window contains a single value 1. + + .. note:: + This is a generalized version of :meth:`torch.hann_window`. + + Arguments: + window_length (int): the size of returned window + periodic (bool, optional): If True, returns a window to be used as periodic + function. If False, return a symmetric window. + alpha (float, optional): The coefficient :math:`\alpha` in the equation above + beta (float, optional): The coefficient :math:`\beta` in the equation above + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported. + layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only + ``torch.strided`` (dense layout) is supported. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Returns: + Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window. + """ + ... +@overload +def hann_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + hann_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Hann window function. + + .. math:: + w[n] = \frac{1}{2}\ \left[1 - \cos \left( \frac{2 \pi n}{N - 1} \right)\right] = + \sin^2 \left( \frac{\pi n}{N - 1} \right), + + where :math:`N` is the full window size. + + The input :attr:`window_length` is a positive integer controlling the + returned window size. :attr:`periodic` flag determines whether the returned + window trims off the last duplicate value from the symmetric window and is + ready to be used as a periodic window with functions like + :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in + above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have + ``torch.hann_window(L, periodic=True)`` equal to + ``torch.hann_window(L + 1, periodic=False)[:-1])``. + + .. note:: + If :attr:`window_length` :math:`=1`, the returned window contains a single value 1. + + Arguments: + window_length (int): the size of returned window + periodic (bool, optional): If True, returns a window to be used as periodic + function. If False, return a symmetric window. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported. + layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only + ``torch.strided`` (dense layout) is supported. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Returns: + Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window + """ + ... +@overload +def hann_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + hann_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Hann window function. + + .. math:: + w[n] = \frac{1}{2}\ \left[1 - \cos \left( \frac{2 \pi n}{N - 1} \right)\right] = + \sin^2 \left( \frac{\pi n}{N - 1} \right), + + where :math:`N` is the full window size. + + The input :attr:`window_length` is a positive integer controlling the + returned window size. :attr:`periodic` flag determines whether the returned + window trims off the last duplicate value from the symmetric window and is + ready to be used as a periodic window with functions like + :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in + above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have + ``torch.hann_window(L, periodic=True)`` equal to + ``torch.hann_window(L + 1, periodic=False)[:-1])``. + + .. note:: + If :attr:`window_length` :math:`=1`, the returned window contains a single value 1. + + Arguments: + window_length (int): the size of returned window + periodic (bool, optional): If True, returns a window to be used as periodic + function. If False, return a symmetric window. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported. + layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only + ``torch.strided`` (dense layout) is supported. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Returns: + Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window + """ + ... +def hardshrink(input: Tensor, lambd: Union[Number, _complex] = 0.5, *, out: Optional[Tensor] = None) -> Tensor: ... +def heaviside(input: Tensor, values: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + heaviside(input, values, *, out=None) -> Tensor + + Computes the Heaviside step function for each element in :attr:`input`. + The Heaviside step function is defined as: + + .. math:: + \text{{heaviside}}(input, values) = \begin{cases} + 0, & \text{if input < 0}\\ + values, & \text{if input == 0}\\ + 1, & \text{if input > 0} + \end{cases} + + + Args: + input (Tensor): the input tensor. + values (Tensor): The values to use where :attr:`input` is zero. + + Keyword arguments: + out (Tensor, optional): the output tensor. + + Example:: + + >>> input = torch.tensor([-1.5, 0, 2.0]) + >>> values = torch.tensor([0.5]) + >>> torch.heaviside(input, values) + tensor([0.0000, 0.5000, 1.0000]) + >>> values = torch.tensor([1.2, -2.0, 3.5]) + >>> torch.heaviside(input, values) + tensor([0., -2., 1.]) + """ + ... +def hinge_embedding_loss(input: Tensor, target: Tensor, margin: _float = 1.0, reduction: _int = 1) -> Tensor: ... +def histc(input: Tensor, bins: _int = 100, min: Union[Number, _complex] = 0, max: Union[Number, _complex] = 0, *, out: Optional[Tensor] = None) -> Tensor: + r""" + histc(input, bins=100, min=0, max=0, *, out=None) -> Tensor + + Computes the histogram of a tensor. + + The elements are sorted into equal width bins between :attr:`min` and + :attr:`max`. If :attr:`min` and :attr:`max` are both zero, the minimum and + maximum values of the data are used. + + Elements lower than min and higher than max and ``NaN`` elements are ignored. + + Args: + input (Tensor): the input tensor. + bins (int): number of histogram bins + min (Scalar): lower end of the range (inclusive) + max (Scalar): upper end of the range (inclusive) + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + Tensor: Histogram represented as a tensor + + Example:: + + >>> torch.histc(torch.tensor([1., 2, 1]), bins=4, min=0, max=3) + tensor([ 0., 2., 1., 0.]) + """ + ... +@overload +def histogram(input: Tensor, bins: Tensor, *, weight: Optional[Tensor] = None, density: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.histogram: + r""" + histogram(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor) + + Computes a histogram of the values in a tensor. + + :attr:`bins` can be an integer or a 1D tensor. + + If :attr:`bins` is an int, it specifies the number of equal-width bins. + By default, the lower and upper range of the bins is determined by the + minimum and maximum elements of the input tensor. The :attr:`range` + argument can be provided to specify a range for the bins. + + If :attr:`bins` is a 1D tensor, it specifies the sequence of bin edges + including the rightmost edge. It should contain at least 2 elements + and its elements should be increasing. + + Args: + input (Tensor): the input tensor. + bins: int or 1D Tensor. If int, defines the number of equal-width bins. If tensor, + defines the sequence of bin edges including the rightmost edge. + + Keyword args: + range (tuple of float): Defines the range of the bins. + weight (Tensor): If provided, weight should have the same shape as input. Each value in + input contributes its associated weight towards its bin's result. + density (bool): If False, the result will contain the count (or total weight) in each bin. + If True, the result is the value of the probability density function over the bins, + normalized such that the integral over the range of the bins is 1. + out (Tensor, optional): the output tensor. (tuple, optional): The result tuple of two output tensors (hist, bin_edges). + + Returns: + hist (Tensor): 1D Tensor containing the values of the histogram. + bin_edges(Tensor): 1D Tensor containing the edges of the histogram bins. + + Example:: + + >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.])) + (tensor([ 0., 5., 2., 0.]), tensor([0., 0.75, 1.5, 2.25, 3.])) + >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]), density=True) + (tensor([ 0., 0.9524, 0.3810, 0.]), tensor([0., 0.75, 1.5, 2.25, 3.])) + """ + ... +@overload +def histogram(input: Tensor, bins: _int = 100, *, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.histogram: + r""" + histogram(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor) + + Computes a histogram of the values in a tensor. + + :attr:`bins` can be an integer or a 1D tensor. + + If :attr:`bins` is an int, it specifies the number of equal-width bins. + By default, the lower and upper range of the bins is determined by the + minimum and maximum elements of the input tensor. The :attr:`range` + argument can be provided to specify a range for the bins. + + If :attr:`bins` is a 1D tensor, it specifies the sequence of bin edges + including the rightmost edge. It should contain at least 2 elements + and its elements should be increasing. + + Args: + input (Tensor): the input tensor. + bins: int or 1D Tensor. If int, defines the number of equal-width bins. If tensor, + defines the sequence of bin edges including the rightmost edge. + + Keyword args: + range (tuple of float): Defines the range of the bins. + weight (Tensor): If provided, weight should have the same shape as input. Each value in + input contributes its associated weight towards its bin's result. + density (bool): If False, the result will contain the count (or total weight) in each bin. + If True, the result is the value of the probability density function over the bins, + normalized such that the integral over the range of the bins is 1. + out (Tensor, optional): the output tensor. (tuple, optional): The result tuple of two output tensors (hist, bin_edges). + + Returns: + hist (Tensor): 1D Tensor containing the values of the histogram. + bin_edges(Tensor): 1D Tensor containing the edges of the histogram bins. + + Example:: + + >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.])) + (tensor([ 0., 5., 2., 0.]), tensor([0., 0.75, 1.5, 2.25, 3.])) + >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]), density=True) + (tensor([ 0., 0.9524, 0.3810, 0.]), tensor([0., 0.75, 1.5, 2.25, 3.])) + """ + ... +@overload +def histogramdd(input: Tensor, bins: _int, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> torch.return_types.histogramdd: + r""" + histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[]) + + Computes a multi-dimensional histogram of the values in a tensor. + + Interprets the elements of an input tensor whose innermost dimension has size N + as a collection of N-dimensional points. Maps each of the points into a set of + N-dimensional bins and returns the number of points (or total weight) in each bin. + + :attr:`input` must be a tensor with at least 2 dimensions. + If input has shape (M, N), each of its M rows defines a point in N-dimensional space. + If input has three or more dimensions, all but the last dimension are flattened. + + Each dimension is independently associated with its own strictly increasing sequence + of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D + tensors. Alternatively, bin edges may be constructed automatically by passing a + sequence of integers specifying the number of equal-width bins in each dimension. + + For each N-dimensional point in input: + - Each of its coordinates is binned independently among the bin edges + corresponding to its dimension + - Binning results are combined to identify the N-dimensional bin (if any) + into which the point falls + - If the point falls into a bin, the bin's count (or total weight) is incremented + - Points which do not fall into any bin do not contribute to the output + + :attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int. + + If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences + of bin edges. Each 1D tensor should contain a strictly increasing sequence with at + least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying + the left and right edges of all bins. Every bin is exclusive of its left edge. Only + the rightmost bin is inclusive of its right edge. + + If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins + in each dimension. By default, the leftmost and rightmost bin edges in each dimension + are determined by the minimum and maximum elements of the input tensor in the + corresponding dimension. The :attr:`range` argument can be provided to manually + specify the leftmost and rightmost bin edges in each dimension. + + If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions. + + .. note:: + See also :func:`torch.histogram`, which specifically computes 1D histograms. + While :func:`torch.histogramdd` infers the dimensionality of its bins and + binned values from the shape of :attr:`input`, :func:`torch.histogram` + accepts and flattens :attr:`input` of any shape. + + Args: + input (Tensor): the input tensor. + bins: Tensor[], int[], or int. + If Tensor[], defines the sequences of bin edges. + If int[], defines the number of equal-width bins in each dimension. + If int, defines the number of equal-width bins for all dimensions. + Keyword args: + range (sequence of float): Defines the leftmost and rightmost bin edges + in each dimension. + weight (Tensor): By default, each value in the input has weight 1. If a weight + tensor is passed, each N-dimensional coordinate in input + contributes its associated weight towards its bin's result. + The weight tensor should have the same shape as the :attr:`input` + tensor excluding its innermost dimension N. + density (bool): If False (default), the result will contain the count (or total weight) + in each bin. If True, each count (weight) is divided by the total count + (total weight), then divided by the volume of its associated bin. + Returns: + hist (Tensor): N-dimensional Tensor containing the values of the histogram. + bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges. + + Example:: + >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3], + ... weight=torch.tensor([1., 2., 4., 8.])) + torch.return_types.histogramdd( + hist=tensor([[0., 1., 0.], + [2., 0., 0.], + [4., 0., 8.]]), + bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]), + tensor([0.0000, 0.6667, 1.3333, 2.0000]))) + + >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2], + ... range=[0., 1., 0., 1.], density=True) + torch.return_types.histogramdd( + hist=tensor([[2., 0.], + [0., 2.]]), + bin_edges=(tensor([0.0000, 0.5000, 1.0000]), + tensor([0.0000, 0.5000, 1.0000]))) + """ + ... +@overload +def histogramdd(input: Tensor, bins: _size, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> torch.return_types.histogramdd: + r""" + histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[]) + + Computes a multi-dimensional histogram of the values in a tensor. + + Interprets the elements of an input tensor whose innermost dimension has size N + as a collection of N-dimensional points. Maps each of the points into a set of + N-dimensional bins and returns the number of points (or total weight) in each bin. + + :attr:`input` must be a tensor with at least 2 dimensions. + If input has shape (M, N), each of its M rows defines a point in N-dimensional space. + If input has three or more dimensions, all but the last dimension are flattened. + + Each dimension is independently associated with its own strictly increasing sequence + of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D + tensors. Alternatively, bin edges may be constructed automatically by passing a + sequence of integers specifying the number of equal-width bins in each dimension. + + For each N-dimensional point in input: + - Each of its coordinates is binned independently among the bin edges + corresponding to its dimension + - Binning results are combined to identify the N-dimensional bin (if any) + into which the point falls + - If the point falls into a bin, the bin's count (or total weight) is incremented + - Points which do not fall into any bin do not contribute to the output + + :attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int. + + If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences + of bin edges. Each 1D tensor should contain a strictly increasing sequence with at + least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying + the left and right edges of all bins. Every bin is exclusive of its left edge. Only + the rightmost bin is inclusive of its right edge. + + If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins + in each dimension. By default, the leftmost and rightmost bin edges in each dimension + are determined by the minimum and maximum elements of the input tensor in the + corresponding dimension. The :attr:`range` argument can be provided to manually + specify the leftmost and rightmost bin edges in each dimension. + + If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions. + + .. note:: + See also :func:`torch.histogram`, which specifically computes 1D histograms. + While :func:`torch.histogramdd` infers the dimensionality of its bins and + binned values from the shape of :attr:`input`, :func:`torch.histogram` + accepts and flattens :attr:`input` of any shape. + + Args: + input (Tensor): the input tensor. + bins: Tensor[], int[], or int. + If Tensor[], defines the sequences of bin edges. + If int[], defines the number of equal-width bins in each dimension. + If int, defines the number of equal-width bins for all dimensions. + Keyword args: + range (sequence of float): Defines the leftmost and rightmost bin edges + in each dimension. + weight (Tensor): By default, each value in the input has weight 1. If a weight + tensor is passed, each N-dimensional coordinate in input + contributes its associated weight towards its bin's result. + The weight tensor should have the same shape as the :attr:`input` + tensor excluding its innermost dimension N. + density (bool): If False (default), the result will contain the count (or total weight) + in each bin. If True, each count (weight) is divided by the total count + (total weight), then divided by the volume of its associated bin. + Returns: + hist (Tensor): N-dimensional Tensor containing the values of the histogram. + bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges. + + Example:: + >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3], + ... weight=torch.tensor([1., 2., 4., 8.])) + torch.return_types.histogramdd( + hist=tensor([[0., 1., 0.], + [2., 0., 0.], + [4., 0., 8.]]), + bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]), + tensor([0.0000, 0.6667, 1.3333, 2.0000]))) + + >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2], + ... range=[0., 1., 0., 1.], density=True) + torch.return_types.histogramdd( + hist=tensor([[2., 0.], + [0., 2.]]), + bin_edges=(tensor([0.0000, 0.5000, 1.0000]), + tensor([0.0000, 0.5000, 1.0000]))) + """ + ... +@overload +def histogramdd(input: Tensor, bins: Union[Tuple[Tensor, ...], List[Tensor]], range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> torch.return_types.histogramdd: + r""" + histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[]) + + Computes a multi-dimensional histogram of the values in a tensor. + + Interprets the elements of an input tensor whose innermost dimension has size N + as a collection of N-dimensional points. Maps each of the points into a set of + N-dimensional bins and returns the number of points (or total weight) in each bin. + + :attr:`input` must be a tensor with at least 2 dimensions. + If input has shape (M, N), each of its M rows defines a point in N-dimensional space. + If input has three or more dimensions, all but the last dimension are flattened. + + Each dimension is independently associated with its own strictly increasing sequence + of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D + tensors. Alternatively, bin edges may be constructed automatically by passing a + sequence of integers specifying the number of equal-width bins in each dimension. + + For each N-dimensional point in input: + - Each of its coordinates is binned independently among the bin edges + corresponding to its dimension + - Binning results are combined to identify the N-dimensional bin (if any) + into which the point falls + - If the point falls into a bin, the bin's count (or total weight) is incremented + - Points which do not fall into any bin do not contribute to the output + + :attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int. + + If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences + of bin edges. Each 1D tensor should contain a strictly increasing sequence with at + least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying + the left and right edges of all bins. Every bin is exclusive of its left edge. Only + the rightmost bin is inclusive of its right edge. + + If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins + in each dimension. By default, the leftmost and rightmost bin edges in each dimension + are determined by the minimum and maximum elements of the input tensor in the + corresponding dimension. The :attr:`range` argument can be provided to manually + specify the leftmost and rightmost bin edges in each dimension. + + If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions. + + .. note:: + See also :func:`torch.histogram`, which specifically computes 1D histograms. + While :func:`torch.histogramdd` infers the dimensionality of its bins and + binned values from the shape of :attr:`input`, :func:`torch.histogram` + accepts and flattens :attr:`input` of any shape. + + Args: + input (Tensor): the input tensor. + bins: Tensor[], int[], or int. + If Tensor[], defines the sequences of bin edges. + If int[], defines the number of equal-width bins in each dimension. + If int, defines the number of equal-width bins for all dimensions. + Keyword args: + range (sequence of float): Defines the leftmost and rightmost bin edges + in each dimension. + weight (Tensor): By default, each value in the input has weight 1. If a weight + tensor is passed, each N-dimensional coordinate in input + contributes its associated weight towards its bin's result. + The weight tensor should have the same shape as the :attr:`input` + tensor excluding its innermost dimension N. + density (bool): If False (default), the result will contain the count (or total weight) + in each bin. If True, each count (weight) is divided by the total count + (total weight), then divided by the volume of its associated bin. + Returns: + hist (Tensor): N-dimensional Tensor containing the values of the histogram. + bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges. + + Example:: + >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3], + ... weight=torch.tensor([1., 2., 4., 8.])) + torch.return_types.histogramdd( + hist=tensor([[0., 1., 0.], + [2., 0., 0.], + [4., 0., 8.]]), + bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]), + tensor([0.0000, 0.6667, 1.3333, 2.0000]))) + + >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2], + ... range=[0., 1., 0., 1.], density=True) + torch.return_types.histogramdd( + hist=tensor([[2., 0.], + [0., 2.]]), + bin_edges=(tensor([0.0000, 0.5000, 1.0000]), + tensor([0.0000, 0.5000, 1.0000]))) + """ + ... +def hsmm(input: Tensor, mat2: Tensor) -> Tensor: ... +@overload +def hsplit(input: Tensor, sections: _int) -> Tuple[Tensor, ...]: + r""" + hsplit(input, indices_or_sections) -> List of Tensors + + Splits :attr:`input`, a tensor with one or more dimensions, into multiple tensors + horizontally according to :attr:`indices_or_sections`. Each split is a view of + :attr:`input`. + + If :attr:`input` is one dimensional this is equivalent to calling + torch.tensor_split(input, indices_or_sections, dim=0) (the split dimension is + zero), and if :attr:`input` has two or more dimensions it's equivalent to calling + torch.tensor_split(input, indices_or_sections, dim=1) (the split dimension is 1), + except that if :attr:`indices_or_sections` is an integer it must evenly divide + the split dimension or a runtime error will be thrown. + + This function is based on NumPy's :func:`numpy.hsplit`. + + Args: + input (Tensor): tensor to split. + indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`. + + Example:: + >>> t = torch.arange(16.0).reshape(4,4) + >>> t + tensor([[ 0., 1., 2., 3.], + [ 4., 5., 6., 7.], + [ 8., 9., 10., 11.], + [12., 13., 14., 15.]]) + >>> torch.hsplit(t, 2) + (tensor([[ 0., 1.], + [ 4., 5.], + [ 8., 9.], + [12., 13.]]), + tensor([[ 2., 3.], + [ 6., 7.], + [10., 11.], + [14., 15.]])) + >>> torch.hsplit(t, [3, 6]) + (tensor([[ 0., 1., 2.], + [ 4., 5., 6.], + [ 8., 9., 10.], + [12., 13., 14.]]), + tensor([[ 3.], + [ 7.], + [11.], + [15.]]), + tensor([], size=(4, 0))) + """ + ... +@overload +def hsplit(input: Tensor, indices: _size) -> Tuple[Tensor, ...]: + r""" + hsplit(input, indices_or_sections) -> List of Tensors + + Splits :attr:`input`, a tensor with one or more dimensions, into multiple tensors + horizontally according to :attr:`indices_or_sections`. Each split is a view of + :attr:`input`. + + If :attr:`input` is one dimensional this is equivalent to calling + torch.tensor_split(input, indices_or_sections, dim=0) (the split dimension is + zero), and if :attr:`input` has two or more dimensions it's equivalent to calling + torch.tensor_split(input, indices_or_sections, dim=1) (the split dimension is 1), + except that if :attr:`indices_or_sections` is an integer it must evenly divide + the split dimension or a runtime error will be thrown. + + This function is based on NumPy's :func:`numpy.hsplit`. + + Args: + input (Tensor): tensor to split. + indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`. + + Example:: + >>> t = torch.arange(16.0).reshape(4,4) + >>> t + tensor([[ 0., 1., 2., 3.], + [ 4., 5., 6., 7.], + [ 8., 9., 10., 11.], + [12., 13., 14., 15.]]) + >>> torch.hsplit(t, 2) + (tensor([[ 0., 1.], + [ 4., 5.], + [ 8., 9.], + [12., 13.]]), + tensor([[ 2., 3.], + [ 6., 7.], + [10., 11.], + [14., 15.]])) + >>> torch.hsplit(t, [3, 6]) + (tensor([[ 0., 1., 2.], + [ 4., 5., 6.], + [ 8., 9., 10.], + [12., 13., 14.]]), + tensor([[ 3.], + [ 7.], + [11.], + [15.]]), + tensor([], size=(4, 0))) + """ + ... +def hspmm(mat1: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + hspmm(mat1, mat2, *, out=None) -> Tensor + + Performs a matrix multiplication of a :ref:`sparse COO matrix + ` :attr:`mat1` and a strided matrix :attr:`mat2`. The + result is a (1 + 1)-dimensional :ref:`hybrid COO matrix + `. + + Args: + mat1 (Tensor): the first sparse matrix to be matrix multiplied + mat2 (Tensor): the second strided matrix to be matrix multiplied + + Keyword args: + out (Tensor, optional): the output tensor. + """ + ... +def hstack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: + r""" + hstack(tensors, *, out=None) -> Tensor + + Stack tensors in sequence horizontally (column wise). + + This is equivalent to concatenation along the first axis for 1-D tensors, and along the second axis for all other tensors. + + Args: + tensors (sequence of Tensors): sequence of tensors to concatenate + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([1, 2, 3]) + >>> b = torch.tensor([4, 5, 6]) + >>> torch.hstack((a,b)) + tensor([1, 2, 3, 4, 5, 6]) + >>> a = torch.tensor([[1],[2],[3]]) + >>> b = torch.tensor([[4],[5],[6]]) + >>> torch.hstack((a,b)) + tensor([[1, 4], + [2, 5], + [3, 6]]) + """ + ... +def hypot(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + hypot(input, other, *, out=None) -> Tensor + + Given the legs of a right triangle, return its hypotenuse. + + .. math:: + \text{out}_{i} = \sqrt{\text{input}_{i}^{2} + \text{other}_{i}^{2}} + + The shapes of ``input`` and ``other`` must be + :ref:`broadcastable `. + + Args: + input (Tensor): the first input tensor + other (Tensor): the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.hypot(torch.tensor([4.0]), torch.tensor([3.0, 4.0, 5.0])) + tensor([5.0000, 5.6569, 6.4031]) + """ + ... +def i0(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + i0(input, *, out=None) -> Tensor + + Alias for :func:`torch.special.i0`. + """ + ... +def i0_(input: Tensor) -> Tensor: ... +def igamma(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + igamma(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.special.gammainc`. + """ + ... +def igammac(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + igammac(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.special.gammaincc`. + """ + ... +def imag(input: Tensor) -> Tensor: + r""" + imag(input) -> Tensor + + Returns a new tensor containing imaginary values of the :attr:`self` tensor. + The returned tensor and :attr:`self` share the same underlying storage. + + .. warning:: + :func:`imag` is only supported for tensors with complex dtypes. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> x=torch.randn(4, dtype=torch.cfloat) + >>> x + tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)]) + >>> x.imag + tensor([ 0.3553, -0.7896, -0.0633, -0.8119]) + """ + ... +@overload +def index_add(input: Tensor, dim: _int, index: Tensor, source: Tensor, *, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: + r""" + index_add(input, dim, index, source, *, alpha=1, out=None) -> Tensor + + See :meth:`~Tensor.index_add_` for function description. + """ + ... +@overload +def index_add(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, source: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: + r""" + index_add(input, dim, index, source, *, alpha=1, out=None) -> Tensor + + See :meth:`~Tensor.index_add_` for function description. + """ + ... +@overload +def index_copy(input: Tensor, dim: _int, index: Tensor, source: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + index_copy(input, dim, index, source, *, out=None) -> Tensor + + See :meth:`~Tensor.index_add_` for function description. + """ + ... +@overload +def index_copy(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, source: Tensor) -> Tensor: + r""" + index_copy(input, dim, index, source, *, out=None) -> Tensor + + See :meth:`~Tensor.index_add_` for function description. + """ + ... +@overload +def index_fill(input: Tensor, dim: _int, index: Tensor, value: Tensor) -> Tensor: ... +@overload +def index_fill(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, value: Tensor) -> Tensor: ... +@overload +def index_fill(input: Tensor, dim: _int, index: Tensor, value: Union[Number, _complex]) -> Tensor: ... +@overload +def index_fill(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, value: Union[Number, _complex]) -> Tensor: ... +def index_put(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False) -> Tensor: ... +def index_put_(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False) -> Tensor: ... +def index_reduce(input: Tensor, dim: _int, index: Tensor, source: Tensor, reduce: str, *, include_self: _bool = True, out: Optional[Tensor] = None) -> Tensor: + r""" + index_reduce(input, dim, index, source, reduce, *, include_self=True, out=None) -> Tensor + + See :meth:`~Tensor.index_reduce_` for function description. + """ + ... +@overload +def index_select(input: Tensor, dim: _int, index: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + index_select(input, dim, index, *, out=None) -> Tensor + + Returns a new tensor which indexes the :attr:`input` tensor along dimension + :attr:`dim` using the entries in :attr:`index` which is a `LongTensor`. + + The returned tensor has the same number of dimensions as the original tensor + (:attr:`input`). The :attr:`dim`\ th dimension has the same size as the length + of :attr:`index`; other dimensions have the same size as in the original tensor. + + .. note:: The returned tensor does **not** use the same storage as the original + tensor. If :attr:`out` has a different shape than expected, we + silently change it to the correct shape, reallocating the underlying + storage if necessary. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension in which we index + index (IntTensor or LongTensor): the 1-D tensor containing the indices to index + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> x = torch.randn(3, 4) + >>> x + tensor([[ 0.1427, 0.0231, -0.5414, -1.0009], + [-0.4664, 0.2647, -0.1228, -1.1068], + [-1.1734, -0.6571, 0.7230, -0.6004]]) + >>> indices = torch.tensor([0, 2]) + >>> torch.index_select(x, 0, indices) + tensor([[ 0.1427, 0.0231, -0.5414, -1.0009], + [-1.1734, -0.6571, 0.7230, -0.6004]]) + >>> torch.index_select(x, 1, indices) + tensor([[ 0.1427, -0.5414], + [-0.4664, -0.1228], + [-1.1734, 0.7230]]) + """ + ... +@overload +def index_select(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + index_select(input, dim, index, *, out=None) -> Tensor + + Returns a new tensor which indexes the :attr:`input` tensor along dimension + :attr:`dim` using the entries in :attr:`index` which is a `LongTensor`. + + The returned tensor has the same number of dimensions as the original tensor + (:attr:`input`). The :attr:`dim`\ th dimension has the same size as the length + of :attr:`index`; other dimensions have the same size as in the original tensor. + + .. note:: The returned tensor does **not** use the same storage as the original + tensor. If :attr:`out` has a different shape than expected, we + silently change it to the correct shape, reallocating the underlying + storage if necessary. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension in which we index + index (IntTensor or LongTensor): the 1-D tensor containing the indices to index + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> x = torch.randn(3, 4) + >>> x + tensor([[ 0.1427, 0.0231, -0.5414, -1.0009], + [-0.4664, 0.2647, -0.1228, -1.1068], + [-1.1734, -0.6571, 0.7230, -0.6004]]) + >>> indices = torch.tensor([0, 2]) + >>> torch.index_select(x, 0, indices) + tensor([[ 0.1427, 0.0231, -0.5414, -1.0009], + [-1.1734, -0.6571, 0.7230, -0.6004]]) + >>> torch.index_select(x, 1, indices) + tensor([[ 0.1427, -0.5414], + [-0.4664, -0.1228], + [-1.1734, 0.7230]]) + """ + ... +def indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.indices`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def init_num_threads() -> None: ... +def inner(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + inner(input, other, *, out=None) -> Tensor + + Computes the dot product for 1D tensors. For higher dimensions, sums the product + of elements from :attr:`input` and :attr:`other` along their last dimension. + + .. note:: + + If either :attr:`input` or :attr:`other` is a scalar, the result is equivalent + to `torch.mul(input, other)`. + + If both :attr:`input` and :attr:`other` are non-scalars, the size of their last + dimension must match and the result is equivalent to `torch.tensordot(input, + other, dims=([-1], [-1]))` + + Args: + input (Tensor): First input tensor + other (Tensor): Second input tensor + + Keyword args: + out (Tensor, optional): Optional output tensor to write result into. The output + shape is `input.shape[:-1] + other.shape[:-1]`. + + Example:: + + # Dot product + >>> torch.inner(torch.tensor([1, 2, 3]), torch.tensor([0, 2, 1])) + tensor(7) + + # Multidimensional input tensors + >>> a = torch.randn(2, 3) + >>> a + tensor([[0.8173, 1.0874, 1.1784], + [0.3279, 0.1234, 2.7894]]) + >>> b = torch.randn(2, 4, 3) + >>> b + tensor([[[-0.4682, -0.7159, 0.1506], + [ 0.4034, -0.3657, 1.0387], + [ 0.9892, -0.6684, 0.1774], + [ 0.9482, 1.3261, 0.3917]], + + [[ 0.4537, 0.7493, 1.1724], + [ 0.2291, 0.5749, -0.2267], + [-0.7920, 0.3607, -0.3701], + [ 1.3666, -0.5850, -1.7242]]]) + >>> torch.inner(a, b) + tensor([[[-0.9837, 1.1560, 0.2907, 2.6785], + [ 2.5671, 0.5452, -0.6912, -1.5509]], + + [[ 0.1782, 2.9843, 0.7366, 1.5672], + [ 3.5115, -0.4864, -1.2476, -4.4337]]]) + + # Scalar input + >>> torch.inner(a, torch.tensor(2)) + tensor([[1.6347, 2.1748, 2.3567], + [0.6558, 0.2469, 5.5787]]) + """ + ... +def instance_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], use_input_stats: _bool, momentum: _float, eps: _float, cudnn_enabled: _bool) -> Tensor: ... +def int_repr(input: Tensor) -> Tensor: ... +def inverse(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + inverse(input, *, out=None) -> Tensor + + Alias for :func:`torch.linalg.inv` + """ + ... +def is_complex(input: Tensor) -> _bool: + r""" + is_complex(input) -> (bool) + + Returns True if the data type of :attr:`input` is a complex data type i.e., + one of ``torch.complex64``, and ``torch.complex128``. + + Args: + input (Tensor): the input tensor. + """ + ... +def is_conj(input: Tensor) -> _bool: + r""" + is_conj(input) -> (bool) + + Returns True if the :attr:`input` is a conjugated tensor, i.e. its conjugate bit is set to `True`. + + Args: + input (Tensor): the input tensor. + """ + ... +def is_distributed(input: Tensor) -> _bool: ... +def is_floating_point(input: Tensor) -> _bool: + r""" + is_floating_point(input) -> (bool) + + Returns True if the data type of :attr:`input` is a floating point data type i.e., + one of ``torch.float64``, ``torch.float32``, ``torch.float16``, and ``torch.bfloat16``. + + Args: + input (Tensor): the input tensor. + """ + ... +def is_grad_enabled() -> _bool: + r""" + is_grad_enabled() -> (bool) + + Returns True if grad mode is currently enabled. + """ + ... +def is_inference(input: Tensor) -> _bool: + r""" + is_inference(input) -> (bool) + + Returns True if :attr:`input` is an inference tensor. + + A non-view tensor is an inference tensor if and only if it was + allocated during inference mode. A view tensor is an inference + tensor if and only if the tensor it is a view of is an inference tensor. + + For details on inference mode please see + `Inference Mode `_. + + Args: + input (Tensor): the input tensor. + """ + ... +def is_inference_mode_enabled() -> _bool: + r""" + is_inference_mode_enabled() -> (bool) + + Returns True if inference mode is currently enabled. + """ + ... +def is_neg(input: Tensor) -> _bool: ... +def is_nonzero(input: Tensor) -> _bool: + r""" + is_nonzero(input) -> (bool) + + Returns True if the :attr:`input` is a single element tensor which is not equal to zero + after type conversions. + i.e. not equal to ``torch.tensor([0.])`` or ``torch.tensor([0])`` or + ``torch.tensor([False])``. + Throws a ``RuntimeError`` if ``torch.numel() != 1`` (even in case + of sparse tensors). + + Args: + input (Tensor): the input tensor. + + Examples:: + + >>> torch.is_nonzero(torch.tensor([0.])) + False + >>> torch.is_nonzero(torch.tensor([1.5])) + True + >>> torch.is_nonzero(torch.tensor([False])) + False + >>> torch.is_nonzero(torch.tensor([3])) + True + >>> torch.is_nonzero(torch.tensor([1, 3, 5])) + Traceback (most recent call last): + ... + RuntimeError: bool value of Tensor with more than one value is ambiguous + >>> torch.is_nonzero(torch.tensor([])) + Traceback (most recent call last): + ... + RuntimeError: bool value of Tensor with no values is ambiguous + """ + ... +def is_same_size(input: Tensor, other: Tensor) -> _bool: ... +def is_signed(input: Tensor) -> _bool: ... +def is_vulkan_available() -> _bool: ... +def isclose(input: Tensor, other: Tensor, rtol: _float = 1e-05, atol: _float = 1e-08, equal_nan: _bool = False) -> Tensor: + r""" + isclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False) -> Tensor + + Returns a new tensor with boolean elements representing if each element of + :attr:`input` is "close" to the corresponding element of :attr:`other`. + Closeness is defined as: + + .. math:: + \lvert \text{input} - \text{other} \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert \text{other} \rvert + + + where :attr:`input` and :attr:`other` are finite. Where :attr:`input` + and/or :attr:`other` are nonfinite they are close if and only if + they are equal, with NaNs being considered equal to each other when + :attr:`equal_nan` is True. + + Args: + input (Tensor): first tensor to compare + other (Tensor): second tensor to compare + atol (float, optional): absolute tolerance. Default: 1e-08 + rtol (float, optional): relative tolerance. Default: 1e-05 + equal_nan (bool, optional): if ``True``, then two ``NaN`` s will be considered equal. Default: ``False`` + + Examples:: + + >>> torch.isclose(torch.tensor((1., 2, 3)), torch.tensor((1 + 1e-10, 3, 4))) + tensor([ True, False, False]) + >>> torch.isclose(torch.tensor((float('inf'), 4)), torch.tensor((float('inf'), 6)), rtol=.5) + tensor([True, True]) + """ + ... +def isfinite(input: Tensor) -> Tensor: + r""" + isfinite(input) -> Tensor + + Returns a new tensor with boolean elements representing if each element is `finite` or not. + + Real values are finite when they are not NaN, negative infinity, or infinity. + Complex values are finite when both their real and imaginary parts are finite. + + Args: + input (Tensor): the input tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is finite and False elsewhere + + Example:: + + >>> torch.isfinite(torch.tensor([1, float('inf'), 2, float('-inf'), float('nan')])) + tensor([True, False, True, False, False]) + """ + ... +@overload +def isin(elements: Tensor, test_elements: Tensor, *, assume_unique: _bool = False, invert: _bool = False, out: Optional[Tensor] = None) -> Tensor: + r""" + isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor + + Tests if each element of :attr:`elements` is in :attr:`test_elements`. Returns + a boolean tensor of the same shape as :attr:`elements` that is True for elements + in :attr:`test_elements` and False otherwise. + + .. note:: + One of :attr:`elements` or :attr:`test_elements` can be a scalar, but not both. + + Args: + elements (Tensor or Scalar): Input elements + test_elements (Tensor or Scalar): Values against which to test for each input element + assume_unique (bool, optional): If True, assumes both :attr:`elements` and + :attr:`test_elements` contain unique elements, which can speed up the + calculation. Default: False + invert (bool, optional): If True, inverts the boolean return tensor, resulting in True + values for elements *not* in :attr:`test_elements`. Default: False + + Returns: + A boolean tensor of the same shape as :attr:`elements` that is True for elements in + :attr:`test_elements` and False otherwise + + Example: + >>> torch.isin(torch.tensor([[1, 2], [3, 4]]), torch.tensor([2, 3])) + tensor([[False, True], + [ True, False]]) + """ + ... +@overload +def isin(element: Union[Number, _complex], test_elements: Tensor, *, assume_unique: _bool = False, invert: _bool = False, out: Optional[Tensor] = None) -> Tensor: + r""" + isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor + + Tests if each element of :attr:`elements` is in :attr:`test_elements`. Returns + a boolean tensor of the same shape as :attr:`elements` that is True for elements + in :attr:`test_elements` and False otherwise. + + .. note:: + One of :attr:`elements` or :attr:`test_elements` can be a scalar, but not both. + + Args: + elements (Tensor or Scalar): Input elements + test_elements (Tensor or Scalar): Values against which to test for each input element + assume_unique (bool, optional): If True, assumes both :attr:`elements` and + :attr:`test_elements` contain unique elements, which can speed up the + calculation. Default: False + invert (bool, optional): If True, inverts the boolean return tensor, resulting in True + values for elements *not* in :attr:`test_elements`. Default: False + + Returns: + A boolean tensor of the same shape as :attr:`elements` that is True for elements in + :attr:`test_elements` and False otherwise + + Example: + >>> torch.isin(torch.tensor([[1, 2], [3, 4]]), torch.tensor([2, 3])) + tensor([[False, True], + [ True, False]]) + """ + ... +@overload +def isin(elements: Tensor, test_element: Union[Number, _complex], *, assume_unique: _bool = False, invert: _bool = False, out: Optional[Tensor] = None) -> Tensor: + r""" + isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor + + Tests if each element of :attr:`elements` is in :attr:`test_elements`. Returns + a boolean tensor of the same shape as :attr:`elements` that is True for elements + in :attr:`test_elements` and False otherwise. + + .. note:: + One of :attr:`elements` or :attr:`test_elements` can be a scalar, but not both. + + Args: + elements (Tensor or Scalar): Input elements + test_elements (Tensor or Scalar): Values against which to test for each input element + assume_unique (bool, optional): If True, assumes both :attr:`elements` and + :attr:`test_elements` contain unique elements, which can speed up the + calculation. Default: False + invert (bool, optional): If True, inverts the boolean return tensor, resulting in True + values for elements *not* in :attr:`test_elements`. Default: False + + Returns: + A boolean tensor of the same shape as :attr:`elements` that is True for elements in + :attr:`test_elements` and False otherwise + + Example: + >>> torch.isin(torch.tensor([[1, 2], [3, 4]]), torch.tensor([2, 3])) + tensor([[False, True], + [ True, False]]) + """ + ... +def isinf(input: Tensor) -> Tensor: + r""" + isinf(input) -> Tensor + + Tests if each element of :attr:`input` is infinite + (positive or negative infinity) or not. + + .. note:: + Complex values are infinite when their real or imaginary part is + infinite. + + Args: + input (Tensor): the input tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is infinite and False elsewhere + + Example:: + + >>> torch.isinf(torch.tensor([1, float('inf'), 2, float('-inf'), float('nan')])) + tensor([False, True, False, True, False]) + """ + ... +def isnan(input: Tensor) -> Tensor: + r""" + isnan(input) -> Tensor + + Returns a new tensor with boolean elements representing if each element of :attr:`input` + is NaN or not. Complex values are considered NaN when either their real + and/or imaginary part is NaN. + + Arguments: + input (Tensor): the input tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is NaN and False elsewhere + + Example:: + + >>> torch.isnan(torch.tensor([1, float('nan'), 2])) + tensor([False, True, False]) + """ + ... +def isneginf(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + isneginf(input, *, out=None) -> Tensor + Tests if each element of :attr:`input` is negative infinity or not. + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([-float('inf'), float('inf'), 1.2]) + >>> torch.isneginf(a) + tensor([ True, False, False]) + """ + ... +def isposinf(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + isposinf(input, *, out=None) -> Tensor + Tests if each element of :attr:`input` is positive infinity or not. + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([-float('inf'), float('inf'), 1.2]) + >>> torch.isposinf(a) + tensor([False, True, False]) + """ + ... +def isreal(input: Tensor) -> Tensor: + r""" + isreal(input) -> Tensor + + Returns a new tensor with boolean elements representing if each element of :attr:`input` is real-valued or not. + All real-valued types are considered real. Complex values are considered real when their imaginary part is 0. + + Arguments: + input (Tensor): the input tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is real and False elsewhere + + Example:: + + >>> torch.isreal(torch.tensor([1, 1+1j, 2+0j])) + tensor([True, False, True]) + """ + ... +def istft(input: Tensor, n_fft: _int, hop_length: Optional[_int] = None, win_length: Optional[_int] = None, window: Optional[Tensor] = None, center: _bool = True, normalized: _bool = False, onesided: Optional[_bool] = None, length: Optional[_int] = None, return_complex: _bool = False) -> Tensor: ... +@overload +def kaiser_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + kaiser_window(window_length, periodic=True, beta=12.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Computes the Kaiser window with window length :attr:`window_length` and shape parameter :attr:`beta`. + + Let I_0 be the zeroth order modified Bessel function of the first kind (see :func:`torch.i0`) and + ``N = L - 1`` if :attr:`periodic` is False and ``L`` if :attr:`periodic` is True, + where ``L`` is the :attr:`window_length`. This function computes: + + .. math:: + out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta ) + + Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling + ``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``. + The :attr:`periodic` argument is intended as a helpful shorthand + to produce a periodic window as input to functions like :func:`torch.stft`. + + .. note:: + If :attr:`window_length` is one, then the returned window is a single element tensor containing a one. + + + Args: + window_length (int): length of the window. + periodic (bool, optional): If True, returns a periodic window suitable for use in spectral analysis. + If False, returns a symmetric window suitable for use in filter design. + beta (float, optional): shape parameter for the window. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only + ``torch.strided`` (dense layout) is supported. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + """ + ... +@overload +def kaiser_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + kaiser_window(window_length, periodic=True, beta=12.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Computes the Kaiser window with window length :attr:`window_length` and shape parameter :attr:`beta`. + + Let I_0 be the zeroth order modified Bessel function of the first kind (see :func:`torch.i0`) and + ``N = L - 1`` if :attr:`periodic` is False and ``L`` if :attr:`periodic` is True, + where ``L`` is the :attr:`window_length`. This function computes: + + .. math:: + out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta ) + + Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling + ``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``. + The :attr:`periodic` argument is intended as a helpful shorthand + to produce a periodic window as input to functions like :func:`torch.stft`. + + .. note:: + If :attr:`window_length` is one, then the returned window is a single element tensor containing a one. + + + Args: + window_length (int): length of the window. + periodic (bool, optional): If True, returns a periodic window suitable for use in spectral analysis. + If False, returns a symmetric window suitable for use in filter design. + beta (float, optional): shape parameter for the window. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only + ``torch.strided`` (dense layout) is supported. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + """ + ... +@overload +def kaiser_window(window_length: _int, periodic: _bool, beta: _float, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + kaiser_window(window_length, periodic=True, beta=12.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Computes the Kaiser window with window length :attr:`window_length` and shape parameter :attr:`beta`. + + Let I_0 be the zeroth order modified Bessel function of the first kind (see :func:`torch.i0`) and + ``N = L - 1`` if :attr:`periodic` is False and ``L`` if :attr:`periodic` is True, + where ``L`` is the :attr:`window_length`. This function computes: + + .. math:: + out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta ) + + Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling + ``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``. + The :attr:`periodic` argument is intended as a helpful shorthand + to produce a periodic window as input to functions like :func:`torch.stft`. + + .. note:: + If :attr:`window_length` is one, then the returned window is a single element tensor containing a one. + + + Args: + window_length (int): length of the window. + periodic (bool, optional): If True, returns a periodic window suitable for use in spectral analysis. + If False, returns a symmetric window suitable for use in filter design. + beta (float, optional): shape parameter for the window. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only + ``torch.strided`` (dense layout) is supported. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + """ + ... +def kl_div(input: Tensor, target: Tensor, reduction: _int = 1, *, log_target: _bool = False) -> Tensor: ... +def kron(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + kron(input, other, *, out=None) -> Tensor + + Computes the Kronecker product, denoted by :math:`\otimes`, of :attr:`input` and :attr:`other`. + + If :attr:`input` is a :math:`(a_0 \times a_1 \times \dots \times a_n)` tensor and :attr:`other` is a + :math:`(b_0 \times b_1 \times \dots \times b_n)` tensor, the result will be a + :math:`(a_0*b_0 \times a_1*b_1 \times \dots \times a_n*b_n)` tensor with the following entries: + + .. math:: + (\text{input} \otimes \text{other})_{k_0, k_1, \dots, k_n} = + \text{input}_{i_0, i_1, \dots, i_n} * \text{other}_{j_0, j_1, \dots, j_n}, + + where :math:`k_t = i_t * b_t + j_t` for :math:`0 \leq t \leq n`. + If one tensor has fewer dimensions than the other it is unsqueezed until it has the same number of dimensions. + + Supports real-valued and complex-valued inputs. + + .. note:: + This function generalizes the typical definition of the Kronecker product for two matrices to two tensors, + as described above. When :attr:`input` is a :math:`(m \times n)` matrix and :attr:`other` is a + :math:`(p \times q)` matrix, the result will be a :math:`(p*m \times q*n)` block matrix: + + .. math:: + \mathbf{A} \otimes \mathbf{B}=\begin{bmatrix} + a_{11} \mathbf{B} & \cdots & a_{1 n} \mathbf{B} \\ + \vdots & \ddots & \vdots \\ + a_{m 1} \mathbf{B} & \cdots & a_{m n} \mathbf{B} \end{bmatrix} + + where :attr:`input` is :math:`\mathbf{A}` and :attr:`other` is :math:`\mathbf{B}`. + + Arguments: + input (Tensor) + other (Tensor) + + Keyword args: + out (Tensor, optional): The output tensor. Ignored if ``None``. Default: ``None`` + + Examples:: + + >>> mat1 = torch.eye(2) + >>> mat2 = torch.ones(2, 2) + >>> torch.kron(mat1, mat2) + tensor([[1., 1., 0., 0.], + [1., 1., 0., 0.], + [0., 0., 1., 1.], + [0., 0., 1., 1.]]) + + >>> mat1 = torch.eye(2) + >>> mat2 = torch.arange(1, 5).reshape(2, 2) + >>> torch.kron(mat1, mat2) + tensor([[1., 2., 0., 0.], + [3., 4., 0., 0.], + [0., 0., 1., 2.], + [0., 0., 3., 4.]]) + """ + ... +@overload +def kthvalue(input: Tensor, k: _int, dim: _int = -1, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.kthvalue: + r""" + kthvalue(input, k, dim=None, keepdim=False, *, out=None) -> (Tensor, LongTensor) + + Returns a namedtuple ``(values, indices)`` where ``values`` is the :attr:`k` th + smallest element of each row of the :attr:`input` tensor in the given dimension + :attr:`dim`. And ``indices`` is the index location of each element found. + + If :attr:`dim` is not given, the last dimension of the `input` is chosen. + + If :attr:`keepdim` is ``True``, both the :attr:`values` and :attr:`indices` tensors + are the same size as :attr:`input`, except in the dimension :attr:`dim` where + they are of size 1. Otherwise, :attr:`dim` is squeezed + (see :func:`torch.squeeze`), resulting in both the :attr:`values` and + :attr:`indices` tensors having 1 fewer dimension than the :attr:`input` tensor. + + .. note:: + When :attr:`input` is a CUDA tensor and there are multiple valid + :attr:`k` th values, this function may nondeterministically return + :attr:`indices` for any of them. + + Args: + input (Tensor): the input tensor. + k (int): k for the k-th smallest element + dim (int, optional): the dimension to find the kth value along + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (tuple, optional): the output tuple of (Tensor, LongTensor) + can be optionally given to be used as output buffers + + Example:: + + >>> x = torch.arange(1., 6.) + >>> x + tensor([ 1., 2., 3., 4., 5.]) + >>> torch.kthvalue(x, 4) + torch.return_types.kthvalue(values=tensor(4.), indices=tensor(3)) + + >>> x=torch.arange(1.,7.).resize_(2,3) + >>> x + tensor([[ 1., 2., 3.], + [ 4., 5., 6.]]) + >>> torch.kthvalue(x, 2, 0, True) + torch.return_types.kthvalue(values=tensor([[4., 5., 6.]]), indices=tensor([[1, 1, 1]])) + """ + ... +@overload +def kthvalue(input: Tensor, k: _int, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.kthvalue: + r""" + kthvalue(input, k, dim=None, keepdim=False, *, out=None) -> (Tensor, LongTensor) + + Returns a namedtuple ``(values, indices)`` where ``values`` is the :attr:`k` th + smallest element of each row of the :attr:`input` tensor in the given dimension + :attr:`dim`. And ``indices`` is the index location of each element found. + + If :attr:`dim` is not given, the last dimension of the `input` is chosen. + + If :attr:`keepdim` is ``True``, both the :attr:`values` and :attr:`indices` tensors + are the same size as :attr:`input`, except in the dimension :attr:`dim` where + they are of size 1. Otherwise, :attr:`dim` is squeezed + (see :func:`torch.squeeze`), resulting in both the :attr:`values` and + :attr:`indices` tensors having 1 fewer dimension than the :attr:`input` tensor. + + .. note:: + When :attr:`input` is a CUDA tensor and there are multiple valid + :attr:`k` th values, this function may nondeterministically return + :attr:`indices` for any of them. + + Args: + input (Tensor): the input tensor. + k (int): k for the k-th smallest element + dim (int, optional): the dimension to find the kth value along + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (tuple, optional): the output tuple of (Tensor, LongTensor) + can be optionally given to be used as output buffers + + Example:: + + >>> x = torch.arange(1., 6.) + >>> x + tensor([ 1., 2., 3., 4., 5.]) + >>> torch.kthvalue(x, 4) + torch.return_types.kthvalue(values=tensor(4.), indices=tensor(3)) + + >>> x=torch.arange(1.,7.).resize_(2,3) + >>> x + tensor([[ 1., 2., 3.], + [ 4., 5., 6.]]) + >>> torch.kthvalue(x, 2, 0, True) + torch.return_types.kthvalue(values=tensor([[4., 5., 6.]]), indices=tensor([[1, 1, 1]])) + """ + ... +def layer_norm(input: Tensor, normalized_shape: Sequence[Union[_int, SymInt]], weight: Optional[Tensor] = None, bias: Optional[Tensor] = None, eps: _float = 1e-05, cudnn_enable: _bool = True) -> Tensor: ... +def lcm(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + lcm(input, other, *, out=None) -> Tensor + + Computes the element-wise least common multiple (LCM) of :attr:`input` and :attr:`other`. + + Both :attr:`input` and :attr:`other` must have integer types. + + .. note:: + This defines :math:`lcm(0, 0) = 0` and :math:`lcm(0, a) = 0`. + + Args: + input (Tensor): the input tensor. + other (Tensor): the second input tensor + + Keyword arguments: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([5, 10, 15]) + >>> b = torch.tensor([3, 4, 5]) + >>> torch.lcm(a, b) + tensor([15, 20, 15]) + >>> c = torch.tensor([3]) + >>> torch.lcm(a, c) + tensor([15, 30, 15]) + """ + ... +def lcm_(input: Tensor, other: Tensor) -> Tensor: ... +def ldexp(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + ldexp(input, other, *, out=None) -> Tensor + + Multiplies :attr:`input` by 2 ** :attr:`other`. + + .. math:: + \text{{out}}_i = \text{{input}}_i * 2^\text{{other}}_i + + + Typically this function is used to construct floating point numbers by multiplying + mantissas in :attr:`input` with integral powers of two created from the exponents + in :attr:`other`. + + Args: + input (Tensor): the input tensor. + other (Tensor): a tensor of exponents, typically integers. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.ldexp(torch.tensor([1.]), torch.tensor([1])) + tensor([2.]) + >>> torch.ldexp(torch.tensor([1.0]), torch.tensor([1, 2, 3, 4])) + tensor([ 2., 4., 8., 16.]) + """ + ... +def ldexp_(input: Tensor, other: Tensor) -> Tensor: ... +@overload +def le(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + le(input, other, *, out=None) -> Tensor + + Computes :math:`\text{input} \leq \text{other}` element-wise. + + + The second argument can be a number or a tensor whose shape is + :ref:`broadcastable ` with the first argument. + + Args: + input (Tensor): the tensor to compare + other (Tensor or Scalar): the tensor or value to compare + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is less than or equal to + :attr:`other` and False elsewhere + + Example:: + + >>> torch.le(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]])) + tensor([[True, False], [True, True]]) + """ + ... +@overload +def le(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + le(input, other, *, out=None) -> Tensor + + Computes :math:`\text{input} \leq \text{other}` element-wise. + + + The second argument can be a number or a tensor whose shape is + :ref:`broadcastable ` with the first argument. + + Args: + input (Tensor): the tensor to compare + other (Tensor or Scalar): the tensor or value to compare + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is less than or equal to + :attr:`other` and False elsewhere + + Example:: + + >>> torch.le(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]])) + tensor([[True, False], [True, True]]) + """ + ... +@overload +def lerp(input: Tensor, end: Tensor, weight: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + lerp(input, end, weight, *, out=None) + + Does a linear interpolation of two tensors :attr:`start` (given by :attr:`input`) and :attr:`end` based + on a scalar or tensor :attr:`weight` and returns the resulting :attr:`out` tensor. + + .. math:: + \text{out}_i = \text{start}_i + \text{weight}_i \times (\text{end}_i - \text{start}_i) + + The shapes of :attr:`start` and :attr:`end` must be + :ref:`broadcastable `. If :attr:`weight` is a tensor, then + the shapes of :attr:`weight`, :attr:`start`, and :attr:`end` must be :ref:`broadcastable `. + + Args: + input (Tensor): the tensor with the starting points + end (Tensor): the tensor with the ending points + weight (float or tensor): the weight for the interpolation formula + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> start = torch.arange(1., 5.) + >>> end = torch.empty(4).fill_(10) + >>> start + tensor([ 1., 2., 3., 4.]) + >>> end + tensor([ 10., 10., 10., 10.]) + >>> torch.lerp(start, end, 0.5) + tensor([ 5.5000, 6.0000, 6.5000, 7.0000]) + >>> torch.lerp(start, end, torch.full_like(start, 0.5)) + tensor([ 5.5000, 6.0000, 6.5000, 7.0000]) + """ + ... +@overload +def lerp(input: Tensor, end: Tensor, weight: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + lerp(input, end, weight, *, out=None) + + Does a linear interpolation of two tensors :attr:`start` (given by :attr:`input`) and :attr:`end` based + on a scalar or tensor :attr:`weight` and returns the resulting :attr:`out` tensor. + + .. math:: + \text{out}_i = \text{start}_i + \text{weight}_i \times (\text{end}_i - \text{start}_i) + + The shapes of :attr:`start` and :attr:`end` must be + :ref:`broadcastable `. If :attr:`weight` is a tensor, then + the shapes of :attr:`weight`, :attr:`start`, and :attr:`end` must be :ref:`broadcastable `. + + Args: + input (Tensor): the tensor with the starting points + end (Tensor): the tensor with the ending points + weight (float or tensor): the weight for the interpolation formula + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> start = torch.arange(1., 5.) + >>> end = torch.empty(4).fill_(10) + >>> start + tensor([ 1., 2., 3., 4.]) + >>> end + tensor([ 10., 10., 10., 10.]) + >>> torch.lerp(start, end, 0.5) + tensor([ 5.5000, 6.0000, 6.5000, 7.0000]) + >>> torch.lerp(start, end, torch.full_like(start, 0.5)) + tensor([ 5.5000, 6.0000, 6.5000, 7.0000]) + """ + ... +@overload +def less(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + less(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.lt`. + """ + ... +@overload +def less(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + less(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.lt`. + """ + ... +@overload +def less_equal(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + less_equal(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.le`. + """ + ... +@overload +def less_equal(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + less_equal(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.le`. + """ + ... +def lgamma(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + lgamma(input, *, out=None) -> Tensor + + Computes the natural logarithm of the absolute value of the gamma function on :attr:`input`. + + .. math:: + \text{out}_{i} = \ln |\Gamma(\text{input}_{i})| + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.arange(0.5, 2, 0.5) + >>> torch.lgamma(a) + tensor([ 0.5724, 0.0000, -0.1208]) + """ + ... +@overload +def linspace(start: Number, end: Number, steps: Optional[_int] = None, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: + r""" + linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly + spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are: + + .. math:: + (\text{start}, + \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1}, + \ldots, + \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1}, + \text{end}) + + + From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior. + + Args: + start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional + end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional + steps (int): size of the constructed tensor + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (torch.dtype, optional): the data type to perform the computation in. + Default: if None, uses the global default dtype (see torch.get_default_dtype()) + when both :attr:`start` and :attr:`end` are real, + and corresponding complex dtype when either is complex. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + + Example:: + + >>> torch.linspace(3, 10, steps=5) + tensor([ 3.0000, 4.7500, 6.5000, 8.2500, 10.0000]) + >>> torch.linspace(-10, 10, steps=5) + tensor([-10., -5., 0., 5., 10.]) + >>> torch.linspace(start=-10, end=10, steps=5) + tensor([-10., -5., 0., 5., 10.]) + >>> torch.linspace(start=-10, end=10, steps=1) + tensor([-10.]) + """ + ... +@overload +def linspace(start: Tensor, end: Tensor, steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly + spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are: + + .. math:: + (\text{start}, + \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1}, + \ldots, + \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1}, + \text{end}) + + + From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior. + + Args: + start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional + end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional + steps (int): size of the constructed tensor + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (torch.dtype, optional): the data type to perform the computation in. + Default: if None, uses the global default dtype (see torch.get_default_dtype()) + when both :attr:`start` and :attr:`end` are real, + and corresponding complex dtype when either is complex. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + + Example:: + + >>> torch.linspace(3, 10, steps=5) + tensor([ 3.0000, 4.7500, 6.5000, 8.2500, 10.0000]) + >>> torch.linspace(-10, 10, steps=5) + tensor([-10., -5., 0., 5., 10.]) + >>> torch.linspace(start=-10, end=10, steps=5) + tensor([-10., -5., 0., 5., 10.]) + >>> torch.linspace(start=-10, end=10, steps=1) + tensor([-10.]) + """ + ... +@overload +def linspace(start: Union[Number, _complex], end: Tensor, steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly + spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are: + + .. math:: + (\text{start}, + \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1}, + \ldots, + \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1}, + \text{end}) + + + From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior. + + Args: + start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional + end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional + steps (int): size of the constructed tensor + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (torch.dtype, optional): the data type to perform the computation in. + Default: if None, uses the global default dtype (see torch.get_default_dtype()) + when both :attr:`start` and :attr:`end` are real, + and corresponding complex dtype when either is complex. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + + Example:: + + >>> torch.linspace(3, 10, steps=5) + tensor([ 3.0000, 4.7500, 6.5000, 8.2500, 10.0000]) + >>> torch.linspace(-10, 10, steps=5) + tensor([-10., -5., 0., 5., 10.]) + >>> torch.linspace(start=-10, end=10, steps=5) + tensor([-10., -5., 0., 5., 10.]) + >>> torch.linspace(start=-10, end=10, steps=1) + tensor([-10.]) + """ + ... +@overload +def linspace(start: Tensor, end: Union[Number, _complex], steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly + spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are: + + .. math:: + (\text{start}, + \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1}, + \ldots, + \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1}, + \text{end}) + + + From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior. + + Args: + start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional + end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional + steps (int): size of the constructed tensor + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (torch.dtype, optional): the data type to perform the computation in. + Default: if None, uses the global default dtype (see torch.get_default_dtype()) + when both :attr:`start` and :attr:`end` are real, + and corresponding complex dtype when either is complex. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + + Example:: + + >>> torch.linspace(3, 10, steps=5) + tensor([ 3.0000, 4.7500, 6.5000, 8.2500, 10.0000]) + >>> torch.linspace(-10, 10, steps=5) + tensor([-10., -5., 0., 5., 10.]) + >>> torch.linspace(start=-10, end=10, steps=5) + tensor([-10., -5., 0., 5., 10.]) + >>> torch.linspace(start=-10, end=10, steps=1) + tensor([-10.]) + """ + ... +@overload +def linspace(start: Union[Number, _complex], end: Union[Number, _complex], steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly + spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are: + + .. math:: + (\text{start}, + \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1}, + \ldots, + \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1}, + \text{end}) + + + From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior. + + Args: + start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional + end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional + steps (int): size of the constructed tensor + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (torch.dtype, optional): the data type to perform the computation in. + Default: if None, uses the global default dtype (see torch.get_default_dtype()) + when both :attr:`start` and :attr:`end` are real, + and corresponding complex dtype when either is complex. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + + Example:: + + >>> torch.linspace(3, 10, steps=5) + tensor([ 3.0000, 4.7500, 6.5000, 8.2500, 10.0000]) + >>> torch.linspace(-10, 10, steps=5) + tensor([-10., -5., 0., 5., 10.]) + >>> torch.linspace(start=-10, end=10, steps=5) + tensor([-10., -5., 0., 5., 10.]) + >>> torch.linspace(start=-10, end=10, steps=1) + tensor([-10.]) + """ + ... +def log(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + log(input, *, out=None) -> Tensor + + Returns a new tensor with the natural logarithm of the elements + of :attr:`input`. + + .. math:: + y_{i} = \log_{e} (x_{i}) + + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.rand(5) * 5 + >>> a + tensor([4.7767, 4.3234, 1.2156, 0.2411, 4.5739]) + >>> torch.log(a) + tensor([ 1.5637, 1.4640, 0.1952, -1.4226, 1.5204]) + """ + ... +def log10(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + log10(input, *, out=None) -> Tensor + + Returns a new tensor with the logarithm to the base 10 of the elements + of :attr:`input`. + + .. math:: + y_{i} = \log_{10} (x_{i}) + + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.rand(5) + >>> a + tensor([ 0.5224, 0.9354, 0.7257, 0.1301, 0.2251]) + + + >>> torch.log10(a) + tensor([-0.2820, -0.0290, -0.1392, -0.8857, -0.6476]) + """ + ... +def log10_(input: Tensor) -> Tensor: ... +def log1p(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + log1p(input, *, out=None) -> Tensor + + Returns a new tensor with the natural logarithm of (1 + :attr:`input`). + + .. math:: + y_i = \log_{e} (x_i + 1) + + .. note:: This function is more accurate than :func:`torch.log` for small + values of :attr:`input` + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(5) + >>> a + tensor([-1.0090, -0.9923, 1.0249, -0.5372, 0.2492]) + >>> torch.log1p(a) + tensor([ nan, -4.8653, 0.7055, -0.7705, 0.2225]) + """ + ... +def log1p_(input: Tensor) -> Tensor: ... +def log2(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + log2(input, *, out=None) -> Tensor + + Returns a new tensor with the logarithm to the base 2 of the elements + of :attr:`input`. + + .. math:: + y_{i} = \log_{2} (x_{i}) + + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.rand(5) + >>> a + tensor([ 0.8419, 0.8003, 0.9971, 0.5287, 0.0490]) + + + >>> torch.log2(a) + tensor([-0.2483, -0.3213, -0.0042, -0.9196, -4.3504]) + """ + ... +def log2_(input: Tensor) -> Tensor: ... +def log_(input: Tensor) -> Tensor: ... +@overload +def log_softmax(input: Tensor, dim: _int, dtype: Optional[_dtype] = None, *, out: Optional[Tensor] = None) -> Tensor: ... +@overload +def log_softmax(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None) -> Tensor: ... +def logaddexp(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + logaddexp(input, other, *, out=None) -> Tensor + + Logarithm of the sum of exponentiations of the inputs. + + Calculates pointwise :math:`\log\left(e^x + e^y\right)`. This function is useful + in statistics where the calculated probabilities of events may be so small as to + exceed the range of normal floating point numbers. In such cases the logarithm + of the calculated probability is stored. This function allows adding + probabilities stored in such a fashion. + + This op should be disambiguated with :func:`torch.logsumexp` which performs a + reduction on a single tensor. + + Args: + input (Tensor): the input tensor. + other (Tensor): the second input tensor + + Keyword arguments: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.logaddexp(torch.tensor([-1.0]), torch.tensor([-1.0, -2, -3])) + tensor([-0.3069, -0.6867, -0.8731]) + >>> torch.logaddexp(torch.tensor([-100.0, -200, -300]), torch.tensor([-1.0, -2, -3])) + tensor([-1., -2., -3.]) + >>> torch.logaddexp(torch.tensor([1.0, 2000, 30000]), torch.tensor([-1.0, -2, -3])) + tensor([1.1269e+00, 2.0000e+03, 3.0000e+04]) + """ + ... +def logaddexp2(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + logaddexp2(input, other, *, out=None) -> Tensor + + Logarithm of the sum of exponentiations of the inputs in base-2. + + Calculates pointwise :math:`\log_2\left(2^x + 2^y\right)`. See + :func:`torch.logaddexp` for more details. + + Args: + input (Tensor): the input tensor. + other (Tensor): the second input tensor + + Keyword arguments: + out (Tensor, optional): the output tensor. + """ + ... +@overload +def logcumsumexp(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: + r""" + logcumsumexp(input, dim, *, out=None) -> Tensor + Returns the logarithm of the cumulative summation of the exponentiation of + elements of :attr:`input` in the dimension :attr:`dim`. + + For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is + + .. math:: + \text{logcumsumexp}(x)_{ij} = \log \sum\limits_{j=0}^{i} \exp(x_{ij}) + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to do the operation over + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(10) + >>> torch.logcumsumexp(a, dim=0) + tensor([-0.42296738, -0.04462666, 0.86278635, 0.94622083, 1.05277811, + 1.39202815, 1.83525007, 1.84492621, 2.06084887, 2.06844475])) + """ + ... +@overload +def logcumsumexp(input: Tensor, dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: + r""" + logcumsumexp(input, dim, *, out=None) -> Tensor + Returns the logarithm of the cumulative summation of the exponentiation of + elements of :attr:`input` in the dimension :attr:`dim`. + + For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is + + .. math:: + \text{logcumsumexp}(x)_{ij} = \log \sum\limits_{j=0}^{i} \exp(x_{ij}) + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to do the operation over + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(10) + >>> torch.logcumsumexp(a, dim=0) + tensor([-0.42296738, -0.04462666, 0.86278635, 0.94622083, 1.05277811, + 1.39202815, 1.83525007, 1.84492621, 2.06084887, 2.06844475])) + """ + ... +def logdet(input: Tensor) -> Tensor: + r""" + logdet(input) -> Tensor + + Calculates log determinant of a square matrix or batches of square matrices. + + It returns ``-inf`` if the input has a determinant of zero, and ``NaN`` if it has + a negative determinant. + + .. note:: + Backward through :meth:`logdet` internally uses SVD results when :attr:`input` + is not invertible. In this case, double backward through :meth:`logdet` will + be unstable in when :attr:`input` doesn't have distinct singular values. See + :func:`torch.linalg.svd` for details. + + .. seealso:: + + :func:`torch.linalg.slogdet` computes the sign (resp. angle) and natural logarithm of the + absolute value of the determinant of real-valued (resp. complex) square matrices. + + Arguments: + input (Tensor): the input tensor of size ``(*, n, n)`` where ``*`` is zero or more + batch dimensions. + + Example:: + + >>> A = torch.randn(3, 3) + >>> torch.det(A) + tensor(0.2611) + >>> torch.logdet(A) + tensor(-1.3430) + >>> A + tensor([[[ 0.9254, -0.6213], + [-0.5787, 1.6843]], + + [[ 0.3242, -0.9665], + [ 0.4539, -0.0887]], + + [[ 1.1336, -0.4025], + [-0.7089, 0.9032]]]) + >>> A.det() + tensor([1.1990, 0.4099, 0.7386]) + >>> A.det().log() + tensor([ 0.1815, -0.8917, -0.3031]) + """ + ... +def logical_and(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + logical_and(input, other, *, out=None) -> Tensor + + Computes the element-wise logical AND of the given input tensors. Zeros are treated as ``False`` and nonzeros are + treated as ``True``. + + Args: + input (Tensor): the input tensor. + other (Tensor): the tensor to compute AND with + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.logical_and(torch.tensor([True, False, True]), torch.tensor([True, False, False])) + tensor([ True, False, False]) + >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8) + >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8) + >>> torch.logical_and(a, b) + tensor([False, False, True, False]) + >>> torch.logical_and(a.double(), b.double()) + tensor([False, False, True, False]) + >>> torch.logical_and(a.double(), b) + tensor([False, False, True, False]) + >>> torch.logical_and(a, b, out=torch.empty(4, dtype=torch.bool)) + tensor([False, False, True, False]) + """ + ... +def logical_not(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + logical_not(input, *, out=None) -> Tensor + + Computes the element-wise logical NOT of the given input tensor. If not specified, the output tensor will have the bool + dtype. If the input tensor is not a bool tensor, zeros are treated as ``False`` and non-zeros are treated as ``True``. + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.logical_not(torch.tensor([True, False])) + tensor([False, True]) + >>> torch.logical_not(torch.tensor([0, 1, -10], dtype=torch.int8)) + tensor([ True, False, False]) + >>> torch.logical_not(torch.tensor([0., 1.5, -10.], dtype=torch.double)) + tensor([ True, False, False]) + >>> torch.logical_not(torch.tensor([0., 1., -10.], dtype=torch.double), out=torch.empty(3, dtype=torch.int16)) + tensor([1, 0, 0], dtype=torch.int16) + """ + ... +def logical_or(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + logical_or(input, other, *, out=None) -> Tensor + + Computes the element-wise logical OR of the given input tensors. Zeros are treated as ``False`` and nonzeros are + treated as ``True``. + + Args: + input (Tensor): the input tensor. + other (Tensor): the tensor to compute OR with + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.logical_or(torch.tensor([True, False, True]), torch.tensor([True, False, False])) + tensor([ True, False, True]) + >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8) + >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8) + >>> torch.logical_or(a, b) + tensor([ True, True, True, False]) + >>> torch.logical_or(a.double(), b.double()) + tensor([ True, True, True, False]) + >>> torch.logical_or(a.double(), b) + tensor([ True, True, True, False]) + >>> torch.logical_or(a, b, out=torch.empty(4, dtype=torch.bool)) + tensor([ True, True, True, False]) + """ + ... +def logical_xor(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + logical_xor(input, other, *, out=None) -> Tensor + + Computes the element-wise logical XOR of the given input tensors. Zeros are treated as ``False`` and nonzeros are + treated as ``True``. + + Args: + input (Tensor): the input tensor. + other (Tensor): the tensor to compute XOR with + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.logical_xor(torch.tensor([True, False, True]), torch.tensor([True, False, False])) + tensor([False, False, True]) + >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8) + >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8) + >>> torch.logical_xor(a, b) + tensor([ True, True, False, False]) + >>> torch.logical_xor(a.double(), b.double()) + tensor([ True, True, False, False]) + >>> torch.logical_xor(a.double(), b) + tensor([ True, True, False, False]) + >>> torch.logical_xor(a, b, out=torch.empty(4, dtype=torch.bool)) + tensor([ True, True, False, False]) + """ + ... +def logit(input: Tensor, eps: Optional[_float] = None, *, out: Optional[Tensor] = None) -> Tensor: + r""" + logit(input, eps=None, *, out=None) -> Tensor + + Alias for :func:`torch.special.logit`. + """ + ... +def logit_(input: Tensor, eps: Optional[_float] = None) -> Tensor: ... +@overload +def logspace(start: Number, end: Number, steps: Optional[_int] = None, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: + r""" + logspace(start, end, steps, base=10.0, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + + Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly + spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to + :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale + with base :attr:`base`. That is, the values are: + + .. math:: + (\text{base}^{\text{start}}, + \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})}, + \ldots, + \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})}, + \text{base}^{\text{end}}) + + + + From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior. + + Args: + start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional + end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional + steps (int): size of the constructed tensor + base (float, optional): base of the logarithm function. Default: ``10.0``. + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (torch.dtype, optional): the data type to perform the computation in. + Default: if None, uses the global default dtype (see torch.get_default_dtype()) + when both :attr:`start` and :attr:`end` are real, + and corresponding complex dtype when either is complex. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.logspace(start=-10, end=10, steps=5) + tensor([ 1.0000e-10, 1.0000e-05, 1.0000e+00, 1.0000e+05, 1.0000e+10]) + >>> torch.logspace(start=0.1, end=1.0, steps=5) + tensor([ 1.2589, 2.1135, 3.5481, 5.9566, 10.0000]) + >>> torch.logspace(start=0.1, end=1.0, steps=1) + tensor([1.2589]) + >>> torch.logspace(start=2, end=2, steps=1, base=2) + tensor([4.0]) + """ + ... +@overload +def logspace(start: Tensor, end: Tensor, steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + logspace(start, end, steps, base=10.0, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + + Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly + spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to + :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale + with base :attr:`base`. That is, the values are: + + .. math:: + (\text{base}^{\text{start}}, + \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})}, + \ldots, + \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})}, + \text{base}^{\text{end}}) + + + + From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior. + + Args: + start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional + end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional + steps (int): size of the constructed tensor + base (float, optional): base of the logarithm function. Default: ``10.0``. + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (torch.dtype, optional): the data type to perform the computation in. + Default: if None, uses the global default dtype (see torch.get_default_dtype()) + when both :attr:`start` and :attr:`end` are real, + and corresponding complex dtype when either is complex. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.logspace(start=-10, end=10, steps=5) + tensor([ 1.0000e-10, 1.0000e-05, 1.0000e+00, 1.0000e+05, 1.0000e+10]) + >>> torch.logspace(start=0.1, end=1.0, steps=5) + tensor([ 1.2589, 2.1135, 3.5481, 5.9566, 10.0000]) + >>> torch.logspace(start=0.1, end=1.0, steps=1) + tensor([1.2589]) + >>> torch.logspace(start=2, end=2, steps=1, base=2) + tensor([4.0]) + """ + ... +@overload +def logspace(start: Union[Number, _complex], end: Tensor, steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + logspace(start, end, steps, base=10.0, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + + Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly + spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to + :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale + with base :attr:`base`. That is, the values are: + + .. math:: + (\text{base}^{\text{start}}, + \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})}, + \ldots, + \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})}, + \text{base}^{\text{end}}) + + + + From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior. + + Args: + start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional + end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional + steps (int): size of the constructed tensor + base (float, optional): base of the logarithm function. Default: ``10.0``. + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (torch.dtype, optional): the data type to perform the computation in. + Default: if None, uses the global default dtype (see torch.get_default_dtype()) + when both :attr:`start` and :attr:`end` are real, + and corresponding complex dtype when either is complex. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.logspace(start=-10, end=10, steps=5) + tensor([ 1.0000e-10, 1.0000e-05, 1.0000e+00, 1.0000e+05, 1.0000e+10]) + >>> torch.logspace(start=0.1, end=1.0, steps=5) + tensor([ 1.2589, 2.1135, 3.5481, 5.9566, 10.0000]) + >>> torch.logspace(start=0.1, end=1.0, steps=1) + tensor([1.2589]) + >>> torch.logspace(start=2, end=2, steps=1, base=2) + tensor([4.0]) + """ + ... +@overload +def logspace(start: Tensor, end: Union[Number, _complex], steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + logspace(start, end, steps, base=10.0, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + + Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly + spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to + :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale + with base :attr:`base`. That is, the values are: + + .. math:: + (\text{base}^{\text{start}}, + \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})}, + \ldots, + \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})}, + \text{base}^{\text{end}}) + + + + From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior. + + Args: + start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional + end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional + steps (int): size of the constructed tensor + base (float, optional): base of the logarithm function. Default: ``10.0``. + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (torch.dtype, optional): the data type to perform the computation in. + Default: if None, uses the global default dtype (see torch.get_default_dtype()) + when both :attr:`start` and :attr:`end` are real, + and corresponding complex dtype when either is complex. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.logspace(start=-10, end=10, steps=5) + tensor([ 1.0000e-10, 1.0000e-05, 1.0000e+00, 1.0000e+05, 1.0000e+10]) + >>> torch.logspace(start=0.1, end=1.0, steps=5) + tensor([ 1.2589, 2.1135, 3.5481, 5.9566, 10.0000]) + >>> torch.logspace(start=0.1, end=1.0, steps=1) + tensor([1.2589]) + >>> torch.logspace(start=2, end=2, steps=1, base=2) + tensor([4.0]) + """ + ... +@overload +def logspace(start: Union[Number, _complex], end: Union[Number, _complex], steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + logspace(start, end, steps, base=10.0, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + + Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly + spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to + :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale + with base :attr:`base`. That is, the values are: + + .. math:: + (\text{base}^{\text{start}}, + \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})}, + \ldots, + \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})}, + \text{base}^{\text{end}}) + + + + From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior. + + Args: + start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional + end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional + steps (int): size of the constructed tensor + base (float, optional): base of the logarithm function. Default: ``10.0``. + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (torch.dtype, optional): the data type to perform the computation in. + Default: if None, uses the global default dtype (see torch.get_default_dtype()) + when both :attr:`start` and :attr:`end` are real, + and corresponding complex dtype when either is complex. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.logspace(start=-10, end=10, steps=5) + tensor([ 1.0000e-10, 1.0000e-05, 1.0000e+00, 1.0000e+05, 1.0000e+10]) + >>> torch.logspace(start=0.1, end=1.0, steps=5) + tensor([ 1.2589, 2.1135, 3.5481, 5.9566, 10.0000]) + >>> torch.logspace(start=0.1, end=1.0, steps=1) + tensor([1.2589]) + >>> torch.logspace(start=2, end=2, steps=1, base=2) + tensor([4.0]) + """ + ... +@overload +def logsumexp(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + logsumexp(input, dim, keepdim=False, *, out=None) + + Returns the log of summed exponentials of each row of the :attr:`input` + tensor in the given dimension :attr:`dim`. The computation is numerically + stabilized. + + For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is + + .. math:: + \text{logsumexp}(x)_{i} = \log \sum_j \exp(x_{ij}) + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(3, 3) + >>> torch.logsumexp(a, 1) + tensor([1.4907, 1.0593, 1.5696]) + >>> torch.dist(torch.logsumexp(a, 1), torch.log(torch.sum(torch.exp(a), 1))) + tensor(1.6859e-07) + """ + ... +@overload +def logsumexp(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + logsumexp(input, dim, keepdim=False, *, out=None) + + Returns the log of summed exponentials of each row of the :attr:`input` + tensor in the given dimension :attr:`dim`. The computation is numerically + stabilized. + + For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is + + .. math:: + \text{logsumexp}(x)_{i} = \log \sum_j \exp(x_{ij}) + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(3, 3) + >>> torch.logsumexp(a, 1) + tensor([1.4907, 1.0593, 1.5696]) + >>> torch.dist(torch.logsumexp(a, 1), torch.log(torch.sum(torch.exp(a), 1))) + tensor(1.6859e-07) + """ + ... +@overload +def lstm(data: Tensor, batch_sizes: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor, Tensor]: ... +@overload +def lstm(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor, Tensor]: ... +def lstm_cell(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: ... +@overload +def lt(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + lt(input, other, *, out=None) -> Tensor + + Computes :math:`\text{input} < \text{other}` element-wise. + + + The second argument can be a number or a tensor whose shape is + :ref:`broadcastable ` with the first argument. + + Args: + input (Tensor): the tensor to compare + other (Tensor or float): the tensor or value to compare + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is less than :attr:`other` and False elsewhere + + Example:: + + >>> torch.lt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]])) + tensor([[False, False], [True, False]]) + """ + ... +@overload +def lt(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + lt(input, other, *, out=None) -> Tensor + + Computes :math:`\text{input} < \text{other}` element-wise. + + + The second argument can be a number or a tensor whose shape is + :ref:`broadcastable ` with the first argument. + + Args: + input (Tensor): the tensor to compare + other (Tensor or float): the tensor or value to compare + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is less than :attr:`other` and False elsewhere + + Example:: + + >>> torch.lt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]])) + tensor([[False, False], [True, False]]) + """ + ... +def lu_solve(input: Tensor, LU_data: Tensor, LU_pivots: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + lu_solve(b, LU_data, LU_pivots, *, out=None) -> Tensor + + Returns the LU solve of the linear system :math:`Ax = b` using the partially pivoted + LU factorization of A from :func:`~linalg.lu_factor`. + + This function supports ``float``, ``double``, ``cfloat`` and ``cdouble`` dtypes for :attr:`input`. + + .. warning:: + + :func:`torch.lu_solve` is deprecated in favor of :func:`torch.linalg.lu_solve`. + :func:`torch.lu_solve` will be removed in a future PyTorch release. + ``X = torch.lu_solve(B, LU, pivots)`` should be replaced with + + .. code:: python + + X = linalg.lu_solve(LU, pivots, B) + + Arguments: + b (Tensor): the RHS tensor of size :math:`(*, m, k)`, where :math:`*` + is zero or more batch dimensions. + LU_data (Tensor): the pivoted LU factorization of A from :meth:`~linalg.lu_factor` of size :math:`(*, m, m)`, + where :math:`*` is zero or more batch dimensions. + LU_pivots (IntTensor): the pivots of the LU factorization from :meth:`~linalg.lu_factor` of size :math:`(*, m)`, + where :math:`*` is zero or more batch dimensions. + The batch dimensions of :attr:`LU_pivots` must be equal to the batch dimensions of + :attr:`LU_data`. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> A = torch.randn(2, 3, 3) + >>> b = torch.randn(2, 3, 1) + >>> LU, pivots = torch.linalg.lu_factor(A) + >>> x = torch.lu_solve(b, LU, pivots) + >>> torch.dist(A @ x, b) + tensor(1.00000e-07 * + 2.8312) + """ + ... +def lu_unpack(LU_data: Tensor, LU_pivots: Tensor, unpack_data: _bool = True, unpack_pivots: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.lu_unpack: + r""" + lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True, *, out=None) -> (Tensor, Tensor, Tensor) + + Unpacks the LU decomposition returned by :func:`~linalg.lu_factor` into the `P, L, U` matrices. + + .. seealso:: + + :func:`~linalg.lu` returns the matrices from the LU decomposition. Its gradient formula is more efficient + than that of doing :func:`~linalg.lu_factor` followed by :func:`~linalg.lu_unpack`. + + Args: + LU_data (Tensor): the packed LU factorization data + LU_pivots (Tensor): the packed LU factorization pivots + unpack_data (bool): flag indicating if the data should be unpacked. + If ``False``, then the returned ``L`` and ``U`` are empty tensors. + Default: ``True`` + unpack_pivots (bool): flag indicating if the pivots should be unpacked into a permutation matrix ``P``. + If ``False``, then the returned ``P`` is an empty tensor. + Default: ``True`` + + Keyword args: + out (tuple, optional): output tuple of three tensors. Ignored if `None`. + + Returns: + A namedtuple ``(P, L, U)`` + + Examples:: + + >>> A = torch.randn(2, 3, 3) + >>> LU, pivots = torch.linalg.lu_factor(A) + >>> P, L, U = torch.lu_unpack(LU, pivots) + >>> # We can recover A from the factorization + >>> A_ = P @ L @ U + >>> torch.allclose(A, A_) + True + + >>> # LU factorization of a rectangular matrix: + >>> A = torch.randn(2, 3, 2) + >>> LU, pivots = torch.linalg.lu_factor(A) + >>> P, L, U = torch.lu_unpack(LU, pivots) + >>> # P, L, U are the same as returned by linalg.lu + >>> P_, L_, U_ = torch.linalg.lu(A) + >>> torch.allclose(P, P_) and torch.allclose(L, L_) and torch.allclose(U, U_) + True + """ + ... +def margin_ranking_loss(input1: Tensor, input2: Tensor, target: Tensor, margin: _float = 0.0, reduction: _int = 1) -> Tensor: ... +@overload +def masked_fill(input: Tensor, mask: Tensor, value: Tensor) -> Tensor: ... +@overload +def masked_fill(input: Tensor, mask: Tensor, value: Union[Number, _complex]) -> Tensor: ... +def masked_scatter(input: Tensor, mask: Tensor, source: Tensor) -> Tensor: ... +def masked_select(input: Tensor, mask: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + masked_select(input, mask, *, out=None) -> Tensor + + Returns a new 1-D tensor which indexes the :attr:`input` tensor according to + the boolean mask :attr:`mask` which is a `BoolTensor`. + + The shapes of the :attr:`mask` tensor and the :attr:`input` tensor don't need + to match, but they must be :ref:`broadcastable `. + + .. note:: The returned tensor does **not** use the same storage + as the original tensor + + Args: + input (Tensor): the input tensor. + mask (BoolTensor): the tensor containing the binary mask to index with + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> x = torch.randn(3, 4) + >>> x + tensor([[ 0.3552, -2.3825, -0.8297, 0.3477], + [-1.2035, 1.2252, 0.5002, 0.6248], + [ 0.1307, -2.0608, 0.1244, 2.0139]]) + >>> mask = x.ge(0.5) + >>> mask + tensor([[False, False, False, False], + [False, True, True, True], + [False, False, False, True]]) + >>> torch.masked_select(x, mask) + tensor([ 1.2252, 0.5002, 0.6248, 2.0139]) + """ + ... +def matmul(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + matmul(input, other, *, out=None) -> Tensor + + Matrix product of two tensors. + + The behavior depends on the dimensionality of the tensors as follows: + + - If both tensors are 1-dimensional, the dot product (scalar) is returned. + - If both arguments are 2-dimensional, the matrix-matrix product is returned. + - If the first argument is 1-dimensional and the second argument is 2-dimensional, + a 1 is prepended to its dimension for the purpose of the matrix multiply. + After the matrix multiply, the prepended dimension is removed. + - If the first argument is 2-dimensional and the second argument is 1-dimensional, + the matrix-vector product is returned. + - If both arguments are at least 1-dimensional and at least one argument is + N-dimensional (where N > 2), then a batched matrix multiply is returned. If the first + argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the + batched matrix multiply and removed after. If the second argument is 1-dimensional, a + 1 is appended to its dimension for the purpose of the batched matrix multiple and removed after. + The non-matrix (i.e. batch) dimensions are :ref:`broadcasted ` (and thus + must be broadcastable). For example, if :attr:`input` is a + :math:`(j \times 1 \times n \times n)` tensor and :attr:`other` is a :math:`(k \times n \times n)` + tensor, :attr:`out` will be a :math:`(j \times k \times n \times n)` tensor. + + Note that the broadcasting logic only looks at the batch dimensions when determining if the inputs + are broadcastable, and not the matrix dimensions. For example, if :attr:`input` is a + :math:`(j \times 1 \times n \times m)` tensor and :attr:`other` is a :math:`(k \times m \times p)` + tensor, these inputs are valid for broadcasting even though the final two dimensions (i.e. the + matrix dimensions) are different. :attr:`out` will be a :math:`(j \times k \times n \times p)` tensor. + + This operation has support for arguments with :ref:`sparse layouts`. In particular the + matrix-matrix (both arguments 2-dimensional) supports sparse arguments with the same restrictions + as :func:`torch.mm` + + + .. warning:: + Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported, + or may not have autograd support. If you notice missing functionality please + open a feature request. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + .. note:: + + The 1-dimensional dot product version of this function does not support an :attr:`out` parameter. + + Arguments: + input (Tensor): the first tensor to be multiplied + other (Tensor): the second tensor to be multiplied + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> # vector x vector + >>> tensor1 = torch.randn(3) + >>> tensor2 = torch.randn(3) + >>> torch.matmul(tensor1, tensor2).size() + torch.Size([]) + >>> # matrix x vector + >>> tensor1 = torch.randn(3, 4) + >>> tensor2 = torch.randn(4) + >>> torch.matmul(tensor1, tensor2).size() + torch.Size([3]) + >>> # batched matrix x broadcasted vector + >>> tensor1 = torch.randn(10, 3, 4) + >>> tensor2 = torch.randn(4) + >>> torch.matmul(tensor1, tensor2).size() + torch.Size([10, 3]) + >>> # batched matrix x batched matrix + >>> tensor1 = torch.randn(10, 3, 4) + >>> tensor2 = torch.randn(10, 4, 5) + >>> torch.matmul(tensor1, tensor2).size() + torch.Size([10, 3, 5]) + >>> # batched matrix x broadcasted matrix + >>> tensor1 = torch.randn(10, 3, 4) + >>> tensor2 = torch.randn(4, 5) + >>> torch.matmul(tensor1, tensor2).size() + torch.Size([10, 3, 5]) + """ + ... +def matrix_exp(input: Tensor) -> Tensor: + r""" + matrix_exp(A) -> Tensor + + Alias for :func:`torch.linalg.matrix_exp`. + """ + ... +def matrix_power(input: Tensor, n: _int, *, out: Optional[Tensor] = None) -> Tensor: + r""" + matrix_power(input, n, *, out=None) -> Tensor + + Alias for :func:`torch.linalg.matrix_power` + """ + ... +@overload +def max(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + max(input) -> Tensor + + Returns the maximum value of all elements in the ``input`` tensor. + + .. warning:: + This function produces deterministic (sub)gradients unlike ``max(dim=0)`` + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 0.6763, 0.7445, -2.2369]]) + >>> torch.max(a) + tensor(0.7445) + + .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor) + :noindex: + + Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum + value of each row of the :attr:`input` tensor in the given dimension + :attr:`dim`. And ``indices`` is the index location of each maximum value found + (argmax). + + If ``keepdim`` is ``True``, the output tensors are of the same size + as ``input`` except in the dimension ``dim`` where they are of size 1. + Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting + in the output tensors having 1 fewer dimension than ``input``. + + .. note:: If there are multiple maximal values in a reduced row then + the indices of the first maximal value are returned. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``. + + Keyword args: + out (tuple, optional): the result tuple of two output tensors (max, max_indices) + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[-1.2360, -0.2942, -0.1222, 0.8475], + [ 1.1949, -1.1127, -2.2379, -0.6702], + [ 1.5717, -0.9207, 0.1297, -1.8768], + [-0.6172, 1.0036, -0.6060, -0.2432]]) + >>> torch.max(a, 1) + torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1])) + + .. function:: max(input, other, *, out=None) -> Tensor + :noindex: + + See :func:`torch.maximum`. + """ + ... +@overload +def max(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + max(input) -> Tensor + + Returns the maximum value of all elements in the ``input`` tensor. + + .. warning:: + This function produces deterministic (sub)gradients unlike ``max(dim=0)`` + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 0.6763, 0.7445, -2.2369]]) + >>> torch.max(a) + tensor(0.7445) + + .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor) + :noindex: + + Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum + value of each row of the :attr:`input` tensor in the given dimension + :attr:`dim`. And ``indices`` is the index location of each maximum value found + (argmax). + + If ``keepdim`` is ``True``, the output tensors are of the same size + as ``input`` except in the dimension ``dim`` where they are of size 1. + Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting + in the output tensors having 1 fewer dimension than ``input``. + + .. note:: If there are multiple maximal values in a reduced row then + the indices of the first maximal value are returned. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``. + + Keyword args: + out (tuple, optional): the result tuple of two output tensors (max, max_indices) + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[-1.2360, -0.2942, -0.1222, 0.8475], + [ 1.1949, -1.1127, -2.2379, -0.6702], + [ 1.5717, -0.9207, 0.1297, -1.8768], + [-0.6172, 1.0036, -0.6060, -0.2432]]) + >>> torch.max(a, 1) + torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1])) + + .. function:: max(input, other, *, out=None) -> Tensor + :noindex: + + See :func:`torch.maximum`. + """ + ... +@overload +def max(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.max: + r""" + max(input) -> Tensor + + Returns the maximum value of all elements in the ``input`` tensor. + + .. warning:: + This function produces deterministic (sub)gradients unlike ``max(dim=0)`` + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 0.6763, 0.7445, -2.2369]]) + >>> torch.max(a) + tensor(0.7445) + + .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor) + :noindex: + + Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum + value of each row of the :attr:`input` tensor in the given dimension + :attr:`dim`. And ``indices`` is the index location of each maximum value found + (argmax). + + If ``keepdim`` is ``True``, the output tensors are of the same size + as ``input`` except in the dimension ``dim`` where they are of size 1. + Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting + in the output tensors having 1 fewer dimension than ``input``. + + .. note:: If there are multiple maximal values in a reduced row then + the indices of the first maximal value are returned. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``. + + Keyword args: + out (tuple, optional): the result tuple of two output tensors (max, max_indices) + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[-1.2360, -0.2942, -0.1222, 0.8475], + [ 1.1949, -1.1127, -2.2379, -0.6702], + [ 1.5717, -0.9207, 0.1297, -1.8768], + [-0.6172, 1.0036, -0.6060, -0.2432]]) + >>> torch.max(a, 1) + torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1])) + + .. function:: max(input, other, *, out=None) -> Tensor + :noindex: + + See :func:`torch.maximum`. + """ + ... +@overload +def max(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.max: + r""" + max(input) -> Tensor + + Returns the maximum value of all elements in the ``input`` tensor. + + .. warning:: + This function produces deterministic (sub)gradients unlike ``max(dim=0)`` + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 0.6763, 0.7445, -2.2369]]) + >>> torch.max(a) + tensor(0.7445) + + .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor) + :noindex: + + Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum + value of each row of the :attr:`input` tensor in the given dimension + :attr:`dim`. And ``indices`` is the index location of each maximum value found + (argmax). + + If ``keepdim`` is ``True``, the output tensors are of the same size + as ``input`` except in the dimension ``dim`` where they are of size 1. + Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting + in the output tensors having 1 fewer dimension than ``input``. + + .. note:: If there are multiple maximal values in a reduced row then + the indices of the first maximal value are returned. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``. + + Keyword args: + out (tuple, optional): the result tuple of two output tensors (max, max_indices) + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[-1.2360, -0.2942, -0.1222, 0.8475], + [ 1.1949, -1.1127, -2.2379, -0.6702], + [ 1.5717, -0.9207, 0.1297, -1.8768], + [-0.6172, 1.0036, -0.6060, -0.2432]]) + >>> torch.max(a, 1) + torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1])) + + .. function:: max(input, other, *, out=None) -> Tensor + :noindex: + + See :func:`torch.maximum`. + """ + ... +def max_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ... +def max_pool1d_with_indices(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tuple[Tensor, Tensor]: ... +def max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ... +def max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ... +def maximum(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + maximum(input, other, *, out=None) -> Tensor + + Computes the element-wise maximum of :attr:`input` and :attr:`other`. + + .. note:: + If one of the elements being compared is a NaN, then that element is returned. + :func:`maximum` is not supported for tensors with complex dtypes. + + Args: + input (Tensor): the input tensor. + other (Tensor): the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor((1, 2, -1)) + >>> b = torch.tensor((3, 0, 4)) + >>> torch.maximum(a, b) + tensor([3, 2, 4]) + """ + ... +@overload +def mean(input: Tensor, *, dtype: Optional[_dtype] = None) -> Tensor: + r""" + mean(input, *, dtype=None) -> Tensor + + Returns the mean value of all elements in the :attr:`input` tensor. Input must be floating point or complex. + + Args: + input (Tensor): + the input tensor, either of floating point or complex dtype + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 0.2294, -0.5481, 1.3288]]) + >>> torch.mean(a) + tensor(0.3367) + + .. function:: mean(input, dim, keepdim=False, *, dtype=None, out=None) -> Tensor + :noindex: + + Returns the mean value of each row of the :attr:`input` tensor in the given + dimension :attr:`dim`. If :attr:`dim` is a list of dimensions, + reduce over all of them. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + out (Tensor, optional): the output tensor. + + .. seealso:: + + :func:`torch.nanmean` computes the mean value of `non-NaN` elements. + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[-0.3841, 0.6320, 0.4254, -0.7384], + [-0.9644, 1.0131, -0.6549, -1.4279], + [-0.2951, -1.3350, -0.7694, 0.5600], + [ 1.0842, -0.9580, 0.3623, 0.2343]]) + >>> torch.mean(a, 1) + tensor([-0.0163, -0.5085, -0.4599, 0.1807]) + >>> torch.mean(a, 1, True) + tensor([[-0.0163], + [-0.5085], + [-0.4599], + [ 0.1807]]) + """ + ... +@overload +def mean(input: Tensor, dim: Optional[Union[_int, _size]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + mean(input, *, dtype=None) -> Tensor + + Returns the mean value of all elements in the :attr:`input` tensor. Input must be floating point or complex. + + Args: + input (Tensor): + the input tensor, either of floating point or complex dtype + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 0.2294, -0.5481, 1.3288]]) + >>> torch.mean(a) + tensor(0.3367) + + .. function:: mean(input, dim, keepdim=False, *, dtype=None, out=None) -> Tensor + :noindex: + + Returns the mean value of each row of the :attr:`input` tensor in the given + dimension :attr:`dim`. If :attr:`dim` is a list of dimensions, + reduce over all of them. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + out (Tensor, optional): the output tensor. + + .. seealso:: + + :func:`torch.nanmean` computes the mean value of `non-NaN` elements. + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[-0.3841, 0.6320, 0.4254, -0.7384], + [-0.9644, 1.0131, -0.6549, -1.4279], + [-0.2951, -1.3350, -0.7694, 0.5600], + [ 1.0842, -0.9580, 0.3623, 0.2343]]) + >>> torch.mean(a, 1) + tensor([-0.0163, -0.5085, -0.4599, 0.1807]) + >>> torch.mean(a, 1, True) + tensor([[-0.0163], + [-0.5085], + [-0.4599], + [ 0.1807]]) + """ + ... +@overload +def mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + mean(input, *, dtype=None) -> Tensor + + Returns the mean value of all elements in the :attr:`input` tensor. Input must be floating point or complex. + + Args: + input (Tensor): + the input tensor, either of floating point or complex dtype + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 0.2294, -0.5481, 1.3288]]) + >>> torch.mean(a) + tensor(0.3367) + + .. function:: mean(input, dim, keepdim=False, *, dtype=None, out=None) -> Tensor + :noindex: + + Returns the mean value of each row of the :attr:`input` tensor in the given + dimension :attr:`dim`. If :attr:`dim` is a list of dimensions, + reduce over all of them. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + out (Tensor, optional): the output tensor. + + .. seealso:: + + :func:`torch.nanmean` computes the mean value of `non-NaN` elements. + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[-0.3841, 0.6320, 0.4254, -0.7384], + [-0.9644, 1.0131, -0.6549, -1.4279], + [-0.2951, -1.3350, -0.7694, 0.5600], + [ 1.0842, -0.9580, 0.3623, 0.2343]]) + >>> torch.mean(a, 1) + tensor([-0.0163, -0.5085, -0.4599, 0.1807]) + >>> torch.mean(a, 1, True) + tensor([[-0.0163], + [-0.5085], + [-0.4599], + [ 0.1807]]) + """ + ... +@overload +def median(input: Tensor) -> Tensor: + r""" + median(input) -> Tensor + + Returns the median of the values in :attr:`input`. + + .. note:: + The median is not unique for :attr:`input` tensors with an even number + of elements. In this case the lower of the two medians is returned. To + compute the mean of both medians, use :func:`torch.quantile` with ``q=0.5`` instead. + + .. warning:: + This function produces deterministic (sub)gradients unlike ``median(dim=0)`` + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 1.5219, -1.5212, 0.2202]]) + >>> torch.median(a) + tensor(0.2202) + + .. function:: median(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor) + :noindex: + + Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input` + in the dimension :attr:`dim`, and ``indices`` contains the index of the median values found in the dimension :attr:`dim`. + + By default, :attr:`dim` is the last dimension of the :attr:`input` tensor. + + If :attr:`keepdim` is ``True``, the output tensors are of the same size + as :attr:`input` except in the dimension :attr:`dim` where they are of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in + the outputs tensor having 1 fewer dimension than :attr:`input`. + + .. note:: + The median is not unique for :attr:`input` tensors with an even number + of elements in the dimension :attr:`dim`. In this case the lower of the + two medians is returned. To compute the mean of both medians in + :attr:`input`, use :func:`torch.quantile` with ``q=0.5`` instead. + + .. warning:: + ``indices`` does not necessarily contain the first occurrence of each + median value found, unless it is unique. + The exact implementation details are device-specific. + Do not expect the same result when run on CPU and GPU in general. + For the same reason do not expect the gradients to be deterministic. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second + tensor, which must have dtype long, with their indices in the dimension + :attr:`dim` of :attr:`input`. + + Example:: + + >>> a = torch.randn(4, 5) + >>> a + tensor([[ 0.2505, -0.3982, -0.9948, 0.3518, -1.3131], + [ 0.3180, -0.6993, 1.0436, 0.0438, 0.2270], + [-0.2751, 0.7303, 0.2192, 0.3321, 0.2488], + [ 1.0778, -1.9510, 0.7048, 0.4742, -0.7125]]) + >>> torch.median(a, 1) + torch.return_types.median(values=tensor([-0.3982, 0.2270, 0.2488, 0.4742]), indices=tensor([1, 4, 4, 3])) + """ + ... +@overload +def median(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.median: + r""" + median(input) -> Tensor + + Returns the median of the values in :attr:`input`. + + .. note:: + The median is not unique for :attr:`input` tensors with an even number + of elements. In this case the lower of the two medians is returned. To + compute the mean of both medians, use :func:`torch.quantile` with ``q=0.5`` instead. + + .. warning:: + This function produces deterministic (sub)gradients unlike ``median(dim=0)`` + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 1.5219, -1.5212, 0.2202]]) + >>> torch.median(a) + tensor(0.2202) + + .. function:: median(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor) + :noindex: + + Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input` + in the dimension :attr:`dim`, and ``indices`` contains the index of the median values found in the dimension :attr:`dim`. + + By default, :attr:`dim` is the last dimension of the :attr:`input` tensor. + + If :attr:`keepdim` is ``True``, the output tensors are of the same size + as :attr:`input` except in the dimension :attr:`dim` where they are of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in + the outputs tensor having 1 fewer dimension than :attr:`input`. + + .. note:: + The median is not unique for :attr:`input` tensors with an even number + of elements in the dimension :attr:`dim`. In this case the lower of the + two medians is returned. To compute the mean of both medians in + :attr:`input`, use :func:`torch.quantile` with ``q=0.5`` instead. + + .. warning:: + ``indices`` does not necessarily contain the first occurrence of each + median value found, unless it is unique. + The exact implementation details are device-specific. + Do not expect the same result when run on CPU and GPU in general. + For the same reason do not expect the gradients to be deterministic. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second + tensor, which must have dtype long, with their indices in the dimension + :attr:`dim` of :attr:`input`. + + Example:: + + >>> a = torch.randn(4, 5) + >>> a + tensor([[ 0.2505, -0.3982, -0.9948, 0.3518, -1.3131], + [ 0.3180, -0.6993, 1.0436, 0.0438, 0.2270], + [-0.2751, 0.7303, 0.2192, 0.3321, 0.2488], + [ 1.0778, -1.9510, 0.7048, 0.4742, -0.7125]]) + >>> torch.median(a, 1) + torch.return_types.median(values=tensor([-0.3982, 0.2270, 0.2488, 0.4742]), indices=tensor([1, 4, 4, 3])) + """ + ... +@overload +def median(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.median: + r""" + median(input) -> Tensor + + Returns the median of the values in :attr:`input`. + + .. note:: + The median is not unique for :attr:`input` tensors with an even number + of elements. In this case the lower of the two medians is returned. To + compute the mean of both medians, use :func:`torch.quantile` with ``q=0.5`` instead. + + .. warning:: + This function produces deterministic (sub)gradients unlike ``median(dim=0)`` + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 1.5219, -1.5212, 0.2202]]) + >>> torch.median(a) + tensor(0.2202) + + .. function:: median(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor) + :noindex: + + Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input` + in the dimension :attr:`dim`, and ``indices`` contains the index of the median values found in the dimension :attr:`dim`. + + By default, :attr:`dim` is the last dimension of the :attr:`input` tensor. + + If :attr:`keepdim` is ``True``, the output tensors are of the same size + as :attr:`input` except in the dimension :attr:`dim` where they are of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in + the outputs tensor having 1 fewer dimension than :attr:`input`. + + .. note:: + The median is not unique for :attr:`input` tensors with an even number + of elements in the dimension :attr:`dim`. In this case the lower of the + two medians is returned. To compute the mean of both medians in + :attr:`input`, use :func:`torch.quantile` with ``q=0.5`` instead. + + .. warning:: + ``indices`` does not necessarily contain the first occurrence of each + median value found, unless it is unique. + The exact implementation details are device-specific. + Do not expect the same result when run on CPU and GPU in general. + For the same reason do not expect the gradients to be deterministic. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second + tensor, which must have dtype long, with their indices in the dimension + :attr:`dim` of :attr:`input`. + + Example:: + + >>> a = torch.randn(4, 5) + >>> a + tensor([[ 0.2505, -0.3982, -0.9948, 0.3518, -1.3131], + [ 0.3180, -0.6993, 1.0436, 0.0438, 0.2270], + [-0.2751, 0.7303, 0.2192, 0.3321, 0.2488], + [ 1.0778, -1.9510, 0.7048, 0.4742, -0.7125]]) + >>> torch.median(a, 1) + torch.return_types.median(values=tensor([-0.3982, 0.2270, 0.2488, 0.4742]), indices=tensor([1, 4, 4, 3])) + """ + ... +@overload +def min(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + min(input) -> Tensor + + Returns the minimum value of all elements in the :attr:`input` tensor. + + .. warning:: + This function produces deterministic (sub)gradients unlike ``min(dim=0)`` + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 0.6750, 1.0857, 1.7197]]) + >>> torch.min(a) + tensor(0.6750) + + .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor) + :noindex: + + Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum + value of each row of the :attr:`input` tensor in the given dimension + :attr:`dim`. And ``indices`` is the index location of each minimum value found + (argmin). + + If :attr:`keepdim` is ``True``, the output tensors are of the same size as + :attr:`input` except in the dimension :attr:`dim` where they are of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in + the output tensors having 1 fewer dimension than :attr:`input`. + + .. note:: If there are multiple minimal values in a reduced row then + the indices of the first minimal value are returned. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (tuple, optional): the tuple of two output tensors (min, min_indices) + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[-0.6248, 1.1334, -1.1899, -0.2803], + [-1.4644, -0.2635, -0.3651, 0.6134], + [ 0.2457, 0.0384, 1.0128, 0.7015], + [-0.1153, 2.9849, 2.1458, 0.5788]]) + >>> torch.min(a, 1) + torch.return_types.min(values=tensor([-1.1899, -1.4644, 0.0384, -0.1153]), indices=tensor([2, 0, 1, 0])) + + .. function:: min(input, other, *, out=None) -> Tensor + :noindex: + + See :func:`torch.minimum`. + """ + ... +@overload +def min(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + min(input) -> Tensor + + Returns the minimum value of all elements in the :attr:`input` tensor. + + .. warning:: + This function produces deterministic (sub)gradients unlike ``min(dim=0)`` + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 0.6750, 1.0857, 1.7197]]) + >>> torch.min(a) + tensor(0.6750) + + .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor) + :noindex: + + Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum + value of each row of the :attr:`input` tensor in the given dimension + :attr:`dim`. And ``indices`` is the index location of each minimum value found + (argmin). + + If :attr:`keepdim` is ``True``, the output tensors are of the same size as + :attr:`input` except in the dimension :attr:`dim` where they are of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in + the output tensors having 1 fewer dimension than :attr:`input`. + + .. note:: If there are multiple minimal values in a reduced row then + the indices of the first minimal value are returned. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (tuple, optional): the tuple of two output tensors (min, min_indices) + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[-0.6248, 1.1334, -1.1899, -0.2803], + [-1.4644, -0.2635, -0.3651, 0.6134], + [ 0.2457, 0.0384, 1.0128, 0.7015], + [-0.1153, 2.9849, 2.1458, 0.5788]]) + >>> torch.min(a, 1) + torch.return_types.min(values=tensor([-1.1899, -1.4644, 0.0384, -0.1153]), indices=tensor([2, 0, 1, 0])) + + .. function:: min(input, other, *, out=None) -> Tensor + :noindex: + + See :func:`torch.minimum`. + """ + ... +@overload +def min(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.min: + r""" + min(input) -> Tensor + + Returns the minimum value of all elements in the :attr:`input` tensor. + + .. warning:: + This function produces deterministic (sub)gradients unlike ``min(dim=0)`` + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 0.6750, 1.0857, 1.7197]]) + >>> torch.min(a) + tensor(0.6750) + + .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor) + :noindex: + + Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum + value of each row of the :attr:`input` tensor in the given dimension + :attr:`dim`. And ``indices`` is the index location of each minimum value found + (argmin). + + If :attr:`keepdim` is ``True``, the output tensors are of the same size as + :attr:`input` except in the dimension :attr:`dim` where they are of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in + the output tensors having 1 fewer dimension than :attr:`input`. + + .. note:: If there are multiple minimal values in a reduced row then + the indices of the first minimal value are returned. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (tuple, optional): the tuple of two output tensors (min, min_indices) + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[-0.6248, 1.1334, -1.1899, -0.2803], + [-1.4644, -0.2635, -0.3651, 0.6134], + [ 0.2457, 0.0384, 1.0128, 0.7015], + [-0.1153, 2.9849, 2.1458, 0.5788]]) + >>> torch.min(a, 1) + torch.return_types.min(values=tensor([-1.1899, -1.4644, 0.0384, -0.1153]), indices=tensor([2, 0, 1, 0])) + + .. function:: min(input, other, *, out=None) -> Tensor + :noindex: + + See :func:`torch.minimum`. + """ + ... +@overload +def min(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.min: + r""" + min(input) -> Tensor + + Returns the minimum value of all elements in the :attr:`input` tensor. + + .. warning:: + This function produces deterministic (sub)gradients unlike ``min(dim=0)`` + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 0.6750, 1.0857, 1.7197]]) + >>> torch.min(a) + tensor(0.6750) + + .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor) + :noindex: + + Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum + value of each row of the :attr:`input` tensor in the given dimension + :attr:`dim`. And ``indices`` is the index location of each minimum value found + (argmin). + + If :attr:`keepdim` is ``True``, the output tensors are of the same size as + :attr:`input` except in the dimension :attr:`dim` where they are of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in + the output tensors having 1 fewer dimension than :attr:`input`. + + .. note:: If there are multiple minimal values in a reduced row then + the indices of the first minimal value are returned. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (tuple, optional): the tuple of two output tensors (min, min_indices) + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[-0.6248, 1.1334, -1.1899, -0.2803], + [-1.4644, -0.2635, -0.3651, 0.6134], + [ 0.2457, 0.0384, 1.0128, 0.7015], + [-0.1153, 2.9849, 2.1458, 0.5788]]) + >>> torch.min(a, 1) + torch.return_types.min(values=tensor([-1.1899, -1.4644, 0.0384, -0.1153]), indices=tensor([2, 0, 1, 0])) + + .. function:: min(input, other, *, out=None) -> Tensor + :noindex: + + See :func:`torch.minimum`. + """ + ... +def minimum(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + minimum(input, other, *, out=None) -> Tensor + + Computes the element-wise minimum of :attr:`input` and :attr:`other`. + + .. note:: + If one of the elements being compared is a NaN, then that element is returned. + :func:`minimum` is not supported for tensors with complex dtypes. + + Args: + input (Tensor): the input tensor. + other (Tensor): the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor((1, 2, -1)) + >>> b = torch.tensor((3, 0, 4)) + >>> torch.minimum(a, b) + tensor([1, 0, -1]) + """ + ... +def miopen_batch_norm(input: Tensor, weight: Tensor, bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, exponential_average_factor: _float, epsilon: _float) -> Tuple[Tensor, Tensor, Tensor]: ... +def miopen_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool) -> Tensor: ... +def miopen_convolution_add_relu(input: Tensor, weight: Tensor, z: Tensor, alpha: Optional[Union[Number, _complex]], bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ... +def miopen_convolution_relu(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ... +def miopen_convolution_transpose(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], output_padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool) -> Tensor: ... +def miopen_depthwise_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool) -> Tensor: ... +def miopen_rnn(input: Tensor, weight: Union[Tuple[Tensor, ...], List[Tensor]], weight_stride0: _int, hx: Tensor, cx: Optional[Tensor], mode: _int, hidden_size: _int, num_layers: _int, batch_first: _bool, dropout: _float, train: _bool, bidirectional: _bool, batch_sizes: _size, dropout_state: Optional[Tensor]) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: ... +def mkldnn_adaptive_avg_pool2d(input: Tensor, output_size: Union[_int, _size], *, out: Optional[Tensor] = None) -> Tensor: ... +def mkldnn_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ... +def mkldnn_linear_backward_weights(grad_output: Tensor, input: Tensor, weight: Tensor, bias_defined: _bool) -> Tuple[Tensor, Tensor]: ... +def mkldnn_max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ... +def mkldnn_max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ... +def mkldnn_rnn_layer(input: Tensor, weight0: Tensor, weight1: Tensor, weight2: Tensor, weight3: Tensor, hx_: Tensor, cx_: Tensor, reverse: _bool, batch_sizes: _size, mode: _int, hidden_size: _int, num_layers: _int, has_biases: _bool, bidirectional: _bool, batch_first: _bool, train: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ... +def mm(input: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + mm(input, mat2, *, out=None) -> Tensor + + Performs a matrix multiplication of the matrices :attr:`input` and :attr:`mat2`. + + If :attr:`input` is a :math:`(n \times m)` tensor, :attr:`mat2` is a + :math:`(m \times p)` tensor, :attr:`out` will be a :math:`(n \times p)` tensor. + + .. note:: This function does not :ref:`broadcast `. + For broadcasting matrix products, see :func:`torch.matmul`. + + Supports strided and sparse 2-D tensors as inputs, autograd with + respect to strided inputs. + + This operation has support for arguments with :ref:`sparse layouts`. + If :attr:`out` is provided it's layout will be used. Otherwise, the result + layout will be deduced from that of :attr:`input`. + + + .. warning:: + Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported, + or may not have autograd support. If you notice missing functionality please + open a feature request. + + This operator supports :ref:`TensorFloat32`. + + On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision` for backward. + + Args: + input (Tensor): the first matrix to be matrix multiplied + mat2 (Tensor): the second matrix to be matrix multiplied + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> mat1 = torch.randn(2, 3) + >>> mat2 = torch.randn(3, 3) + >>> torch.mm(mat1, mat2) + tensor([[ 0.4851, 0.5037, -0.3633], + [-0.0760, -3.6705, 2.4784]]) + """ + ... +@overload +def mode(input: Tensor, dim: _int = -1, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.mode: + r""" + mode(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor) + + Returns a namedtuple ``(values, indices)`` where ``values`` is the mode + value of each row of the :attr:`input` tensor in the given dimension + :attr:`dim`, i.e. a value which appears most often + in that row, and ``indices`` is the index location of each mode value found. + + By default, :attr:`dim` is the last dimension of the :attr:`input` tensor. + + If :attr:`keepdim` is ``True``, the output tensors are of the same size as + :attr:`input` except in the dimension :attr:`dim` where they are of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting + in the output tensors having 1 fewer dimension than :attr:`input`. + + .. note:: This function is not defined for ``torch.cuda.Tensor`` yet. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (tuple, optional): the result tuple of two output tensors (values, indices) + + Example:: + + >>> b = torch.tensor( + [[0, 0, 0, 2, 0, 0, 2], + [0, 3, 0, 0, 2, 0, 1], + [2, 2, 2, 0, 0, 0, 3], + [2, 2, 3, 0, 1, 1, 0], + [1, 1, 0, 0, 2, 0, 2]]) + >>> torch.mode(b, 0) + torch.return_types.mode( + values=tensor([0, 2, 0, 0, 0, 0, 2]), + indices=tensor([1, 3, 4, 4, 2, 4, 4])) + """ + ... +@overload +def mode(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.mode: + r""" + mode(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor) + + Returns a namedtuple ``(values, indices)`` where ``values`` is the mode + value of each row of the :attr:`input` tensor in the given dimension + :attr:`dim`, i.e. a value which appears most often + in that row, and ``indices`` is the index location of each mode value found. + + By default, :attr:`dim` is the last dimension of the :attr:`input` tensor. + + If :attr:`keepdim` is ``True``, the output tensors are of the same size as + :attr:`input` except in the dimension :attr:`dim` where they are of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting + in the output tensors having 1 fewer dimension than :attr:`input`. + + .. note:: This function is not defined for ``torch.cuda.Tensor`` yet. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out (tuple, optional): the result tuple of two output tensors (values, indices) + + Example:: + + >>> b = torch.tensor( + [[0, 0, 0, 2, 0, 0, 2], + [0, 3, 0, 0, 2, 0, 1], + [2, 2, 2, 0, 0, 0, 3], + [2, 2, 3, 0, 1, 1, 0], + [1, 1, 0, 0, 2, 0, 2]]) + >>> torch.mode(b, 0) + torch.return_types.mode( + values=tensor([0, 2, 0, 0, 0, 0, 2]), + indices=tensor([1, 3, 4, 4, 2, 4, 4])) + """ + ... +@overload +def moveaxis(input: Tensor, source: _int, destination: _int) -> Tensor: + r""" + moveaxis(input, source, destination) -> Tensor + + Alias for :func:`torch.movedim`. + + This function is equivalent to NumPy's moveaxis function. + + Examples:: + + >>> t = torch.randn(3,2,1) + >>> t + tensor([[[-0.3362], + [-0.8437]], + + [[-0.9627], + [ 0.1727]], + + [[ 0.5173], + [-0.1398]]]) + >>> torch.moveaxis(t, 1, 0).shape + torch.Size([2, 3, 1]) + >>> torch.moveaxis(t, 1, 0) + tensor([[[-0.3362], + [-0.9627], + [ 0.5173]], + + [[-0.8437], + [ 0.1727], + [-0.1398]]]) + >>> torch.moveaxis(t, (1, 2), (0, 1)).shape + torch.Size([2, 1, 3]) + >>> torch.moveaxis(t, (1, 2), (0, 1)) + tensor([[[-0.3362, -0.9627, 0.5173]], + + [[-0.8437, 0.1727, -0.1398]]]) + """ + ... +@overload +def moveaxis(input: Tensor, source: _size, destination: _size) -> Tensor: + r""" + moveaxis(input, source, destination) -> Tensor + + Alias for :func:`torch.movedim`. + + This function is equivalent to NumPy's moveaxis function. + + Examples:: + + >>> t = torch.randn(3,2,1) + >>> t + tensor([[[-0.3362], + [-0.8437]], + + [[-0.9627], + [ 0.1727]], + + [[ 0.5173], + [-0.1398]]]) + >>> torch.moveaxis(t, 1, 0).shape + torch.Size([2, 3, 1]) + >>> torch.moveaxis(t, 1, 0) + tensor([[[-0.3362], + [-0.9627], + [ 0.5173]], + + [[-0.8437], + [ 0.1727], + [-0.1398]]]) + >>> torch.moveaxis(t, (1, 2), (0, 1)).shape + torch.Size([2, 1, 3]) + >>> torch.moveaxis(t, (1, 2), (0, 1)) + tensor([[[-0.3362, -0.9627, 0.5173]], + + [[-0.8437, 0.1727, -0.1398]]]) + """ + ... +@overload +def movedim(input: Tensor, source: _int, destination: _int) -> Tensor: + r""" + movedim(input, source, destination) -> Tensor + + Moves the dimension(s) of :attr:`input` at the position(s) in :attr:`source` + to the position(s) in :attr:`destination`. + + Other dimensions of :attr:`input` that are not explicitly moved remain in + their original order and appear at the positions not specified in :attr:`destination`. + + Args: + input (Tensor): the input tensor. + source (int or tuple of ints): Original positions of the dims to move. These must be unique. + destination (int or tuple of ints): Destination positions for each of the original dims. These must also be unique. + + Examples:: + + >>> t = torch.randn(3,2,1) + >>> t + tensor([[[-0.3362], + [-0.8437]], + + [[-0.9627], + [ 0.1727]], + + [[ 0.5173], + [-0.1398]]]) + >>> torch.movedim(t, 1, 0).shape + torch.Size([2, 3, 1]) + >>> torch.movedim(t, 1, 0) + tensor([[[-0.3362], + [-0.9627], + [ 0.5173]], + + [[-0.8437], + [ 0.1727], + [-0.1398]]]) + >>> torch.movedim(t, (1, 2), (0, 1)).shape + torch.Size([2, 1, 3]) + >>> torch.movedim(t, (1, 2), (0, 1)) + tensor([[[-0.3362, -0.9627, 0.5173]], + + [[-0.8437, 0.1727, -0.1398]]]) + """ + ... +@overload +def movedim(input: Tensor, source: _size, destination: _size) -> Tensor: + r""" + movedim(input, source, destination) -> Tensor + + Moves the dimension(s) of :attr:`input` at the position(s) in :attr:`source` + to the position(s) in :attr:`destination`. + + Other dimensions of :attr:`input` that are not explicitly moved remain in + their original order and appear at the positions not specified in :attr:`destination`. + + Args: + input (Tensor): the input tensor. + source (int or tuple of ints): Original positions of the dims to move. These must be unique. + destination (int or tuple of ints): Destination positions for each of the original dims. These must also be unique. + + Examples:: + + >>> t = torch.randn(3,2,1) + >>> t + tensor([[[-0.3362], + [-0.8437]], + + [[-0.9627], + [ 0.1727]], + + [[ 0.5173], + [-0.1398]]]) + >>> torch.movedim(t, 1, 0).shape + torch.Size([2, 3, 1]) + >>> torch.movedim(t, 1, 0) + tensor([[[-0.3362], + [-0.9627], + [ 0.5173]], + + [[-0.8437], + [ 0.1727], + [-0.1398]]]) + >>> torch.movedim(t, (1, 2), (0, 1)).shape + torch.Size([2, 1, 3]) + >>> torch.movedim(t, (1, 2), (0, 1)) + tensor([[[-0.3362, -0.9627, 0.5173]], + + [[-0.8437, 0.1727, -0.1398]]]) + """ + ... +def msort(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + msort(input, *, out=None) -> Tensor + + Sorts the elements of the :attr:`input` tensor along its first dimension + in ascending order by value. + + .. note:: `torch.msort(t)` is equivalent to `torch.sort(t, dim=0)[0]`. + See also :func:`torch.sort`. + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> t = torch.randn(3, 4) + >>> t + tensor([[-0.1321, 0.4370, -1.2631, -1.1289], + [-2.0527, -1.1250, 0.2275, 0.3077], + [-0.0881, -0.1259, -0.5495, 1.0284]]) + >>> torch.msort(t) + tensor([[-2.0527, -1.1250, -1.2631, -1.1289], + [-0.1321, -0.1259, -0.5495, 0.3077], + [-0.0881, 0.4370, 0.2275, 1.0284]]) + """ + ... +def mul(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + mul(input, other, *, out=None) -> Tensor + + Multiplies :attr:`input` by :attr:`other`. + + + .. math:: + \text{out}_i = \text{input}_i \times \text{other}_i + + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer, float, and complex inputs. + + Args: + input (Tensor): the input tensor. + other (Tensor or Number) - the tensor or number to multiply input by. + + Keyword args: + out (Tensor, optional): the output tensor. + + Examples:: + + >>> a = torch.randn(3) + >>> a + tensor([ 0.2015, -0.4255, 2.6087]) + >>> torch.mul(a, 100) + tensor([ 20.1494, -42.5491, 260.8663]) + + >>> b = torch.randn(4, 1) + >>> b + tensor([[ 1.1207], + [-0.3137], + [ 0.0700], + [ 0.8378]]) + >>> c = torch.randn(1, 4) + >>> c + tensor([[ 0.5146, 0.1216, -0.5244, 2.2382]]) + >>> torch.mul(b, c) + tensor([[ 0.5767, 0.1363, -0.5877, 2.5083], + [-0.1614, -0.0382, 0.1645, -0.7021], + [ 0.0360, 0.0085, -0.0367, 0.1567], + [ 0.4312, 0.1019, -0.4394, 1.8753]]) + """ + ... +def multinomial(input: Tensor, num_samples: _int, replacement: _bool = False, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + multinomial(input, num_samples, replacement=False, *, generator=None, out=None) -> LongTensor + + Returns a tensor where each row contains :attr:`num_samples` indices sampled + from the multinomial (a stricter definition would be multivariate, + refer to torch.distributions.multinomial.Multinomial for more details) + probability distribution located in the corresponding row + of tensor :attr:`input`. + + .. note:: + The rows of :attr:`input` do not need to sum to one (in which case we use + the values as weights), but must be non-negative, finite and have + a non-zero sum. + + Indices are ordered from left to right according to when each was sampled + (first samples are placed in first column). + + If :attr:`input` is a vector, :attr:`out` is a vector of size :attr:`num_samples`. + + If :attr:`input` is a matrix with `m` rows, :attr:`out` is an matrix of shape + :math:`(m \times \text{num\_samples})`. + + If replacement is ``True``, samples are drawn with replacement. + + If not, they are drawn without replacement, which means that when a + sample index is drawn for a row, it cannot be drawn again for that row. + + .. note:: + When drawn without replacement, :attr:`num_samples` must be lower than + number of non-zero elements in :attr:`input` (or the min number of non-zero + elements in each row of :attr:`input` if it is a matrix). + + Args: + input (Tensor): the input tensor containing probabilities + num_samples (int): number of samples to draw + replacement (bool, optional): whether to draw with replacement or not + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + + Example:: + + >>> weights = torch.tensor([0, 10, 3, 0], dtype=torch.float) # create a tensor of weights + >>> torch.multinomial(weights, 2) + tensor([1, 2]) + >>> torch.multinomial(weights, 4) # ERROR! + RuntimeError: invalid argument 2: invalid multinomial distribution (with replacement=False, + not enough non-negative category to sample) at ../aten/src/TH/generic/THTensorRandom.cpp:320 + >>> torch.multinomial(weights, 4, replacement=True) + tensor([ 2, 1, 1, 1]) + """ + ... +@overload +def multiply(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + multiply(input, other, *, out=None) + + Alias for :func:`torch.mul`. + """ + ... +@overload +def multiply(input: Tensor, other: Union[Number, _complex]) -> Tensor: + r""" + multiply(input, other, *, out=None) + + Alias for :func:`torch.mul`. + """ + ... +def mv(input: Tensor, vec: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + mv(input, vec, *, out=None) -> Tensor + + Performs a matrix-vector product of the matrix :attr:`input` and the vector + :attr:`vec`. + + If :attr:`input` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of + size :math:`m`, :attr:`out` will be 1-D of size :math:`n`. + + .. note:: This function does not :ref:`broadcast `. + + Args: + input (Tensor): matrix to be multiplied + vec (Tensor): vector to be multiplied + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> mat = torch.randn(2, 3) + >>> vec = torch.randn(3) + >>> torch.mv(mat, vec) + tensor([ 1.0404, -0.6361]) + """ + ... +def mvlgamma(input: Tensor, p: _int, *, out: Optional[Tensor] = None) -> Tensor: + r""" + mvlgamma(input, p, *, out=None) -> Tensor + + Alias for :func:`torch.special.multigammaln`. + """ + ... +def nan_to_num(input: Tensor, nan: Optional[_float] = None, posinf: Optional[_float] = None, neginf: Optional[_float] = None, *, out: Optional[Tensor] = None) -> Tensor: + r""" + nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None) -> Tensor + + Replaces :literal:`NaN`, positive infinity, and negative infinity values in :attr:`input` + with the values specified by :attr:`nan`, :attr:`posinf`, and :attr:`neginf`, respectively. + By default, :literal:`NaN`\ s are replaced with zero, positive infinity is replaced with the + greatest finite value representable by :attr:`input`'s dtype, and negative infinity + is replaced with the least finite value representable by :attr:`input`'s dtype. + + Args: + input (Tensor): the input tensor. + nan (Number, optional): the value to replace :literal:`NaN`\s with. Default is zero. + posinf (Number, optional): if a Number, the value to replace positive infinity values with. + If None, positive infinity values are replaced with the greatest finite value representable by :attr:`input`'s dtype. + Default is None. + neginf (Number, optional): if a Number, the value to replace negative infinity values with. + If None, negative infinity values are replaced with the lowest finite value representable by :attr:`input`'s dtype. + Default is None. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> x = torch.tensor([float('nan'), float('inf'), -float('inf'), 3.14]) + >>> torch.nan_to_num(x) + tensor([ 0.0000e+00, 3.4028e+38, -3.4028e+38, 3.1400e+00]) + >>> torch.nan_to_num(x, nan=2.0) + tensor([ 2.0000e+00, 3.4028e+38, -3.4028e+38, 3.1400e+00]) + >>> torch.nan_to_num(x, nan=2.0, posinf=1.0) + tensor([ 2.0000e+00, 1.0000e+00, -3.4028e+38, 3.1400e+00]) + """ + ... +def nan_to_num_(input: Tensor, nan: Optional[_float] = None, posinf: Optional[_float] = None, neginf: Optional[_float] = None) -> Tensor: ... +def nanmean(input: Tensor, dim: Optional[Union[_int, _size]] = None, keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + nanmean(input, dim=None, keepdim=False, *, dtype=None, out=None) -> Tensor + + Computes the mean of all `non-NaN` elements along the specified dimensions. + + This function is identical to :func:`torch.mean` when there are no `NaN` values + in the :attr:`input` tensor. In the presence of `NaN`, :func:`torch.mean` will + propagate the `NaN` to the output whereas :func:`torch.nanmean` will ignore the + `NaN` values (`torch.nanmean(a)` is equivalent to `torch.mean(a[~a.isnan()])`). + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + out (Tensor, optional): the output tensor. + + .. seealso:: + + :func:`torch.mean` computes the mean value, propagating `NaN`. + + Example:: + + >>> x = torch.tensor([[torch.nan, 1, 2], [1, 2, 3]]) + >>> x.mean() + tensor(nan) + >>> x.nanmean() + tensor(1.8000) + >>> x.mean(dim=0) + tensor([ nan, 1.5000, 2.5000]) + >>> x.nanmean(dim=0) + tensor([1.0000, 1.5000, 2.5000]) + + # If all elements in the reduced dimensions are NaN then the result is NaN + >>> torch.tensor([torch.nan]).nanmean() + tensor(nan) + """ + ... +@overload +def nanmedian(input: Tensor) -> Tensor: + r""" + nanmedian(input) -> Tensor + + Returns the median of the values in :attr:`input`, ignoring ``NaN`` values. + + This function is identical to :func:`torch.median` when there are no ``NaN`` values in :attr:`input`. + When :attr:`input` has one or more ``NaN`` values, :func:`torch.median` will always return ``NaN``, + while this function will return the median of the non-``NaN`` elements in :attr:`input`. + If all the elements in :attr:`input` are ``NaN`` it will also return ``NaN``. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.tensor([1, float('nan'), 3, 2]) + >>> a.median() + tensor(nan) + >>> a.nanmedian() + tensor(2.) + + .. function:: nanmedian(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor) + :noindex: + + Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input` + in the dimension :attr:`dim`, ignoring ``NaN`` values, and ``indices`` contains the index of the median values + found in the dimension :attr:`dim`. + + This function is identical to :func:`torch.median` when there are no ``NaN`` values in a reduced row. When a reduced row has + one or more ``NaN`` values, :func:`torch.median` will always reduce it to ``NaN``, while this function will reduce it to the + median of the non-``NaN`` elements. If all the elements in a reduced row are ``NaN`` then it will be reduced to ``NaN``, too. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second + tensor, which must have dtype long, with their indices in the dimension + :attr:`dim` of :attr:`input`. + + Example:: + + >>> a = torch.tensor([[2, 3, 1], [float('nan'), 1, float('nan')]]) + >>> a + tensor([[2., 3., 1.], + [nan, 1., nan]]) + >>> a.median(0) + torch.return_types.median(values=tensor([nan, 1., nan]), indices=tensor([1, 1, 1])) + >>> a.nanmedian(0) + torch.return_types.nanmedian(values=tensor([2., 1., 1.]), indices=tensor([0, 1, 0])) + """ + ... +@overload +def nanmedian(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.nanmedian: + r""" + nanmedian(input) -> Tensor + + Returns the median of the values in :attr:`input`, ignoring ``NaN`` values. + + This function is identical to :func:`torch.median` when there are no ``NaN`` values in :attr:`input`. + When :attr:`input` has one or more ``NaN`` values, :func:`torch.median` will always return ``NaN``, + while this function will return the median of the non-``NaN`` elements in :attr:`input`. + If all the elements in :attr:`input` are ``NaN`` it will also return ``NaN``. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.tensor([1, float('nan'), 3, 2]) + >>> a.median() + tensor(nan) + >>> a.nanmedian() + tensor(2.) + + .. function:: nanmedian(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor) + :noindex: + + Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input` + in the dimension :attr:`dim`, ignoring ``NaN`` values, and ``indices`` contains the index of the median values + found in the dimension :attr:`dim`. + + This function is identical to :func:`torch.median` when there are no ``NaN`` values in a reduced row. When a reduced row has + one or more ``NaN`` values, :func:`torch.median` will always reduce it to ``NaN``, while this function will reduce it to the + median of the non-``NaN`` elements. If all the elements in a reduced row are ``NaN`` then it will be reduced to ``NaN``, too. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second + tensor, which must have dtype long, with their indices in the dimension + :attr:`dim` of :attr:`input`. + + Example:: + + >>> a = torch.tensor([[2, 3, 1], [float('nan'), 1, float('nan')]]) + >>> a + tensor([[2., 3., 1.], + [nan, 1., nan]]) + >>> a.median(0) + torch.return_types.median(values=tensor([nan, 1., nan]), indices=tensor([1, 1, 1])) + >>> a.nanmedian(0) + torch.return_types.nanmedian(values=tensor([2., 1., 1.]), indices=tensor([0, 1, 0])) + """ + ... +@overload +def nanmedian(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.nanmedian: + r""" + nanmedian(input) -> Tensor + + Returns the median of the values in :attr:`input`, ignoring ``NaN`` values. + + This function is identical to :func:`torch.median` when there are no ``NaN`` values in :attr:`input`. + When :attr:`input` has one or more ``NaN`` values, :func:`torch.median` will always return ``NaN``, + while this function will return the median of the non-``NaN`` elements in :attr:`input`. + If all the elements in :attr:`input` are ``NaN`` it will also return ``NaN``. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.tensor([1, float('nan'), 3, 2]) + >>> a.median() + tensor(nan) + >>> a.nanmedian() + tensor(2.) + + .. function:: nanmedian(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor) + :noindex: + + Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input` + in the dimension :attr:`dim`, ignoring ``NaN`` values, and ``indices`` contains the index of the median values + found in the dimension :attr:`dim`. + + This function is identical to :func:`torch.median` when there are no ``NaN`` values in a reduced row. When a reduced row has + one or more ``NaN`` values, :func:`torch.median` will always reduce it to ``NaN``, while this function will reduce it to the + median of the non-``NaN`` elements. If all the elements in a reduced row are ``NaN`` then it will be reduced to ``NaN``, too. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second + tensor, which must have dtype long, with their indices in the dimension + :attr:`dim` of :attr:`input`. + + Example:: + + >>> a = torch.tensor([[2, 3, 1], [float('nan'), 1, float('nan')]]) + >>> a + tensor([[2., 3., 1.], + [nan, 1., nan]]) + >>> a.median(0) + torch.return_types.median(values=tensor([nan, 1., nan]), indices=tensor([1, 1, 1])) + >>> a.nanmedian(0) + torch.return_types.nanmedian(values=tensor([2., 1., 1.]), indices=tensor([0, 1, 0])) + """ + ... +@overload +def nanquantile(input: Tensor, q: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: + r""" + nanquantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor + + This is a variant of :func:`torch.quantile` that "ignores" ``NaN`` values, + computing the quantiles :attr:`q` as if ``NaN`` values in :attr:`input` did + not exist. If all values in a reduced row are ``NaN`` then the quantiles for + that reduction will be ``NaN``. See the documentation for :func:`torch.quantile`. + + Args: + input (Tensor): the input tensor. + q (float or Tensor): a scalar or 1D tensor of quantile values in the range [0, 1] + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword arguments: + interpolation (str): interpolation method to use when the desired quantile lies between two data points. + Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``. + Default is ``linear``. + out (Tensor, optional): the output tensor. + + Example:: + + >>> t = torch.tensor([float('nan'), 1, 2]) + >>> t.quantile(0.5) + tensor(nan) + >>> t.nanquantile(0.5) + tensor(1.5000) + >>> t = torch.tensor([[float('nan'), float('nan')], [1, 2]]) + >>> t + tensor([[nan, nan], + [1., 2.]]) + >>> t.nanquantile(0.5, dim=0) + tensor([1., 2.]) + >>> t.nanquantile(0.5, dim=1) + tensor([ nan, 1.5000]) + """ + ... +@overload +def nanquantile(input: Tensor, q: _float, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: + r""" + nanquantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor + + This is a variant of :func:`torch.quantile` that "ignores" ``NaN`` values, + computing the quantiles :attr:`q` as if ``NaN`` values in :attr:`input` did + not exist. If all values in a reduced row are ``NaN`` then the quantiles for + that reduction will be ``NaN``. See the documentation for :func:`torch.quantile`. + + Args: + input (Tensor): the input tensor. + q (float or Tensor): a scalar or 1D tensor of quantile values in the range [0, 1] + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword arguments: + interpolation (str): interpolation method to use when the desired quantile lies between two data points. + Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``. + Default is ``linear``. + out (Tensor, optional): the output tensor. + + Example:: + + >>> t = torch.tensor([float('nan'), 1, 2]) + >>> t.quantile(0.5) + tensor(nan) + >>> t.nanquantile(0.5) + tensor(1.5000) + >>> t = torch.tensor([[float('nan'), float('nan')], [1, 2]]) + >>> t + tensor([[nan, nan], + [1., 2.]]) + >>> t.nanquantile(0.5, dim=0) + tensor([1., 2.]) + >>> t.nanquantile(0.5, dim=1) + tensor([ nan, 1.5000]) + """ + ... +def nansum(input: Tensor, dim: Optional[Union[_int, _size]] = None, keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + nansum(input, *, dtype=None) -> Tensor + + Returns the sum of all elements, treating Not a Numbers (NaNs) as zero. + + Args: + input (Tensor): the input tensor. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.tensor([1., 2., float('nan'), 4.]) + >>> torch.nansum(a) + tensor(7.) + + .. function:: nansum(input, dim, keepdim=False, *, dtype=None) -> Tensor + :noindex: + + Returns the sum of each row of the :attr:`input` tensor in the given + dimension :attr:`dim`, treating Not a Numbers (NaNs) as zero. + If :attr:`dim` is a list of dimensions, reduce over all of them. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> torch.nansum(torch.tensor([1., float("nan")])) + 1.0 + >>> a = torch.tensor([[1, 2], [3., float("nan")]]) + >>> torch.nansum(a) + tensor(6.) + >>> torch.nansum(a, dim=0) + tensor([4., 2.]) + >>> torch.nansum(a, dim=1) + tensor([3., 3.]) + """ + ... +@overload +def narrow(input: Tensor, dim: _int, start: Tensor, length: Union[_int, SymInt]) -> Tensor: + r""" + narrow(input, dim, start, length) -> Tensor + + Returns a new tensor that is a narrowed version of :attr:`input` tensor. The + dimension :attr:`dim` is input from :attr:`start` to ``start + length``. The + returned tensor and :attr:`input` tensor share the same underlying storage. + + Args: + input (Tensor): the tensor to narrow + dim (int): the dimension along which to narrow + start (int or Tensor): index of the element to start the narrowed dimension + from. Can be negative, which means indexing from the end of `dim`. If + `Tensor`, it must be an 0-dim integral `Tensor` (bools not allowed) + length (int): length of the narrowed dimension, must be weakly positive + + Example:: + + >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + >>> torch.narrow(x, 0, 0, 2) + tensor([[ 1, 2, 3], + [ 4, 5, 6]]) + >>> torch.narrow(x, 1, 1, 2) + tensor([[ 2, 3], + [ 5, 6], + [ 8, 9]]) + >>> torch.narrow(x, -1, torch.tensor(-1), 1) + tensor([[3], + [6], + [9]]) + """ + ... +@overload +def narrow(input: Tensor, dim: _int, start: Union[_int, SymInt], length: Union[_int, SymInt]) -> Tensor: + r""" + narrow(input, dim, start, length) -> Tensor + + Returns a new tensor that is a narrowed version of :attr:`input` tensor. The + dimension :attr:`dim` is input from :attr:`start` to ``start + length``. The + returned tensor and :attr:`input` tensor share the same underlying storage. + + Args: + input (Tensor): the tensor to narrow + dim (int): the dimension along which to narrow + start (int or Tensor): index of the element to start the narrowed dimension + from. Can be negative, which means indexing from the end of `dim`. If + `Tensor`, it must be an 0-dim integral `Tensor` (bools not allowed) + length (int): length of the narrowed dimension, must be weakly positive + + Example:: + + >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + >>> torch.narrow(x, 0, 0, 2) + tensor([[ 1, 2, 3], + [ 4, 5, 6]]) + >>> torch.narrow(x, 1, 1, 2) + tensor([[ 2, 3], + [ 5, 6], + [ 8, 9]]) + >>> torch.narrow(x, -1, torch.tensor(-1), 1) + tensor([[3], + [6], + [9]]) + """ + ... +def narrow_copy(input: Tensor, dim: _int, start: Union[_int, SymInt], length: Union[_int, SymInt], *, out: Optional[Tensor] = None) -> Tensor: + r""" + narrow_copy(input, dim, start, length, *, out=None) -> Tensor + + Same as :meth:`Tensor.narrow` except this returns a copy rather + than shared storage. This is primarily for sparse tensors, which + do not have a shared-storage narrow method. + + Args: + input (Tensor): the tensor to narrow + dim (int): the dimension along which to narrow + start (int): index of the element to start the narrowed dimension from. Can + be negative, which means indexing from the end of `dim` + length (int): length of the narrowed dimension, must be weakly positive + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + >>> torch.narrow_copy(x, 0, 0, 2) + tensor([[ 1, 2, 3], + [ 4, 5, 6]]) + >>> torch.narrow_copy(x, 1, 1, 2) + tensor([[ 2, 3], + [ 5, 6], + [ 8, 9]]) + >>> s = torch.arange(16).reshape(2, 2, 2, 2).to_sparse(2) + >>> torch.narrow_copy(s, 0, 0, 1) + tensor(indices=tensor([[0, 0], + [0, 1]]), + values=tensor([[[0, 1], + [2, 3]], + + [[4, 5], + [6, 7]]]), + size=(1, 2, 2, 2), nnz=2, layout=torch.sparse_coo) + + .. seealso:: + + :func:`torch.narrow` for a non copy variant + """ + ... +def native_batch_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, momentum: _float, eps: _float, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor, Tensor]: ... +def native_channel_shuffle(input: Tensor, groups: Union[_int, SymInt]) -> Tensor: ... +def native_dropout(input: Tensor, p: _float, train: Optional[_bool]) -> Tuple[Tensor, Tensor]: ... +def native_group_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], N: Union[_int, SymInt], C: Union[_int, SymInt], HxW: Union[_int, SymInt], group: _int, eps: _float) -> Tuple[Tensor, Tensor, Tensor]: ... +def native_layer_norm(input: Tensor, normalized_shape: Sequence[Union[_int, SymInt]], weight: Optional[Tensor], bias: Optional[Tensor], eps: _float) -> Tuple[Tensor, Tensor, Tensor]: ... +@overload +def native_norm(input: Tensor, p: Optional[Union[Number, _complex]], dim: Union[_int, _size], keepdim: _bool, dtype: Optional[_dtype]) -> Tensor: ... +@overload +def native_norm(input: Tensor, p: Union[Number, _complex] = 2) -> Tensor: ... +@overload +def ne(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + ne(input, other, *, out=None) -> Tensor + + Computes :math:`\text{input} \neq \text{other}` element-wise. + + + The second argument can be a number or a tensor whose shape is + :ref:`broadcastable ` with the first argument. + + Args: + input (Tensor): the tensor to compare + other (Tensor or float): the tensor or value to compare + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is not equal to :attr:`other` and False elsewhere + + Example:: + + >>> torch.ne(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]])) + tensor([[False, True], [True, False]]) + """ + ... +@overload +def ne(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + ne(input, other, *, out=None) -> Tensor + + Computes :math:`\text{input} \neq \text{other}` element-wise. + + + The second argument can be a number or a tensor whose shape is + :ref:`broadcastable ` with the first argument. + + Args: + input (Tensor): the tensor to compare + other (Tensor or float): the tensor or value to compare + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + A boolean tensor that is True where :attr:`input` is not equal to :attr:`other` and False elsewhere + + Example:: + + >>> torch.ne(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]])) + tensor([[False, True], [True, False]]) + """ + ... +def neg(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + neg(input, *, out=None) -> Tensor + + Returns a new tensor with the negative of the elements of :attr:`input`. + + .. math:: + \text{out} = -1 \times \text{input} + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(5) + >>> a + tensor([ 0.0090, -0.2262, -0.0682, -0.2866, 0.3940]) + >>> torch.neg(a) + tensor([-0.0090, 0.2262, 0.0682, 0.2866, -0.3940]) + """ + ... +def neg_(input: Tensor) -> Tensor: ... +def negative(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + negative(input, *, out=None) -> Tensor + + Alias for :func:`torch.neg` + """ + ... +def negative_(input: Tensor) -> Tensor: ... +def nextafter(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + nextafter(input, other, *, out=None) -> Tensor + + Return the next floating-point value after :attr:`input` towards :attr:`other`, elementwise. + + The shapes of ``input`` and ``other`` must be + :ref:`broadcastable `. + + Args: + input (Tensor): the first input tensor + other (Tensor): the second input tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> eps = torch.finfo(torch.float32).eps + >>> torch.nextafter(torch.tensor([1.0, 2.0]), torch.tensor([2.0, 1.0])) == torch.tensor([eps + 1, 2 - eps]) + tensor([True, True]) + """ + ... +@overload +def nonzero(input: Tensor, *, as_tuple: Literal[False] = False, out: Optional[Tensor] = None) -> Tensor: + r""" + nonzero(input, *, out=None, as_tuple=False) -> LongTensor or tuple of LongTensors + + .. note:: + :func:`torch.nonzero(..., as_tuple=False) ` (default) returns a + 2-D tensor where each row is the index for a nonzero value. + + :func:`torch.nonzero(..., as_tuple=True) ` returns a tuple of 1-D + index tensors, allowing for advanced indexing, so ``x[x.nonzero(as_tuple=True)]`` + gives all nonzero values of tensor ``x``. Of the returned tuple, each index tensor + contains nonzero indices for a certain dimension. + + See below for more details on the two behaviors. + + When :attr:`input` is on CUDA, :func:`torch.nonzero() ` causes + host-device synchronization. + + **When** :attr:`as_tuple` **is** ``False`` **(default)**: + + Returns a tensor containing the indices of all non-zero elements of + :attr:`input`. Each row in the result contains the indices of a non-zero + element in :attr:`input`. The result is sorted lexicographically, with + the last index changing the fastest (C-style). + + If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor + :attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of + non-zero elements in the :attr:`input` tensor. + + **When** :attr:`as_tuple` **is** ``True``: + + Returns a tuple of 1-D tensors, one for each dimension in :attr:`input`, + each containing the indices (in that dimension) of all non-zero elements of + :attr:`input` . + + If :attr:`input` has :math:`n` dimensions, then the resulting tuple contains :math:`n` + tensors of size :math:`z`, where :math:`z` is the total number of + non-zero elements in the :attr:`input` tensor. + + As a special case, when :attr:`input` has zero dimensions and a nonzero scalar + value, it is treated as a one-dimensional tensor with one element. + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (LongTensor, optional): the output tensor containing indices + + Returns: + LongTensor or tuple of LongTensor: If :attr:`as_tuple` is ``False``, the output + tensor containing indices. If :attr:`as_tuple` is ``True``, one 1-D tensor for + each dimension, containing the indices of each nonzero element along that + dimension. + + Example:: + + >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1])) + tensor([[ 0], + [ 1], + [ 2], + [ 4]]) + >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0], + ... [0.0, 0.4, 0.0, 0.0], + ... [0.0, 0.0, 1.2, 0.0], + ... [0.0, 0.0, 0.0,-0.4]])) + tensor([[ 0, 0], + [ 1, 1], + [ 2, 2], + [ 3, 3]]) + >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]), as_tuple=True) + (tensor([0, 1, 2, 4]),) + >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0], + ... [0.0, 0.4, 0.0, 0.0], + ... [0.0, 0.0, 1.2, 0.0], + ... [0.0, 0.0, 0.0,-0.4]]), as_tuple=True) + (tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3])) + >>> torch.nonzero(torch.tensor(5), as_tuple=True) + (tensor([0]),) + """ + ... +@overload +def nonzero(input: Tensor, *, as_tuple: Literal[True]) -> Tuple[Tensor, ...]: + r""" + nonzero(input, *, out=None, as_tuple=False) -> LongTensor or tuple of LongTensors + + .. note:: + :func:`torch.nonzero(..., as_tuple=False) ` (default) returns a + 2-D tensor where each row is the index for a nonzero value. + + :func:`torch.nonzero(..., as_tuple=True) ` returns a tuple of 1-D + index tensors, allowing for advanced indexing, so ``x[x.nonzero(as_tuple=True)]`` + gives all nonzero values of tensor ``x``. Of the returned tuple, each index tensor + contains nonzero indices for a certain dimension. + + See below for more details on the two behaviors. + + When :attr:`input` is on CUDA, :func:`torch.nonzero() ` causes + host-device synchronization. + + **When** :attr:`as_tuple` **is** ``False`` **(default)**: + + Returns a tensor containing the indices of all non-zero elements of + :attr:`input`. Each row in the result contains the indices of a non-zero + element in :attr:`input`. The result is sorted lexicographically, with + the last index changing the fastest (C-style). + + If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor + :attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of + non-zero elements in the :attr:`input` tensor. + + **When** :attr:`as_tuple` **is** ``True``: + + Returns a tuple of 1-D tensors, one for each dimension in :attr:`input`, + each containing the indices (in that dimension) of all non-zero elements of + :attr:`input` . + + If :attr:`input` has :math:`n` dimensions, then the resulting tuple contains :math:`n` + tensors of size :math:`z`, where :math:`z` is the total number of + non-zero elements in the :attr:`input` tensor. + + As a special case, when :attr:`input` has zero dimensions and a nonzero scalar + value, it is treated as a one-dimensional tensor with one element. + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (LongTensor, optional): the output tensor containing indices + + Returns: + LongTensor or tuple of LongTensor: If :attr:`as_tuple` is ``False``, the output + tensor containing indices. If :attr:`as_tuple` is ``True``, one 1-D tensor for + each dimension, containing the indices of each nonzero element along that + dimension. + + Example:: + + >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1])) + tensor([[ 0], + [ 1], + [ 2], + [ 4]]) + >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0], + ... [0.0, 0.4, 0.0, 0.0], + ... [0.0, 0.0, 1.2, 0.0], + ... [0.0, 0.0, 0.0,-0.4]])) + tensor([[ 0, 0], + [ 1, 1], + [ 2, 2], + [ 3, 3]]) + >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]), as_tuple=True) + (tensor([0, 1, 2, 4]),) + >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0], + ... [0.0, 0.4, 0.0, 0.0], + ... [0.0, 0.0, 1.2, 0.0], + ... [0.0, 0.0, 0.0,-0.4]]), as_tuple=True) + (tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3])) + >>> torch.nonzero(torch.tensor(5), as_tuple=True) + (tensor([0]),) + """ + ... +def nonzero_static(input: Tensor, *, size: _int, fill_value: _int = -1, out: Optional[Tensor] = None) -> Tensor: ... +def norm_except_dim(v: Tensor, pow: _int = 2, dim: _int = 0) -> Tensor: ... +@overload +def normal(mean: Tensor, std: Tensor, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + normal(mean, std, *, generator=None, out=None) -> Tensor + + Returns a tensor of random numbers drawn from separate normal distributions + whose mean and standard deviation are given. + + The :attr:`mean` is a tensor with the mean of + each output element's normal distribution + + The :attr:`std` is a tensor with the standard deviation of + each output element's normal distribution + + The shapes of :attr:`mean` and :attr:`std` don't need to match, but the + total number of elements in each tensor need to be the same. + + .. note:: When the shapes do not match, the shape of :attr:`mean` + is used as the shape for the returned output tensor + + .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes + its device with the CPU. + + Args: + mean (Tensor): the tensor of per-element means + std (Tensor): the tensor of per-element standard deviations + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1)) + tensor([ 1.0425, 3.5672, 2.7969, 4.2925, 4.7229, 6.2134, + 8.0505, 8.1408, 9.0563, 10.0566]) + + .. function:: normal(mean=0.0, std, *, out=None) -> Tensor + :noindex: + + Similar to the function above, but the means are shared among all drawn + elements. + + Args: + mean (float, optional): the mean for all distributions + std (Tensor): the tensor of per-element standard deviations + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.normal(mean=0.5, std=torch.arange(1., 6.)) + tensor([-1.2793, -1.0732, -2.0687, 5.1177, -1.2303]) + + .. function:: normal(mean, std=1.0, *, out=None) -> Tensor + :noindex: + + Similar to the function above, but the standard deviations are shared among + all drawn elements. + + Args: + mean (Tensor): the tensor of per-element means + std (float, optional): the standard deviation for all distributions + + Keyword args: + out (Tensor, optional): the output tensor + + Example:: + + >>> torch.normal(mean=torch.arange(1., 6.)) + tensor([ 1.1552, 2.6148, 2.6535, 5.8318, 4.2361]) + + .. function:: normal(mean, std, size, *, out=None) -> Tensor + :noindex: + + Similar to the function above, but the means and standard deviations are shared + among all drawn elements. The resulting tensor has size given by :attr:`size`. + + Args: + mean (float): the mean for all distributions + std (float): the standard deviation for all distributions + size (int...): a sequence of integers defining the shape of the output tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.normal(2, 3, size=(1, 4)) + tensor([[-1.3987, -1.9544, 3.6048, 0.7909]]) + """ + ... +@overload +def normal(mean: Tensor, std: _float = 1, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + normal(mean, std, *, generator=None, out=None) -> Tensor + + Returns a tensor of random numbers drawn from separate normal distributions + whose mean and standard deviation are given. + + The :attr:`mean` is a tensor with the mean of + each output element's normal distribution + + The :attr:`std` is a tensor with the standard deviation of + each output element's normal distribution + + The shapes of :attr:`mean` and :attr:`std` don't need to match, but the + total number of elements in each tensor need to be the same. + + .. note:: When the shapes do not match, the shape of :attr:`mean` + is used as the shape for the returned output tensor + + .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes + its device with the CPU. + + Args: + mean (Tensor): the tensor of per-element means + std (Tensor): the tensor of per-element standard deviations + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1)) + tensor([ 1.0425, 3.5672, 2.7969, 4.2925, 4.7229, 6.2134, + 8.0505, 8.1408, 9.0563, 10.0566]) + + .. function:: normal(mean=0.0, std, *, out=None) -> Tensor + :noindex: + + Similar to the function above, but the means are shared among all drawn + elements. + + Args: + mean (float, optional): the mean for all distributions + std (Tensor): the tensor of per-element standard deviations + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.normal(mean=0.5, std=torch.arange(1., 6.)) + tensor([-1.2793, -1.0732, -2.0687, 5.1177, -1.2303]) + + .. function:: normal(mean, std=1.0, *, out=None) -> Tensor + :noindex: + + Similar to the function above, but the standard deviations are shared among + all drawn elements. + + Args: + mean (Tensor): the tensor of per-element means + std (float, optional): the standard deviation for all distributions + + Keyword args: + out (Tensor, optional): the output tensor + + Example:: + + >>> torch.normal(mean=torch.arange(1., 6.)) + tensor([ 1.1552, 2.6148, 2.6535, 5.8318, 4.2361]) + + .. function:: normal(mean, std, size, *, out=None) -> Tensor + :noindex: + + Similar to the function above, but the means and standard deviations are shared + among all drawn elements. The resulting tensor has size given by :attr:`size`. + + Args: + mean (float): the mean for all distributions + std (float): the standard deviation for all distributions + size (int...): a sequence of integers defining the shape of the output tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.normal(2, 3, size=(1, 4)) + tensor([[-1.3987, -1.9544, 3.6048, 0.7909]]) + """ + ... +@overload +def normal(mean: _float, std: Tensor, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + normal(mean, std, *, generator=None, out=None) -> Tensor + + Returns a tensor of random numbers drawn from separate normal distributions + whose mean and standard deviation are given. + + The :attr:`mean` is a tensor with the mean of + each output element's normal distribution + + The :attr:`std` is a tensor with the standard deviation of + each output element's normal distribution + + The shapes of :attr:`mean` and :attr:`std` don't need to match, but the + total number of elements in each tensor need to be the same. + + .. note:: When the shapes do not match, the shape of :attr:`mean` + is used as the shape for the returned output tensor + + .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes + its device with the CPU. + + Args: + mean (Tensor): the tensor of per-element means + std (Tensor): the tensor of per-element standard deviations + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1)) + tensor([ 1.0425, 3.5672, 2.7969, 4.2925, 4.7229, 6.2134, + 8.0505, 8.1408, 9.0563, 10.0566]) + + .. function:: normal(mean=0.0, std, *, out=None) -> Tensor + :noindex: + + Similar to the function above, but the means are shared among all drawn + elements. + + Args: + mean (float, optional): the mean for all distributions + std (Tensor): the tensor of per-element standard deviations + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.normal(mean=0.5, std=torch.arange(1., 6.)) + tensor([-1.2793, -1.0732, -2.0687, 5.1177, -1.2303]) + + .. function:: normal(mean, std=1.0, *, out=None) -> Tensor + :noindex: + + Similar to the function above, but the standard deviations are shared among + all drawn elements. + + Args: + mean (Tensor): the tensor of per-element means + std (float, optional): the standard deviation for all distributions + + Keyword args: + out (Tensor, optional): the output tensor + + Example:: + + >>> torch.normal(mean=torch.arange(1., 6.)) + tensor([ 1.1552, 2.6148, 2.6535, 5.8318, 4.2361]) + + .. function:: normal(mean, std, size, *, out=None) -> Tensor + :noindex: + + Similar to the function above, but the means and standard deviations are shared + among all drawn elements. The resulting tensor has size given by :attr:`size`. + + Args: + mean (float): the mean for all distributions + std (float): the standard deviation for all distributions + size (int...): a sequence of integers defining the shape of the output tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.normal(2, 3, size=(1, 4)) + tensor([[-1.3987, -1.9544, 3.6048, 0.7909]]) + """ + ... +@overload +def normal(mean: _float, std: _float, size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator] = None, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + normal(mean, std, *, generator=None, out=None) -> Tensor + + Returns a tensor of random numbers drawn from separate normal distributions + whose mean and standard deviation are given. + + The :attr:`mean` is a tensor with the mean of + each output element's normal distribution + + The :attr:`std` is a tensor with the standard deviation of + each output element's normal distribution + + The shapes of :attr:`mean` and :attr:`std` don't need to match, but the + total number of elements in each tensor need to be the same. + + .. note:: When the shapes do not match, the shape of :attr:`mean` + is used as the shape for the returned output tensor + + .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes + its device with the CPU. + + Args: + mean (Tensor): the tensor of per-element means + std (Tensor): the tensor of per-element standard deviations + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1)) + tensor([ 1.0425, 3.5672, 2.7969, 4.2925, 4.7229, 6.2134, + 8.0505, 8.1408, 9.0563, 10.0566]) + + .. function:: normal(mean=0.0, std, *, out=None) -> Tensor + :noindex: + + Similar to the function above, but the means are shared among all drawn + elements. + + Args: + mean (float, optional): the mean for all distributions + std (Tensor): the tensor of per-element standard deviations + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.normal(mean=0.5, std=torch.arange(1., 6.)) + tensor([-1.2793, -1.0732, -2.0687, 5.1177, -1.2303]) + + .. function:: normal(mean, std=1.0, *, out=None) -> Tensor + :noindex: + + Similar to the function above, but the standard deviations are shared among + all drawn elements. + + Args: + mean (Tensor): the tensor of per-element means + std (float, optional): the standard deviation for all distributions + + Keyword args: + out (Tensor, optional): the output tensor + + Example:: + + >>> torch.normal(mean=torch.arange(1., 6.)) + tensor([ 1.1552, 2.6148, 2.6535, 5.8318, 4.2361]) + + .. function:: normal(mean, std, size, *, out=None) -> Tensor + :noindex: + + Similar to the function above, but the means and standard deviations are shared + among all drawn elements. The resulting tensor has size given by :attr:`size`. + + Args: + mean (float): the mean for all distributions + std (float): the standard deviation for all distributions + size (int...): a sequence of integers defining the shape of the output tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.normal(2, 3, size=(1, 4)) + tensor([[-1.3987, -1.9544, 3.6048, 0.7909]]) + """ + ... +@overload +def not_equal(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + not_equal(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.ne`. + """ + ... +@overload +def not_equal(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + not_equal(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.ne`. + """ + ... +@overload +def nuclear_norm(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: ... +@overload +def nuclear_norm(input: Tensor, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: ... +def numel(self: Tensor) -> _int: + r""" + numel(input) -> int + + Returns the total number of elements in the :attr:`input` tensor. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> a = torch.randn(1, 2, 3, 4, 5) + >>> torch.numel(a) + 120 + >>> a = torch.zeros(4,4) + >>> torch.numel(a) + 16 + """ + ... +@overload +def ones(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a tensor filled with the scalar value `1`, with the shape defined + by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.ones(2, 3) + tensor([[ 1., 1., 1.], + [ 1., 1., 1.]]) + + >>> torch.ones(5) + tensor([ 1., 1., 1., 1., 1.]) + """ + ... +@overload +def ones(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a tensor filled with the scalar value `1`, with the shape defined + by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.ones(2, 3) + tensor([[ 1., 1., 1.], + [ 1., 1., 1.]]) + + >>> torch.ones(5) + tensor([ 1., 1., 1., 1., 1.]) + """ + ... +@overload +def ones(size: _size, *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a tensor filled with the scalar value `1`, with the shape defined + by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.ones(2, 3) + tensor([[ 1., 1., 1.], + [ 1., 1., 1.]]) + + >>> torch.ones(5) + tensor([ 1., 1., 1., 1., 1.]) + """ + ... +@overload +def ones(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a tensor filled with the scalar value `1`, with the shape defined + by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword arguments: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.ones(2, 3) + tensor([[ 1., 1., 1.], + [ 1., 1., 1.]]) + + >>> torch.ones(5) + tensor([ 1., 1., 1., 1., 1.]) + """ + ... +def ones_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + ones_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor + + Returns a tensor filled with the scalar value `1`, with the same size as + :attr:`input`. ``torch.ones_like(input)`` is equivalent to + ``torch.ones(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``. + + .. warning:: + As of 0.4, this function does not support an :attr:`out` keyword. As an alternative, + the old ``torch.ones_like(input, out=output)`` is equivalent to + ``torch.ones(input.size(), out=output)``. + + Args: + input (Tensor): the size of :attr:`input` will determine size of the output tensor. + + Keyword arguments: + dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor. + Default: if ``None``, defaults to the dtype of :attr:`input`. + layout (:class:`torch.layout`, optional): the desired layout of returned tensor. + Default: if ``None``, defaults to the layout of :attr:`input`. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, defaults to the device of :attr:`input`. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + memory_format (:class:`torch.memory_format`, optional): the desired memory format of + returned Tensor. Default: ``torch.preserve_format``. + + Example:: + + >>> input = torch.empty(2, 3) + >>> torch.ones_like(input) + tensor([[ 1., 1., 1.], + [ 1., 1., 1.]]) + """ + ... +def orgqr(input: Tensor, input2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + orgqr(input, tau) -> Tensor + + Alias for :func:`torch.linalg.householder_product`. + """ + ... +def ormqr(input: Tensor, input2: Tensor, input3: Tensor, left: _bool = True, transpose: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + ormqr(input, tau, other, left=True, transpose=False, *, out=None) -> Tensor + + Computes the matrix-matrix multiplication of a product of Householder matrices with a general matrix. + + Multiplies a :math:`m \times n` matrix `C` (given by :attr:`other`) with a matrix `Q`, + where `Q` is represented using Householder reflectors `(input, tau)`. + See `Representation of Orthogonal or Unitary Matrices`_ for further details. + + If :attr:`left` is `True` then `op(Q)` times `C` is computed, otherwise the result is `C` times `op(Q)`. + When :attr:`left` is `True`, the implicit matrix `Q` has size :math:`m \times m`. + It has size :math:`n \times n` otherwise. + If :attr:`transpose` is `True` then `op` is the conjugate transpose operation, otherwise it's a no-op. + + Supports inputs of float, double, cfloat and cdouble dtypes. + Also supports batched inputs, and, if the input is batched, the output is batched with the same dimensions. + + .. seealso:: + :func:`torch.geqrf` can be used to form the Householder representation `(input, tau)` of matrix `Q` + from the QR decomposition. + + .. note:: + This function supports backward but it is only fast when ``(input, tau)`` do not require gradients + and/or ``tau.size(-1)`` is very small. + `` + + Args: + input (Tensor): tensor of shape `(*, mn, k)` where `*` is zero or more batch dimensions + and `mn` equals to `m` or `n` depending on the :attr:`left`. + tau (Tensor): tensor of shape `(*, min(mn, k))` where `*` is zero or more batch dimensions. + other (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions. + left (bool): controls the order of multiplication. + transpose (bool): controls whether the matrix `Q` is conjugate transposed or not. + + Keyword args: + out (Tensor, optional): the output Tensor. Ignored if `None`. Default: `None`. + + .. _Representation of Orthogonal or Unitary Matrices: + https://www.netlib.org/lapack/lug/node128.html + """ + ... +def outer(input: Tensor, vec2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + outer(input, vec2, *, out=None) -> Tensor + + Outer product of :attr:`input` and :attr:`vec2`. + If :attr:`input` is a vector of size :math:`n` and :attr:`vec2` is a vector of + size :math:`m`, then :attr:`out` must be a matrix of size :math:`(n \times m)`. + + .. note:: This function does not :ref:`broadcast `. + + Args: + input (Tensor): 1-D input vector + vec2 (Tensor): 1-D input vector + + Keyword args: + out (Tensor, optional): optional output matrix + + Example:: + + >>> v1 = torch.arange(1., 5.) + >>> v2 = torch.arange(1., 4.) + >>> torch.outer(v1, v2) + tensor([[ 1., 2., 3.], + [ 2., 4., 6.], + [ 3., 6., 9.], + [ 4., 8., 12.]]) + """ + ... +def pairwise_distance(x1: Tensor, x2: Tensor, p: _float = 2, eps: _float = 1e-06, keepdim: _bool = False) -> Tensor: ... +def pdist(input: Tensor, p: _float = 2) -> Tensor: ... +def permute(input: Tensor, dims: _size) -> Tensor: + r""" + permute(input, dims) -> Tensor + + Returns a view of the original tensor :attr:`input` with its dimensions permuted. + + Args: + input (Tensor): the input tensor. + dims (tuple of int): The desired ordering of dimensions + + Example: + >>> x = torch.randn(2, 3, 5) + >>> x.size() + torch.Size([2, 3, 5]) + >>> torch.permute(x, (2, 0, 1)).size() + torch.Size([5, 2, 3]) + """ + ... +def permute_copy(input: Tensor, dims: _size, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.permute`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def pinverse(input: Tensor, rcond: _float = 1e-15) -> Tensor: + r""" + pinverse(input, rcond=1e-15) -> Tensor + + Alias for :func:`torch.linalg.pinv` + """ + ... +def pixel_shuffle(input: Tensor, upscale_factor: _int) -> Tensor: ... +def pixel_unshuffle(input: Tensor, downscale_factor: _int) -> Tensor: ... +def poisson(input: Tensor, generator: Optional[Generator] = None) -> Tensor: + r""" + poisson(input, generator=None) -> Tensor + + Returns a tensor of the same size as :attr:`input` with each element + sampled from a Poisson distribution with rate parameter given by the corresponding + element in :attr:`input` i.e., + + .. math:: + \text{out}_i \sim \text{Poisson}(\text{input}_i) + + :attr:`input` must be non-negative. + + Args: + input (Tensor): the input tensor containing the rates of the Poisson distribution + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + + Example:: + + >>> rates = torch.rand(4, 4) * 5 # rate parameter between 0 and 5 + >>> torch.poisson(rates) + tensor([[9., 1., 3., 5.], + [8., 6., 6., 0.], + [0., 4., 5., 3.], + [2., 1., 4., 2.]]) + """ + ... +def poisson_nll_loss(input: Tensor, target: Tensor, log_input: _bool, full: _bool, eps: _float, reduction: _int) -> Tensor: ... +def polar(abs: Tensor, angle: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + polar(abs, angle, *, out=None) -> Tensor + + Constructs a complex tensor whose elements are Cartesian coordinates + corresponding to the polar coordinates with absolute value :attr:`abs` and angle + :attr:`angle`. + + .. math:: + \text{out} = \text{abs} \cdot \cos(\text{angle}) + \text{abs} \cdot \sin(\text{angle}) \cdot j + + .. note:: + `torch.polar` is similar to + `std::polar `_ + and does not compute the polar decomposition + of a complex tensor like Python's `cmath.polar` and SciPy's `linalg.polar` do. + The behavior of this function is undefined if `abs` is negative or NaN, or if `angle` is + infinite. + + + Args: + abs (Tensor): The absolute value the complex tensor. Must be float or double. + angle (Tensor): The angle of the complex tensor. Must be same dtype as + :attr:`abs`. + + Keyword args: + out (Tensor): If the inputs are ``torch.float32``, must be + ``torch.complex64``. If the inputs are ``torch.float64``, must be + ``torch.complex128``. + + Example:: + + >>> import numpy as np + >>> abs = torch.tensor([1, 2], dtype=torch.float64) + >>> angle = torch.tensor([np.pi / 2, 5 * np.pi / 4], dtype=torch.float64) + >>> z = torch.polar(abs, angle) + >>> z + tensor([(0.0000+1.0000j), (-1.4142-1.4142j)], dtype=torch.complex128) + """ + ... +def polygamma(n: _int, input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + polygamma(n, input, *, out=None) -> Tensor + + Alias for :func:`torch.special.polygamma`. + """ + ... +def positive(input: Tensor) -> Tensor: + r""" + positive(input) -> Tensor + + Returns :attr:`input`. + Throws a runtime error if :attr:`input` is a bool tensor. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> t = torch.randn(5) + >>> t + tensor([ 0.0090, -0.2262, -0.0682, -0.2866, 0.3940]) + >>> torch.positive(t) + tensor([ 0.0090, -0.2262, -0.0682, -0.2866, 0.3940]) + """ + ... +@overload +def pow(input: Tensor, exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + pow(input, exponent, *, out=None) -> Tensor + + Takes the power of each element in :attr:`input` with :attr:`exponent` and + returns a tensor with the result. + + :attr:`exponent` can be either a single ``float`` number or a `Tensor` + with the same number of elements as :attr:`input`. + + When :attr:`exponent` is a scalar value, the operation applied is: + + .. math:: + \text{out}_i = x_i ^ \text{exponent} + + When :attr:`exponent` is a tensor, the operation applied is: + + .. math:: + \text{out}_i = x_i ^ {\text{exponent}_i} + + When :attr:`exponent` is a tensor, the shapes of :attr:`input` + and :attr:`exponent` must be :ref:`broadcastable `. + + Args: + input (Tensor): the input tensor. + exponent (float or tensor): the exponent value + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([ 0.4331, 1.2475, 0.6834, -0.2791]) + >>> torch.pow(a, 2) + tensor([ 0.1875, 1.5561, 0.4670, 0.0779]) + >>> exp = torch.arange(1., 5.) + + >>> a = torch.arange(1., 5.) + >>> a + tensor([ 1., 2., 3., 4.]) + >>> exp + tensor([ 1., 2., 3., 4.]) + >>> torch.pow(a, exp) + tensor([ 1., 4., 27., 256.]) + + .. function:: pow(self, exponent, *, out=None) -> Tensor + :noindex: + + :attr:`self` is a scalar ``float`` value, and :attr:`exponent` is a tensor. + The returned tensor :attr:`out` is of the same shape as :attr:`exponent` + + The operation applied is: + + .. math:: + \text{out}_i = \text{self} ^ {\text{exponent}_i} + + Args: + self (float): the scalar base value for the power operation + exponent (Tensor): the exponent tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> exp = torch.arange(1., 5.) + >>> base = 2 + >>> torch.pow(base, exp) + tensor([ 2., 4., 8., 16.]) + """ + ... +@overload +def pow(self: Union[Number, _complex], exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + pow(input, exponent, *, out=None) -> Tensor + + Takes the power of each element in :attr:`input` with :attr:`exponent` and + returns a tensor with the result. + + :attr:`exponent` can be either a single ``float`` number or a `Tensor` + with the same number of elements as :attr:`input`. + + When :attr:`exponent` is a scalar value, the operation applied is: + + .. math:: + \text{out}_i = x_i ^ \text{exponent} + + When :attr:`exponent` is a tensor, the operation applied is: + + .. math:: + \text{out}_i = x_i ^ {\text{exponent}_i} + + When :attr:`exponent` is a tensor, the shapes of :attr:`input` + and :attr:`exponent` must be :ref:`broadcastable `. + + Args: + input (Tensor): the input tensor. + exponent (float or tensor): the exponent value + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([ 0.4331, 1.2475, 0.6834, -0.2791]) + >>> torch.pow(a, 2) + tensor([ 0.1875, 1.5561, 0.4670, 0.0779]) + >>> exp = torch.arange(1., 5.) + + >>> a = torch.arange(1., 5.) + >>> a + tensor([ 1., 2., 3., 4.]) + >>> exp + tensor([ 1., 2., 3., 4.]) + >>> torch.pow(a, exp) + tensor([ 1., 4., 27., 256.]) + + .. function:: pow(self, exponent, *, out=None) -> Tensor + :noindex: + + :attr:`self` is a scalar ``float`` value, and :attr:`exponent` is a tensor. + The returned tensor :attr:`out` is of the same shape as :attr:`exponent` + + The operation applied is: + + .. math:: + \text{out}_i = \text{self} ^ {\text{exponent}_i} + + Args: + self (float): the scalar base value for the power operation + exponent (Tensor): the exponent tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> exp = torch.arange(1., 5.) + >>> base = 2 + >>> torch.pow(base, exp) + tensor([ 2., 4., 8., 16.]) + """ + ... +@overload +def pow(input: Tensor, exponent: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + pow(input, exponent, *, out=None) -> Tensor + + Takes the power of each element in :attr:`input` with :attr:`exponent` and + returns a tensor with the result. + + :attr:`exponent` can be either a single ``float`` number or a `Tensor` + with the same number of elements as :attr:`input`. + + When :attr:`exponent` is a scalar value, the operation applied is: + + .. math:: + \text{out}_i = x_i ^ \text{exponent} + + When :attr:`exponent` is a tensor, the operation applied is: + + .. math:: + \text{out}_i = x_i ^ {\text{exponent}_i} + + When :attr:`exponent` is a tensor, the shapes of :attr:`input` + and :attr:`exponent` must be :ref:`broadcastable `. + + Args: + input (Tensor): the input tensor. + exponent (float or tensor): the exponent value + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([ 0.4331, 1.2475, 0.6834, -0.2791]) + >>> torch.pow(a, 2) + tensor([ 0.1875, 1.5561, 0.4670, 0.0779]) + >>> exp = torch.arange(1., 5.) + + >>> a = torch.arange(1., 5.) + >>> a + tensor([ 1., 2., 3., 4.]) + >>> exp + tensor([ 1., 2., 3., 4.]) + >>> torch.pow(a, exp) + tensor([ 1., 4., 27., 256.]) + + .. function:: pow(self, exponent, *, out=None) -> Tensor + :noindex: + + :attr:`self` is a scalar ``float`` value, and :attr:`exponent` is a tensor. + The returned tensor :attr:`out` is of the same shape as :attr:`exponent` + + The operation applied is: + + .. math:: + \text{out}_i = \text{self} ^ {\text{exponent}_i} + + Args: + self (float): the scalar base value for the power operation + exponent (Tensor): the exponent tensor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> exp = torch.arange(1., 5.) + >>> base = 2 + >>> torch.pow(base, exp) + tensor([ 2., 4., 8., 16.]) + """ + ... +def prelu(input: Tensor, weight: Tensor) -> Tensor: ... +@overload +def prod(input: Tensor, *, dtype: Optional[_dtype] = None) -> Tensor: + r""" + prod(input, *, dtype=None) -> Tensor + + Returns the product of all elements in the :attr:`input` tensor. + + Args: + input (Tensor): the input tensor. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[-0.8020, 0.5428, -1.5854]]) + >>> torch.prod(a) + tensor(0.6902) + + .. function:: prod(input, dim, keepdim=False, *, dtype=None) -> Tensor + :noindex: + + Returns the product of each row of the :attr:`input` tensor in the given + dimension :attr:`dim`. + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in + the output tensor having 1 fewer dimension than :attr:`input`. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(4, 2) + >>> a + tensor([[ 0.5261, -0.3837], + [ 1.1857, -0.2498], + [-1.1646, 0.0705], + [ 1.1131, -1.0629]]) + >>> torch.prod(a, 1) + tensor([-0.2018, -0.2962, -0.0821, -1.1831]) + """ + ... +@overload +def prod(input: Tensor, dim: _int, keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + prod(input, *, dtype=None) -> Tensor + + Returns the product of all elements in the :attr:`input` tensor. + + Args: + input (Tensor): the input tensor. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[-0.8020, 0.5428, -1.5854]]) + >>> torch.prod(a) + tensor(0.6902) + + .. function:: prod(input, dim, keepdim=False, *, dtype=None) -> Tensor + :noindex: + + Returns the product of each row of the :attr:`input` tensor in the given + dimension :attr:`dim`. + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in + the output tensor having 1 fewer dimension than :attr:`input`. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(4, 2) + >>> a + tensor([[ 0.5261, -0.3837], + [ 1.1857, -0.2498], + [-1.1646, 0.0705], + [ 1.1131, -1.0629]]) + >>> torch.prod(a, 1) + tensor([-0.2018, -0.2962, -0.0821, -1.1831]) + """ + ... +@overload +def prod(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + prod(input, *, dtype=None) -> Tensor + + Returns the product of all elements in the :attr:`input` tensor. + + Args: + input (Tensor): the input tensor. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[-0.8020, 0.5428, -1.5854]]) + >>> torch.prod(a) + tensor(0.6902) + + .. function:: prod(input, dim, keepdim=False, *, dtype=None) -> Tensor + :noindex: + + Returns the product of each row of the :attr:`input` tensor in the given + dimension :attr:`dim`. + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in + the output tensor having 1 fewer dimension than :attr:`input`. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(4, 2) + >>> a + tensor([[ 0.5261, -0.3837], + [ 1.1857, -0.2498], + [-1.1646, 0.0705], + [ 1.1131, -1.0629]]) + >>> torch.prod(a, 1) + tensor([-0.2018, -0.2962, -0.0821, -1.1831]) + """ + ... +def promote_types(type1: _dtype, type2: _dtype) -> _dtype: + r""" + promote_types(type1, type2) -> dtype + + Returns the :class:`torch.dtype` with the smallest size and scalar kind that is + not smaller nor of lower kind than either `type1` or `type2`. See type promotion + :ref:`documentation ` for more information on the type + promotion logic. + + Args: + type1 (:class:`torch.dtype`) + type2 (:class:`torch.dtype`) + + Example:: + + >>> torch.promote_types(torch.int32, torch.float32) + torch.float32 + >>> torch.promote_types(torch.uint8, torch.long) + torch.long + """ + ... +def put(input: Tensor, index: Tensor, source: Tensor, accumulate: _bool = False) -> Tensor: ... +def q_per_channel_axis(input: Tensor) -> _int: ... +def q_per_channel_scales(input: Tensor) -> Tensor: ... +def q_per_channel_zero_points(input: Tensor) -> Tensor: ... +def q_scale(input: Tensor) -> _float: ... +def q_zero_point(input: Tensor) -> _int: ... +def qr(input: Tensor, some: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.qr: + r""" + qr(input, some=True, *, out=None) -> (Tensor, Tensor) + + Computes the QR decomposition of a matrix or a batch of matrices :attr:`input`, + and returns a namedtuple (Q, R) of tensors such that :math:`\text{input} = Q R` + with :math:`Q` being an orthogonal matrix or batch of orthogonal matrices and + :math:`R` being an upper triangular matrix or batch of upper triangular matrices. + + If :attr:`some` is ``True``, then this function returns the thin (reduced) QR factorization. + Otherwise, if :attr:`some` is ``False``, this function returns the complete QR factorization. + + .. warning:: + + :func:`torch.qr` is deprecated in favor of :func:`torch.linalg.qr` + and will be removed in a future PyTorch release. The boolean parameter :attr:`some` has been + replaced with a string parameter :attr:`mode`. + + ``Q, R = torch.qr(A)`` should be replaced with + + .. code:: python + + Q, R = torch.linalg.qr(A) + + ``Q, R = torch.qr(A, some=False)`` should be replaced with + + .. code:: python + + Q, R = torch.linalg.qr(A, mode="complete") + + .. warning:: + If you plan to backpropagate through QR, note that the current backward implementation + is only well-defined when the first :math:`\min(input.size(-1), input.size(-2))` + columns of :attr:`input` are linearly independent. + This behavior will probably change once QR supports pivoting. + + .. note:: This function uses LAPACK for CPU inputs and MAGMA for CUDA inputs, + and may produce different (valid) decompositions on different device types + or different platforms. + + Args: + input (Tensor): the input tensor of size :math:`(*, m, n)` where `*` is zero or more + batch dimensions consisting of matrices of dimension :math:`m \times n`. + some (bool, optional): Set to ``True`` for reduced QR decomposition and ``False`` for + complete QR decomposition. If `k = min(m, n)` then: + + * ``some=True`` : returns `(Q, R)` with dimensions (m, k), (k, n) (default) + + * ``'some=False'``: returns `(Q, R)` with dimensions (m, m), (m, n) + + Keyword args: + out (tuple, optional): tuple of `Q` and `R` tensors. + The dimensions of `Q` and `R` are detailed in the description of :attr:`some` above. + + Example:: + + >>> a = torch.tensor([[12., -51, 4], [6, 167, -68], [-4, 24, -41]]) + >>> q, r = torch.qr(a) + >>> q + tensor([[-0.8571, 0.3943, 0.3314], + [-0.4286, -0.9029, -0.0343], + [ 0.2857, -0.1714, 0.9429]]) + >>> r + tensor([[ -14.0000, -21.0000, 14.0000], + [ 0.0000, -175.0000, 70.0000], + [ 0.0000, 0.0000, -35.0000]]) + >>> torch.mm(q, r).round() + tensor([[ 12., -51., 4.], + [ 6., 167., -68.], + [ -4., 24., -41.]]) + >>> torch.mm(q.t(), q).round() + tensor([[ 1., 0., 0.], + [ 0., 1., -0.], + [ 0., -0., 1.]]) + >>> a = torch.randn(3, 4, 5) + >>> q, r = torch.qr(a, some=False) + >>> torch.allclose(torch.matmul(q, r), a) + True + >>> torch.allclose(torch.matmul(q.mT, q), torch.eye(5)) + True + """ + ... +@overload +def quantile(input: Tensor, q: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: + r""" + quantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor + + Computes the q-th quantiles of each row of the :attr:`input` tensor along the dimension :attr:`dim`. + + To compute the quantile, we map q in [0, 1] to the range of indices [0, n] to find the location + of the quantile in the sorted input. If the quantile lies between two data points ``a < b`` with + indices ``i`` and ``j`` in the sorted order, result is computed according to the given + :attr:`interpolation` method as follows: + + - ``linear``: ``a + (b - a) * fraction``, where ``fraction`` is the fractional part of the computed quantile index. + - ``lower``: ``a``. + - ``higher``: ``b``. + - ``nearest``: ``a`` or ``b``, whichever's index is closer to the computed quantile index (rounding down for .5 fractions). + - ``midpoint``: ``(a + b) / 2``. + + If :attr:`q` is a 1D tensor, the first dimension of the output represents the quantiles and has size + equal to the size of :attr:`q`, the remaining dimensions are what remains from the reduction. + + .. note:: + By default :attr:`dim` is ``None`` resulting in the :attr:`input` tensor being flattened before computation. + + Args: + input (Tensor): the input tensor. + q (float or Tensor): a scalar or 1D tensor of values in the range [0, 1]. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword arguments: + interpolation (str): interpolation method to use when the desired quantile lies between two data points. + Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``. + Default is ``linear``. + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(2, 3) + >>> a + tensor([[ 0.0795, -1.2117, 0.9765], + [ 1.1707, 0.6706, 0.4884]]) + >>> q = torch.tensor([0.25, 0.5, 0.75]) + >>> torch.quantile(a, q, dim=1, keepdim=True) + tensor([[[-0.5661], + [ 0.5795]], + + [[ 0.0795], + [ 0.6706]], + + [[ 0.5280], + [ 0.9206]]]) + >>> torch.quantile(a, q, dim=1, keepdim=True).shape + torch.Size([3, 2, 1]) + >>> a = torch.arange(4.) + >>> a + tensor([0., 1., 2., 3.]) + >>> torch.quantile(a, 0.6, interpolation='linear') + tensor(1.8000) + >>> torch.quantile(a, 0.6, interpolation='lower') + tensor(1.) + >>> torch.quantile(a, 0.6, interpolation='higher') + tensor(2.) + >>> torch.quantile(a, 0.6, interpolation='midpoint') + tensor(1.5000) + >>> torch.quantile(a, 0.6, interpolation='nearest') + tensor(2.) + >>> torch.quantile(a, 0.4, interpolation='nearest') + tensor(1.) + """ + ... +@overload +def quantile(input: Tensor, q: _float, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: + r""" + quantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor + + Computes the q-th quantiles of each row of the :attr:`input` tensor along the dimension :attr:`dim`. + + To compute the quantile, we map q in [0, 1] to the range of indices [0, n] to find the location + of the quantile in the sorted input. If the quantile lies between two data points ``a < b`` with + indices ``i`` and ``j`` in the sorted order, result is computed according to the given + :attr:`interpolation` method as follows: + + - ``linear``: ``a + (b - a) * fraction``, where ``fraction`` is the fractional part of the computed quantile index. + - ``lower``: ``a``. + - ``higher``: ``b``. + - ``nearest``: ``a`` or ``b``, whichever's index is closer to the computed quantile index (rounding down for .5 fractions). + - ``midpoint``: ``(a + b) / 2``. + + If :attr:`q` is a 1D tensor, the first dimension of the output represents the quantiles and has size + equal to the size of :attr:`q`, the remaining dimensions are what remains from the reduction. + + .. note:: + By default :attr:`dim` is ``None`` resulting in the :attr:`input` tensor being flattened before computation. + + Args: + input (Tensor): the input tensor. + q (float or Tensor): a scalar or 1D tensor of values in the range [0, 1]. + dim (int): the dimension to reduce. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword arguments: + interpolation (str): interpolation method to use when the desired quantile lies between two data points. + Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``. + Default is ``linear``. + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(2, 3) + >>> a + tensor([[ 0.0795, -1.2117, 0.9765], + [ 1.1707, 0.6706, 0.4884]]) + >>> q = torch.tensor([0.25, 0.5, 0.75]) + >>> torch.quantile(a, q, dim=1, keepdim=True) + tensor([[[-0.5661], + [ 0.5795]], + + [[ 0.0795], + [ 0.6706]], + + [[ 0.5280], + [ 0.9206]]]) + >>> torch.quantile(a, q, dim=1, keepdim=True).shape + torch.Size([3, 2, 1]) + >>> a = torch.arange(4.) + >>> a + tensor([0., 1., 2., 3.]) + >>> torch.quantile(a, 0.6, interpolation='linear') + tensor(1.8000) + >>> torch.quantile(a, 0.6, interpolation='lower') + tensor(1.) + >>> torch.quantile(a, 0.6, interpolation='higher') + tensor(2.) + >>> torch.quantile(a, 0.6, interpolation='midpoint') + tensor(1.5000) + >>> torch.quantile(a, 0.6, interpolation='nearest') + tensor(2.) + >>> torch.quantile(a, 0.4, interpolation='nearest') + tensor(1.) + """ + ... +def quantize_per_channel(input: Tensor, scales: Tensor, zero_points: Tensor, axis: _int, dtype: _dtype) -> Tensor: + r""" + quantize_per_channel(input, scales, zero_points, axis, dtype) -> Tensor + + Converts a float tensor to a per-channel quantized tensor with given scales and zero points. + + Arguments: + input (Tensor): float tensor to quantize + scales (Tensor): float 1D tensor of scales to use, size should match ``input.size(axis)`` + zero_points (int): integer 1D tensor of offset to use, size should match ``input.size(axis)`` + axis (int): dimension on which apply per-channel quantization + dtype (:class:`torch.dtype`): the desired data type of returned tensor. + Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32`` + + Returns: + Tensor: A newly quantized tensor + + Example:: + + >>> x = torch.tensor([[-1.0, 0.0], [1.0, 2.0]]) + >>> torch.quantize_per_channel(x, torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8) + tensor([[-1., 0.], + [ 1., 2.]], size=(2, 2), dtype=torch.quint8, + quantization_scheme=torch.per_channel_affine, + scale=tensor([0.1000, 0.0100], dtype=torch.float64), + zero_point=tensor([10, 0]), axis=0) + >>> torch.quantize_per_channel(x, torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8).int_repr() + tensor([[ 0, 10], + [100, 200]], dtype=torch.uint8) + """ + ... +@overload +def quantize_per_tensor(input: Tensor, scale: Tensor, zero_point: Tensor, dtype: _dtype) -> Tensor: + r""" + quantize_per_tensor(input, scale, zero_point, dtype) -> Tensor + + Converts a float tensor to a quantized tensor with given scale and zero point. + + Arguments: + input (Tensor): float tensor or list of tensors to quantize + scale (float or Tensor): scale to apply in quantization formula + zero_point (int or Tensor): offset in integer value that maps to float zero + dtype (:class:`torch.dtype`): the desired data type of returned tensor. + Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32`` + + Returns: + Tensor: A newly quantized tensor or list of quantized tensors. + + Example:: + + >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8) + tensor([-1., 0., 1., 2.], size=(4,), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10) + >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8).int_repr() + tensor([ 0, 10, 20, 30], dtype=torch.uint8) + >>> torch.quantize_per_tensor([torch.tensor([-1.0, 0.0]), torch.tensor([-2.0, 2.0])], + >>> torch.tensor([0.1, 0.2]), torch.tensor([10, 20]), torch.quint8) + (tensor([-1., 0.], size=(2,), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10), + tensor([-2., 2.], size=(2,), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=20)) + >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.tensor(0.1), torch.tensor(10), torch.quint8) + tensor([-1., 0., 1., 2.], size=(4,), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=0.10, zero_point=10) + """ + ... +@overload +def quantize_per_tensor(input: Tensor, scale: _float, zero_point: _int, dtype: _dtype) -> Tensor: + r""" + quantize_per_tensor(input, scale, zero_point, dtype) -> Tensor + + Converts a float tensor to a quantized tensor with given scale and zero point. + + Arguments: + input (Tensor): float tensor or list of tensors to quantize + scale (float or Tensor): scale to apply in quantization formula + zero_point (int or Tensor): offset in integer value that maps to float zero + dtype (:class:`torch.dtype`): the desired data type of returned tensor. + Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32`` + + Returns: + Tensor: A newly quantized tensor or list of quantized tensors. + + Example:: + + >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8) + tensor([-1., 0., 1., 2.], size=(4,), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10) + >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8).int_repr() + tensor([ 0, 10, 20, 30], dtype=torch.uint8) + >>> torch.quantize_per_tensor([torch.tensor([-1.0, 0.0]), torch.tensor([-2.0, 2.0])], + >>> torch.tensor([0.1, 0.2]), torch.tensor([10, 20]), torch.quint8) + (tensor([-1., 0.], size=(2,), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10), + tensor([-2., 2.], size=(2,), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=20)) + >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.tensor(0.1), torch.tensor(10), torch.quint8) + tensor([-1., 0., 1., 2.], size=(4,), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=0.10, zero_point=10) + """ + ... +@overload +def quantize_per_tensor(tensors: Union[Tuple[Tensor, ...], List[Tensor]], scales: Tensor, zero_points: Tensor, dtype: _dtype) -> Tuple[Tensor, ...]: + r""" + quantize_per_tensor(input, scale, zero_point, dtype) -> Tensor + + Converts a float tensor to a quantized tensor with given scale and zero point. + + Arguments: + input (Tensor): float tensor or list of tensors to quantize + scale (float or Tensor): scale to apply in quantization formula + zero_point (int or Tensor): offset in integer value that maps to float zero + dtype (:class:`torch.dtype`): the desired data type of returned tensor. + Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32`` + + Returns: + Tensor: A newly quantized tensor or list of quantized tensors. + + Example:: + + >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8) + tensor([-1., 0., 1., 2.], size=(4,), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10) + >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8).int_repr() + tensor([ 0, 10, 20, 30], dtype=torch.uint8) + >>> torch.quantize_per_tensor([torch.tensor([-1.0, 0.0]), torch.tensor([-2.0, 2.0])], + >>> torch.tensor([0.1, 0.2]), torch.tensor([10, 20]), torch.quint8) + (tensor([-1., 0.], size=(2,), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10), + tensor([-2., 2.], size=(2,), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=20)) + >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.tensor(0.1), torch.tensor(10), torch.quint8) + tensor([-1., 0., 1., 2.], size=(4,), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=0.10, zero_point=10) + """ + ... +def quantize_per_tensor_dynamic(input: Tensor, dtype: _dtype, reduce_range: _bool) -> Tensor: + r""" + quantize_per_tensor_dynamic(input, dtype, reduce_range) -> Tensor + + Converts a float tensor to a quantized tensor with scale and zero_point calculated + dynamically based on the input. + + Arguments: + input (Tensor): float tensor or list of tensors to quantize + dtype (:class:`torch.dtype`): the desired data type of returned tensor. + Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8`` + reduce_range (bool): a flag to indicate whether to reduce the range of quantized + data by 1 bit, it's required to avoid instruction overflow for some hardwares + + Returns: + Tensor: A newly (dynamically) quantized tensor + + Example:: + + >>> t = torch.quantize_per_tensor_dynamic(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.quint8, False) + >>> print(t) + tensor([-1., 0., 1., 2.], size=(4,), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=0.011764705882352941, + zero_point=85) + >>> t.int_repr() + tensor([ 0, 85, 170, 255], dtype=torch.uint8) + """ + ... +def quantized_batch_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], mean: Tensor, var: Tensor, eps: _float, output_scale: _float, output_zero_point: _int) -> Tensor: + r""" + quantized_batch_norm(input, weight=None, bias=None, mean, var, eps, output_scale, output_zero_point) -> Tensor + + Applies batch normalization on a 4D (NCHW) quantized tensor. + + .. math:: + + y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + + Arguments: + input (Tensor): quantized tensor + weight (Tensor): float tensor that corresponds to the gamma, size C + bias (Tensor): float tensor that corresponds to the beta, size C + mean (Tensor): float mean value in batch normalization, size C + var (Tensor): float tensor for variance, size C + eps (float): a value added to the denominator for numerical stability. + output_scale (float): output quantized tensor scale + output_zero_point (int): output quantized tensor zero_point + + Returns: + Tensor: A quantized tensor with batch normalization applied. + + Example:: + + >>> qx = torch.quantize_per_tensor(torch.rand(2, 2, 2, 2), 1.5, 3, torch.quint8) + >>> torch.quantized_batch_norm(qx, torch.ones(2), torch.zeros(2), torch.rand(2), torch.rand(2), 0.00001, 0.2, 2) + tensor([[[[-0.2000, -0.2000], + [ 1.6000, -0.2000]], + + [[-0.4000, -0.4000], + [-0.4000, 0.6000]]], + + + [[[-0.2000, -0.2000], + [-0.2000, -0.2000]], + + [[ 0.6000, -0.4000], + [ 0.6000, -0.4000]]]], size=(2, 2, 2, 2), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=2) + """ + ... +def quantized_gru_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tensor: ... +def quantized_lstm_cell(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tuple[Tensor, Tensor]: ... +def quantized_max_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: + r""" + quantized_max_pool1d(input, kernel_size, stride=[], padding=0, dilation=1, ceil_mode=False) -> Tensor + + Applies a 1D max pooling over an input quantized tensor composed of several input planes. + + Arguments: + input (Tensor): quantized tensor + kernel_size (list of int): the size of the sliding window + stride (``list of int``, optional): the stride of the sliding window + padding (``list of int``, optional): padding to be added on both sides, must be >= 0 and <= kernel_size / 2 + dilation (``list of int``, optional): The stride between elements within a sliding window, must be > 0. Default 1 + ceil_mode (bool, optional): If True, will use ceil instead of floor to compute the output shape. + Defaults to False. + + + Returns: + Tensor: A quantized tensor with max_pool1d applied. + + Example:: + + >>> qx = torch.quantize_per_tensor(torch.rand(2, 2), 1.5, 3, torch.quint8) + >>> torch.quantized_max_pool1d(qx, [2]) + tensor([[0.0000], + [1.5000]], size=(2, 1), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=1.5, zero_point=3) + """ + ... +def quantized_max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: + r""" + quantized_max_pool2d(input, kernel_size, stride=[], padding=0, dilation=1, ceil_mode=False) -> Tensor + + Applies a 2D max pooling over an input quantized tensor composed of several input planes. + + Arguments: + input (Tensor): quantized tensor + kernel_size (``list of int``): the size of the sliding window + stride (``list of int``, optional): the stride of the sliding window + padding (``list of int``, optional): padding to be added on both sides, must be >= 0 and <= kernel_size / 2 + dilation (``list of int``, optional): The stride between elements within a sliding window, must be > 0. Default 1 + ceil_mode (bool, optional): If True, will use ceil instead of floor to compute the output shape. + Defaults to False. + + + Returns: + Tensor: A quantized tensor with max_pool2d applied. + + Example:: + + >>> qx = torch.quantize_per_tensor(torch.rand(2, 2, 2, 2), 1.5, 3, torch.quint8) + >>> torch.quantized_max_pool2d(qx, [2,2]) + tensor([[[[1.5000]], + + [[1.5000]]], + + + [[[0.0000]], + + [[0.0000]]]], size=(2, 2, 1, 1), dtype=torch.quint8, + quantization_scheme=torch.per_tensor_affine, scale=1.5, zero_point=3) + """ + ... +def quantized_max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ... +def quantized_rnn_relu_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tensor: ... +def quantized_rnn_tanh_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tensor: ... +def rad2deg(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + rad2deg(input, *, out=None) -> Tensor + + Returns a new tensor with each of the elements of :attr:`input` + converted from angles in radians to degrees. + + Args: + input (Tensor): the input tensor. + + Keyword arguments: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([[3.142, -3.142], [6.283, -6.283], [1.570, -1.570]]) + >>> torch.rad2deg(a) + tensor([[ 180.0233, -180.0233], + [ 359.9894, -359.9894], + [ 89.9544, -89.9544]]) + """ + ... +def rad2deg_(input: Tensor) -> Tensor: ... +@overload +def rand(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + Returns a tensor filled with random numbers from a uniform distribution + on the interval :math:`[0, 1)` + + The shape of the tensor is defined by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.rand(4) + tensor([ 0.5204, 0.2503, 0.3525, 0.5673]) + >>> torch.rand(2, 3) + tensor([[ 0.8237, 0.5781, 0.6879], + [ 0.3816, 0.7249, 0.0998]]) + """ + ... +@overload +def rand(*size: _int, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + Returns a tensor filled with random numbers from a uniform distribution + on the interval :math:`[0, 1)` + + The shape of the tensor is defined by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.rand(4) + tensor([ 0.5204, 0.2503, 0.3525, 0.5673]) + >>> torch.rand(2, 3) + tensor([[ 0.8237, 0.5781, 0.6879], + [ 0.3816, 0.7249, 0.0998]]) + """ + ... +@overload +def rand(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + Returns a tensor filled with random numbers from a uniform distribution + on the interval :math:`[0, 1)` + + The shape of the tensor is defined by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.rand(4) + tensor([ 0.5204, 0.2503, 0.3525, 0.5673]) + >>> torch.rand(2, 3) + tensor([[ 0.8237, 0.5781, 0.6879], + [ 0.3816, 0.7249, 0.0998]]) + """ + ... +@overload +def rand(*size: _int, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + Returns a tensor filled with random numbers from a uniform distribution + on the interval :math:`[0, 1)` + + The shape of the tensor is defined by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.rand(4) + tensor([ 0.5204, 0.2503, 0.3525, 0.5673]) + >>> torch.rand(2, 3) + tensor([[ 0.8237, 0.5781, 0.6879], + [ 0.3816, 0.7249, 0.0998]]) + """ + ... +@overload +def rand(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + Returns a tensor filled with random numbers from a uniform distribution + on the interval :math:`[0, 1)` + + The shape of the tensor is defined by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.rand(4) + tensor([ 0.5204, 0.2503, 0.3525, 0.5673]) + >>> torch.rand(2, 3) + tensor([[ 0.8237, 0.5781, 0.6879], + [ 0.3816, 0.7249, 0.0998]]) + """ + ... +@overload +def rand(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + Returns a tensor filled with random numbers from a uniform distribution + on the interval :math:`[0, 1)` + + The shape of the tensor is defined by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.rand(4) + tensor([ 0.5204, 0.2503, 0.3525, 0.5673]) + >>> torch.rand(2, 3) + tensor([[ 0.8237, 0.5781, 0.6879], + [ 0.3816, 0.7249, 0.0998]]) + """ + ... +@overload +def rand(size: Sequence[Union[_int, SymInt]], *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + Returns a tensor filled with random numbers from a uniform distribution + on the interval :math:`[0, 1)` + + The shape of the tensor is defined by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.rand(4) + tensor([ 0.5204, 0.2503, 0.3525, 0.5673]) + >>> torch.rand(2, 3) + tensor([[ 0.8237, 0.5781, 0.6879], + [ 0.3816, 0.7249, 0.0998]]) + """ + ... +@overload +def rand(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + Returns a tensor filled with random numbers from a uniform distribution + on the interval :math:`[0, 1)` + + The shape of the tensor is defined by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.rand(4) + tensor([ 0.5204, 0.2503, 0.3525, 0.5673]) + >>> torch.rand(2, 3) + tensor([[ 0.8237, 0.5781, 0.6879], + [ 0.3816, 0.7249, 0.0998]]) + """ + ... +def rand_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + rand_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor + + Returns a tensor with the same size as :attr:`input` that is filled with + random numbers from a uniform distribution on the interval :math:`[0, 1)`. + ``torch.rand_like(input)`` is equivalent to + ``torch.rand(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``. + + Args: + input (Tensor): the size of :attr:`input` will determine size of the output tensor. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor. + Default: if ``None``, defaults to the dtype of :attr:`input`. + layout (:class:`torch.layout`, optional): the desired layout of returned tensor. + Default: if ``None``, defaults to the layout of :attr:`input`. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, defaults to the device of :attr:`input`. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + memory_format (:class:`torch.memory_format`, optional): the desired memory format of + returned Tensor. Default: ``torch.preserve_format``. + """ + ... +@overload +def randint(low: _int, high: _int, size: _size, *, generator: Optional[Generator] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: + r""" + randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a tensor filled with random integers generated uniformly + between :attr:`low` (inclusive) and :attr:`high` (exclusive). + + The shape of the tensor is defined by the variable argument :attr:`size`. + + .. note:: + With the global dtype default (``torch.float32``), this function returns + a tensor with dtype ``torch.int64``. + + Args: + low (int, optional): Lowest integer to be drawn from the distribution. Default: 0. + high (int): One above the highest integer to be drawn from the distribution. + size (tuple): a tuple defining the shape of the output tensor. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``, + this function returns a tensor with dtype ``torch.int64``. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.randint(3, 5, (3,)) + tensor([4, 3, 4]) + + + >>> torch.randint(10, (2, 2)) + tensor([[0, 2], + [5, 5]]) + + + >>> torch.randint(3, 10, (2, 2)) + tensor([[4, 5], + [6, 7]]) + """ + ... +@overload +def randint(high: _int, size: _size, *, generator: Optional[Generator] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: + r""" + randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a tensor filled with random integers generated uniformly + between :attr:`low` (inclusive) and :attr:`high` (exclusive). + + The shape of the tensor is defined by the variable argument :attr:`size`. + + .. note:: + With the global dtype default (``torch.float32``), this function returns + a tensor with dtype ``torch.int64``. + + Args: + low (int, optional): Lowest integer to be drawn from the distribution. Default: 0. + high (int): One above the highest integer to be drawn from the distribution. + size (tuple): a tuple defining the shape of the output tensor. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``, + this function returns a tensor with dtype ``torch.int64``. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.randint(3, 5, (3,)) + tensor([4, 3, 4]) + + + >>> torch.randint(10, (2, 2)) + tensor([[0, 2], + [5, 5]]) + + + >>> torch.randint(3, 10, (2, 2)) + tensor([[4, 5], + [6, 7]]) + """ + ... +@overload +def randint(high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a tensor filled with random integers generated uniformly + between :attr:`low` (inclusive) and :attr:`high` (exclusive). + + The shape of the tensor is defined by the variable argument :attr:`size`. + + .. note:: + With the global dtype default (``torch.float32``), this function returns + a tensor with dtype ``torch.int64``. + + Args: + low (int, optional): Lowest integer to be drawn from the distribution. Default: 0. + high (int): One above the highest integer to be drawn from the distribution. + size (tuple): a tuple defining the shape of the output tensor. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``, + this function returns a tensor with dtype ``torch.int64``. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.randint(3, 5, (3,)) + tensor([4, 3, 4]) + + + >>> torch.randint(10, (2, 2)) + tensor([[0, 2], + [5, 5]]) + + + >>> torch.randint(3, 10, (2, 2)) + tensor([[4, 5], + [6, 7]]) + """ + ... +@overload +def randint(high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a tensor filled with random integers generated uniformly + between :attr:`low` (inclusive) and :attr:`high` (exclusive). + + The shape of the tensor is defined by the variable argument :attr:`size`. + + .. note:: + With the global dtype default (``torch.float32``), this function returns + a tensor with dtype ``torch.int64``. + + Args: + low (int, optional): Lowest integer to be drawn from the distribution. Default: 0. + high (int): One above the highest integer to be drawn from the distribution. + size (tuple): a tuple defining the shape of the output tensor. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``, + this function returns a tensor with dtype ``torch.int64``. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.randint(3, 5, (3,)) + tensor([4, 3, 4]) + + + >>> torch.randint(10, (2, 2)) + tensor([[0, 2], + [5, 5]]) + + + >>> torch.randint(3, 10, (2, 2)) + tensor([[4, 5], + [6, 7]]) + """ + ... +@overload +def randint(low: Union[_int, SymInt], high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a tensor filled with random integers generated uniformly + between :attr:`low` (inclusive) and :attr:`high` (exclusive). + + The shape of the tensor is defined by the variable argument :attr:`size`. + + .. note:: + With the global dtype default (``torch.float32``), this function returns + a tensor with dtype ``torch.int64``. + + Args: + low (int, optional): Lowest integer to be drawn from the distribution. Default: 0. + high (int): One above the highest integer to be drawn from the distribution. + size (tuple): a tuple defining the shape of the output tensor. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``, + this function returns a tensor with dtype ``torch.int64``. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.randint(3, 5, (3,)) + tensor([4, 3, 4]) + + + >>> torch.randint(10, (2, 2)) + tensor([[0, 2], + [5, 5]]) + + + >>> torch.randint(3, 10, (2, 2)) + tensor([[4, 5], + [6, 7]]) + """ + ... +@overload +def randint(low: Union[_int, SymInt], high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a tensor filled with random integers generated uniformly + between :attr:`low` (inclusive) and :attr:`high` (exclusive). + + The shape of the tensor is defined by the variable argument :attr:`size`. + + .. note:: + With the global dtype default (``torch.float32``), this function returns + a tensor with dtype ``torch.int64``. + + Args: + low (int, optional): Lowest integer to be drawn from the distribution. Default: 0. + high (int): One above the highest integer to be drawn from the distribution. + size (tuple): a tuple defining the shape of the output tensor. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``, + this function returns a tensor with dtype ``torch.int64``. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.randint(3, 5, (3,)) + tensor([4, 3, 4]) + + + >>> torch.randint(10, (2, 2)) + tensor([[0, 2], + [5, 5]]) + + + >>> torch.randint(3, 10, (2, 2)) + tensor([[4, 5], + [6, 7]]) + """ + ... +@overload +def randint_like(input: Tensor, high: Union[_int, SymInt], *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randint_like(input, low=0, high, \*, dtype=None, layout=torch.strided, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor + + Returns a tensor with the same shape as Tensor :attr:`input` filled with + random integers generated uniformly between :attr:`low` (inclusive) and + :attr:`high` (exclusive). + + .. note: + With the global dtype default (``torch.float32``), this function returns + a tensor with dtype ``torch.int64``. + + Args: + input (Tensor): the size of :attr:`input` will determine size of the output tensor. + low (int, optional): Lowest integer to be drawn from the distribution. Default: 0. + high (int): One above the highest integer to be drawn from the distribution. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor. + Default: if ``None``, defaults to the dtype of :attr:`input`. + layout (:class:`torch.layout`, optional): the desired layout of returned tensor. + Default: if ``None``, defaults to the layout of :attr:`input`. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, defaults to the device of :attr:`input`. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + memory_format (:class:`torch.memory_format`, optional): the desired memory format of + returned Tensor. Default: ``torch.preserve_format``. + """ + ... +@overload +def randint_like(input: Tensor, low: Union[_int, SymInt], high: Union[_int, SymInt], *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randint_like(input, low=0, high, \*, dtype=None, layout=torch.strided, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor + + Returns a tensor with the same shape as Tensor :attr:`input` filled with + random integers generated uniformly between :attr:`low` (inclusive) and + :attr:`high` (exclusive). + + .. note: + With the global dtype default (``torch.float32``), this function returns + a tensor with dtype ``torch.int64``. + + Args: + input (Tensor): the size of :attr:`input` will determine size of the output tensor. + low (int, optional): Lowest integer to be drawn from the distribution. Default: 0. + high (int): One above the highest integer to be drawn from the distribution. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor. + Default: if ``None``, defaults to the dtype of :attr:`input`. + layout (:class:`torch.layout`, optional): the desired layout of returned tensor. + Default: if ``None``, defaults to the layout of :attr:`input`. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, defaults to the device of :attr:`input`. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + memory_format (:class:`torch.memory_format`, optional): the desired memory format of + returned Tensor. Default: ``torch.preserve_format``. + """ + ... +@overload +def randn(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + + Returns a tensor filled with random numbers from a normal distribution + with mean `0` and variance `1` (also called the standard normal + distribution). + + .. math:: + \text{out}_{i} \sim \mathcal{N}(0, 1) + + For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and + unit variance as + + .. math:: + \text{out}_{i} \sim \mathcal{CN}(0, 1) + + This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary + :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as + + .. math:: + \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad + \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}) + + The shape of the tensor is defined by the variable argument :attr:`size`. + + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.randn(4) + tensor([-2.1436, 0.9966, 2.3426, -0.6366]) + >>> torch.randn(2, 3) + tensor([[ 1.5954, 2.8929, -1.0923], + [ 1.1719, -0.4709, -0.1996]]) + + .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution + """ + ... +@overload +def randn(*size: _int, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + + Returns a tensor filled with random numbers from a normal distribution + with mean `0` and variance `1` (also called the standard normal + distribution). + + .. math:: + \text{out}_{i} \sim \mathcal{N}(0, 1) + + For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and + unit variance as + + .. math:: + \text{out}_{i} \sim \mathcal{CN}(0, 1) + + This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary + :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as + + .. math:: + \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad + \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}) + + The shape of the tensor is defined by the variable argument :attr:`size`. + + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.randn(4) + tensor([-2.1436, 0.9966, 2.3426, -0.6366]) + >>> torch.randn(2, 3) + tensor([[ 1.5954, 2.8929, -1.0923], + [ 1.1719, -0.4709, -0.1996]]) + + .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution + """ + ... +@overload +def randn(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + + Returns a tensor filled with random numbers from a normal distribution + with mean `0` and variance `1` (also called the standard normal + distribution). + + .. math:: + \text{out}_{i} \sim \mathcal{N}(0, 1) + + For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and + unit variance as + + .. math:: + \text{out}_{i} \sim \mathcal{CN}(0, 1) + + This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary + :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as + + .. math:: + \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad + \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}) + + The shape of the tensor is defined by the variable argument :attr:`size`. + + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.randn(4) + tensor([-2.1436, 0.9966, 2.3426, -0.6366]) + >>> torch.randn(2, 3) + tensor([[ 1.5954, 2.8929, -1.0923], + [ 1.1719, -0.4709, -0.1996]]) + + .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution + """ + ... +@overload +def randn(*size: _int, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + + Returns a tensor filled with random numbers from a normal distribution + with mean `0` and variance `1` (also called the standard normal + distribution). + + .. math:: + \text{out}_{i} \sim \mathcal{N}(0, 1) + + For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and + unit variance as + + .. math:: + \text{out}_{i} \sim \mathcal{CN}(0, 1) + + This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary + :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as + + .. math:: + \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad + \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}) + + The shape of the tensor is defined by the variable argument :attr:`size`. + + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.randn(4) + tensor([-2.1436, 0.9966, 2.3426, -0.6366]) + >>> torch.randn(2, 3) + tensor([[ 1.5954, 2.8929, -1.0923], + [ 1.1719, -0.4709, -0.1996]]) + + .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution + """ + ... +@overload +def randn(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + + Returns a tensor filled with random numbers from a normal distribution + with mean `0` and variance `1` (also called the standard normal + distribution). + + .. math:: + \text{out}_{i} \sim \mathcal{N}(0, 1) + + For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and + unit variance as + + .. math:: + \text{out}_{i} \sim \mathcal{CN}(0, 1) + + This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary + :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as + + .. math:: + \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad + \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}) + + The shape of the tensor is defined by the variable argument :attr:`size`. + + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.randn(4) + tensor([-2.1436, 0.9966, 2.3426, -0.6366]) + >>> torch.randn(2, 3) + tensor([[ 1.5954, 2.8929, -1.0923], + [ 1.1719, -0.4709, -0.1996]]) + + .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution + """ + ... +@overload +def randn(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + + Returns a tensor filled with random numbers from a normal distribution + with mean `0` and variance `1` (also called the standard normal + distribution). + + .. math:: + \text{out}_{i} \sim \mathcal{N}(0, 1) + + For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and + unit variance as + + .. math:: + \text{out}_{i} \sim \mathcal{CN}(0, 1) + + This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary + :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as + + .. math:: + \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad + \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}) + + The shape of the tensor is defined by the variable argument :attr:`size`. + + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.randn(4) + tensor([-2.1436, 0.9966, 2.3426, -0.6366]) + >>> torch.randn(2, 3) + tensor([[ 1.5954, 2.8929, -1.0923], + [ 1.1719, -0.4709, -0.1996]]) + + .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution + """ + ... +@overload +def randn(size: Sequence[Union[_int, SymInt]], *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + + Returns a tensor filled with random numbers from a normal distribution + with mean `0` and variance `1` (also called the standard normal + distribution). + + .. math:: + \text{out}_{i} \sim \mathcal{N}(0, 1) + + For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and + unit variance as + + .. math:: + \text{out}_{i} \sim \mathcal{CN}(0, 1) + + This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary + :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as + + .. math:: + \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad + \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}) + + The shape of the tensor is defined by the variable argument :attr:`size`. + + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.randn(4) + tensor([-2.1436, 0.9966, 2.3426, -0.6366]) + >>> torch.randn(2, 3) + tensor([[ 1.5954, 2.8929, -1.0923], + [ 1.1719, -0.4709, -0.1996]]) + + .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution + """ + ... +@overload +def randn(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + + Returns a tensor filled with random numbers from a normal distribution + with mean `0` and variance `1` (also called the standard normal + distribution). + + .. math:: + \text{out}_{i} \sim \mathcal{N}(0, 1) + + For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and + unit variance as + + .. math:: + \text{out}_{i} \sim \mathcal{CN}(0, 1) + + This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary + :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as + + .. math:: + \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad + \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}) + + The shape of the tensor is defined by the variable argument :attr:`size`. + + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.randn(4) + tensor([-2.1436, 0.9966, 2.3426, -0.6366]) + >>> torch.randn(2, 3) + tensor([[ 1.5954, 2.8929, -1.0923], + [ 1.1719, -0.4709, -0.1996]]) + + .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution + """ + ... +def randn_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randn_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor + + Returns a tensor with the same size as :attr:`input` that is filled with + random numbers from a normal distribution with mean 0 and variance 1. Please refer to :func:`torch.randn` for the + sampling process of complex dtypes. ``torch.randn_like(input)`` is equivalent to + ``torch.randn(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``. + + Args: + input (Tensor): the size of :attr:`input` will determine size of the output tensor. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor. + Default: if ``None``, defaults to the dtype of :attr:`input`. + layout (:class:`torch.layout`, optional): the desired layout of returned tensor. + Default: if ``None``, defaults to the layout of :attr:`input`. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, defaults to the device of :attr:`input`. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + memory_format (:class:`torch.memory_format`, optional): the desired memory format of + returned Tensor. Default: ``torch.preserve_format``. + """ + ... +@overload +def randperm(n: Union[_int, SymInt], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randperm(n, *, generator=None, out=None, dtype=torch.int64,layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + Returns a random permutation of integers from ``0`` to ``n - 1``. + + Args: + n (int): the upper bound (exclusive) + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: ``torch.int64``. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.randperm(4) + tensor([2, 1, 0, 3]) + """ + ... +@overload +def randperm(n: Union[_int, SymInt], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + randperm(n, *, generator=None, out=None, dtype=torch.int64,layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor + + Returns a random permutation of integers from ``0`` to ``n - 1``. + + Args: + n (int): the upper bound (exclusive) + + Keyword args: + generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: ``torch.int64``. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + Example:: + + >>> torch.randperm(4) + tensor([2, 1, 0, 3]) + """ + ... +def range(start: Number, end: Number, step: Number = 1, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: + r""" + range(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a 1-D tensor of size :math:`\left\lfloor \frac{\text{end} - \text{start}}{\text{step}} \right\rfloor + 1` + with values from :attr:`start` to :attr:`end` with step :attr:`step`. Step is + the gap between two values in the tensor. + + .. math:: + \text{out}_{i+1} = \text{out}_i + \text{step}. + + .. warning:: + This function is deprecated and will be removed in a future release because its behavior is inconsistent with + Python's range builtin. Instead, use :func:`torch.arange`, which produces values in [start, end). + + Args: + start (float): the starting value for the set of points. Default: ``0``. + end (float): the ending value for the set of points + step (float): the gap between each pair of adjacent points. Default: ``1``. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input + arguments. If any of `start`, `end`, or `stop` are floating-point, the + `dtype` is inferred to be the default dtype, see + :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to + be `torch.int64`. + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.range(1, 4) + tensor([ 1., 2., 3., 4.]) + >>> torch.range(1, 4, 0.5) + tensor([ 1.0000, 1.5000, 2.0000, 2.5000, 3.0000, 3.5000, 4.0000]) + """ + ... +def ravel(input: Tensor) -> Tensor: + r""" + ravel(input) -> Tensor + + Return a contiguous flattened tensor. A copy is made only if needed. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> t = torch.tensor([[[1, 2], + ... [3, 4]], + ... [[5, 6], + ... [7, 8]]]) + >>> torch.ravel(t) + tensor([1, 2, 3, 4, 5, 6, 7, 8]) + """ + ... +def real(input: Tensor) -> Tensor: + r""" + real(input) -> Tensor + + Returns a new tensor containing real values of the :attr:`self` tensor. + The returned tensor and :attr:`self` share the same underlying storage. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> x=torch.randn(4, dtype=torch.cfloat) + >>> x + tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)]) + >>> x.real + tensor([ 0.3100, -0.5445, -1.6492, -0.0638]) + """ + ... +def reciprocal(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + reciprocal(input, *, out=None) -> Tensor + + Returns a new tensor with the reciprocal of the elements of :attr:`input` + + .. math:: + \text{out}_{i} = \frac{1}{\text{input}_{i}} + + .. note:: + Unlike NumPy's reciprocal, torch.reciprocal supports integral inputs. Integral + inputs to reciprocal are automatically :ref:`promoted ` to + the default scalar type. + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([-0.4595, -2.1219, -1.4314, 0.7298]) + >>> torch.reciprocal(a) + tensor([-2.1763, -0.4713, -0.6986, 1.3702]) + """ + ... +def reciprocal_(input: Tensor) -> Tensor: ... +def relu(input: Tensor) -> Tensor: ... +def relu_(input: Tensor) -> Tensor: ... +@overload +def remainder(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + remainder(input, other, *, out=None) -> Tensor + + Computes + `Python's modulus operation `_ + entrywise. The result has the same sign as the divisor :attr:`other` and its absolute value + is less than that of :attr:`other`. + + It may also be defined in terms of :func:`torch.div` as + + .. code:: python + + torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer and float inputs. + + .. note:: + Complex inputs are not supported. In some cases, it is not mathematically + possible to satisfy the definition of a modulo operation with complex numbers. + See :func:`torch.fmod` for how division by zero is handled. + + .. seealso:: + + :func:`torch.fmod` which implements C++'s `std::fmod `_. + This one is defined in terms of division rounding towards zero. + + Args: + input (Tensor or Scalar): the dividend + other (Tensor or Scalar): the divisor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.remainder(torch.tensor([-3., -2, -1, 1, 2, 3]), 2) + tensor([ 1., 0., 1., 1., 0., 1.]) + >>> torch.remainder(torch.tensor([1, 2, 3, 4, 5]), -1.5) + tensor([ -0.5000, -1.0000, 0.0000, -0.5000, -1.0000 ]) + """ + ... +@overload +def remainder(self: Union[Number, _complex], other: Tensor) -> Tensor: + r""" + remainder(input, other, *, out=None) -> Tensor + + Computes + `Python's modulus operation `_ + entrywise. The result has the same sign as the divisor :attr:`other` and its absolute value + is less than that of :attr:`other`. + + It may also be defined in terms of :func:`torch.div` as + + .. code:: python + + torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer and float inputs. + + .. note:: + Complex inputs are not supported. In some cases, it is not mathematically + possible to satisfy the definition of a modulo operation with complex numbers. + See :func:`torch.fmod` for how division by zero is handled. + + .. seealso:: + + :func:`torch.fmod` which implements C++'s `std::fmod `_. + This one is defined in terms of division rounding towards zero. + + Args: + input (Tensor or Scalar): the dividend + other (Tensor or Scalar): the divisor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.remainder(torch.tensor([-3., -2, -1, 1, 2, 3]), 2) + tensor([ 1., 0., 1., 1., 0., 1.]) + >>> torch.remainder(torch.tensor([1, 2, 3, 4, 5]), -1.5) + tensor([ -0.5000, -1.0000, 0.0000, -0.5000, -1.0000 ]) + """ + ... +@overload +def remainder(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + remainder(input, other, *, out=None) -> Tensor + + Computes + `Python's modulus operation `_ + entrywise. The result has the same sign as the divisor :attr:`other` and its absolute value + is less than that of :attr:`other`. + + It may also be defined in terms of :func:`torch.div` as + + .. code:: python + + torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer and float inputs. + + .. note:: + Complex inputs are not supported. In some cases, it is not mathematically + possible to satisfy the definition of a modulo operation with complex numbers. + See :func:`torch.fmod` for how division by zero is handled. + + .. seealso:: + + :func:`torch.fmod` which implements C++'s `std::fmod `_. + This one is defined in terms of division rounding towards zero. + + Args: + input (Tensor or Scalar): the dividend + other (Tensor or Scalar): the divisor + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.remainder(torch.tensor([-3., -2, -1, 1, 2, 3]), 2) + tensor([ 1., 0., 1., 1., 0., 1.]) + >>> torch.remainder(torch.tensor([1, 2, 3, 4, 5]), -1.5) + tensor([ -0.5000, -1.0000, 0.0000, -0.5000, -1.0000 ]) + """ + ... +def renorm(input: Tensor, p: Union[Number, _complex], dim: _int, maxnorm: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + renorm(input, p, dim, maxnorm, *, out=None) -> Tensor + + Returns a tensor where each sub-tensor of :attr:`input` along dimension + :attr:`dim` is normalized such that the `p`-norm of the sub-tensor is lower + than the value :attr:`maxnorm` + + .. note:: If the norm of a row is lower than `maxnorm`, the row is unchanged + + Args: + input (Tensor): the input tensor. + p (float): the power for the norm computation + dim (int): the dimension to slice over to get the sub-tensors + maxnorm (float): the maximum norm to keep each sub-tensor under + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> x = torch.ones(3, 3) + >>> x[1].fill_(2) + tensor([ 2., 2., 2.]) + >>> x[2].fill_(3) + tensor([ 3., 3., 3.]) + >>> x + tensor([[ 1., 1., 1.], + [ 2., 2., 2.], + [ 3., 3., 3.]]) + >>> torch.renorm(x, 1, 0, 5) + tensor([[ 1.0000, 1.0000, 1.0000], + [ 1.6667, 1.6667, 1.6667], + [ 1.6667, 1.6667, 1.6667]]) + """ + ... +@overload +def repeat_interleave(input: Tensor, repeats: Tensor, dim: Optional[_int] = None, *, output_size: Optional[Union[_int, SymInt]] = None) -> Tensor: + r""" + repeat_interleave(input, repeats, dim=None, *, output_size=None) -> Tensor + + Repeat elements of a tensor. + + .. warning:: + + This is different from :meth:`torch.Tensor.repeat` but similar to ``numpy.repeat``. + + Args: + input (Tensor): the input tensor. + repeats (Tensor or int): The number of repetitions for each element. + repeats is broadcasted to fit the shape of the given axis. + dim (int, optional): The dimension along which to repeat values. + By default, use the flattened input array, and return a flat output + array. + + Keyword args: + output_size (int, optional): Total output size for the given axis + ( e.g. sum of repeats). If given, it will avoid stream synchronization + needed to calculate output shape of the tensor. + + Returns: + Tensor: Repeated tensor which has the same shape as input, except along the given axis. + + Example:: + + >>> x = torch.tensor([1, 2, 3]) + >>> x.repeat_interleave(2) + tensor([1, 1, 2, 2, 3, 3]) + >>> y = torch.tensor([[1, 2], [3, 4]]) + >>> torch.repeat_interleave(y, 2) + tensor([1, 1, 2, 2, 3, 3, 4, 4]) + >>> torch.repeat_interleave(y, 3, dim=1) + tensor([[1, 1, 1, 2, 2, 2], + [3, 3, 3, 4, 4, 4]]) + >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0) + tensor([[1, 2], + [3, 4], + [3, 4]]) + >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0, output_size=3) + tensor([[1, 2], + [3, 4], + [3, 4]]) + + If the `repeats` is `tensor([n1, n2, n3, ...])`, then the output will be + `tensor([0, 0, ..., 1, 1, ..., 2, 2, ..., ...])` where `0` appears `n1` times, + `1` appears `n2` times, `2` appears `n3` times, etc. + + .. function:: repeat_interleave(repeats, *) -> Tensor + :noindex: + + Repeats 0 repeats[0] times, 1 repeats[1] times, 2 repeats[2] times, etc. + + Args: + repeats (Tensor): The number of repetitions for each element. + + Returns: + Tensor: Repeated tensor of size `sum(repeats)`. + + Example:: + + >>> torch.repeat_interleave(torch.tensor([1, 2, 3])) + tensor([0, 1, 1, 2, 2, 2]) + """ + ... +@overload +def repeat_interleave(repeats: Tensor, *, output_size: Optional[Union[_int, SymInt]] = None) -> Tensor: + r""" + repeat_interleave(input, repeats, dim=None, *, output_size=None) -> Tensor + + Repeat elements of a tensor. + + .. warning:: + + This is different from :meth:`torch.Tensor.repeat` but similar to ``numpy.repeat``. + + Args: + input (Tensor): the input tensor. + repeats (Tensor or int): The number of repetitions for each element. + repeats is broadcasted to fit the shape of the given axis. + dim (int, optional): The dimension along which to repeat values. + By default, use the flattened input array, and return a flat output + array. + + Keyword args: + output_size (int, optional): Total output size for the given axis + ( e.g. sum of repeats). If given, it will avoid stream synchronization + needed to calculate output shape of the tensor. + + Returns: + Tensor: Repeated tensor which has the same shape as input, except along the given axis. + + Example:: + + >>> x = torch.tensor([1, 2, 3]) + >>> x.repeat_interleave(2) + tensor([1, 1, 2, 2, 3, 3]) + >>> y = torch.tensor([[1, 2], [3, 4]]) + >>> torch.repeat_interleave(y, 2) + tensor([1, 1, 2, 2, 3, 3, 4, 4]) + >>> torch.repeat_interleave(y, 3, dim=1) + tensor([[1, 1, 1, 2, 2, 2], + [3, 3, 3, 4, 4, 4]]) + >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0) + tensor([[1, 2], + [3, 4], + [3, 4]]) + >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0, output_size=3) + tensor([[1, 2], + [3, 4], + [3, 4]]) + + If the `repeats` is `tensor([n1, n2, n3, ...])`, then the output will be + `tensor([0, 0, ..., 1, 1, ..., 2, 2, ..., ...])` where `0` appears `n1` times, + `1` appears `n2` times, `2` appears `n3` times, etc. + + .. function:: repeat_interleave(repeats, *) -> Tensor + :noindex: + + Repeats 0 repeats[0] times, 1 repeats[1] times, 2 repeats[2] times, etc. + + Args: + repeats (Tensor): The number of repetitions for each element. + + Returns: + Tensor: Repeated tensor of size `sum(repeats)`. + + Example:: + + >>> torch.repeat_interleave(torch.tensor([1, 2, 3])) + tensor([0, 1, 1, 2, 2, 2]) + """ + ... +@overload +def repeat_interleave(input: Tensor, repeats: Union[_int, SymInt], dim: Optional[_int] = None, *, output_size: Optional[Union[_int, SymInt]] = None) -> Tensor: + r""" + repeat_interleave(input, repeats, dim=None, *, output_size=None) -> Tensor + + Repeat elements of a tensor. + + .. warning:: + + This is different from :meth:`torch.Tensor.repeat` but similar to ``numpy.repeat``. + + Args: + input (Tensor): the input tensor. + repeats (Tensor or int): The number of repetitions for each element. + repeats is broadcasted to fit the shape of the given axis. + dim (int, optional): The dimension along which to repeat values. + By default, use the flattened input array, and return a flat output + array. + + Keyword args: + output_size (int, optional): Total output size for the given axis + ( e.g. sum of repeats). If given, it will avoid stream synchronization + needed to calculate output shape of the tensor. + + Returns: + Tensor: Repeated tensor which has the same shape as input, except along the given axis. + + Example:: + + >>> x = torch.tensor([1, 2, 3]) + >>> x.repeat_interleave(2) + tensor([1, 1, 2, 2, 3, 3]) + >>> y = torch.tensor([[1, 2], [3, 4]]) + >>> torch.repeat_interleave(y, 2) + tensor([1, 1, 2, 2, 3, 3, 4, 4]) + >>> torch.repeat_interleave(y, 3, dim=1) + tensor([[1, 1, 1, 2, 2, 2], + [3, 3, 3, 4, 4, 4]]) + >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0) + tensor([[1, 2], + [3, 4], + [3, 4]]) + >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0, output_size=3) + tensor([[1, 2], + [3, 4], + [3, 4]]) + + If the `repeats` is `tensor([n1, n2, n3, ...])`, then the output will be + `tensor([0, 0, ..., 1, 1, ..., 2, 2, ..., ...])` where `0` appears `n1` times, + `1` appears `n2` times, `2` appears `n3` times, etc. + + .. function:: repeat_interleave(repeats, *) -> Tensor + :noindex: + + Repeats 0 repeats[0] times, 1 repeats[1] times, 2 repeats[2] times, etc. + + Args: + repeats (Tensor): The number of repetitions for each element. + + Returns: + Tensor: Repeated tensor of size `sum(repeats)`. + + Example:: + + >>> torch.repeat_interleave(torch.tensor([1, 2, 3])) + tensor([0, 1, 1, 2, 2, 2]) + """ + ... +def reshape(input: Tensor, shape: Sequence[Union[_int, SymInt]]) -> Tensor: + r""" + reshape(input, shape) -> Tensor + + Returns a tensor with the same data and number of elements as :attr:`input`, + but with the specified shape. When possible, the returned tensor will be a view + of :attr:`input`. Otherwise, it will be a copy. Contiguous inputs and inputs + with compatible strides can be reshaped without copying, but you should not + depend on the copying vs. viewing behavior. + + See :meth:`torch.Tensor.view` on when it is possible to return a view. + + A single dimension may be -1, in which case it's inferred from the remaining + dimensions and the number of elements in :attr:`input`. + + Args: + input (Tensor): the tensor to be reshaped + shape (tuple of int): the new shape + + Example:: + + >>> a = torch.arange(4.) + >>> torch.reshape(a, (2, 2)) + tensor([[ 0., 1.], + [ 2., 3.]]) + >>> b = torch.tensor([[0, 1], [2, 3]]) + >>> torch.reshape(b, (-1,)) + tensor([ 0, 1, 2, 3]) + """ + ... +def resize_as_(input: Tensor, the_template: Tensor, *, memory_format: Optional[memory_format] = None) -> Tensor: ... +def resize_as_sparse_(input: Tensor, the_template: Tensor) -> Tensor: ... +def resolve_conj(input: Tensor) -> Tensor: + r""" + resolve_conj(input) -> Tensor + + Returns a new tensor with materialized conjugation if :attr:`input`'s conjugate bit is set to `True`, + else returns :attr:`input`. The output tensor will always have its conjugate bit set to `False`. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]) + >>> y = x.conj() + >>> y.is_conj() + True + >>> z = y.resolve_conj() + >>> z + tensor([-1 - 1j, -2 - 2j, 3 + 3j]) + >>> z.is_conj() + False + """ + ... +def resolve_neg(input: Tensor) -> Tensor: + r""" + resolve_neg(input) -> Tensor + + Returns a new tensor with materialized negation if :attr:`input`'s negative bit is set to `True`, + else returns :attr:`input`. The output tensor will always have its negative bit set to `False`. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]) + >>> y = x.conj() + >>> z = y.imag + >>> z.is_neg() + True + >>> out = z.resolve_neg() + >>> out + tensor([-1., -2., 3.]) + >>> out.is_neg() + False + """ + ... +@overload +def result_type(tensor: Tensor, other: Tensor) -> _dtype: + r""" + result_type(tensor1, tensor2) -> dtype + + Returns the :class:`torch.dtype` that would result from performing an arithmetic + operation on the provided input tensors. See type promotion :ref:`documentation ` + for more information on the type promotion logic. + + Args: + tensor1 (Tensor or Number): an input tensor or number + tensor2 (Tensor or Number): an input tensor or number + + Example:: + + >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0) + torch.float32 + >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1)) + torch.uint8 + """ + ... +@overload +def result_type(scalar: Union[Number, _complex], tensor: Tensor) -> _dtype: + r""" + result_type(tensor1, tensor2) -> dtype + + Returns the :class:`torch.dtype` that would result from performing an arithmetic + operation on the provided input tensors. See type promotion :ref:`documentation ` + for more information on the type promotion logic. + + Args: + tensor1 (Tensor or Number): an input tensor or number + tensor2 (Tensor or Number): an input tensor or number + + Example:: + + >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0) + torch.float32 + >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1)) + torch.uint8 + """ + ... +@overload +def result_type(tensor: Tensor, other: Union[Number, _complex]) -> _dtype: + r""" + result_type(tensor1, tensor2) -> dtype + + Returns the :class:`torch.dtype` that would result from performing an arithmetic + operation on the provided input tensors. See type promotion :ref:`documentation ` + for more information on the type promotion logic. + + Args: + tensor1 (Tensor or Number): an input tensor or number + tensor2 (Tensor or Number): an input tensor or number + + Example:: + + >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0) + torch.float32 + >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1)) + torch.uint8 + """ + ... +@overload +def result_type(scalar1: Union[Number, _complex], scalar2: Union[Number, _complex]) -> _dtype: + r""" + result_type(tensor1, tensor2) -> dtype + + Returns the :class:`torch.dtype` that would result from performing an arithmetic + operation on the provided input tensors. See type promotion :ref:`documentation ` + for more information on the type promotion logic. + + Args: + tensor1 (Tensor or Number): an input tensor or number + tensor2 (Tensor or Number): an input tensor or number + + Example:: + + >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0) + torch.float32 + >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1)) + torch.uint8 + """ + ... +@overload +def rnn_relu(data: Tensor, batch_sizes: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor]: ... +@overload +def rnn_relu(input: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor]: ... +def rnn_relu_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tensor: ... +@overload +def rnn_tanh(data: Tensor, batch_sizes: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor]: ... +@overload +def rnn_tanh(input: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor]: ... +def rnn_tanh_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tensor: ... +def roll(input: Tensor, shifts: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]], dims: Union[_int, _size] = ()) -> Tensor: + r""" + roll(input, shifts, dims=None) -> Tensor + + Roll the tensor :attr:`input` along the given dimension(s). Elements that are + shifted beyond the last position are re-introduced at the first position. If + :attr:`dims` is `None`, the tensor will be flattened before rolling and then + restored to the original shape. + + Args: + input (Tensor): the input tensor. + shifts (int or tuple of ints): The number of places by which the elements + of the tensor are shifted. If shifts is a tuple, dims must be a tuple of + the same size, and each dimension will be rolled by the corresponding + value + dims (int or tuple of ints): Axis along which to roll + + Example:: + + >>> x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]).view(4, 2) + >>> x + tensor([[1, 2], + [3, 4], + [5, 6], + [7, 8]]) + >>> torch.roll(x, 1) + tensor([[8, 1], + [2, 3], + [4, 5], + [6, 7]]) + >>> torch.roll(x, 1, 0) + tensor([[7, 8], + [1, 2], + [3, 4], + [5, 6]]) + >>> torch.roll(x, -1, 0) + tensor([[3, 4], + [5, 6], + [7, 8], + [1, 2]]) + >>> torch.roll(x, shifts=(2, 1), dims=(0, 1)) + tensor([[6, 5], + [8, 7], + [2, 1], + [4, 3]]) + """ + ... +def rot90(input: Tensor, k: _int = 1, dims: _size = (0,1)) -> Tensor: + r""" + rot90(input, k=1, dims=[0,1]) -> Tensor + + Rotate an n-D tensor by 90 degrees in the plane specified by dims axis. + Rotation direction is from the first towards the second axis if k > 0, and from the second towards the first for k < 0. + + Args: + input (Tensor): the input tensor. + k (int): number of times to rotate. Default value is 1 + dims (a list or tuple): axis to rotate. Default value is [0, 1] + + Example:: + + >>> x = torch.arange(4).view(2, 2) + >>> x + tensor([[0, 1], + [2, 3]]) + >>> torch.rot90(x, 1, [0, 1]) + tensor([[1, 3], + [0, 2]]) + + >>> x = torch.arange(8).view(2, 2, 2) + >>> x + tensor([[[0, 1], + [2, 3]], + + [[4, 5], + [6, 7]]]) + >>> torch.rot90(x, 1, [1, 2]) + tensor([[[1, 3], + [0, 2]], + + [[5, 7], + [4, 6]]]) + """ + ... +@overload +def round(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + round(input, *, decimals=0, out=None) -> Tensor + + Rounds elements of :attr:`input` to the nearest integer. + + For integer inputs, follows the array-api convention of returning a + copy of the input tensor. + The return type of output is same as that of input's dtype. + + .. note:: + This function implements the "round half to even" to + break ties when a number is equidistant from two + integers (e.g. `round(2.5)` is 2). + + When the :attr:\`decimals\` argument is specified the + algorithm used is similar to NumPy's `around`. This + algorithm is fast but inexact and it can easily + overflow for low precision dtypes. + Eg. `round(tensor([10000], dtype=torch.float16), decimals=3)` is `inf`. + + .. seealso:: + :func:`torch.ceil`, which rounds up. + :func:`torch.floor`, which rounds down. + :func:`torch.trunc`, which rounds towards zero. + + Args: + input (Tensor): the input tensor. + decimals (int): Number of decimal places to round to (default: 0). + If decimals is negative, it specifies the number of positions + to the left of the decimal point. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.round(torch.tensor((4.7, -2.3, 9.1, -7.7))) + tensor([ 5., -2., 9., -8.]) + + >>> # Values equidistant from two integers are rounded towards the + >>> # the nearest even value (zero is treated as even) + >>> torch.round(torch.tensor([-0.5, 0.5, 1.5, 2.5])) + tensor([-0., 0., 2., 2.]) + + >>> # A positive decimals argument rounds to the to that decimal place + >>> torch.round(torch.tensor([0.1234567]), decimals=3) + tensor([0.1230]) + + >>> # A negative decimals argument rounds to the left of the decimal + >>> torch.round(torch.tensor([1200.1234567]), decimals=-3) + tensor([1000.]) + """ + ... +@overload +def round(input: Tensor, *, decimals: _int, out: Optional[Tensor] = None) -> Tensor: + r""" + round(input, *, decimals=0, out=None) -> Tensor + + Rounds elements of :attr:`input` to the nearest integer. + + For integer inputs, follows the array-api convention of returning a + copy of the input tensor. + The return type of output is same as that of input's dtype. + + .. note:: + This function implements the "round half to even" to + break ties when a number is equidistant from two + integers (e.g. `round(2.5)` is 2). + + When the :attr:\`decimals\` argument is specified the + algorithm used is similar to NumPy's `around`. This + algorithm is fast but inexact and it can easily + overflow for low precision dtypes. + Eg. `round(tensor([10000], dtype=torch.float16), decimals=3)` is `inf`. + + .. seealso:: + :func:`torch.ceil`, which rounds up. + :func:`torch.floor`, which rounds down. + :func:`torch.trunc`, which rounds towards zero. + + Args: + input (Tensor): the input tensor. + decimals (int): Number of decimal places to round to (default: 0). + If decimals is negative, it specifies the number of positions + to the left of the decimal point. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> torch.round(torch.tensor((4.7, -2.3, 9.1, -7.7))) + tensor([ 5., -2., 9., -8.]) + + >>> # Values equidistant from two integers are rounded towards the + >>> # the nearest even value (zero is treated as even) + >>> torch.round(torch.tensor([-0.5, 0.5, 1.5, 2.5])) + tensor([-0., 0., 2., 2.]) + + >>> # A positive decimals argument rounds to the to that decimal place + >>> torch.round(torch.tensor([0.1234567]), decimals=3) + tensor([0.1230]) + + >>> # A negative decimals argument rounds to the left of the decimal + >>> torch.round(torch.tensor([1200.1234567]), decimals=-3) + tensor([1000.]) + """ + ... +@overload +def round_(input: Tensor) -> Tensor: ... +@overload +def round_(input: Tensor, *, decimals: _int) -> Tensor: ... +def row_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ... +def row_stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: + r""" + row_stack(tensors, *, out=None) -> Tensor + + Alias of :func:`torch.vstack`. + """ + ... +def rrelu(input: Tensor, lower: Union[Number, _complex] = 0.125, upper: Union[Number, _complex] = 0.3333333333333333, training: _bool = False, generator: Optional[Generator] = None) -> Tensor: ... +def rrelu_(input: Tensor, lower: Union[Number, _complex] = 0.125, upper: Union[Number, _complex] = 0.3333333333333333, training: _bool = False, generator: Optional[Generator] = None) -> Tensor: ... +def rsqrt(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + rsqrt(input, *, out=None) -> Tensor + + Returns a new tensor with the reciprocal of the square-root of each of + the elements of :attr:`input`. + + .. math:: + \text{out}_{i} = \frac{1}{\sqrt{\text{input}_{i}}} + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([-0.0370, 0.2970, 1.5420, -0.9105]) + >>> torch.rsqrt(a) + tensor([ nan, 1.8351, 0.8053, nan]) + """ + ... +def rsqrt_(input: Tensor) -> Tensor: ... +@overload +def rsub(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: ... +@overload +def rsub(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: ... +def saddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Number = 1, alpha: Number = 1, out: Optional[Tensor] = None) -> Tensor: ... +def scalar_tensor(s: Union[Number, _complex], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ... +@overload +def scatter(input: Tensor, dim: _int, index: Tensor, src: Tensor, *, reduce: str, out: Optional[Tensor] = None) -> Tensor: + r""" + scatter(input, dim, index, src) -> Tensor + + Out-of-place version of :meth:`torch.Tensor.scatter_` + """ + ... +@overload +def scatter(input: Tensor, dim: _int, index: Tensor, src: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + scatter(input, dim, index, src) -> Tensor + + Out-of-place version of :meth:`torch.Tensor.scatter_` + """ + ... +@overload +def scatter(input: Tensor, dim: _int, index: Tensor, value: Union[Number, _complex], *, reduce: str, out: Optional[Tensor] = None) -> Tensor: + r""" + scatter(input, dim, index, src) -> Tensor + + Out-of-place version of :meth:`torch.Tensor.scatter_` + """ + ... +@overload +def scatter(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, src: Tensor) -> Tensor: + r""" + scatter(input, dim, index, src) -> Tensor + + Out-of-place version of :meth:`torch.Tensor.scatter_` + """ + ... +@overload +def scatter(input: Tensor, dim: _int, index: Tensor, value: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + scatter(input, dim, index, src) -> Tensor + + Out-of-place version of :meth:`torch.Tensor.scatter_` + """ + ... +@overload +def scatter(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, value: Union[Number, _complex]) -> Tensor: + r""" + scatter(input, dim, index, src) -> Tensor + + Out-of-place version of :meth:`torch.Tensor.scatter_` + """ + ... +@overload +def scatter_add(input: Tensor, dim: _int, index: Tensor, src: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + scatter_add(input, dim, index, src) -> Tensor + + Out-of-place version of :meth:`torch.Tensor.scatter_add_` + """ + ... +@overload +def scatter_add(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, src: Tensor) -> Tensor: + r""" + scatter_add(input, dim, index, src) -> Tensor + + Out-of-place version of :meth:`torch.Tensor.scatter_add_` + """ + ... +def scatter_reduce(input: Tensor, dim: _int, index: Tensor, src: Tensor, reduce: str, *, include_self: _bool = True, out: Optional[Tensor] = None) -> Tensor: + r""" + scatter_reduce(input, dim, index, src, reduce, *, include_self=True) -> Tensor + + Out-of-place version of :meth:`torch.Tensor.scatter_reduce_` + """ + ... +@overload +def searchsorted(sorted_sequence: Tensor, input: Tensor, *, out_int32: _bool = False, right: _bool = False, side: Optional[str] = None, sorter: Optional[Tensor] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + searchsorted(sorted_sequence, values, *, out_int32=False, right=False, side=None, out=None, sorter=None) -> Tensor + + Find the indices from the *innermost* dimension of :attr:`sorted_sequence` such that, if the + corresponding values in :attr:`values` were inserted before the indices, when sorted, the order + of the corresponding *innermost* dimension within :attr:`sorted_sequence` would be preserved. + Return a new tensor with the same size as :attr:`values`. More formally, + the returned index satisfies the following rules: + + .. list-table:: + :widths: 12 10 78 + :header-rows: 1 + + * - :attr:`sorted_sequence` + - :attr:`right` + - *returned index satisfies* + * - 1-D + - False + - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]`` + * - 1-D + - True + - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]`` + * - N-D + - False + - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]`` + * - N-D + - True + - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]`` + + Args: + sorted_sequence (Tensor): N-D or 1-D tensor, containing monotonically increasing sequence on the *innermost* + dimension unless :attr:`sorter` is provided, in which case the sequence does not + need to be sorted + values (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s). + + Keyword args: + out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise. + Default value is False, i.e. default output data type is torch.int64. + right (bool, optional): if False, return the first suitable location that is found. If True, return the + last such index. If no suitable index found, return 0 for non-numerical value + (eg. nan, inf) or the size of *innermost* dimension within :attr:`sorted_sequence` + (one pass the last index of the *innermost* dimension). In other words, if False, + gets the lower bound index for each value in :attr:`values` on the corresponding + *innermost* dimension of the :attr:`sorted_sequence`. If True, gets the upper + bound index instead. Default value is False. :attr:`side` does the same and is + preferred. It will error if :attr:`side` is set to "left" while this is True. + side (str, optional): the same as :attr:`right` but preferred. "left" corresponds to False for :attr:`right` + and "right" corresponds to True for :attr:`right`. It will error if this is set to + "left" while :attr:`right` is True. Default value is None. + out (Tensor, optional): the output tensor, must be the same size as :attr:`values` if provided. + sorter (LongTensor, optional): if provided, a tensor matching the shape of the unsorted + :attr:`sorted_sequence` containing a sequence of indices that sort it in the + ascending order on the innermost dimension + + + Example:: + + >>> sorted_sequence = torch.tensor([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]]) + >>> sorted_sequence + tensor([[ 1, 3, 5, 7, 9], + [ 2, 4, 6, 8, 10]]) + >>> values = torch.tensor([[3, 6, 9], [3, 6, 9]]) + >>> values + tensor([[3, 6, 9], + [3, 6, 9]]) + >>> torch.searchsorted(sorted_sequence, values) + tensor([[1, 3, 4], + [1, 2, 4]]) + >>> torch.searchsorted(sorted_sequence, values, side='right') + tensor([[2, 3, 5], + [1, 3, 4]]) + + >>> sorted_sequence_1d = torch.tensor([1, 3, 5, 7, 9]) + >>> sorted_sequence_1d + tensor([1, 3, 5, 7, 9]) + >>> torch.searchsorted(sorted_sequence_1d, values) + tensor([[1, 3, 4], + [1, 3, 4]]) + """ + ... +@overload +def searchsorted(sorted_sequence: Tensor, self: Union[Number, _complex], *, out_int32: _bool = False, right: _bool = False, side: Optional[str] = None, sorter: Optional[Tensor] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + searchsorted(sorted_sequence, values, *, out_int32=False, right=False, side=None, out=None, sorter=None) -> Tensor + + Find the indices from the *innermost* dimension of :attr:`sorted_sequence` such that, if the + corresponding values in :attr:`values` were inserted before the indices, when sorted, the order + of the corresponding *innermost* dimension within :attr:`sorted_sequence` would be preserved. + Return a new tensor with the same size as :attr:`values`. More formally, + the returned index satisfies the following rules: + + .. list-table:: + :widths: 12 10 78 + :header-rows: 1 + + * - :attr:`sorted_sequence` + - :attr:`right` + - *returned index satisfies* + * - 1-D + - False + - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]`` + * - 1-D + - True + - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]`` + * - N-D + - False + - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]`` + * - N-D + - True + - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]`` + + Args: + sorted_sequence (Tensor): N-D or 1-D tensor, containing monotonically increasing sequence on the *innermost* + dimension unless :attr:`sorter` is provided, in which case the sequence does not + need to be sorted + values (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s). + + Keyword args: + out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise. + Default value is False, i.e. default output data type is torch.int64. + right (bool, optional): if False, return the first suitable location that is found. If True, return the + last such index. If no suitable index found, return 0 for non-numerical value + (eg. nan, inf) or the size of *innermost* dimension within :attr:`sorted_sequence` + (one pass the last index of the *innermost* dimension). In other words, if False, + gets the lower bound index for each value in :attr:`values` on the corresponding + *innermost* dimension of the :attr:`sorted_sequence`. If True, gets the upper + bound index instead. Default value is False. :attr:`side` does the same and is + preferred. It will error if :attr:`side` is set to "left" while this is True. + side (str, optional): the same as :attr:`right` but preferred. "left" corresponds to False for :attr:`right` + and "right" corresponds to True for :attr:`right`. It will error if this is set to + "left" while :attr:`right` is True. Default value is None. + out (Tensor, optional): the output tensor, must be the same size as :attr:`values` if provided. + sorter (LongTensor, optional): if provided, a tensor matching the shape of the unsorted + :attr:`sorted_sequence` containing a sequence of indices that sort it in the + ascending order on the innermost dimension + + + Example:: + + >>> sorted_sequence = torch.tensor([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]]) + >>> sorted_sequence + tensor([[ 1, 3, 5, 7, 9], + [ 2, 4, 6, 8, 10]]) + >>> values = torch.tensor([[3, 6, 9], [3, 6, 9]]) + >>> values + tensor([[3, 6, 9], + [3, 6, 9]]) + >>> torch.searchsorted(sorted_sequence, values) + tensor([[1, 3, 4], + [1, 2, 4]]) + >>> torch.searchsorted(sorted_sequence, values, side='right') + tensor([[2, 3, 5], + [1, 3, 4]]) + + >>> sorted_sequence_1d = torch.tensor([1, 3, 5, 7, 9]) + >>> sorted_sequence_1d + tensor([1, 3, 5, 7, 9]) + >>> torch.searchsorted(sorted_sequence_1d, values) + tensor([[1, 3, 4], + [1, 3, 4]]) + """ + ... +def segment_reduce(data: Tensor, reduce: str, *, lengths: Optional[Tensor] = None, indices: Optional[Tensor] = None, offsets: Optional[Tensor] = None, axis: _int = 0, unsafe: _bool = False, initial: Optional[Union[Number, _complex]] = None) -> Tensor: ... +@overload +def select(input: Tensor, dim: _int, index: Union[_int, SymInt]) -> Tensor: + r""" + select(input, dim, index) -> Tensor + + Slices the :attr:`input` tensor along the selected dimension at the given index. + This function returns a view of the original tensor with the given dimension removed. + + .. note:: If :attr:`input` is a sparse tensor and returning a view of + the tensor is not possible, a RuntimeError exception is + raised. In this is the case, consider using + :func:`torch.select_copy` function. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to slice + index (int): the index to select with + + .. note:: + + :meth:`select` is equivalent to slicing. For example, + ``tensor.select(0, index)`` is equivalent to ``tensor[index]`` and + ``tensor.select(2, index)`` is equivalent to ``tensor[:,:,index]``. + """ + ... +@overload +def select(input: Tensor, dim: Union[str, ellipsis, None], index: _int) -> Tensor: + r""" + select(input, dim, index) -> Tensor + + Slices the :attr:`input` tensor along the selected dimension at the given index. + This function returns a view of the original tensor with the given dimension removed. + + .. note:: If :attr:`input` is a sparse tensor and returning a view of + the tensor is not possible, a RuntimeError exception is + raised. In this is the case, consider using + :func:`torch.select_copy` function. + + Args: + input (Tensor): the input tensor. + dim (int): the dimension to slice + index (int): the index to select with + + .. note:: + + :meth:`select` is equivalent to slicing. For example, + ``tensor.select(0, index)`` is equivalent to ``tensor[index]`` and + ``tensor.select(2, index)`` is equivalent to ``tensor[:,:,index]``. + """ + ... +def select_copy(input: Tensor, dim: _int, index: Union[_int, SymInt], *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.select`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def select_scatter(input: Tensor, src: Tensor, dim: _int, index: Union[_int, SymInt]) -> Tensor: + r""" + select_scatter(input, src, dim, index) -> Tensor + + Embeds the values of the :attr:`src` tensor into :attr:`input` at the given index. + This function returns a tensor with fresh storage; it does not create a view. + + + Args: + input (Tensor): the input tensor. + src (Tensor): The tensor to embed into :attr:`input` + dim (int): the dimension to insert the slice into. + index (int): the index to select with + + .. note:: + + :attr:`src` must be of the proper size in order to be embedded + into :attr:`input`. Specifically, it should have the same shape as + ``torch.select(input, dim, index)`` + + Example:: + + >>> a = torch.zeros(2, 2) + >>> b = torch.ones(2) + >>> a.select_scatter(b, 0, 0) + tensor([[1., 1.], + [0., 0.]]) + """ + ... +def selu(input: Tensor) -> Tensor: ... +def selu_(input: Tensor) -> Tensor: ... +def set_flush_denormal(mode: _bool) -> _bool: + r""" + set_flush_denormal(mode) -> bool + + Disables denormal floating numbers on CPU. + + Returns ``True`` if your system supports flushing denormal numbers and it + successfully configures flush denormal mode. :meth:`~torch.set_flush_denormal` + is supported on x86 architectures supporting SSE3 and AArch64 architecture. + + Args: + mode (bool): Controls whether to enable flush denormal mode or not + + Example:: + + >>> torch.set_flush_denormal(True) + True + >>> torch.tensor([1e-323], dtype=torch.float64) + tensor([ 0.], dtype=torch.float64) + >>> torch.set_flush_denormal(False) + True + >>> torch.tensor([1e-323], dtype=torch.float64) + tensor(9.88131e-324 * + [ 1.0000], dtype=torch.float64) + """ + ... +def set_num_interop_threads(num: _int) -> None: + r""" + set_num_interop_threads(int) + + Sets the number of threads used for interop parallelism + (e.g. in JIT interpreter) on CPU. + + .. warning:: + Can only be called once and before any inter-op parallel work + is started (e.g. JIT execution). + """ + ... +def set_num_threads(num: _int) -> None: + r""" + set_num_threads(int) + + Sets the number of threads used for intraop parallelism on CPU. + + .. warning:: + To ensure that the correct number of threads is used, set_num_threads + must be called before running eager, JIT or autograd code. + """ + ... +def sgn(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + sgn(input, *, out=None) -> Tensor + + This function is an extension of torch.sign() to complex tensors. + It computes a new tensor whose elements have + the same angles as the corresponding elements of :attr:`input` and + absolute values (i.e. magnitudes) of one for complex tensors and + is equivalent to torch.sign() for non-complex tensors. + + .. math:: + \text{out}_{i} = \begin{cases} + 0 & |\text{{input}}_i| == 0 \\ + \frac{{\text{{input}}_i}}{|{\text{{input}}_i}|} & \text{otherwise} + \end{cases} + + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> t = torch.tensor([3+4j, 7-24j, 0, 1+2j]) + >>> t.sgn() + tensor([0.6000+0.8000j, 0.2800-0.9600j, 0.0000+0.0000j, 0.4472+0.8944j]) + """ + ... +def sigmoid(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + sigmoid(input, *, out=None) -> Tensor + + Alias for :func:`torch.special.expit`. + """ + ... +def sigmoid_(input: Tensor) -> Tensor: ... +def sign(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + sign(input, *, out=None) -> Tensor + + Returns a new tensor with the signs of the elements of :attr:`input`. + + .. math:: + \text{out}_{i} = \operatorname{sgn}(\text{input}_{i}) + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([0.7, -1.2, 0., 2.3]) + >>> a + tensor([ 0.7000, -1.2000, 0.0000, 2.3000]) + >>> torch.sign(a) + tensor([ 1., -1., 0., 1.]) + """ + ... +def signbit(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + signbit(input, *, out=None) -> Tensor + + Tests if each element of :attr:`input` has its sign bit set or not. + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([0.7, -1.2, 0., 2.3]) + >>> torch.signbit(a) + tensor([ False, True, False, False]) + >>> a = torch.tensor([-0.0, 0.0]) + >>> torch.signbit(a) + tensor([ True, False]) + + .. note:: + signbit handles signed zeros, so negative zero (-0) returns True. + """ + ... +def sin(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + sin(input, *, out=None) -> Tensor + + Returns a new tensor with the sine of the elements of :attr:`input`. + + .. math:: + \text{out}_{i} = \sin(\text{input}_{i}) + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([-0.5461, 0.1347, -2.7266, -0.2746]) + >>> torch.sin(a) + tensor([-0.5194, 0.1343, -0.4032, -0.2711]) + """ + ... +def sin_(input: Tensor) -> Tensor: ... +def sinc(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + sinc(input, *, out=None) -> Tensor + + Alias for :func:`torch.special.sinc`. + """ + ... +def sinc_(input: Tensor) -> Tensor: ... +def sinh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + sinh(input, *, out=None) -> Tensor + + Returns a new tensor with the hyperbolic sine of the elements of + :attr:`input`. + + .. math:: + \text{out}_{i} = \sinh(\text{input}_{i}) + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([ 0.5380, -0.8632, -0.1265, 0.9399]) + >>> torch.sinh(a) + tensor([ 0.5644, -0.9744, -0.1268, 1.0845]) + + .. note:: + When :attr:`input` is on the CPU, the implementation of torch.sinh may use + the Sleef library, which rounds very large results to infinity or negative + infinity. See `here `_ for details. + """ + ... +def sinh_(input: Tensor) -> Tensor: ... +def slice_copy(input: Tensor, dim: _int = 0, start: Optional[Union[_int, SymInt]] = None, end: Optional[Union[_int, SymInt]] = None, step: Union[_int, SymInt] = 1, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.slice`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def slice_inverse(input: Tensor, src: Tensor, dim: _int = 0, start: Optional[Union[_int, SymInt]] = None, end: Optional[Union[_int, SymInt]] = None, step: Union[_int, SymInt] = 1) -> Tensor: ... +def slice_scatter(input: Tensor, src: Tensor, dim: _int = 0, start: Optional[Union[_int, SymInt]] = None, end: Optional[Union[_int, SymInt]] = None, step: Union[_int, SymInt] = 1, *, out: Optional[Tensor] = None) -> Tensor: + r""" + slice_scatter(input, src, dim=0, start=None, end=None, step=1) -> Tensor + + Embeds the values of the :attr:`src` tensor into :attr:`input` at the given + dimension. + This function returns a tensor with fresh storage; it does not create a view. + + + Args: + input (Tensor): the input tensor. + src (Tensor): The tensor to embed into :attr:`input` + dim (int): the dimension to insert the slice into + start (Optional[int]): the start index of where to insert the slice + end (Optional[int]): the end index of where to insert the slice + step (int): the how many elements to skip in + + Example:: + + >>> a = torch.zeros(8, 8) + >>> b = torch.ones(2, 8) + >>> a.slice_scatter(b, start=6) + tensor([[0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0.], + [1., 1., 1., 1., 1., 1., 1., 1.], + [1., 1., 1., 1., 1., 1., 1., 1.]]) + + >>> b = torch.ones(8, 2) + >>> a.slice_scatter(b, dim=1, start=2, end=6, step=2) + tensor([[0., 0., 1., 0., 1., 0., 0., 0.], + [0., 0., 1., 0., 1., 0., 0., 0.], + [0., 0., 1., 0., 1., 0., 0., 0.], + [0., 0., 1., 0., 1., 0., 0., 0.], + [0., 0., 1., 0., 1., 0., 0., 0.], + [0., 0., 1., 0., 1., 0., 0., 0.], + [0., 0., 1., 0., 1., 0., 0., 0.], + [0., 0., 1., 0., 1., 0., 0., 0.]]) + """ + ... +def slogdet(input: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.slogdet: + r""" + slogdet(input) -> (Tensor, Tensor) + + Alias for :func:`torch.linalg.slogdet` + """ + ... +def smm(input: Tensor, mat2: Tensor) -> Tensor: + r""" + smm(input, mat) -> Tensor + + Performs a matrix multiplication of the sparse matrix :attr:`input` + with the dense matrix :attr:`mat`. + + Args: + input (Tensor): a sparse matrix to be matrix multiplied + mat (Tensor): a dense matrix to be matrix multiplied + """ + ... +@overload +def softmax(input: Tensor, dim: _int, dtype: Optional[_dtype] = None, *, out: Optional[Tensor] = None) -> Tensor: + r""" + softmax(input, dim, *, dtype=None) -> Tensor + + Alias for :func:`torch.nn.functional.softmax`. + """ + ... +@overload +def softmax(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None) -> Tensor: + r""" + softmax(input, dim, *, dtype=None) -> Tensor + + Alias for :func:`torch.nn.functional.softmax`. + """ + ... +@overload +def sort(input: Tensor, *, stable: Optional[_bool], dim: _int = -1, descending: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: + r""" + sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor) + + Sorts the elements of the :attr:`input` tensor along a given dimension + in ascending order by value. + + If :attr:`dim` is not given, the last dimension of the `input` is chosen. + + If :attr:`descending` is ``True`` then the elements are sorted in descending + order by value. + + If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving + the order of equivalent elements. + + A namedtuple of (values, indices) is returned, where the `values` are the + sorted values and `indices` are the indices of the elements in the original + `input` tensor. + + Args: + input (Tensor): the input tensor. + dim (int, optional): the dimension to sort along + descending (bool, optional): controls the sorting order (ascending or descending) + stable (bool, optional): makes the sorting routine stable, which guarantees that the order + of equivalent elements is preserved. + + Keyword args: + out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can + be optionally given to be used as output buffers + + Example:: + + >>> x = torch.randn(3, 4) + >>> sorted, indices = torch.sort(x) + >>> sorted + tensor([[-0.2162, 0.0608, 0.6719, 2.3332], + [-0.5793, 0.0061, 0.6058, 0.9497], + [-0.5071, 0.3343, 0.9553, 1.0960]]) + >>> indices + tensor([[ 1, 0, 2, 3], + [ 3, 1, 0, 2], + [ 0, 3, 1, 2]]) + + >>> sorted, indices = torch.sort(x, 0) + >>> sorted + tensor([[-0.5071, -0.2162, 0.6719, -0.5793], + [ 0.0608, 0.0061, 0.9497, 0.3343], + [ 0.6058, 0.9553, 1.0960, 2.3332]]) + >>> indices + tensor([[ 2, 0, 0, 1], + [ 0, 1, 1, 2], + [ 1, 2, 2, 0]]) + >>> x = torch.tensor([0, 1] * 9) + >>> x.sort() + torch.return_types.sort( + values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), + indices=tensor([ 2, 16, 4, 6, 14, 8, 0, 10, 12, 9, 17, 15, 13, 11, 7, 5, 3, 1])) + >>> x.sort(stable=True) + torch.return_types.sort( + values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), + indices=tensor([ 0, 2, 4, 6, 8, 10, 12, 14, 16, 1, 3, 5, 7, 9, 11, 13, 15, 17])) + """ + ... +@overload +def sort(input: Tensor, dim: _int = -1, descending: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: + r""" + sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor) + + Sorts the elements of the :attr:`input` tensor along a given dimension + in ascending order by value. + + If :attr:`dim` is not given, the last dimension of the `input` is chosen. + + If :attr:`descending` is ``True`` then the elements are sorted in descending + order by value. + + If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving + the order of equivalent elements. + + A namedtuple of (values, indices) is returned, where the `values` are the + sorted values and `indices` are the indices of the elements in the original + `input` tensor. + + Args: + input (Tensor): the input tensor. + dim (int, optional): the dimension to sort along + descending (bool, optional): controls the sorting order (ascending or descending) + stable (bool, optional): makes the sorting routine stable, which guarantees that the order + of equivalent elements is preserved. + + Keyword args: + out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can + be optionally given to be used as output buffers + + Example:: + + >>> x = torch.randn(3, 4) + >>> sorted, indices = torch.sort(x) + >>> sorted + tensor([[-0.2162, 0.0608, 0.6719, 2.3332], + [-0.5793, 0.0061, 0.6058, 0.9497], + [-0.5071, 0.3343, 0.9553, 1.0960]]) + >>> indices + tensor([[ 1, 0, 2, 3], + [ 3, 1, 0, 2], + [ 0, 3, 1, 2]]) + + >>> sorted, indices = torch.sort(x, 0) + >>> sorted + tensor([[-0.5071, -0.2162, 0.6719, -0.5793], + [ 0.0608, 0.0061, 0.9497, 0.3343], + [ 0.6058, 0.9553, 1.0960, 2.3332]]) + >>> indices + tensor([[ 2, 0, 0, 1], + [ 0, 1, 1, 2], + [ 1, 2, 2, 0]]) + >>> x = torch.tensor([0, 1] * 9) + >>> x.sort() + torch.return_types.sort( + values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), + indices=tensor([ 2, 16, 4, 6, 14, 8, 0, 10, 12, 9, 17, 15, 13, 11, 7, 5, 3, 1])) + >>> x.sort(stable=True) + torch.return_types.sort( + values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), + indices=tensor([ 0, 2, 4, 6, 8, 10, 12, 14, 16, 1, 3, 5, 7, 9, 11, 13, 15, 17])) + """ + ... +@overload +def sort(input: Tensor, *, stable: Optional[_bool], dim: Union[str, ellipsis, None], descending: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: + r""" + sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor) + + Sorts the elements of the :attr:`input` tensor along a given dimension + in ascending order by value. + + If :attr:`dim` is not given, the last dimension of the `input` is chosen. + + If :attr:`descending` is ``True`` then the elements are sorted in descending + order by value. + + If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving + the order of equivalent elements. + + A namedtuple of (values, indices) is returned, where the `values` are the + sorted values and `indices` are the indices of the elements in the original + `input` tensor. + + Args: + input (Tensor): the input tensor. + dim (int, optional): the dimension to sort along + descending (bool, optional): controls the sorting order (ascending or descending) + stable (bool, optional): makes the sorting routine stable, which guarantees that the order + of equivalent elements is preserved. + + Keyword args: + out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can + be optionally given to be used as output buffers + + Example:: + + >>> x = torch.randn(3, 4) + >>> sorted, indices = torch.sort(x) + >>> sorted + tensor([[-0.2162, 0.0608, 0.6719, 2.3332], + [-0.5793, 0.0061, 0.6058, 0.9497], + [-0.5071, 0.3343, 0.9553, 1.0960]]) + >>> indices + tensor([[ 1, 0, 2, 3], + [ 3, 1, 0, 2], + [ 0, 3, 1, 2]]) + + >>> sorted, indices = torch.sort(x, 0) + >>> sorted + tensor([[-0.5071, -0.2162, 0.6719, -0.5793], + [ 0.0608, 0.0061, 0.9497, 0.3343], + [ 0.6058, 0.9553, 1.0960, 2.3332]]) + >>> indices + tensor([[ 2, 0, 0, 1], + [ 0, 1, 1, 2], + [ 1, 2, 2, 0]]) + >>> x = torch.tensor([0, 1] * 9) + >>> x.sort() + torch.return_types.sort( + values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), + indices=tensor([ 2, 16, 4, 6, 14, 8, 0, 10, 12, 9, 17, 15, 13, 11, 7, 5, 3, 1])) + >>> x.sort(stable=True) + torch.return_types.sort( + values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), + indices=tensor([ 0, 2, 4, 6, 8, 10, 12, 14, 16, 1, 3, 5, 7, 9, 11, 13, 15, 17])) + """ + ... +@overload +def sort(input: Tensor, dim: Union[str, ellipsis, None], descending: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: + r""" + sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor) + + Sorts the elements of the :attr:`input` tensor along a given dimension + in ascending order by value. + + If :attr:`dim` is not given, the last dimension of the `input` is chosen. + + If :attr:`descending` is ``True`` then the elements are sorted in descending + order by value. + + If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving + the order of equivalent elements. + + A namedtuple of (values, indices) is returned, where the `values` are the + sorted values and `indices` are the indices of the elements in the original + `input` tensor. + + Args: + input (Tensor): the input tensor. + dim (int, optional): the dimension to sort along + descending (bool, optional): controls the sorting order (ascending or descending) + stable (bool, optional): makes the sorting routine stable, which guarantees that the order + of equivalent elements is preserved. + + Keyword args: + out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can + be optionally given to be used as output buffers + + Example:: + + >>> x = torch.randn(3, 4) + >>> sorted, indices = torch.sort(x) + >>> sorted + tensor([[-0.2162, 0.0608, 0.6719, 2.3332], + [-0.5793, 0.0061, 0.6058, 0.9497], + [-0.5071, 0.3343, 0.9553, 1.0960]]) + >>> indices + tensor([[ 1, 0, 2, 3], + [ 3, 1, 0, 2], + [ 0, 3, 1, 2]]) + + >>> sorted, indices = torch.sort(x, 0) + >>> sorted + tensor([[-0.5071, -0.2162, 0.6719, -0.5793], + [ 0.0608, 0.0061, 0.9497, 0.3343], + [ 0.6058, 0.9553, 1.0960, 2.3332]]) + >>> indices + tensor([[ 2, 0, 0, 1], + [ 0, 1, 1, 2], + [ 1, 2, 2, 0]]) + >>> x = torch.tensor([0, 1] * 9) + >>> x.sort() + torch.return_types.sort( + values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), + indices=tensor([ 2, 16, 4, 6, 14, 8, 0, 10, 12, 9, 17, 15, 13, 11, 7, 5, 3, 1])) + >>> x.sort(stable=True) + torch.return_types.sort( + values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), + indices=tensor([ 0, 2, 4, 6, 8, 10, 12, 14, 16, 1, 3, 5, 7, 9, 11, 13, 15, 17])) + """ + ... +def sparse_bsc_tensor(ccol_indices: Union[Tensor, List], row_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: + r""" + sparse_bsc_tensor(ccol_indices, row_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor + + Constructs a :ref:`sparse tensor in BSC (Block Compressed Sparse + Column)) ` with specified 2-dimensional blocks at the + given :attr:`ccol_indices` and :attr:`row_indices`. Sparse matrix + multiplication operations in BSC format are typically faster than that + for sparse tensors in COO format. Make you have a look at :ref:`the + note on the data type of the indices `. + + .. note:: + + If the ``device`` argument is not specified the device of the given + :attr:`values` and indices tensor(s) must match. If, however, the + argument is specified the input Tensors will be converted to the + given device and in turn determine the device of the constructed + sparse tensor. + + Args: + ccol_indices (array_like): (B+1)-dimensional array of size + ``(*batchsize, ncolblocks + 1)``. The last element of each + batch is the number of non-zeros. This tensor encodes the + index in values and row_indices depending on where the given + column starts. Each successive number in the tensor subtracted + by the number before it denotes the number of elements in a + given column. + row_indices (array_like): Row block co-ordinates of each block in + values. (B+1)-dimensional tensor with the same length + as values. + values (array_list): Initial blocks for the tensor. Can be a list, + tuple, NumPy ``ndarray``, and other types that + represents a (1 + 2 + K)-dimensional tensor where ``K`` is the + number of dense dimensions. + size (list, tuple, :class:`torch.Size`, optional): Size of the + sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols * + blocksize[1], *densesize)`` If not provided, the size will be + inferred as the minimum size big enough to hold all non-zero + blocks. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of + returned tensor. Default: if None, infers data type from + :attr:`values`. + device (:class:`torch.device`, optional): the desired device of + returned tensor. Default: if None, uses the current device + for the default tensor type (see + :func:`torch.set_default_device`). :attr:`device` will be + the CPU for CPU tensor types and the current CUDA device for + CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + check_invariants (bool, optional): If sparse tensor invariants are checked. + Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`, + initially False. + + Example:: + >>> ccol_indices = [0, 1, 2] + >>> row_indices = [0, 1] + >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]] + >>> torch.sparse_bsc_tensor(torch.tensor(ccol_indices, dtype=torch.int64), + ... torch.tensor(row_indices, dtype=torch.int64), + ... torch.tensor(values), dtype=torch.double) + tensor(ccol_indices=tensor([0, 1, 2]), + row_indices=tensor([0, 1]), + values=tensor([[[1., 2.], + [3., 4.]], + [[5., 6.], + [7., 8.]]]), size=(2, 2), nnz=2, dtype=torch.float64, + layout=torch.sparse_bsc) + """ + ... +def sparse_bsr_tensor(crow_indices: Union[Tensor, List], col_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: + r""" + sparse_bsr_tensor(crow_indices, col_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor + + Constructs a :ref:`sparse tensor in BSR (Block Compressed Sparse Row)) + ` with specified 2-dimensional blocks at the given + :attr:`crow_indices` and :attr:`col_indices`. Sparse matrix + multiplication operations in BSR format are typically faster than that + for sparse tensors in COO format. Make you have a look at :ref:`the + note on the data type of the indices `. + + .. note:: + + If the ``device`` argument is not specified the device of the given + :attr:`values` and indices tensor(s) must match. If, however, the + argument is specified the input Tensors will be converted to the + given device and in turn determine the device of the constructed + sparse tensor. + + Args: + crow_indices (array_like): (B+1)-dimensional array of size + ``(*batchsize, nrowblocks + 1)``. The last element of each + batch is the number of non-zeros. This tensor encodes the + block index in values and col_indices depending on where the + given row block starts. Each successive number in the tensor + subtracted by the number before it denotes the number of + blocks in a given row. + col_indices (array_like): Column block co-ordinates of each block + in values. (B+1)-dimensional tensor with the same length as + values. + values (array_list): Initial values for the tensor. Can be a list, + tuple, NumPy ``ndarray``, scalar, and other types that + represents a (1 + 2 + K)-dimensional tensor where ``K`` is the + number of dense dimensions. + size (list, tuple, :class:`torch.Size`, optional): Size of the + sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols * + blocksize[1], *densesize)`` where ``blocksize == + values.shape[1:3]``. If not provided, the size will be + inferred as the minimum size big enough to hold all non-zero + blocks. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of + returned tensor. Default: if None, infers data type from + :attr:`values`. + device (:class:`torch.device`, optional): the desired device of + returned tensor. Default: if None, uses the current device + for the default tensor type (see + :func:`torch.set_default_device`). :attr:`device` will be + the CPU for CPU tensor types and the current CUDA device for + CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + check_invariants (bool, optional): If sparse tensor invariants are checked. + Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`, + initially False. + + Example:: + >>> crow_indices = [0, 1, 2] + >>> col_indices = [0, 1] + >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]] + >>> torch.sparse_bsr_tensor(torch.tensor(crow_indices, dtype=torch.int64), + ... torch.tensor(col_indices, dtype=torch.int64), + ... torch.tensor(values), dtype=torch.double) + tensor(crow_indices=tensor([0, 1, 2]), + col_indices=tensor([0, 1]), + values=tensor([[[1., 2.], + [3., 4.]], + [[5., 6.], + [7., 8.]]]), size=(2, 2), nnz=2, dtype=torch.float64, + layout=torch.sparse_bsr) + """ + ... +def sparse_compressed_tensor(compressed_indices: Union[Tensor, List], plain_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: + r""" + sparse_compressed_tensor(compressed_indices, plain_indices, values, size=None, *, dtype=None, layout=None, device=None, requires_grad=False, check_invariants=None) -> Tensor + + Constructs a :ref:`sparse tensor in Compressed Sparse format - CSR, + CSC, BSR, or BSC - ` with specified values at + the given :attr:`compressed_indices` and :attr:`plain_indices`. Sparse + matrix multiplication operations in Compressed Sparse format are + typically faster than that for sparse tensors in COO format. Make you + have a look at :ref:`the note on the data type of the indices + `. + + .. note:: + + If the ``device`` argument is not specified the device of the given + :attr:`values` and indices tensor(s) must match. If, however, the + argument is specified the input Tensors will be converted to the + given device and in turn determine the device of the constructed + sparse tensor. + + Args: + compressed_indices (array_like): (B+1)-dimensional array of size + ``(*batchsize, compressed_dim_size + 1)``. The last element of + each batch is the number of non-zero elements or blocks. This + tensor encodes the index in ``values`` and ``plain_indices`` + depending on where the given compressed dimension (row or + column) starts. Each successive number in the tensor + subtracted by the number before it denotes the number of + elements or blocks in a given compressed dimension. + plain_indices (array_like): Plain dimension (column or row) + co-ordinates of each element or block in values. (B+1)-dimensional + tensor with the same length as values. + + values (array_list): Initial values for the tensor. Can be a list, + tuple, NumPy ``ndarray``, scalar, and other types. that + represents a (1+K)-dimensional (for CSR and CSC layouts) or + (1+2+K)-dimensional tensor (for BSR and BSC layouts) where + ``K`` is the number of dense dimensions. + size (list, tuple, :class:`torch.Size`, optional): Size of the + sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols * + blocksize[1], *densesize)`` where ``blocksize[0] == + blocksize[1] == 1`` for CSR and CSC formats. If not provided, + the size will be inferred as the minimum size big enough to + hold all non-zero elements or blocks. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of + returned tensor. Default: if None, infers data type from + :attr:`values`. + layout (:class:`torch.layout`, required): the desired layout of + returned tensor: :attr:`torch.sparse_csr`, + :attr:`torch.sparse_csc`, :attr:`torch.sparse_bsr`, or + :attr:`torch.sparse_bsc`. + device (:class:`torch.device`, optional): the desired device of + returned tensor. Default: if None, uses the current device + for the default tensor type (see + :func:`torch.set_default_device`). :attr:`device` will be + the CPU for CPU tensor types and the current CUDA device for + CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + check_invariants (bool, optional): If sparse tensor invariants are checked. + Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`, + initially False. + + Example:: + >>> compressed_indices = [0, 2, 4] + >>> plain_indices = [0, 1, 0, 1] + >>> values = [1, 2, 3, 4] + >>> torch.sparse_compressed_tensor(torch.tensor(compressed_indices, dtype=torch.int64), + ... torch.tensor(plain_indices, dtype=torch.int64), + ... torch.tensor(values), dtype=torch.double, layout=torch.sparse_csr) + tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csr) + """ + ... +def sparse_coo_tensor(indices: Tensor, values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None, is_coalesced: Optional[_bool] = None) -> Tensor: + r""" + sparse_coo_tensor(indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None, is_coalesced=None) -> Tensor + + Constructs a :ref:`sparse tensor in COO(rdinate) format + ` with specified values at the given + :attr:`indices`. + + .. note:: + + This function returns an :ref:`uncoalesced tensor + ` when :attr:`is_coalesced` is + unspecified or ``None``. + + .. note:: + + If the ``device`` argument is not specified the device of the given + :attr:`values` and indices tensor(s) must match. If, however, the + argument is specified the input Tensors will be converted to the + given device and in turn determine the device of the constructed + sparse tensor. + + Args: + indices (array_like): Initial data for the tensor. Can be a list, tuple, + NumPy ``ndarray``, scalar, and other types. Will be cast to a :class:`torch.LongTensor` + internally. The indices are the coordinates of the non-zero values in the matrix, and thus + should be two-dimensional where the first dimension is the number of tensor dimensions and + the second dimension is the number of non-zero values. + values (array_like): Initial values for the tensor. Can be a list, tuple, + NumPy ``ndarray``, scalar, and other types. + size (list, tuple, or :class:`torch.Size`, optional): Size of the sparse tensor. If not + provided the size will be inferred as the minimum size big enough to hold all non-zero + elements. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if None, infers data type from :attr:`values`. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if None, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + check_invariants (bool, optional): If sparse tensor invariants are checked. + Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`, + initially False. + is_coalesced (bool, optional): When``True``, the caller is + responsible for providing tensor indices that correspond to a + coalesced tensor. If the :attr:`check_invariants` flag is + False, no error will be raised if the prerequisites are not + met and this will lead to silently incorrect results. To force + coalescion please use :meth:`coalesce` on the resulting + Tensor. + Default: None: except for trivial cases (e.g. nnz < 2) the + resulting Tensor has is_coalesced set to ``False```. + + Example:: + + >>> i = torch.tensor([[0, 1, 1], + ... [2, 0, 2]]) + >>> v = torch.tensor([3, 4, 5], dtype=torch.float32) + >>> torch.sparse_coo_tensor(i, v, [2, 4]) + tensor(indices=tensor([[0, 1, 1], + [2, 0, 2]]), + values=tensor([3., 4., 5.]), + size=(2, 4), nnz=3, layout=torch.sparse_coo) + + >>> torch.sparse_coo_tensor(i, v) # Shape inference + tensor(indices=tensor([[0, 1, 1], + [2, 0, 2]]), + values=tensor([3., 4., 5.]), + size=(2, 3), nnz=3, layout=torch.sparse_coo) + + >>> torch.sparse_coo_tensor(i, v, [2, 4], + ... dtype=torch.float64, + ... device=torch.device('cuda:0')) + tensor(indices=tensor([[0, 1, 1], + [2, 0, 2]]), + values=tensor([3., 4., 5.]), + device='cuda:0', size=(2, 4), nnz=3, dtype=torch.float64, + layout=torch.sparse_coo) + + # Create an empty sparse tensor with the following invariants: + # 1. sparse_dim + dense_dim = len(SparseTensor.shape) + # 2. SparseTensor._indices().shape = (sparse_dim, nnz) + # 3. SparseTensor._values().shape = (nnz, SparseTensor.shape[sparse_dim:]) + # + # For instance, to create an empty sparse tensor with nnz = 0, dense_dim = 0 and + # sparse_dim = 1 (hence indices is a 2D tensor of shape = (1, 0)) + >>> S = torch.sparse_coo_tensor(torch.empty([1, 0]), [], [1]) + tensor(indices=tensor([], size=(1, 0)), + values=tensor([], size=(0,)), + size=(1,), nnz=0, layout=torch.sparse_coo) + + # and to create an empty sparse tensor with nnz = 0, dense_dim = 1 and + # sparse_dim = 1 + >>> S = torch.sparse_coo_tensor(torch.empty([1, 0]), torch.empty([0, 2]), [1, 2]) + tensor(indices=tensor([], size=(1, 0)), + values=tensor([], size=(0, 2)), + size=(1, 2), nnz=0, layout=torch.sparse_coo) + + .. _torch.sparse: https://pytorch.org/docs/stable/sparse.html + """ + ... +def sparse_csc_tensor(ccol_indices: Union[Tensor, List], row_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: + r""" + sparse_csc_tensor(ccol_indices, row_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor + + Constructs a :ref:`sparse tensor in CSC (Compressed Sparse Column) + ` with specified values at the given + :attr:`ccol_indices` and :attr:`row_indices`. Sparse matrix + multiplication operations in CSC format are typically faster than that + for sparse tensors in COO format. Make you have a look at :ref:`the + note on the data type of the indices `. + + .. note:: + + If the ``device`` argument is not specified the device of the given + :attr:`values` and indices tensor(s) must match. If, however, the + argument is specified the input Tensors will be converted to the + given device and in turn determine the device of the constructed + sparse tensor. + + Args: + ccol_indices (array_like): (B+1)-dimensional array of size + ``(*batchsize, ncols + 1)``. The last element of each batch + is the number of non-zeros. This tensor encodes the index in + values and row_indices depending on where the given column + starts. Each successive number in the tensor subtracted by the + number before it denotes the number of elements in a given + column. + row_indices (array_like): Row co-ordinates of each element in + values. (B+1)-dimensional tensor with the same length as + values. + values (array_list): Initial values for the tensor. Can be a list, + tuple, NumPy ``ndarray``, scalar, and other types that + represents a (1+K)-dimensional tensor where ``K`` is the number + of dense dimensions. + size (list, tuple, :class:`torch.Size`, optional): Size of the + sparse tensor: ``(*batchsize, nrows, ncols, *densesize)``. If + not provided, the size will be inferred as the minimum size + big enough to hold all non-zero elements. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of + returned tensor. Default: if None, infers data type from + :attr:`values`. + device (:class:`torch.device`, optional): the desired device of + returned tensor. Default: if None, uses the current device + for the default tensor type (see + :func:`torch.set_default_device`). :attr:`device` will be + the CPU for CPU tensor types and the current CUDA device for + CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + check_invariants (bool, optional): If sparse tensor invariants are checked. + Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`, + initially False. + + Example:: + >>> ccol_indices = [0, 2, 4] + >>> row_indices = [0, 1, 0, 1] + >>> values = [1, 2, 3, 4] + >>> torch.sparse_csc_tensor(torch.tensor(ccol_indices, dtype=torch.int64), + ... torch.tensor(row_indices, dtype=torch.int64), + ... torch.tensor(values), dtype=torch.double) + tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csc) + """ + ... +def sparse_csr_tensor(crow_indices: Union[Tensor, List], col_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: + r""" + sparse_csr_tensor(crow_indices, col_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor + + Constructs a :ref:`sparse tensor in CSR (Compressed Sparse Row) ` with specified + values at the given :attr:`crow_indices` and :attr:`col_indices`. Sparse matrix multiplication operations + in CSR format are typically faster than that for sparse tensors in COO format. Make you have a look + at :ref:`the note on the data type of the indices `. + + .. note:: + + If the ``device`` argument is not specified the device of the given + :attr:`values` and indices tensor(s) must match. If, however, the + argument is specified the input Tensors will be converted to the + given device and in turn determine the device of the constructed + sparse tensor. + + Args: + crow_indices (array_like): (B+1)-dimensional array of size + ``(*batchsize, nrows + 1)``. The last element of each batch + is the number of non-zeros. This tensor encodes the index in + values and col_indices depending on where the given row + starts. Each successive number in the tensor subtracted by the + number before it denotes the number of elements in a given + row. + col_indices (array_like): Column co-ordinates of each element in + values. (B+1)-dimensional tensor with the same length + as values. + values (array_list): Initial values for the tensor. Can be a list, + tuple, NumPy ``ndarray``, scalar, and other types that + represents a (1+K)-dimensional tensor where ``K`` is the number + of dense dimensions. + size (list, tuple, :class:`torch.Size`, optional): Size of the + sparse tensor: ``(*batchsize, nrows, ncols, *densesize)``. If + not provided, the size will be inferred as the minimum size + big enough to hold all non-zero elements. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of + returned tensor. Default: if None, infers data type from + :attr:`values`. + device (:class:`torch.device`, optional): the desired device of + returned tensor. Default: if None, uses the current device + for the default tensor type (see + :func:`torch.set_default_device`). :attr:`device` will be + the CPU for CPU tensor types and the current CUDA device for + CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + check_invariants (bool, optional): If sparse tensor invariants are checked. + Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`, + initially False. + + Example:: + >>> crow_indices = [0, 2, 4] + >>> col_indices = [0, 1, 0, 1] + >>> values = [1, 2, 3, 4] + >>> torch.sparse_csr_tensor(torch.tensor(crow_indices, dtype=torch.int64), + ... torch.tensor(col_indices, dtype=torch.int64), + ... torch.tensor(values), dtype=torch.double) + tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csr) + """ + ... +def split_copy(input: Tensor, split_size: Union[_int, SymInt], dim: _int = 0, *, out: Union[Tuple[Tensor, ...], List[Tensor], None] = None) -> None: + r""" + Performs the same operation as :func:`torch.split`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def split_with_sizes(input: Tensor, split_sizes: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: ... +def split_with_sizes_copy(input: Tensor, split_sizes: Sequence[Union[_int, SymInt]], dim: _int = 0, *, out: Union[Tuple[Tensor, ...], List[Tensor], None] = None) -> None: + r""" + Performs the same operation as :func:`torch.split_with_sizes`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def spmm(input: Tensor, mat2: Tensor) -> Tensor: ... +def sqrt(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + sqrt(input, *, out=None) -> Tensor + + Returns a new tensor with the square-root of the elements of :attr:`input`. + + .. math:: + \text{out}_{i} = \sqrt{\text{input}_{i}} + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([-2.0755, 1.0226, 0.0831, 0.4806]) + >>> torch.sqrt(a) + tensor([ nan, 1.0112, 0.2883, 0.6933]) + """ + ... +def sqrt_(input: Tensor) -> Tensor: ... +def square(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + square(input, *, out=None) -> Tensor + + Returns a new tensor with the square of the elements of :attr:`input`. + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([-2.0755, 1.0226, 0.0831, 0.4806]) + >>> torch.square(a) + tensor([ 4.3077, 1.0457, 0.0069, 0.2310]) + """ + ... +def square_(input: Tensor) -> Tensor: ... +@overload +def squeeze(input: Tensor) -> Tensor: + r""" + squeeze(input, dim=None) -> Tensor + + Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed. + + For example, if `input` is of shape: + :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()` + will be of shape: :math:`(A \times B \times C \times D)`. + + When :attr:`dim` is given, a squeeze operation is done only in the given + dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`, + ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)`` + will squeeze the tensor to the shape :math:`(A \times B)`. + + .. note:: The returned tensor shares the storage with the input tensor, + so changing the contents of one will change the contents of the other. + + .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)` + will also remove the batch dimension, which can lead to unexpected + errors. Consider specifying only the dims you wish to be squeezed. + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints, optional): if given, the input will be squeezed + only in the specified dimensions. + + .. versionchanged:: 2.0 + :attr:`dim` now accepts tuples of dimensions. + + Example:: + + >>> x = torch.zeros(2, 1, 2, 1, 2) + >>> x.size() + torch.Size([2, 1, 2, 1, 2]) + >>> y = torch.squeeze(x) + >>> y.size() + torch.Size([2, 2, 2]) + >>> y = torch.squeeze(x, 0) + >>> y.size() + torch.Size([2, 1, 2, 1, 2]) + >>> y = torch.squeeze(x, 1) + >>> y.size() + torch.Size([2, 2, 1, 2]) + >>> y = torch.squeeze(x, (1, 2, 3)) + torch.Size([2, 2, 2]) + """ + ... +@overload +def squeeze(input: Tensor, dim: _int) -> Tensor: + r""" + squeeze(input, dim=None) -> Tensor + + Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed. + + For example, if `input` is of shape: + :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()` + will be of shape: :math:`(A \times B \times C \times D)`. + + When :attr:`dim` is given, a squeeze operation is done only in the given + dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`, + ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)`` + will squeeze the tensor to the shape :math:`(A \times B)`. + + .. note:: The returned tensor shares the storage with the input tensor, + so changing the contents of one will change the contents of the other. + + .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)` + will also remove the batch dimension, which can lead to unexpected + errors. Consider specifying only the dims you wish to be squeezed. + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints, optional): if given, the input will be squeezed + only in the specified dimensions. + + .. versionchanged:: 2.0 + :attr:`dim` now accepts tuples of dimensions. + + Example:: + + >>> x = torch.zeros(2, 1, 2, 1, 2) + >>> x.size() + torch.Size([2, 1, 2, 1, 2]) + >>> y = torch.squeeze(x) + >>> y.size() + torch.Size([2, 2, 2]) + >>> y = torch.squeeze(x, 0) + >>> y.size() + torch.Size([2, 1, 2, 1, 2]) + >>> y = torch.squeeze(x, 1) + >>> y.size() + torch.Size([2, 2, 1, 2]) + >>> y = torch.squeeze(x, (1, 2, 3)) + torch.Size([2, 2, 2]) + """ + ... +@overload +def squeeze(input: Tensor, dim: _size) -> Tensor: + r""" + squeeze(input, dim=None) -> Tensor + + Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed. + + For example, if `input` is of shape: + :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()` + will be of shape: :math:`(A \times B \times C \times D)`. + + When :attr:`dim` is given, a squeeze operation is done only in the given + dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`, + ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)`` + will squeeze the tensor to the shape :math:`(A \times B)`. + + .. note:: The returned tensor shares the storage with the input tensor, + so changing the contents of one will change the contents of the other. + + .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)` + will also remove the batch dimension, which can lead to unexpected + errors. Consider specifying only the dims you wish to be squeezed. + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints, optional): if given, the input will be squeezed + only in the specified dimensions. + + .. versionchanged:: 2.0 + :attr:`dim` now accepts tuples of dimensions. + + Example:: + + >>> x = torch.zeros(2, 1, 2, 1, 2) + >>> x.size() + torch.Size([2, 1, 2, 1, 2]) + >>> y = torch.squeeze(x) + >>> y.size() + torch.Size([2, 2, 2]) + >>> y = torch.squeeze(x, 0) + >>> y.size() + torch.Size([2, 1, 2, 1, 2]) + >>> y = torch.squeeze(x, 1) + >>> y.size() + torch.Size([2, 2, 1, 2]) + >>> y = torch.squeeze(x, (1, 2, 3)) + torch.Size([2, 2, 2]) + """ + ... +@overload +def squeeze(input: Tensor, dim: Union[str, ellipsis, None]) -> Tensor: + r""" + squeeze(input, dim=None) -> Tensor + + Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed. + + For example, if `input` is of shape: + :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()` + will be of shape: :math:`(A \times B \times C \times D)`. + + When :attr:`dim` is given, a squeeze operation is done only in the given + dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`, + ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)`` + will squeeze the tensor to the shape :math:`(A \times B)`. + + .. note:: The returned tensor shares the storage with the input tensor, + so changing the contents of one will change the contents of the other. + + .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)` + will also remove the batch dimension, which can lead to unexpected + errors. Consider specifying only the dims you wish to be squeezed. + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints, optional): if given, the input will be squeezed + only in the specified dimensions. + + .. versionchanged:: 2.0 + :attr:`dim` now accepts tuples of dimensions. + + Example:: + + >>> x = torch.zeros(2, 1, 2, 1, 2) + >>> x.size() + torch.Size([2, 1, 2, 1, 2]) + >>> y = torch.squeeze(x) + >>> y.size() + torch.Size([2, 2, 2]) + >>> y = torch.squeeze(x, 0) + >>> y.size() + torch.Size([2, 1, 2, 1, 2]) + >>> y = torch.squeeze(x, 1) + >>> y.size() + torch.Size([2, 2, 1, 2]) + >>> y = torch.squeeze(x, (1, 2, 3)) + torch.Size([2, 2, 2]) + """ + ... +@overload +def squeeze_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.squeeze`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +@overload +def squeeze_copy(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.squeeze`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +@overload +def squeeze_copy(input: Tensor, dim: _size, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.squeeze`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +@overload +def sspaddmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat1: Tensor, mat2: Tensor) -> Tensor: + r""" + sspaddmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor + + Matrix multiplies a sparse tensor :attr:`mat1` with a dense tensor + :attr:`mat2`, then adds the sparse tensor :attr:`input` to the result. + + Note: This function is equivalent to :func:`torch.addmm`, except + :attr:`input` and :attr:`mat1` are sparse. + + Args: + input (Tensor): a sparse matrix to be added + mat1 (Tensor): a sparse matrix to be matrix multiplied + mat2 (Tensor): a dense matrix to be matrix multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + """ + ... +@overload +def sspaddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: + r""" + sspaddmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor + + Matrix multiplies a sparse tensor :attr:`mat1` with a dense tensor + :attr:`mat2`, then adds the sparse tensor :attr:`input` to the result. + + Note: This function is equivalent to :func:`torch.addmm`, except + :attr:`input` and :attr:`mat1` are sparse. + + Args: + input (Tensor): a sparse matrix to be added + mat1 (Tensor): a sparse matrix to be matrix multiplied + mat2 (Tensor): a dense matrix to be matrix multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + """ + ... +@overload +def sspaddmm(beta: Union[Number, _complex], self: Tensor, mat1: Tensor, mat2: Tensor) -> Tensor: + r""" + sspaddmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor + + Matrix multiplies a sparse tensor :attr:`mat1` with a dense tensor + :attr:`mat2`, then adds the sparse tensor :attr:`input` to the result. + + Note: This function is equivalent to :func:`torch.addmm`, except + :attr:`input` and :attr:`mat1` are sparse. + + Args: + input (Tensor): a sparse matrix to be added + mat1 (Tensor): a sparse matrix to be matrix multiplied + mat2 (Tensor): a dense matrix to be matrix multiplied + + Keyword args: + beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`) + alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`) + out (Tensor, optional): the output tensor. + """ + ... +def stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: + r""" + stack(tensors, dim=0, *, out=None) -> Tensor + + Concatenates a sequence of tensors along a new dimension. + + All tensors need to be of the same size. + + .. seealso:: + + :func:`torch.cat` concatenates the given sequence along an existing dimension. + + Arguments: + tensors (sequence of Tensors): sequence of tensors to concatenate + dim (int, optional): dimension to insert. Has to be between 0 and the number + of dimensions of concatenated tensors (inclusive). Default: 0 + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> x = torch.randn(2, 3) + >>> x + tensor([[ 0.3367, 0.1288, 0.2345], + [ 0.2303, -1.1229, -0.1863]]) + >>> x = torch.stack((x, x)) # same as torch.stack((x, x), dim=0) + >>> x + tensor([[[ 0.3367, 0.1288, 0.2345], + [ 0.2303, -1.1229, -0.1863]], + + [[ 0.3367, 0.1288, 0.2345], + [ 0.2303, -1.1229, -0.1863]]]) + >>> x.size() + torch.Size([2, 2, 3]) + >>> x = torch.stack((x, x), dim=1) + tensor([[[ 0.3367, 0.1288, 0.2345], + [ 0.3367, 0.1288, 0.2345]], + + [[ 0.2303, -1.1229, -0.1863], + [ 0.2303, -1.1229, -0.1863]]]) + >>> x = torch.stack((x, x), dim=2) + tensor([[[ 0.3367, 0.3367], + [ 0.1288, 0.1288], + [ 0.2345, 0.2345]], + + [[ 0.2303, 0.2303], + [-1.1229, -1.1229], + [-0.1863, -0.1863]]]) + >>> x = torch.stack((x, x), dim=-1) + tensor([[[ 0.3367, 0.3367], + [ 0.1288, 0.1288], + [ 0.2345, 0.2345]], + + [[ 0.2303, 0.2303], + [-1.1229, -1.1229], + [-0.1863, -0.1863]]]) + """ + ... +@overload +def std(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor + + Calculates the standard deviation over the dimensions specified by :attr:`dim`. + :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to + reduce over all dimensions. + + The standard deviation (:math:`\sigma`) is calculated as + + .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2} + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.std(a, dim=1, keepdim=True) + tensor([[1.0311], + [0.7477], + [1.2204], + [0.9087]]) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def std(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: + r""" + std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor + + Calculates the standard deviation over the dimensions specified by :attr:`dim`. + :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to + reduce over all dimensions. + + The standard deviation (:math:`\sigma`) is calculated as + + .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2} + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.std(a, dim=1, keepdim=True) + tensor([[1.0311], + [0.7477], + [1.2204], + [0.9087]]) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def std(input: Tensor, unbiased: _bool = True) -> Tensor: + r""" + std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor + + Calculates the standard deviation over the dimensions specified by :attr:`dim`. + :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to + reduce over all dimensions. + + The standard deviation (:math:`\sigma`) is calculated as + + .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2} + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.std(a, dim=1, keepdim=True) + tensor([[1.0311], + [0.7477], + [1.2204], + [0.9087]]) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def std(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: + r""" + std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor + + Calculates the standard deviation over the dimensions specified by :attr:`dim`. + :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to + reduce over all dimensions. + + The standard deviation (:math:`\sigma`) is calculated as + + .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2} + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.std(a, dim=1, keepdim=True) + tensor([[1.0311], + [0.7477], + [1.2204], + [0.9087]]) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def std(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor + + Calculates the standard deviation over the dimensions specified by :attr:`dim`. + :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to + reduce over all dimensions. + + The standard deviation (:math:`\sigma`) is calculated as + + .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2} + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + dim (int or tuple of ints): the dimension or dimensions to reduce. + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.std(a, dim=1, keepdim=True) + tensor([[1.0311], + [0.7477], + [1.2204], + [0.9087]]) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def std_mean(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: + r""" + std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor) + + Calculates the standard deviation and mean over the dimensions specified by + :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or + ``None`` to reduce over all dimensions. + + The standard deviation (:math:`\sigma`) is calculated as + + .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2} + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Returns: + A tuple (std, mean) containing the standard deviation and mean. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.std_mean(a, dim=0, keepdim=True) + (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]), + tensor([[ 0.0645, 0.4485, 0.8707, -0.0665]])) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def std_mean(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: + r""" + std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor) + + Calculates the standard deviation and mean over the dimensions specified by + :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or + ``None`` to reduce over all dimensions. + + The standard deviation (:math:`\sigma`) is calculated as + + .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2} + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Returns: + A tuple (std, mean) containing the standard deviation and mean. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.std_mean(a, dim=0, keepdim=True) + (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]), + tensor([[ 0.0645, 0.4485, 0.8707, -0.0665]])) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def std_mean(input: Tensor, unbiased: _bool = True) -> Tuple[Tensor, Tensor]: + r""" + std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor) + + Calculates the standard deviation and mean over the dimensions specified by + :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or + ``None`` to reduce over all dimensions. + + The standard deviation (:math:`\sigma`) is calculated as + + .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2} + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Returns: + A tuple (std, mean) containing the standard deviation and mean. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.std_mean(a, dim=0, keepdim=True) + (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]), + tensor([[ 0.0645, 0.4485, 0.8707, -0.0665]])) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def std_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: + r""" + std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor) + + Calculates the standard deviation and mean over the dimensions specified by + :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or + ``None`` to reduce over all dimensions. + + The standard deviation (:math:`\sigma`) is calculated as + + .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2} + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Returns: + A tuple (std, mean) containing the standard deviation and mean. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.std_mean(a, dim=0, keepdim=True) + (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]), + tensor([[ 0.0645, 0.4485, 0.8707, -0.0665]])) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def std_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: + r""" + std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor) + + Calculates the standard deviation and mean over the dimensions specified by + :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or + ``None`` to reduce over all dimensions. + + The standard deviation (:math:`\sigma`) is calculated as + + .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2} + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Returns: + A tuple (std, mean) containing the standard deviation and mean. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.std_mean(a, dim=0, keepdim=True) + (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]), + tensor([[ 0.0645, 0.4485, 0.8707, -0.0665]])) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def sub(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], *, alpha: Optional[Union[Number, _complex]] = 1, out: Optional[Tensor] = None) -> Tensor: + r""" + sub(input, other, *, alpha=1, out=None) -> Tensor + + Subtracts :attr:`other`, scaled by :attr:`alpha`, from :attr:`input`. + + .. math:: + \text{{out}}_i = \text{{input}}_i - \text{{alpha}} \times \text{{other}}_i + + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer, float, and complex inputs. + + Args: + input (Tensor): the input tensor. + other (Tensor or Number): the tensor or number to subtract from :attr:`input`. + + Keyword args: + alpha (Number): the multiplier for :attr:`other`. + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor((1, 2)) + >>> b = torch.tensor((0, 1)) + >>> torch.sub(a, b, alpha=2) + tensor([1, 0]) + """ + ... +@overload +def sub(self: Tensor, alpha: Union[Number, _complex], other: Tensor) -> Tensor: + r""" + sub(input, other, *, alpha=1, out=None) -> Tensor + + Subtracts :attr:`other`, scaled by :attr:`alpha`, from :attr:`input`. + + .. math:: + \text{{out}}_i = \text{{input}}_i - \text{{alpha}} \times \text{{other}}_i + + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer, float, and complex inputs. + + Args: + input (Tensor): the input tensor. + other (Tensor or Number): the tensor or number to subtract from :attr:`input`. + + Keyword args: + alpha (Number): the multiplier for :attr:`other`. + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor((1, 2)) + >>> b = torch.tensor((0, 1)) + >>> torch.sub(a, b, alpha=2) + tensor([1, 0]) + """ + ... +@overload +def sub(self: Tensor, alpha: Union[Number, _complex], other: Tensor, *, out: Tensor) -> Tensor: + r""" + sub(input, other, *, alpha=1, out=None) -> Tensor + + Subtracts :attr:`other`, scaled by :attr:`alpha`, from :attr:`input`. + + .. math:: + \text{{out}}_i = \text{{input}}_i - \text{{alpha}} \times \text{{other}}_i + + + Supports :ref:`broadcasting to a common shape `, + :ref:`type promotion `, and integer, float, and complex inputs. + + Args: + input (Tensor): the input tensor. + other (Tensor or Number): the tensor or number to subtract from :attr:`input`. + + Keyword args: + alpha (Number): the multiplier for :attr:`other`. + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor((1, 2)) + >>> b = torch.tensor((0, 1)) + >>> torch.sub(a, b, alpha=2) + tensor([1, 0]) + """ + ... +@overload +def subtract(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: + r""" + subtract(input, other, *, alpha=1, out=None) -> Tensor + + Alias for :func:`torch.sub`. + """ + ... +@overload +def subtract(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: + r""" + subtract(input, other, *, alpha=1, out=None) -> Tensor + + Alias for :func:`torch.sub`. + """ + ... +@overload +def sum(input: Tensor, *, dtype: Optional[_dtype] = None) -> Tensor: + r""" + sum(input, *, dtype=None) -> Tensor + + Returns the sum of all elements in the :attr:`input` tensor. + + Args: + input (Tensor): the input tensor. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 0.1133, -0.9567, 0.2958]]) + >>> torch.sum(a) + tensor(-0.5475) + + .. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor + :noindex: + + Returns the sum of each row of the :attr:`input` tensor in the given + dimension :attr:`dim`. If :attr:`dim` is a list of dimensions, + reduce over all of them. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[ 0.0569, -0.2475, 0.0737, -0.3429], + [-0.2993, 0.9138, 0.9337, -1.6864], + [ 0.1132, 0.7892, -0.1003, 0.5688], + [ 0.3637, -0.9906, -0.4752, -1.5197]]) + >>> torch.sum(a, 1) + tensor([-0.4598, -0.1381, 1.3708, -2.6217]) + >>> b = torch.arange(4 * 5 * 6).view(4, 5, 6) + >>> torch.sum(b, (2, 1)) + tensor([ 435., 1335., 2235., 3135.]) + """ + ... +@overload +def sum(input: Tensor, dim: Optional[Union[_int, _size]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + sum(input, *, dtype=None) -> Tensor + + Returns the sum of all elements in the :attr:`input` tensor. + + Args: + input (Tensor): the input tensor. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 0.1133, -0.9567, 0.2958]]) + >>> torch.sum(a) + tensor(-0.5475) + + .. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor + :noindex: + + Returns the sum of each row of the :attr:`input` tensor in the given + dimension :attr:`dim`. If :attr:`dim` is a list of dimensions, + reduce over all of them. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[ 0.0569, -0.2475, 0.0737, -0.3429], + [-0.2993, 0.9138, 0.9337, -1.6864], + [ 0.1132, 0.7892, -0.1003, 0.5688], + [ 0.3637, -0.9906, -0.4752, -1.5197]]) + >>> torch.sum(a, 1) + tensor([-0.4598, -0.1381, 1.3708, -2.6217]) + >>> b = torch.arange(4 * 5 * 6).view(4, 5, 6) + >>> torch.sum(b, (2, 1)) + tensor([ 435., 1335., 2235., 3135.]) + """ + ... +@overload +def sum(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: + r""" + sum(input, *, dtype=None) -> Tensor + + Returns the sum of all elements in the :attr:`input` tensor. + + Args: + input (Tensor): the input tensor. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[ 0.1133, -0.9567, 0.2958]]) + >>> torch.sum(a) + tensor(-0.5475) + + .. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor + :noindex: + + Returns the sum of each row of the :attr:`input` tensor in the given + dimension :attr:`dim`. If :attr:`dim` is a list of dimensions, + reduce over all of them. + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + If specified, the input tensor is casted to :attr:`dtype` before the operation + is performed. This is useful for preventing data type overflows. Default: None. + + Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[ 0.0569, -0.2475, 0.0737, -0.3429], + [-0.2993, 0.9138, 0.9337, -1.6864], + [ 0.1132, 0.7892, -0.1003, 0.5688], + [ 0.3637, -0.9906, -0.4752, -1.5197]]) + >>> torch.sum(a, 1) + tensor([-0.4598, -0.1381, 1.3708, -2.6217]) + >>> b = torch.arange(4 * 5 * 6).view(4, 5, 6) + >>> torch.sum(b, (2, 1)) + tensor([ 435., 1335., 2235., 3135.]) + """ + ... +def svd(input: Tensor, some: _bool = True, compute_uv: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.svd: + r""" + svd(input, some=True, compute_uv=True, *, out=None) -> (Tensor, Tensor, Tensor) + + Computes the singular value decomposition of either a matrix or batch of + matrices :attr:`input`. The singular value decomposition is represented as a + namedtuple `(U, S, V)`, such that :attr:`input` :math:`= U \text{diag}(S) V^{\text{H}}`. + where :math:`V^{\text{H}}` is the transpose of `V` for real inputs, + and the conjugate transpose of `V` for complex inputs. + If :attr:`input` is a batch of matrices, then `U`, `S`, and `V` are also + batched with the same batch dimensions as :attr:`input`. + + If :attr:`some` is `True` (default), the method returns the reduced singular + value decomposition. In this case, if the last two dimensions of :attr:`input` are + `m` and `n`, then the returned `U` and `V` matrices will contain only + `min(n, m)` orthonormal columns. + + If :attr:`compute_uv` is `False`, the returned `U` and `V` will be + zero-filled matrices of shape `(m, m)` and `(n, n)` + respectively, and the same device as :attr:`input`. The argument :attr:`some` + has no effect when :attr:`compute_uv` is `False`. + + Supports :attr:`input` of float, double, cfloat and cdouble data types. + The dtypes of `U` and `V` are the same as :attr:`input`'s. `S` will + always be real-valued, even if :attr:`input` is complex. + + .. warning:: + + :func:`torch.svd` is deprecated in favor of :func:`torch.linalg.svd` + and will be removed in a future PyTorch release. + + ``U, S, V = torch.svd(A, some=some, compute_uv=True)`` (default) should be replaced with + + .. code:: python + + U, S, Vh = torch.linalg.svd(A, full_matrices=not some) + V = Vh.mH + + ``_, S, _ = torch.svd(A, some=some, compute_uv=False)`` should be replaced with + + .. code:: python + + S = torch.linalg.svdvals(A) + + .. note:: Differences with :func:`torch.linalg.svd`: + + * :attr:`some` is the opposite of + :func:`torch.linalg.svd`'s :attr:`full_matrices`. Note that + default value for both is `True`, so the default behavior is + effectively the opposite. + * :func:`torch.svd` returns `V`, whereas :func:`torch.linalg.svd` returns + `Vh`, that is, :math:`V^{\text{H}}`. + * If :attr:`compute_uv` is `False`, :func:`torch.svd` returns zero-filled + tensors for `U` and `Vh`, whereas :func:`torch.linalg.svd` returns + empty tensors. + + .. note:: The singular values are returned in descending order. If :attr:`input` is a batch of matrices, + then the singular values of each matrix in the batch are returned in descending order. + + .. note:: The `S` tensor can only be used to compute gradients if :attr:`compute_uv` is `True`. + + .. note:: When :attr:`some` is `False`, the gradients on `U[..., :, min(m, n):]` + and `V[..., :, min(m, n):]` will be ignored in the backward pass, as those vectors + can be arbitrary bases of the corresponding subspaces. + + .. note:: The implementation of :func:`torch.linalg.svd` on CPU uses LAPACK's routine `?gesdd` + (a divide-and-conquer algorithm) instead of `?gesvd` for speed. Analogously, + on GPU, it uses cuSOLVER's routines `gesvdj` and `gesvdjBatched` on CUDA 10.1.243 + and later, and MAGMA's routine `gesdd` on earlier versions of CUDA. + + .. note:: The returned `U` will not be contiguous. The matrix (or batch of matrices) will + be represented as a column-major matrix (i.e. Fortran-contiguous). + + .. warning:: The gradients with respect to `U` and `V` will only be finite when the input does not + have zero nor repeated singular values. + + .. warning:: If the distance between any two singular values is close to zero, the gradients with respect to + `U` and `V` will be numerically unstable, as they depends on + :math:`\frac{1}{\min_{i \neq j} \sigma_i^2 - \sigma_j^2}`. The same happens when the matrix + has small singular values, as these gradients also depend on `S^{-1}`. + + .. warning:: For complex-valued :attr:`input` the singular value decomposition is not unique, + as `U` and `V` may be multiplied by an arbitrary phase factor :math:`e^{i \phi}` on every column. + The same happens when :attr:`input` has repeated singular values, where one may multiply + the columns of the spanning subspace in `U` and `V` by a rotation matrix + and `the resulting vectors will span the same subspace`_. + Different platforms, like NumPy, or inputs on different device types, + may produce different `U` and `V` tensors. + + Args: + input (Tensor): the input tensor of size `(*, m, n)` where `*` is zero or more + batch dimensions consisting of `(m, n)` matrices. + some (bool, optional): controls whether to compute the reduced or full decomposition, and + consequently, the shape of returned `U` and `V`. Default: `True`. + compute_uv (bool, optional): controls whether to compute `U` and `V`. Default: `True`. + + Keyword args: + out (tuple, optional): the output tuple of tensors + + Example:: + + >>> a = torch.randn(5, 3) + >>> a + tensor([[ 0.2364, -0.7752, 0.6372], + [ 1.7201, 0.7394, -0.0504], + [-0.3371, -1.0584, 0.5296], + [ 0.3550, -0.4022, 1.5569], + [ 0.2445, -0.0158, 1.1414]]) + >>> u, s, v = torch.svd(a) + >>> u + tensor([[ 0.4027, 0.0287, 0.5434], + [-0.1946, 0.8833, 0.3679], + [ 0.4296, -0.2890, 0.5261], + [ 0.6604, 0.2717, -0.2618], + [ 0.4234, 0.2481, -0.4733]]) + >>> s + tensor([2.3289, 2.0315, 0.7806]) + >>> v + tensor([[-0.0199, 0.8766, 0.4809], + [-0.5080, 0.4054, -0.7600], + [ 0.8611, 0.2594, -0.4373]]) + >>> torch.dist(a, torch.mm(torch.mm(u, torch.diag(s)), v.t())) + tensor(8.6531e-07) + >>> a_big = torch.randn(7, 5, 3) + >>> u, s, v = torch.svd(a_big) + >>> torch.dist(a_big, torch.matmul(torch.matmul(u, torch.diag_embed(s)), v.mT)) + tensor(2.6503e-06) + + .. _the resulting vectors will span the same subspace: + (https://en.wikipedia.org/wiki/Singular_value_decomposition#Singular_values,_singular_vectors,_and_their_relation_to_the_SVD) + """ + ... +def swapaxes(input: Tensor, axis0: _int, axis1: _int) -> Tensor: + r""" + swapaxes(input, axis0, axis1) -> Tensor + + Alias for :func:`torch.transpose`. + + This function is equivalent to NumPy's swapaxes function. + + Examples:: + + >>> x = torch.tensor([[[0,1],[2,3]],[[4,5],[6,7]]]) + >>> x + tensor([[[0, 1], + [2, 3]], + + [[4, 5], + [6, 7]]]) + >>> torch.swapaxes(x, 0, 1) + tensor([[[0, 1], + [4, 5]], + + [[2, 3], + [6, 7]]]) + >>> torch.swapaxes(x, 0, 2) + tensor([[[0, 4], + [2, 6]], + + [[1, 5], + [3, 7]]]) + """ + ... +def swapdims(input: Tensor, dim0: _int, dim1: _int) -> Tensor: + r""" + swapdims(input, dim0, dim1) -> Tensor + + Alias for :func:`torch.transpose`. + + This function is equivalent to NumPy's swapaxes function. + + Examples:: + + >>> x = torch.tensor([[[0,1],[2,3]],[[4,5],[6,7]]]) + >>> x + tensor([[[0, 1], + [2, 3]], + + [[4, 5], + [6, 7]]]) + >>> torch.swapdims(x, 0, 1) + tensor([[[0, 1], + [4, 5]], + + [[2, 3], + [6, 7]]]) + >>> torch.swapdims(x, 0, 2) + tensor([[[0, 4], + [2, 6]], + + [[1, 5], + [3, 7]]]) + """ + ... +def sym_constrain_range(size: Union[Number, _complex], *, min: Optional[_int] = None, max: Optional[_int] = None) -> None: ... +def sym_constrain_range_for_size(size: Union[Number, _complex], *, min: Optional[_int] = None, max: Optional[_int] = None) -> None: ... +def t(input: Tensor) -> Tensor: + r""" + t(input) -> Tensor + + Expects :attr:`input` to be <= 2-D tensor and transposes dimensions 0 + and 1. + + 0-D and 1-D tensors are returned as is. When input is a 2-D tensor this + is equivalent to ``transpose(input, 0, 1)``. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> x = torch.randn(()) + >>> x + tensor(0.1995) + >>> torch.t(x) + tensor(0.1995) + >>> x = torch.randn(3) + >>> x + tensor([ 2.4320, -0.4608, 0.7702]) + >>> torch.t(x) + tensor([ 2.4320, -0.4608, 0.7702]) + >>> x = torch.randn(2, 3) + >>> x + tensor([[ 0.4875, 0.9158, -0.5872], + [ 0.3938, -0.6929, 0.6932]]) + >>> torch.t(x) + tensor([[ 0.4875, 0.3938], + [ 0.9158, -0.6929], + [-0.5872, 0.6932]]) + + See also :func:`torch.transpose`. + """ + ... +def t_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.t`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def take(input: Tensor, index: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + take(input, index) -> Tensor + + Returns a new tensor with the elements of :attr:`input` at the given indices. + The input tensor is treated as if it were viewed as a 1-D tensor. The result + takes the same shape as the indices. + + Args: + input (Tensor): the input tensor. + index (LongTensor): the indices into tensor + + Example:: + + >>> src = torch.tensor([[4, 3, 5], + ... [6, 7, 8]]) + >>> torch.take(src, torch.tensor([0, 2, 5])) + tensor([ 4, 5, 8]) + """ + ... +def take_along_dim(input: Tensor, indices: Tensor, dim: Optional[_int] = None, *, out: Optional[Tensor] = None) -> Tensor: + r""" + take_along_dim(input, indices, dim=None, *, out=None) -> Tensor + + Selects values from :attr:`input` at the 1-dimensional indices from :attr:`indices` along the given :attr:`dim`. + + If :attr:`dim` is None, the input array is treated as if it has been flattened to 1d. + + Functions that return indices along a dimension, like :func:`torch.argmax` and :func:`torch.argsort`, + are designed to work with this function. See the examples below. + + .. note:: + This function is similar to NumPy's `take_along_axis`. + See also :func:`torch.gather`. + + Args: + input (Tensor): the input tensor. + indices (tensor): the indices into :attr:`input`. Must have long dtype. + dim (int, optional): dimension to select along. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> t = torch.tensor([[10, 30, 20], [60, 40, 50]]) + >>> max_idx = torch.argmax(t) + >>> torch.take_along_dim(t, max_idx) + tensor([60]) + >>> sorted_idx = torch.argsort(t, dim=1) + >>> torch.take_along_dim(t, sorted_idx, dim=1) + tensor([[10, 20, 30], + [40, 50, 60]]) + """ + ... +def tan(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + tan(input, *, out=None) -> Tensor + + Returns a new tensor with the tangent of the elements of :attr:`input`. + + .. math:: + \text{out}_{i} = \tan(\text{input}_{i}) + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([-1.2027, -1.7687, 0.4412, -1.3856]) + >>> torch.tan(a) + tensor([-2.5930, 4.9859, 0.4722, -5.3366]) + """ + ... +def tan_(input: Tensor) -> Tensor: ... +def tanh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + tanh(input, *, out=None) -> Tensor + + Returns a new tensor with the hyperbolic tangent of the elements + of :attr:`input`. + + .. math:: + \text{out}_{i} = \tanh(\text{input}_{i}) + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([ 0.8986, -0.7279, 1.1745, 0.2611]) + >>> torch.tanh(a) + tensor([ 0.7156, -0.6218, 0.8257, 0.2553]) + """ + ... +def tanh_(input: Tensor) -> Tensor: ... +def tensor(data: Any, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: + r""" + tensor(data, *, dtype=None, device=None, requires_grad=False, pin_memory=False) -> Tensor + + Constructs a tensor with no autograd history (also known as a "leaf tensor", see :doc:`/notes/autograd`) by copying :attr:`data`. + + .. warning:: + + When working with tensors prefer using :func:`torch.Tensor.clone`, + :func:`torch.Tensor.detach`, and :func:`torch.Tensor.requires_grad_` for + readability. Letting `t` be a tensor, ``torch.tensor(t)`` is equivalent to + ``t.clone().detach()``, and ``torch.tensor(t, requires_grad=True)`` + is equivalent to ``t.clone().detach().requires_grad_(True)``. + + .. seealso:: + + :func:`torch.as_tensor` preserves autograd history and avoids copies where possible. + :func:`torch.from_numpy` creates a tensor that shares storage with a NumPy array. + + Args: + data (array_like): Initial data for the tensor. Can be a list, tuple, + NumPy ``ndarray``, scalar, and other types. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, infers data type from :attr:`data`. + device (:class:`torch.device`, optional): the device of the constructed tensor. If None and data is a tensor + then the device of data is used. If None and data is not a tensor then + the result tensor is constructed on the current device. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + + + Example:: + + >>> torch.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]]) + tensor([[ 0.1000, 1.2000], + [ 2.2000, 3.1000], + [ 4.9000, 5.2000]]) + + >>> torch.tensor([0, 1]) # Type inference on data + tensor([ 0, 1]) + + >>> torch.tensor([[0.11111, 0.222222, 0.3333333]], + ... dtype=torch.float64, + ... device=torch.device('cuda:0')) # creates a double tensor on a CUDA device + tensor([[ 0.1111, 0.2222, 0.3333]], dtype=torch.float64, device='cuda:0') + + >>> torch.tensor(3.14159) # Create a zero-dimensional (scalar) tensor + tensor(3.1416) + + >>> torch.tensor([]) # Create an empty tensor (of size (0,)) + tensor([]) + """ + ... +@overload +def tensor_split(input: Tensor, tensor_indices_or_sections: Tensor, dim: _int = 0) -> Tuple[Tensor, ...]: + r""" + tensor_split(input, indices_or_sections, dim=0) -> List of Tensors + + Splits a tensor into multiple sub-tensors, all of which are views of :attr:`input`, + along dimension :attr:`dim` according to the indices or number of sections specified + by :attr:`indices_or_sections`. This function is based on NumPy's + :func:`numpy.array_split`. + + Args: + input (Tensor): the tensor to split + indices_or_sections (Tensor, int or list or tuple of ints): + If :attr:`indices_or_sections` is an integer ``n`` or a zero dimensional long tensor + with value ``n``, :attr:`input` is split into ``n`` sections along dimension :attr:`dim`. + If :attr:`input` is divisible by ``n`` along dimension :attr:`dim`, each + section will be of equal size, :code:`input.size(dim) / n`. If :attr:`input` + is not divisible by ``n``, the sizes of the first :code:`int(input.size(dim) % n)` + sections will have size :code:`int(input.size(dim) / n) + 1`, and the rest will + have size :code:`int(input.size(dim) / n)`. + + If :attr:`indices_or_sections` is a list or tuple of ints, or a one-dimensional long + tensor, then :attr:`input` is split along dimension :attr:`dim` at each of the indices + in the list, tuple or tensor. For instance, :code:`indices_or_sections=[2, 3]` and :code:`dim=0` + would result in the tensors :code:`input[:2]`, :code:`input[2:3]`, and :code:`input[3:]`. + + If :attr:`indices_or_sections` is a tensor, it must be a zero-dimensional or one-dimensional + long tensor on the CPU. + + dim (int, optional): dimension along which to split the tensor. Default: ``0`` + + Example:: + + >>> x = torch.arange(8) + >>> torch.tensor_split(x, 3) + (tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7])) + + >>> x = torch.arange(7) + >>> torch.tensor_split(x, 3) + (tensor([0, 1, 2]), tensor([3, 4]), tensor([5, 6])) + >>> torch.tensor_split(x, (1, 6)) + (tensor([0]), tensor([1, 2, 3, 4, 5]), tensor([6])) + + >>> x = torch.arange(14).reshape(2, 7) + >>> x + tensor([[ 0, 1, 2, 3, 4, 5, 6], + [ 7, 8, 9, 10, 11, 12, 13]]) + >>> torch.tensor_split(x, 3, dim=1) + (tensor([[0, 1, 2], + [7, 8, 9]]), + tensor([[ 3, 4], + [10, 11]]), + tensor([[ 5, 6], + [12, 13]])) + >>> torch.tensor_split(x, (1, 6), dim=1) + (tensor([[0], + [7]]), + tensor([[ 1, 2, 3, 4, 5], + [ 8, 9, 10, 11, 12]]), + tensor([[ 6], + [13]])) + """ + ... +@overload +def tensor_split(input: Tensor, sections: Union[_int, SymInt], dim: _int = 0) -> Tuple[Tensor, ...]: + r""" + tensor_split(input, indices_or_sections, dim=0) -> List of Tensors + + Splits a tensor into multiple sub-tensors, all of which are views of :attr:`input`, + along dimension :attr:`dim` according to the indices or number of sections specified + by :attr:`indices_or_sections`. This function is based on NumPy's + :func:`numpy.array_split`. + + Args: + input (Tensor): the tensor to split + indices_or_sections (Tensor, int or list or tuple of ints): + If :attr:`indices_or_sections` is an integer ``n`` or a zero dimensional long tensor + with value ``n``, :attr:`input` is split into ``n`` sections along dimension :attr:`dim`. + If :attr:`input` is divisible by ``n`` along dimension :attr:`dim`, each + section will be of equal size, :code:`input.size(dim) / n`. If :attr:`input` + is not divisible by ``n``, the sizes of the first :code:`int(input.size(dim) % n)` + sections will have size :code:`int(input.size(dim) / n) + 1`, and the rest will + have size :code:`int(input.size(dim) / n)`. + + If :attr:`indices_or_sections` is a list or tuple of ints, or a one-dimensional long + tensor, then :attr:`input` is split along dimension :attr:`dim` at each of the indices + in the list, tuple or tensor. For instance, :code:`indices_or_sections=[2, 3]` and :code:`dim=0` + would result in the tensors :code:`input[:2]`, :code:`input[2:3]`, and :code:`input[3:]`. + + If :attr:`indices_or_sections` is a tensor, it must be a zero-dimensional or one-dimensional + long tensor on the CPU. + + dim (int, optional): dimension along which to split the tensor. Default: ``0`` + + Example:: + + >>> x = torch.arange(8) + >>> torch.tensor_split(x, 3) + (tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7])) + + >>> x = torch.arange(7) + >>> torch.tensor_split(x, 3) + (tensor([0, 1, 2]), tensor([3, 4]), tensor([5, 6])) + >>> torch.tensor_split(x, (1, 6)) + (tensor([0]), tensor([1, 2, 3, 4, 5]), tensor([6])) + + >>> x = torch.arange(14).reshape(2, 7) + >>> x + tensor([[ 0, 1, 2, 3, 4, 5, 6], + [ 7, 8, 9, 10, 11, 12, 13]]) + >>> torch.tensor_split(x, 3, dim=1) + (tensor([[0, 1, 2], + [7, 8, 9]]), + tensor([[ 3, 4], + [10, 11]]), + tensor([[ 5, 6], + [12, 13]])) + >>> torch.tensor_split(x, (1, 6), dim=1) + (tensor([[0], + [7]]), + tensor([[ 1, 2, 3, 4, 5], + [ 8, 9, 10, 11, 12]]), + tensor([[ 6], + [13]])) + """ + ... +@overload +def tensor_split(input: Tensor, indices: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: + r""" + tensor_split(input, indices_or_sections, dim=0) -> List of Tensors + + Splits a tensor into multiple sub-tensors, all of which are views of :attr:`input`, + along dimension :attr:`dim` according to the indices or number of sections specified + by :attr:`indices_or_sections`. This function is based on NumPy's + :func:`numpy.array_split`. + + Args: + input (Tensor): the tensor to split + indices_or_sections (Tensor, int or list or tuple of ints): + If :attr:`indices_or_sections` is an integer ``n`` or a zero dimensional long tensor + with value ``n``, :attr:`input` is split into ``n`` sections along dimension :attr:`dim`. + If :attr:`input` is divisible by ``n`` along dimension :attr:`dim`, each + section will be of equal size, :code:`input.size(dim) / n`. If :attr:`input` + is not divisible by ``n``, the sizes of the first :code:`int(input.size(dim) % n)` + sections will have size :code:`int(input.size(dim) / n) + 1`, and the rest will + have size :code:`int(input.size(dim) / n)`. + + If :attr:`indices_or_sections` is a list or tuple of ints, or a one-dimensional long + tensor, then :attr:`input` is split along dimension :attr:`dim` at each of the indices + in the list, tuple or tensor. For instance, :code:`indices_or_sections=[2, 3]` and :code:`dim=0` + would result in the tensors :code:`input[:2]`, :code:`input[2:3]`, and :code:`input[3:]`. + + If :attr:`indices_or_sections` is a tensor, it must be a zero-dimensional or one-dimensional + long tensor on the CPU. + + dim (int, optional): dimension along which to split the tensor. Default: ``0`` + + Example:: + + >>> x = torch.arange(8) + >>> torch.tensor_split(x, 3) + (tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7])) + + >>> x = torch.arange(7) + >>> torch.tensor_split(x, 3) + (tensor([0, 1, 2]), tensor([3, 4]), tensor([5, 6])) + >>> torch.tensor_split(x, (1, 6)) + (tensor([0]), tensor([1, 2, 3, 4, 5]), tensor([6])) + + >>> x = torch.arange(14).reshape(2, 7) + >>> x + tensor([[ 0, 1, 2, 3, 4, 5, 6], + [ 7, 8, 9, 10, 11, 12, 13]]) + >>> torch.tensor_split(x, 3, dim=1) + (tensor([[0, 1, 2], + [7, 8, 9]]), + tensor([[ 3, 4], + [10, 11]]), + tensor([[ 5, 6], + [12, 13]])) + >>> torch.tensor_split(x, (1, 6), dim=1) + (tensor([[0], + [7]]), + tensor([[ 1, 2, 3, 4, 5], + [ 8, 9, 10, 11, 12]]), + tensor([[ 6], + [13]])) + """ + ... +def threshold(input: Tensor, threshold: Union[Number, _complex], value: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: ... +def threshold_(input: Tensor, threshold: Union[Number, _complex], value: Union[Number, _complex]) -> Tensor: ... +def tile(input: Tensor, dims: Sequence[Union[_int, SymInt]]) -> Tensor: + r""" + tile(input, dims) -> Tensor + + Constructs a tensor by repeating the elements of :attr:`input`. + The :attr:`dims` argument specifies the number of repetitions + in each dimension. + + If :attr:`dims` specifies fewer dimensions than :attr:`input` has, then + ones are prepended to :attr:`dims` until all dimensions are specified. + For example, if :attr:`input` has shape (8, 6, 4, 2) and :attr:`dims` + is (2, 2), then :attr:`dims` is treated as (1, 1, 2, 2). + + Analogously, if :attr:`input` has fewer dimensions than :attr:`dims` + specifies, then :attr:`input` is treated as if it were unsqueezed at + dimension zero until it has as many dimensions as :attr:`dims` specifies. + For example, if :attr:`input` has shape (4, 2) and :attr:`dims` + is (3, 3, 2, 2), then :attr:`input` is treated as if it had the + shape (1, 1, 4, 2). + + .. note:: + + This function is similar to NumPy's tile function. + + Args: + input (Tensor): the tensor whose elements to repeat. + dims (tuple): the number of repetitions per dimension. + + Example:: + + >>> x = torch.tensor([1, 2, 3]) + >>> x.tile((2,)) + tensor([1, 2, 3, 1, 2, 3]) + >>> y = torch.tensor([[1, 2], [3, 4]]) + >>> torch.tile(y, (2, 2)) + tensor([[1, 2, 1, 2], + [3, 4, 3, 4], + [1, 2, 1, 2], + [3, 4, 3, 4]]) + """ + ... +def topk(input: Tensor, k: Union[_int, SymInt], dim: _int = -1, largest: _bool = True, sorted: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.topk: + r""" + topk(input, k, dim=None, largest=True, sorted=True, *, out=None) -> (Tensor, LongTensor) + + Returns the :attr:`k` largest elements of the given :attr:`input` tensor along + a given dimension. + + If :attr:`dim` is not given, the last dimension of the `input` is chosen. + + If :attr:`largest` is ``False`` then the `k` smallest elements are returned. + + A namedtuple of `(values, indices)` is returned with the `values` and + `indices` of the largest `k` elements of each row of the `input` tensor in the + given dimension `dim`. + + The boolean option :attr:`sorted` if ``True``, will make sure that the returned + `k` elements are themselves sorted + + Args: + input (Tensor): the input tensor. + k (int): the k in "top-k" + dim (int, optional): the dimension to sort along + largest (bool, optional): controls whether to return largest or + smallest elements + sorted (bool, optional): controls whether to return the elements + in sorted order + + Keyword args: + out (tuple, optional): the output tuple of (Tensor, LongTensor) that can be + optionally given to be used as output buffers + + Example:: + + >>> x = torch.arange(1., 6.) + >>> x + tensor([ 1., 2., 3., 4., 5.]) + >>> torch.topk(x, 3) + torch.return_types.topk(values=tensor([5., 4., 3.]), indices=tensor([4, 3, 2])) + """ + ... +def trace(input: Tensor) -> Tensor: + r""" + trace(input) -> Tensor + + Returns the sum of the elements of the diagonal of the input 2-D matrix. + + Example:: + + >>> x = torch.arange(1., 10.).view(3, 3) + >>> x + tensor([[ 1., 2., 3.], + [ 4., 5., 6.], + [ 7., 8., 9.]]) + >>> torch.trace(x) + tensor(15.) + """ + ... +@overload +def transpose(input: Tensor, dim0: _int, dim1: _int) -> Tensor: + r""" + transpose(input, dim0, dim1) -> Tensor + + Returns a tensor that is a transposed version of :attr:`input`. + The given dimensions :attr:`dim0` and :attr:`dim1` are swapped. + + If :attr:`input` is a strided tensor then the resulting :attr:`out` + tensor shares its underlying storage with the :attr:`input` tensor, so + changing the content of one would change the content of the other. + + If :attr:`input` is a :ref:`sparse tensor ` then the + resulting :attr:`out` tensor *does not* share the underlying storage + with the :attr:`input` tensor. + + If :attr:`input` is a :ref:`sparse tensor ` with compressed + layout (SparseCSR, SparseBSR, SparseCSC or SparseBSC) the arguments + :attr:`dim0` and :attr:`dim1` must be both batch dimensions, or must + both be sparse dimensions. The batch dimensions of a sparse tensor are the + dimensions preceding the sparse dimensions. + + .. note:: + Transpositions which interchange the sparse dimensions of a `SparseCSR` + or `SparseCSC` layout tensor will result in the layout changing between + the two options. Transposition of the sparse dimensions of a ` SparseBSR` + or `SparseBSC` layout tensor will likewise generate a result with the + opposite layout. + + + Args: + input (Tensor): the input tensor. + dim0 (int): the first dimension to be transposed + dim1 (int): the second dimension to be transposed + + Example:: + + >>> x = torch.randn(2, 3) + >>> x + tensor([[ 1.0028, -0.9893, 0.5809], + [-0.1669, 0.7299, 0.4942]]) + >>> torch.transpose(x, 0, 1) + tensor([[ 1.0028, -0.1669], + [-0.9893, 0.7299], + [ 0.5809, 0.4942]]) + + See also :func:`torch.t`. + """ + ... +@overload +def transpose(input: Tensor, dim0: Union[str, ellipsis, None], dim1: Union[str, ellipsis, None]) -> Tensor: + r""" + transpose(input, dim0, dim1) -> Tensor + + Returns a tensor that is a transposed version of :attr:`input`. + The given dimensions :attr:`dim0` and :attr:`dim1` are swapped. + + If :attr:`input` is a strided tensor then the resulting :attr:`out` + tensor shares its underlying storage with the :attr:`input` tensor, so + changing the content of one would change the content of the other. + + If :attr:`input` is a :ref:`sparse tensor ` then the + resulting :attr:`out` tensor *does not* share the underlying storage + with the :attr:`input` tensor. + + If :attr:`input` is a :ref:`sparse tensor ` with compressed + layout (SparseCSR, SparseBSR, SparseCSC or SparseBSC) the arguments + :attr:`dim0` and :attr:`dim1` must be both batch dimensions, or must + both be sparse dimensions. The batch dimensions of a sparse tensor are the + dimensions preceding the sparse dimensions. + + .. note:: + Transpositions which interchange the sparse dimensions of a `SparseCSR` + or `SparseCSC` layout tensor will result in the layout changing between + the two options. Transposition of the sparse dimensions of a ` SparseBSR` + or `SparseBSC` layout tensor will likewise generate a result with the + opposite layout. + + + Args: + input (Tensor): the input tensor. + dim0 (int): the first dimension to be transposed + dim1 (int): the second dimension to be transposed + + Example:: + + >>> x = torch.randn(2, 3) + >>> x + tensor([[ 1.0028, -0.9893, 0.5809], + [-0.1669, 0.7299, 0.4942]]) + >>> torch.transpose(x, 0, 1) + tensor([[ 1.0028, -0.1669], + [-0.9893, 0.7299], + [ 0.5809, 0.4942]]) + + See also :func:`torch.t`. + """ + ... +def transpose_copy(input: Tensor, dim0: _int, dim1: _int, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.transpose`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +@overload +def trapezoid(y: Tensor, x: Tensor, *, dim: _int = -1) -> Tensor: + r""" + trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor + + Computes the `trapezoidal rule `_ along + :attr:`dim`. By default the spacing between elements is assumed to be 1, but + :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be + used to specify arbitrary spacing along :attr:`dim`. + + + Assuming :attr:`y` is a one-dimensional tensor with elements :math:`{y_0, y_1, ..., y_n}`, + the default computation is + + .. math:: + \begin{aligned} + \sum_{i = 1}^{n-1} \frac{1}{2} (y_i + y_{i-1}) + \end{aligned} + + When :attr:`dx` is specified the computation becomes + + .. math:: + \begin{aligned} + \sum_{i = 1}^{n-1} \frac{\Delta x}{2} (y_i + y_{i-1}) + \end{aligned} + + effectively multiplying the result by :attr:`dx`. When :attr:`x` is specified, + assuming :attr:`x` is also a one-dimensional tensor with + elements :math:`{x_0, x_1, ..., x_n}`, the computation becomes + + .. math:: + \begin{aligned} + \sum_{i = 1}^{n-1} \frac{(x_i - x_{i-1})}{2} (y_i + y_{i-1}) + \end{aligned} + + When :attr:`x` and :attr:`y` have the same size, the computation is as described above and no broadcasting is needed. + The broadcasting behavior of this function is as follows when their sizes are different. For both :attr:`x` + and :attr:`y`, the function computes the difference between consecutive elements along + dimension :attr:`dim`. This effectively creates two tensors, `x_diff` and `y_diff`, that have + the same shape as the original tensors except their lengths along the dimension :attr:`dim` is reduced by 1. + After that, those two tensors are broadcast together to compute final output as part of the trapezoidal rule. + See the examples below for details. + + .. note:: + The trapezoidal rule is a technique for approximating the definite integral of a function + by averaging its left and right Riemann sums. The approximation becomes more accurate as + the resolution of the partition increases. + + Arguments: + y (Tensor): Values to use when computing the trapezoidal rule. + x (Tensor): If specified, defines spacing between values as specified above. + + Keyword arguments: + dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx` + are specified then this defaults to 1. Effectively multiplies the result by its value. + dim (int): The dimension along which to compute the trapezoidal rule. + The last (inner-most) dimension by default. + + Examples:: + + >>> # Computes the trapezoidal rule in 1D, spacing is implicitly 1 + >>> y = torch.tensor([1, 5, 10]) + >>> torch.trapezoid(y) + tensor(10.5) + + >>> # Computes the same trapezoidal rule directly to verify + >>> (1 + 10 + 10) / 2 + 10.5 + + >>> # Computes the trapezoidal rule in 1D with constant spacing of 2 + >>> # NOTE: the result is the same as before, but multiplied by 2 + >>> torch.trapezoid(y, dx=2) + 21.0 + + >>> # Computes the trapezoidal rule in 1D with arbitrary spacing + >>> x = torch.tensor([1, 3, 6]) + >>> torch.trapezoid(y, x) + 28.5 + + >>> # Computes the same trapezoidal rule directly to verify + >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2 + 28.5 + + >>> # Computes the trapezoidal rule for each row of a 3x3 matrix + >>> y = torch.arange(9).reshape(3, 3) + tensor([[0, 1, 2], + [3, 4, 5], + [6, 7, 8]]) + >>> torch.trapezoid(y) + tensor([ 2., 8., 14.]) + + >>> # Computes the trapezoidal rule for each column of the matrix + >>> torch.trapezoid(y, dim=0) + tensor([ 6., 8., 10.]) + + >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix + >>> # with the same arbitrary spacing + >>> y = torch.ones(3, 3) + >>> x = torch.tensor([1, 3, 6]) + >>> torch.trapezoid(y, x) + array([5., 5., 5.]) + + >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix + >>> # with different arbitrary spacing per row + >>> y = torch.ones(3, 3) + >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]]) + >>> torch.trapezoid(y, x) + array([2., 4., 6.]) + """ + ... +@overload +def trapezoid(y: Tensor, *, dx: Union[Number, _complex] = 1, dim: _int = -1) -> Tensor: + r""" + trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor + + Computes the `trapezoidal rule `_ along + :attr:`dim`. By default the spacing between elements is assumed to be 1, but + :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be + used to specify arbitrary spacing along :attr:`dim`. + + + Assuming :attr:`y` is a one-dimensional tensor with elements :math:`{y_0, y_1, ..., y_n}`, + the default computation is + + .. math:: + \begin{aligned} + \sum_{i = 1}^{n-1} \frac{1}{2} (y_i + y_{i-1}) + \end{aligned} + + When :attr:`dx` is specified the computation becomes + + .. math:: + \begin{aligned} + \sum_{i = 1}^{n-1} \frac{\Delta x}{2} (y_i + y_{i-1}) + \end{aligned} + + effectively multiplying the result by :attr:`dx`. When :attr:`x` is specified, + assuming :attr:`x` is also a one-dimensional tensor with + elements :math:`{x_0, x_1, ..., x_n}`, the computation becomes + + .. math:: + \begin{aligned} + \sum_{i = 1}^{n-1} \frac{(x_i - x_{i-1})}{2} (y_i + y_{i-1}) + \end{aligned} + + When :attr:`x` and :attr:`y` have the same size, the computation is as described above and no broadcasting is needed. + The broadcasting behavior of this function is as follows when their sizes are different. For both :attr:`x` + and :attr:`y`, the function computes the difference between consecutive elements along + dimension :attr:`dim`. This effectively creates two tensors, `x_diff` and `y_diff`, that have + the same shape as the original tensors except their lengths along the dimension :attr:`dim` is reduced by 1. + After that, those two tensors are broadcast together to compute final output as part of the trapezoidal rule. + See the examples below for details. + + .. note:: + The trapezoidal rule is a technique for approximating the definite integral of a function + by averaging its left and right Riemann sums. The approximation becomes more accurate as + the resolution of the partition increases. + + Arguments: + y (Tensor): Values to use when computing the trapezoidal rule. + x (Tensor): If specified, defines spacing between values as specified above. + + Keyword arguments: + dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx` + are specified then this defaults to 1. Effectively multiplies the result by its value. + dim (int): The dimension along which to compute the trapezoidal rule. + The last (inner-most) dimension by default. + + Examples:: + + >>> # Computes the trapezoidal rule in 1D, spacing is implicitly 1 + >>> y = torch.tensor([1, 5, 10]) + >>> torch.trapezoid(y) + tensor(10.5) + + >>> # Computes the same trapezoidal rule directly to verify + >>> (1 + 10 + 10) / 2 + 10.5 + + >>> # Computes the trapezoidal rule in 1D with constant spacing of 2 + >>> # NOTE: the result is the same as before, but multiplied by 2 + >>> torch.trapezoid(y, dx=2) + 21.0 + + >>> # Computes the trapezoidal rule in 1D with arbitrary spacing + >>> x = torch.tensor([1, 3, 6]) + >>> torch.trapezoid(y, x) + 28.5 + + >>> # Computes the same trapezoidal rule directly to verify + >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2 + 28.5 + + >>> # Computes the trapezoidal rule for each row of a 3x3 matrix + >>> y = torch.arange(9).reshape(3, 3) + tensor([[0, 1, 2], + [3, 4, 5], + [6, 7, 8]]) + >>> torch.trapezoid(y) + tensor([ 2., 8., 14.]) + + >>> # Computes the trapezoidal rule for each column of the matrix + >>> torch.trapezoid(y, dim=0) + tensor([ 6., 8., 10.]) + + >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix + >>> # with the same arbitrary spacing + >>> y = torch.ones(3, 3) + >>> x = torch.tensor([1, 3, 6]) + >>> torch.trapezoid(y, x) + array([5., 5., 5.]) + + >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix + >>> # with different arbitrary spacing per row + >>> y = torch.ones(3, 3) + >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]]) + >>> torch.trapezoid(y, x) + array([2., 4., 6.]) + """ + ... +@overload +def trapz(y: Tensor, *, dx: _float = 1, dim: _int = -1) -> Tensor: + r""" + trapz(y, x, *, dim=-1) -> Tensor + + Alias for :func:`torch.trapezoid`. + """ + ... +@overload +def trapz(y: Tensor, x: Tensor, *, dim: _int = -1) -> Tensor: + r""" + trapz(y, x, *, dim=-1) -> Tensor + + Alias for :func:`torch.trapezoid`. + """ + ... +def triangular_solve(input: Tensor, A: Tensor, upper: _bool = True, transpose: _bool = False, unitriangular: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.triangular_solve: + r""" + triangular_solve(b, A, upper=True, transpose=False, unitriangular=False, *, out=None) -> (Tensor, Tensor) + + Solves a system of equations with a square upper or lower triangular invertible matrix :math:`A` + and multiple right-hand sides :math:`b`. + + In symbols, it solves :math:`AX = b` and assumes :math:`A` is square upper-triangular + (or lower-triangular if :attr:`upper`\ `= False`) and does not have zeros on the diagonal. + + `torch.triangular_solve(b, A)` can take in 2D inputs `b, A` or inputs that are + batches of 2D matrices. If the inputs are batches, then returns + batched outputs `X` + + If the diagonal of :attr:`A` contains zeros or elements that are very close to zero and + :attr:`unitriangular`\ `= False` (default) or if the input matrix is badly conditioned, + the result may contain `NaN` s. + + Supports input of float, double, cfloat and cdouble data types. + + .. warning:: + + :func:`torch.triangular_solve` is deprecated in favor of :func:`torch.linalg.solve_triangular` + and will be removed in a future PyTorch release. + :func:`torch.linalg.solve_triangular` has its arguments reversed and does not return a + copy of one of the inputs. + + ``X = torch.triangular_solve(B, A).solution`` should be replaced with + + .. code:: python + + X = torch.linalg.solve_triangular(A, B) + + Args: + b (Tensor): multiple right-hand sides of size :math:`(*, m, k)` where + :math:`*` is zero of more batch dimensions + A (Tensor): the input triangular coefficient matrix of size :math:`(*, m, m)` + where :math:`*` is zero or more batch dimensions + upper (bool, optional): whether :math:`A` is upper or lower triangular. Default: ``True``. + transpose (bool, optional): solves `op(A)X = b` where `op(A) = A^T` if this flag is ``True``, + and `op(A) = A` if it is ``False``. Default: ``False``. + unitriangular (bool, optional): whether :math:`A` is unit triangular. + If True, the diagonal elements of :math:`A` are assumed to be + 1 and not referenced from :math:`A`. Default: ``False``. + + Keyword args: + out ((Tensor, Tensor), optional): tuple of two tensors to write + the output to. Ignored if `None`. Default: `None`. + + Returns: + A namedtuple `(solution, cloned_coefficient)` where `cloned_coefficient` + is a clone of :math:`A` and `solution` is the solution :math:`X` to :math:`AX = b` + (or whatever variant of the system of equations, depending on the keyword arguments.) + + Examples:: + + >>> A = torch.randn(2, 2).triu() + >>> A + tensor([[ 1.1527, -1.0753], + [ 0.0000, 0.7986]]) + >>> b = torch.randn(2, 3) + >>> b + tensor([[-0.0210, 2.3513, -1.5492], + [ 1.5429, 0.7403, -1.0243]]) + >>> torch.triangular_solve(b, A) + torch.return_types.triangular_solve( + solution=tensor([[ 1.7841, 2.9046, -2.5405], + [ 1.9320, 0.9270, -1.2826]]), + cloned_coefficient=tensor([[ 1.1527, -1.0753], + [ 0.0000, 0.7986]])) + """ + ... +def tril(input: Tensor, diagonal: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: + r""" + tril(input, diagonal=0, *, out=None) -> Tensor + + Returns the lower triangular part of the matrix (2-D tensor) or batch of matrices + :attr:`input`, the other elements of the result tensor :attr:`out` are set to 0. + + The lower triangular part of the matrix is defined as the elements on and + below the diagonal. + + The argument :attr:`diagonal` controls which diagonal to consider. If + :attr:`diagonal` = 0, all elements on and below the main diagonal are + retained. A positive value includes just as many diagonals above the main + diagonal, and similarly a negative value excludes just as many diagonals below + the main diagonal. The main diagonal are the set of indices + :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` where + :math:`d_{1}, d_{2}` are the dimensions of the matrix. + + Args: + input (Tensor): the input tensor. + diagonal (int, optional): the diagonal to consider + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(3, 3) + >>> a + tensor([[-1.0813, -0.8619, 0.7105], + [ 0.0935, 0.1380, 2.2112], + [-0.3409, -0.9828, 0.0289]]) + >>> torch.tril(a) + tensor([[-1.0813, 0.0000, 0.0000], + [ 0.0935, 0.1380, 0.0000], + [-0.3409, -0.9828, 0.0289]]) + + >>> b = torch.randn(4, 6) + >>> b + tensor([[ 1.2219, 0.5653, -0.2521, -0.2345, 1.2544, 0.3461], + [ 0.4785, -0.4477, 0.6049, 0.6368, 0.8775, 0.7145], + [ 1.1502, 3.2716, -1.1243, -0.5413, 0.3615, 0.6864], + [-0.0614, -0.7344, -1.3164, -0.7648, -1.4024, 0.0978]]) + >>> torch.tril(b, diagonal=1) + tensor([[ 1.2219, 0.5653, 0.0000, 0.0000, 0.0000, 0.0000], + [ 0.4785, -0.4477, 0.6049, 0.0000, 0.0000, 0.0000], + [ 1.1502, 3.2716, -1.1243, -0.5413, 0.0000, 0.0000], + [-0.0614, -0.7344, -1.3164, -0.7648, -1.4024, 0.0000]]) + >>> torch.tril(b, diagonal=-1) + tensor([[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [ 0.4785, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [ 1.1502, 3.2716, 0.0000, 0.0000, 0.0000, 0.0000], + [-0.0614, -0.7344, -1.3164, 0.0000, 0.0000, 0.0000]]) + """ + ... +def tril_indices(row: _int, col: _int, offset: _int = 0, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + tril_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor + + Returns the indices of the lower triangular part of a :attr:`row`-by- + :attr:`col` matrix in a 2-by-N Tensor, where the first row contains row + coordinates of all indices and the second row contains column coordinates. + Indices are ordered based on rows and then columns. + + The lower triangular part of the matrix is defined as the elements on and + below the diagonal. + + The argument :attr:`offset` controls which diagonal to consider. If + :attr:`offset` = 0, all elements on and below the main diagonal are + retained. A positive value includes just as many diagonals above the main + diagonal, and similarly a negative value excludes just as many diagonals below + the main diagonal. The main diagonal are the set of indices + :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` + where :math:`d_{1}, d_{2}` are the dimensions of the matrix. + + .. note:: + When running on CUDA, ``row * col`` must be less than :math:`2^{59}` to + prevent overflow during calculation. + + Args: + row (``int``): number of rows in the 2-D matrix. + col (``int``): number of columns in the 2-D matrix. + offset (``int``): diagonal offset from the main diagonal. + Default: if not provided, 0. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, ``torch.long``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + layout (:class:`torch.layout`, optional): currently only support ``torch.strided``. + + Example:: + + >>> a = torch.tril_indices(3, 3) + >>> a + tensor([[0, 1, 1, 2, 2, 2], + [0, 0, 1, 0, 1, 2]]) + + >>> a = torch.tril_indices(4, 3, -1) + >>> a + tensor([[1, 2, 2, 3, 3, 3], + [0, 0, 1, 0, 1, 2]]) + + >>> a = torch.tril_indices(4, 3, 1) + >>> a + tensor([[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3], + [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2]]) + """ + ... +def triplet_margin_loss(anchor: Tensor, positive: Tensor, negative: Tensor, margin: _float = 1.0, p: _float = 2, eps: _float = 1e-06, swap: _bool = False, reduction: _int = 1) -> Tensor: ... +def triu(input: Tensor, diagonal: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: + r""" + triu(input, diagonal=0, *, out=None) -> Tensor + + Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices + :attr:`input`, the other elements of the result tensor :attr:`out` are set to 0. + + The upper triangular part of the matrix is defined as the elements on and + above the diagonal. + + The argument :attr:`diagonal` controls which diagonal to consider. If + :attr:`diagonal` = 0, all elements on and above the main diagonal are + retained. A positive value excludes just as many diagonals above the main + diagonal, and similarly a negative value includes just as many diagonals below + the main diagonal. The main diagonal are the set of indices + :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` where + :math:`d_{1}, d_{2}` are the dimensions of the matrix. + + Args: + input (Tensor): the input tensor. + diagonal (int, optional): the diagonal to consider + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(3, 3) + >>> a + tensor([[ 0.2309, 0.5207, 2.0049], + [ 0.2072, -1.0680, 0.6602], + [ 0.3480, -0.5211, -0.4573]]) + >>> torch.triu(a) + tensor([[ 0.2309, 0.5207, 2.0049], + [ 0.0000, -1.0680, 0.6602], + [ 0.0000, 0.0000, -0.4573]]) + >>> torch.triu(a, diagonal=1) + tensor([[ 0.0000, 0.5207, 2.0049], + [ 0.0000, 0.0000, 0.6602], + [ 0.0000, 0.0000, 0.0000]]) + >>> torch.triu(a, diagonal=-1) + tensor([[ 0.2309, 0.5207, 2.0049], + [ 0.2072, -1.0680, 0.6602], + [ 0.0000, -0.5211, -0.4573]]) + + >>> b = torch.randn(4, 6) + >>> b + tensor([[ 0.5876, -0.0794, -1.8373, 0.6654, 0.2604, 1.5235], + [-0.2447, 0.9556, -1.2919, 1.3378, -0.1768, -1.0857], + [ 0.4333, 0.3146, 0.6576, -1.0432, 0.9348, -0.4410], + [-0.9888, 1.0679, -1.3337, -1.6556, 0.4798, 0.2830]]) + >>> torch.triu(b, diagonal=1) + tensor([[ 0.0000, -0.0794, -1.8373, 0.6654, 0.2604, 1.5235], + [ 0.0000, 0.0000, -1.2919, 1.3378, -0.1768, -1.0857], + [ 0.0000, 0.0000, 0.0000, -1.0432, 0.9348, -0.4410], + [ 0.0000, 0.0000, 0.0000, 0.0000, 0.4798, 0.2830]]) + >>> torch.triu(b, diagonal=-1) + tensor([[ 0.5876, -0.0794, -1.8373, 0.6654, 0.2604, 1.5235], + [-0.2447, 0.9556, -1.2919, 1.3378, -0.1768, -1.0857], + [ 0.0000, 0.3146, 0.6576, -1.0432, 0.9348, -0.4410], + [ 0.0000, 0.0000, -1.3337, -1.6556, 0.4798, 0.2830]]) + """ + ... +def triu_indices(row: _int, col: _int, offset: _int = 0, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + triu_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor + + Returns the indices of the upper triangular part of a :attr:`row` by + :attr:`col` matrix in a 2-by-N Tensor, where the first row contains row + coordinates of all indices and the second row contains column coordinates. + Indices are ordered based on rows and then columns. + + The upper triangular part of the matrix is defined as the elements on and + above the diagonal. + + The argument :attr:`offset` controls which diagonal to consider. If + :attr:`offset` = 0, all elements on and above the main diagonal are + retained. A positive value excludes just as many diagonals above the main + diagonal, and similarly a negative value includes just as many diagonals below + the main diagonal. The main diagonal are the set of indices + :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` + where :math:`d_{1}, d_{2}` are the dimensions of the matrix. + + .. note:: + When running on CUDA, ``row * col`` must be less than :math:`2^{59}` to + prevent overflow during calculation. + + Args: + row (``int``): number of rows in the 2-D matrix. + col (``int``): number of columns in the 2-D matrix. + offset (``int``): diagonal offset from the main diagonal. + Default: if not provided, 0. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, ``torch.long``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + layout (:class:`torch.layout`, optional): currently only support ``torch.strided``. + + Example:: + + >>> a = torch.triu_indices(3, 3) + >>> a + tensor([[0, 0, 0, 1, 1, 2], + [0, 1, 2, 1, 2, 2]]) + + >>> a = torch.triu_indices(4, 3, -1) + >>> a + tensor([[0, 0, 0, 1, 1, 1, 2, 2, 3], + [0, 1, 2, 0, 1, 2, 1, 2, 2]]) + + >>> a = torch.triu_indices(4, 3, 1) + >>> a + tensor([[0, 0, 1], + [1, 2, 2]]) + """ + ... +def true_divide(input: Union[Tensor, Number], other: Union[Tensor, Number], *, out: Optional[Tensor] = None) -> Tensor: + r""" + true_divide(dividend, divisor, *, out) -> Tensor + + Alias for :func:`torch.div` with ``rounding_mode=None``. + """ + ... +def trunc(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + trunc(input, *, out=None) -> Tensor + + Returns a new tensor with the truncated integer values of + the elements of :attr:`input`. + + For integer inputs, follows the array-api convention of returning a + copy of the input tensor. + + Args: + input (Tensor): the input tensor. + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.randn(4) + >>> a + tensor([ 3.4742, 0.5466, -0.8008, -0.9079]) + >>> torch.trunc(a) + tensor([ 3., 0., -0., -0.]) + """ + ... +def trunc_(input: Tensor) -> Tensor: ... +@overload +def unbind(input: Tensor, dim: _int = 0) -> Tuple[Tensor, ...]: + r""" + unbind(input, dim=0) -> seq + + Removes a tensor dimension. + + Returns a tuple of all slices along a given dimension, already without it. + + Arguments: + input (Tensor): the tensor to unbind + dim (int): dimension to remove + + Example:: + + >>> torch.unbind(torch.tensor([[1, 2, 3], + >>> [4, 5, 6], + >>> [7, 8, 9]])) + (tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9])) + """ + ... +@overload +def unbind(input: Tensor, dim: Union[str, ellipsis, None]) -> Tuple[Tensor, ...]: + r""" + unbind(input, dim=0) -> seq + + Removes a tensor dimension. + + Returns a tuple of all slices along a given dimension, already without it. + + Arguments: + input (Tensor): the tensor to unbind + dim (int): dimension to remove + + Example:: + + >>> torch.unbind(torch.tensor([[1, 2, 3], + >>> [4, 5, 6], + >>> [7, 8, 9]])) + (tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9])) + """ + ... +def unbind_copy(input: Tensor, dim: _int = 0, *, out: Union[Tuple[Tensor, ...], List[Tensor], None] = None) -> None: + r""" + Performs the same operation as :func:`torch.unbind`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +@overload +def unflatten(input: Tensor, dim: Union[str, ellipsis, None], sizes: Sequence[Union[_int, SymInt]], names: Sequence[Union[str, ellipsis, None]]) -> Tensor: + r""" + unflatten(input, dim, sizes) -> Tensor + + Expands a dimension of the input tensor over multiple dimensions. + + .. seealso:: + + :func:`torch.flatten` the inverse of this function. It coalesces several dimensions into one. + + Args: + input (Tensor): the input tensor. + dim (int): Dimension to be unflattened, specified as an index into + ``input.shape``. + sizes (Tuple[int]): New shape of the unflattened dimension. + One of its elements can be `-1` in which case the corresponding output + dimension is inferred. Otherwise, the product of ``sizes`` *must* + equal ``input.shape[dim]``. + + Returns: + A View of input with the specified dimension unflattened. + + Examples:: + >>> torch.unflatten(torch.randn(3, 4, 1), 1, (2, 2)).shape + torch.Size([3, 2, 2, 1]) + >>> torch.unflatten(torch.randn(3, 4, 1), 1, (-1, 2)).shape + torch.Size([3, 2, 2, 1]) + >>> torch.unflatten(torch.randn(5, 12, 3), -2, (2, 2, 3, 1, 1)).shape + torch.Size([5, 2, 2, 3, 1, 1, 3]) + """ + ... +@overload +def unflatten(input: Tensor, dim: _int, sizes: Sequence[Union[_int, SymInt]]) -> Tensor: + r""" + unflatten(input, dim, sizes) -> Tensor + + Expands a dimension of the input tensor over multiple dimensions. + + .. seealso:: + + :func:`torch.flatten` the inverse of this function. It coalesces several dimensions into one. + + Args: + input (Tensor): the input tensor. + dim (int): Dimension to be unflattened, specified as an index into + ``input.shape``. + sizes (Tuple[int]): New shape of the unflattened dimension. + One of its elements can be `-1` in which case the corresponding output + dimension is inferred. Otherwise, the product of ``sizes`` *must* + equal ``input.shape[dim]``. + + Returns: + A View of input with the specified dimension unflattened. + + Examples:: + >>> torch.unflatten(torch.randn(3, 4, 1), 1, (2, 2)).shape + torch.Size([3, 2, 2, 1]) + >>> torch.unflatten(torch.randn(3, 4, 1), 1, (-1, 2)).shape + torch.Size([3, 2, 2, 1]) + >>> torch.unflatten(torch.randn(5, 12, 3), -2, (2, 2, 3, 1, 1)).shape + torch.Size([5, 2, 2, 3, 1, 1, 3]) + """ + ... +def unfold_copy(input: Tensor, dimension: _int, size: _int, step: _int, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.unfold`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def unique_dim(input: Tensor, dim: _int, sorted: _bool = True, return_inverse: _bool = False, return_counts: _bool = False) -> Tuple[Tensor, Tensor, Tensor]: ... +def unsafe_chunk(input: Tensor, chunks: _int, dim: _int = 0) -> Tuple[Tensor, ...]: + r""" + unsafe_chunk(input, chunks, dim=0) -> List of Tensors + + Works like :func:`torch.chunk` but without enforcing the autograd restrictions + on inplace modification of the outputs. + + .. warning:: + This function is safe to use as long as only the input, or only the outputs + are modified inplace after calling this function. It is user's + responsibility to ensure that is the case. If both the input and one or more + of the outputs are modified inplace, gradients computed by autograd will be + silently incorrect. + """ + ... +def unsafe_split(input: Tensor, split_size: Union[_int, SymInt], dim: _int = 0) -> Tuple[Tensor, ...]: + r""" + unsafe_split(tensor, split_size_or_sections, dim=0) -> List of Tensors + + Works like :func:`torch.split` but without enforcing the autograd restrictions + on inplace modification of the outputs. + + .. warning:: + This function is safe to use as long as only the input, or only the outputs + are modified inplace after calling this function. It is user's + responsibility to ensure that is the case. If both the input and one or more + of the outputs are modified inplace, gradients computed by autograd will be + silently incorrect. + """ + ... +def unsafe_split_with_sizes(input: Tensor, split_sizes: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: ... +def unsqueeze(input: Tensor, dim: _int) -> Tensor: + r""" + unsqueeze(input, dim) -> Tensor + + Returns a new tensor with a dimension of size one inserted at the + specified position. + + The returned tensor shares the same underlying data with this tensor. + + A :attr:`dim` value within the range ``[-input.dim() - 1, input.dim() + 1)`` + can be used. Negative :attr:`dim` will correspond to :meth:`unsqueeze` + applied at :attr:`dim` = ``dim + input.dim() + 1``. + + Args: + input (Tensor): the input tensor. + dim (int): the index at which to insert the singleton dimension + + Example:: + + >>> x = torch.tensor([1, 2, 3, 4]) + >>> torch.unsqueeze(x, 0) + tensor([[ 1, 2, 3, 4]]) + >>> torch.unsqueeze(x, 1) + tensor([[ 1], + [ 2], + [ 3], + [ 4]]) + """ + ... +def unsqueeze_copy(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.unsqueeze`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def values_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.values`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def vander(x: Tensor, N: Optional[_int] = None, increasing: _bool = False) -> Tensor: + r""" + vander(x, N=None, increasing=False) -> Tensor + + Generates a Vandermonde matrix. + + The columns of the output matrix are elementwise powers of the input vector :math:`x^{(N-1)}, x^{(N-2)}, ..., x^0`. + If increasing is True, the order of the columns is reversed :math:`x^0, x^1, ..., x^{(N-1)}`. Such a + matrix with a geometric progression in each row is named for Alexandre-Theophile Vandermonde. + + Arguments: + x (Tensor): 1-D input tensor. + N (int, optional): Number of columns in the output. If N is not specified, + a square array is returned :math:`(N = len(x))`. + increasing (bool, optional): Order of the powers of the columns. If True, + the powers increase from left to right, if False (the default) they are reversed. + + Returns: + Tensor: Vandermonde matrix. If increasing is False, the first column is :math:`x^{(N-1)}`, + the second :math:`x^{(N-2)}` and so forth. If increasing is True, the columns + are :math:`x^0, x^1, ..., x^{(N-1)}`. + + Example:: + + >>> x = torch.tensor([1, 2, 3, 5]) + >>> torch.vander(x) + tensor([[ 1, 1, 1, 1], + [ 8, 4, 2, 1], + [ 27, 9, 3, 1], + [125, 25, 5, 1]]) + >>> torch.vander(x, N=3) + tensor([[ 1, 1, 1], + [ 4, 2, 1], + [ 9, 3, 1], + [25, 5, 1]]) + >>> torch.vander(x, N=3, increasing=True) + tensor([[ 1, 1, 1], + [ 1, 2, 4], + [ 1, 3, 9], + [ 1, 5, 25]]) + """ + ... +@overload +def var(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor + + Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim` + can be a single dimension, list of dimensions, or ``None`` to reduce over all + dimensions. + + The variance (:math:`\sigma^2`) is calculated as + + .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2 + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.var(a, dim=1, keepdim=True) + tensor([[1.0631], + [0.5590], + [1.4893], + [0.8258]]) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def var(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: + r""" + var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor + + Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim` + can be a single dimension, list of dimensions, or ``None`` to reduce over all + dimensions. + + The variance (:math:`\sigma^2`) is calculated as + + .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2 + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.var(a, dim=1, keepdim=True) + tensor([[1.0631], + [0.5590], + [1.4893], + [0.8258]]) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def var(input: Tensor, unbiased: _bool = True) -> Tensor: + r""" + var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor + + Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim` + can be a single dimension, list of dimensions, or ``None`` to reduce over all + dimensions. + + The variance (:math:`\sigma^2`) is calculated as + + .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2 + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.var(a, dim=1, keepdim=True) + tensor([[1.0631], + [0.5590], + [1.4893], + [0.8258]]) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def var(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: + r""" + var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor + + Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim` + can be a single dimension, list of dimensions, or ``None`` to reduce over all + dimensions. + + The variance (:math:`\sigma^2`) is calculated as + + .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2 + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.var(a, dim=1, keepdim=True) + tensor([[1.0631], + [0.5590], + [1.4893], + [0.8258]]) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def var(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: + r""" + var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor + + Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim` + can be a single dimension, list of dimensions, or ``None`` to reduce over all + dimensions. + + The variance (:math:`\sigma^2`) is calculated as + + .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2 + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.var(a, dim=1, keepdim=True) + tensor([[1.0631], + [0.5590], + [1.4893], + [0.8258]]) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def var_mean(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: + r""" + var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor) + + Calculates the variance and mean over the dimensions specified by :attr:`dim`. + :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to + reduce over all dimensions. + + The variance (:math:`\sigma^2`) is calculated as + + .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2 + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Returns: + A tuple (var, mean) containing the variance and mean. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.var_mean(a, dim=0, keepdim=True) + (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]), + tensor([[ 0.0645, 0.4485, 0.8707, -0.0665]])) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def var_mean(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: + r""" + var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor) + + Calculates the variance and mean over the dimensions specified by :attr:`dim`. + :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to + reduce over all dimensions. + + The variance (:math:`\sigma^2`) is calculated as + + .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2 + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Returns: + A tuple (var, mean) containing the variance and mean. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.var_mean(a, dim=0, keepdim=True) + (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]), + tensor([[ 0.0645, 0.4485, 0.8707, -0.0665]])) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def var_mean(input: Tensor, unbiased: _bool = True) -> Tuple[Tensor, Tensor]: + r""" + var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor) + + Calculates the variance and mean over the dimensions specified by :attr:`dim`. + :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to + reduce over all dimensions. + + The variance (:math:`\sigma^2`) is calculated as + + .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2 + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Returns: + A tuple (var, mean) containing the variance and mean. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.var_mean(a, dim=0, keepdim=True) + (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]), + tensor([[ 0.0645, 0.4485, 0.8707, -0.0665]])) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def var_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: + r""" + var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor) + + Calculates the variance and mean over the dimensions specified by :attr:`dim`. + :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to + reduce over all dimensions. + + The variance (:math:`\sigma^2`) is calculated as + + .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2 + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Returns: + A tuple (var, mean) containing the variance and mean. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.var_mean(a, dim=0, keepdim=True) + (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]), + tensor([[ 0.0645, 0.4485, 0.8707, -0.0665]])) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +@overload +def var_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: + r""" + var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor) + + Calculates the variance and mean over the dimensions specified by :attr:`dim`. + :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to + reduce over all dimensions. + + The variance (:math:`\sigma^2`) is calculated as + + .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2 + + where :math:`x` is the sample set of elements, :math:`\bar{x}` is the + sample mean, :math:`N` is the number of samples and :math:`\delta N` is + the :attr:`correction`. + + + + If :attr:`keepdim` is ``True``, the output tensor is of the same size + as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1. + Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the + output tensor having 1 (or ``len(dim)``) fewer dimension(s). + + + Args: + input (Tensor): the input tensor. + + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + If ``None``, all dimensions are reduced. + + + Keyword args: + correction (int): difference between the sample size and sample degrees of freedom. + Defaults to `Bessel's correction`_, ``correction=1``. + + .. versionchanged:: 2.0 + Previously this argument was called ``unbiased`` and was a boolean + with ``True`` corresponding to ``correction=1`` and ``False`` being + ``correction=0``. + keepdim (bool): whether the output tensor has :attr:`dim` retained or not. + out (Tensor, optional): the output tensor. + + Returns: + A tuple (var, mean) containing the variance and mean. + + Example: + + >>> a = torch.tensor( + ... [[ 0.2035, 1.2959, 1.8101, -0.4644], + ... [ 1.5027, -0.3270, 0.5905, 0.6538], + ... [-1.5745, 1.3330, -0.5596, -0.6548], + ... [ 0.1264, -0.5080, 1.6420, 0.1992]]) + >>> torch.var_mean(a, dim=0, keepdim=True) + (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]), + tensor([[ 0.0645, 0.4485, 0.8707, -0.0665]])) + + .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction + """ + ... +def vdot(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + vdot(input, other, *, out=None) -> Tensor + + Computes the dot product of two 1D vectors along a dimension. + + In symbols, this function computes + + .. math:: + + \sum_{i=1}^n \overline{x_i}y_i. + + where :math:`\overline{x_i}` denotes the conjugate for complex + vectors, and it is the identity for real vectors. + + .. note:: + + Unlike NumPy's vdot, torch.vdot intentionally only supports computing the dot product + of two 1D tensors with the same number of elements. + + .. seealso:: + + :func:`torch.linalg.vecdot` computes the dot product of two batches of vectors along a dimension. + + Args: + input (Tensor): first tensor in the dot product, must be 1D. Its conjugate is used if it's complex. + other (Tensor): second tensor in the dot product, must be 1D. + + Keyword args: + + .. note:: out (Tensor, optional): the output tensor. + + + Example:: + + >>> torch.vdot(torch.tensor([2, 3]), torch.tensor([2, 1])) + tensor(7) + >>> a = torch.tensor((1 +2j, 3 - 1j)) + >>> b = torch.tensor((2 +1j, 4 - 0j)) + >>> torch.vdot(a, b) + tensor([16.+1.j]) + >>> torch.vdot(b, a) + tensor([16.-1.j]) + """ + ... +def view_as_complex(input: Tensor) -> Tensor: + r""" + view_as_complex(input) -> Tensor + + Returns a view of :attr:`input` as a complex tensor. For an input complex + tensor of :attr:`size` :math:`m1, m2, \dots, mi, 2`, this function returns a + new complex tensor of :attr:`size` :math:`m1, m2, \dots, mi` where the last + dimension of the input tensor is expected to represent the real and imaginary + components of complex numbers. + + .. warning:: + :func:`view_as_complex` is only supported for tensors with + :class:`torch.dtype` ``torch.float64`` and ``torch.float32``. The input is + expected to have the last dimension of :attr:`size` 2. In addition, the + tensor must have a `stride` of 1 for its last dimension. The strides of all + other dimensions must be even numbers. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> x=torch.randn(4, 2) + >>> x + tensor([[ 1.6116, -0.5772], + [-1.4606, -0.9120], + [ 0.0786, -1.7497], + [-0.6561, -1.6623]]) + >>> torch.view_as_complex(x) + tensor([(1.6116-0.5772j), (-1.4606-0.9120j), (0.0786-1.7497j), (-0.6561-1.6623j)]) + """ + ... +def view_as_complex_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.view_as_complex`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +def view_as_real(input: Tensor) -> Tensor: + r""" + view_as_real(input) -> Tensor + + Returns a view of :attr:`input` as a real tensor. For an input complex tensor of + :attr:`size` :math:`m1, m2, \dots, mi`, this function returns a new + real tensor of size :math:`m1, m2, \dots, mi, 2`, where the last dimension of size 2 + represents the real and imaginary components of complex numbers. + + .. warning:: + :func:`view_as_real` is only supported for tensors with ``complex dtypes``. + + Args: + input (Tensor): the input tensor. + + Example:: + + >>> x=torch.randn(4, dtype=torch.cfloat) + >>> x + tensor([(0.4737-0.3839j), (-0.2098-0.6699j), (0.3470-0.9451j), (-0.5174-1.3136j)]) + >>> torch.view_as_real(x) + tensor([[ 0.4737, -0.3839], + [-0.2098, -0.6699], + [ 0.3470, -0.9451], + [-0.5174, -1.3136]]) + """ + ... +def view_as_real_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.view_as_real`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +@overload +def view_copy(input: Tensor, dtype: _dtype, *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.view`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +@overload +def view_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None) -> Tensor: + r""" + Performs the same operation as :func:`torch.view`, but all output tensors + are freshly created instead of aliasing the input. + """ + ... +@overload +def vsplit(input: Tensor, sections: _int) -> Tuple[Tensor, ...]: + r""" + vsplit(input, indices_or_sections) -> List of Tensors + + Splits :attr:`input`, a tensor with two or more dimensions, into multiple tensors + vertically according to :attr:`indices_or_sections`. Each split is a view of + :attr:`input`. + + This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=0) + (the split dimension is 0), except that if :attr:`indices_or_sections` is an integer + it must evenly divide the split dimension or a runtime error will be thrown. + + This function is based on NumPy's :func:`numpy.vsplit`. + + Args: + input (Tensor): tensor to split. + indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`. + + Example:: + >>> t = torch.arange(16.0).reshape(4,4) + >>> t + tensor([[ 0., 1., 2., 3.], + [ 4., 5., 6., 7.], + [ 8., 9., 10., 11.], + [12., 13., 14., 15.]]) + >>> torch.vsplit(t, 2) + (tensor([[0., 1., 2., 3.], + [4., 5., 6., 7.]]), + tensor([[ 8., 9., 10., 11.], + [12., 13., 14., 15.]])) + >>> torch.vsplit(t, [3, 6]) + (tensor([[ 0., 1., 2., 3.], + [ 4., 5., 6., 7.], + [ 8., 9., 10., 11.]]), + tensor([[12., 13., 14., 15.]]), + tensor([], size=(0, 4))) + """ + ... +@overload +def vsplit(input: Tensor, indices: _size) -> Tuple[Tensor, ...]: + r""" + vsplit(input, indices_or_sections) -> List of Tensors + + Splits :attr:`input`, a tensor with two or more dimensions, into multiple tensors + vertically according to :attr:`indices_or_sections`. Each split is a view of + :attr:`input`. + + This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=0) + (the split dimension is 0), except that if :attr:`indices_or_sections` is an integer + it must evenly divide the split dimension or a runtime error will be thrown. + + This function is based on NumPy's :func:`numpy.vsplit`. + + Args: + input (Tensor): tensor to split. + indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`. + + Example:: + >>> t = torch.arange(16.0).reshape(4,4) + >>> t + tensor([[ 0., 1., 2., 3.], + [ 4., 5., 6., 7.], + [ 8., 9., 10., 11.], + [12., 13., 14., 15.]]) + >>> torch.vsplit(t, 2) + (tensor([[0., 1., 2., 3.], + [4., 5., 6., 7.]]), + tensor([[ 8., 9., 10., 11.], + [12., 13., 14., 15.]])) + >>> torch.vsplit(t, [3, 6]) + (tensor([[ 0., 1., 2., 3.], + [ 4., 5., 6., 7.], + [ 8., 9., 10., 11.]]), + tensor([[12., 13., 14., 15.]]), + tensor([], size=(0, 4))) + """ + ... +def vstack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: + r""" + vstack(tensors, *, out=None) -> Tensor + + Stack tensors in sequence vertically (row wise). + + This is equivalent to concatenation along the first axis after all 1-D tensors have been reshaped by :func:`torch.atleast_2d`. + + Args: + tensors (sequence of Tensors): sequence of tensors to concatenate + + Keyword args: + out (Tensor, optional): the output tensor. + + Example:: + + >>> a = torch.tensor([1, 2, 3]) + >>> b = torch.tensor([4, 5, 6]) + >>> torch.vstack((a,b)) + tensor([[1, 2, 3], + [4, 5, 6]]) + >>> a = torch.tensor([[1],[2],[3]]) + >>> b = torch.tensor([[4],[5],[6]]) + >>> torch.vstack((a,b)) + tensor([[1], + [2], + [3], + [4], + [5], + [6]]) + """ + ... +@overload +def where(condition: Tensor) -> Tuple[Tensor, ...]: + r""" + where(condition, input, other, *, out=None) -> Tensor + + Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`. + + The operation is defined as: + + .. math:: + \text{out}_i = \begin{cases} + \text{input}_i & \text{if } \text{condition}_i \\ + \text{other}_i & \text{otherwise} \\ + \end{cases} + + .. note:: + The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable `. + + Arguments: + condition (BoolTensor): When True (nonzero), yield input, otherwise yield other + input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices + where :attr:`condition` is ``True`` + other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices + where :attr:`condition` is ``False`` + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other` + + Example:: + + >>> x = torch.randn(3, 2) + >>> y = torch.ones(3, 2) + >>> x + tensor([[-0.4620, 0.3139], + [ 0.3898, -0.7197], + [ 0.0478, -0.1657]]) + >>> torch.where(x > 0, 1.0, 0.0) + tensor([[0., 1.], + [1., 0.], + [1., 0.]]) + >>> torch.where(x > 0, x, y) + tensor([[ 1.0000, 0.3139], + [ 0.3898, 1.0000], + [ 0.0478, 1.0000]]) + >>> x = torch.randn(2, 2, dtype=torch.double) + >>> x + tensor([[ 1.0779, 0.0383], + [-0.8785, -1.1089]], dtype=torch.float64) + >>> torch.where(x > 0, x, 0.) + tensor([[1.0779, 0.0383], + [0.0000, 0.0000]], dtype=torch.float64) + + .. function:: where(condition) -> tuple of LongTensor + :noindex: + + ``torch.where(condition)`` is identical to + ``torch.nonzero(condition, as_tuple=True)``. + + .. note:: + See also :func:`torch.nonzero`. + """ + ... +@overload +def where(condition: Tensor, input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + where(condition, input, other, *, out=None) -> Tensor + + Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`. + + The operation is defined as: + + .. math:: + \text{out}_i = \begin{cases} + \text{input}_i & \text{if } \text{condition}_i \\ + \text{other}_i & \text{otherwise} \\ + \end{cases} + + .. note:: + The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable `. + + Arguments: + condition (BoolTensor): When True (nonzero), yield input, otherwise yield other + input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices + where :attr:`condition` is ``True`` + other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices + where :attr:`condition` is ``False`` + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other` + + Example:: + + >>> x = torch.randn(3, 2) + >>> y = torch.ones(3, 2) + >>> x + tensor([[-0.4620, 0.3139], + [ 0.3898, -0.7197], + [ 0.0478, -0.1657]]) + >>> torch.where(x > 0, 1.0, 0.0) + tensor([[0., 1.], + [1., 0.], + [1., 0.]]) + >>> torch.where(x > 0, x, y) + tensor([[ 1.0000, 0.3139], + [ 0.3898, 1.0000], + [ 0.0478, 1.0000]]) + >>> x = torch.randn(2, 2, dtype=torch.double) + >>> x + tensor([[ 1.0779, 0.0383], + [-0.8785, -1.1089]], dtype=torch.float64) + >>> torch.where(x > 0, x, 0.) + tensor([[1.0779, 0.0383], + [0.0000, 0.0000]], dtype=torch.float64) + + .. function:: where(condition) -> tuple of LongTensor + :noindex: + + ``torch.where(condition)`` is identical to + ``torch.nonzero(condition, as_tuple=True)``. + + .. note:: + See also :func:`torch.nonzero`. + """ + ... +@overload +def where(condition: Tensor, self: Union[Number, _complex], other: Tensor) -> Tensor: + r""" + where(condition, input, other, *, out=None) -> Tensor + + Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`. + + The operation is defined as: + + .. math:: + \text{out}_i = \begin{cases} + \text{input}_i & \text{if } \text{condition}_i \\ + \text{other}_i & \text{otherwise} \\ + \end{cases} + + .. note:: + The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable `. + + Arguments: + condition (BoolTensor): When True (nonzero), yield input, otherwise yield other + input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices + where :attr:`condition` is ``True`` + other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices + where :attr:`condition` is ``False`` + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other` + + Example:: + + >>> x = torch.randn(3, 2) + >>> y = torch.ones(3, 2) + >>> x + tensor([[-0.4620, 0.3139], + [ 0.3898, -0.7197], + [ 0.0478, -0.1657]]) + >>> torch.where(x > 0, 1.0, 0.0) + tensor([[0., 1.], + [1., 0.], + [1., 0.]]) + >>> torch.where(x > 0, x, y) + tensor([[ 1.0000, 0.3139], + [ 0.3898, 1.0000], + [ 0.0478, 1.0000]]) + >>> x = torch.randn(2, 2, dtype=torch.double) + >>> x + tensor([[ 1.0779, 0.0383], + [-0.8785, -1.1089]], dtype=torch.float64) + >>> torch.where(x > 0, x, 0.) + tensor([[1.0779, 0.0383], + [0.0000, 0.0000]], dtype=torch.float64) + + .. function:: where(condition) -> tuple of LongTensor + :noindex: + + ``torch.where(condition)`` is identical to + ``torch.nonzero(condition, as_tuple=True)``. + + .. note:: + See also :func:`torch.nonzero`. + """ + ... +@overload +def where(condition: Tensor, input: Tensor, other: Union[Number, _complex]) -> Tensor: + r""" + where(condition, input, other, *, out=None) -> Tensor + + Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`. + + The operation is defined as: + + .. math:: + \text{out}_i = \begin{cases} + \text{input}_i & \text{if } \text{condition}_i \\ + \text{other}_i & \text{otherwise} \\ + \end{cases} + + .. note:: + The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable `. + + Arguments: + condition (BoolTensor): When True (nonzero), yield input, otherwise yield other + input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices + where :attr:`condition` is ``True`` + other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices + where :attr:`condition` is ``False`` + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other` + + Example:: + + >>> x = torch.randn(3, 2) + >>> y = torch.ones(3, 2) + >>> x + tensor([[-0.4620, 0.3139], + [ 0.3898, -0.7197], + [ 0.0478, -0.1657]]) + >>> torch.where(x > 0, 1.0, 0.0) + tensor([[0., 1.], + [1., 0.], + [1., 0.]]) + >>> torch.where(x > 0, x, y) + tensor([[ 1.0000, 0.3139], + [ 0.3898, 1.0000], + [ 0.0478, 1.0000]]) + >>> x = torch.randn(2, 2, dtype=torch.double) + >>> x + tensor([[ 1.0779, 0.0383], + [-0.8785, -1.1089]], dtype=torch.float64) + >>> torch.where(x > 0, x, 0.) + tensor([[1.0779, 0.0383], + [0.0000, 0.0000]], dtype=torch.float64) + + .. function:: where(condition) -> tuple of LongTensor + :noindex: + + ``torch.where(condition)`` is identical to + ``torch.nonzero(condition, as_tuple=True)``. + + .. note:: + See also :func:`torch.nonzero`. + """ + ... +@overload +def where(condition: Tensor, self: Union[Number, _complex], other: Union[Number, _complex]) -> Tensor: + r""" + where(condition, input, other, *, out=None) -> Tensor + + Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`. + + The operation is defined as: + + .. math:: + \text{out}_i = \begin{cases} + \text{input}_i & \text{if } \text{condition}_i \\ + \text{other}_i & \text{otherwise} \\ + \end{cases} + + .. note:: + The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable `. + + Arguments: + condition (BoolTensor): When True (nonzero), yield input, otherwise yield other + input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices + where :attr:`condition` is ``True`` + other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices + where :attr:`condition` is ``False`` + + Keyword args: + out (Tensor, optional): the output tensor. + + Returns: + Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other` + + Example:: + + >>> x = torch.randn(3, 2) + >>> y = torch.ones(3, 2) + >>> x + tensor([[-0.4620, 0.3139], + [ 0.3898, -0.7197], + [ 0.0478, -0.1657]]) + >>> torch.where(x > 0, 1.0, 0.0) + tensor([[0., 1.], + [1., 0.], + [1., 0.]]) + >>> torch.where(x > 0, x, y) + tensor([[ 1.0000, 0.3139], + [ 0.3898, 1.0000], + [ 0.0478, 1.0000]]) + >>> x = torch.randn(2, 2, dtype=torch.double) + >>> x + tensor([[ 1.0779, 0.0383], + [-0.8785, -1.1089]], dtype=torch.float64) + >>> torch.where(x > 0, x, 0.) + tensor([[1.0779, 0.0383], + [0.0000, 0.0000]], dtype=torch.float64) + + .. function:: where(condition) -> tuple of LongTensor + :noindex: + + ``torch.where(condition)`` is identical to + ``torch.nonzero(condition, as_tuple=True)``. + + .. note:: + See also :func:`torch.nonzero`. + """ + ... +@overload +def xlogy(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + xlogy(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.special.xlogy`. + """ + ... +@overload +def xlogy(self: Union[Number, _complex], other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: + r""" + xlogy(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.special.xlogy`. + """ + ... +@overload +def xlogy(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: + r""" + xlogy(input, other, *, out=None) -> Tensor + + Alias for :func:`torch.special.xlogy`. + """ + ... +@overload +def xlogy_(input: Tensor, other: Tensor) -> Tensor: ... +@overload +def xlogy_(input: Tensor, other: Union[Number, _complex]) -> Tensor: ... +def zero_(input: Tensor) -> Tensor: ... +@overload +def zeros(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a tensor filled with the scalar value `0`, with the shape defined + by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.zeros(2, 3) + tensor([[ 0., 0., 0.], + [ 0., 0., 0.]]) + + >>> torch.zeros(5) + tensor([ 0., 0., 0., 0., 0.]) + """ + ... +@overload +def zeros(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a tensor filled with the scalar value `0`, with the shape defined + by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.zeros(2, 3) + tensor([[ 0., 0., 0.], + [ 0., 0., 0.]]) + + >>> torch.zeros(5) + tensor([ 0., 0., 0., 0., 0.]) + """ + ... +@overload +def zeros(size: _size, *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a tensor filled with the scalar value `0`, with the shape defined + by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.zeros(2, 3) + tensor([[ 0., 0., 0.], + [ 0., 0., 0.]]) + + >>> torch.zeros(5) + tensor([ 0., 0., 0., 0., 0.]) + """ + ... +@overload +def zeros(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + + Returns a tensor filled with the scalar value `0`, with the shape defined + by the variable argument :attr:`size`. + + Args: + size (int...): a sequence of integers defining the shape of the output tensor. + Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + out (Tensor, optional): the output tensor. + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, uses the current device for the default tensor type + (see :func:`torch.set_default_device`). :attr:`device` will be the CPU + for CPU tensor types and the current CUDA device for CUDA tensor types. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + + Example:: + + >>> torch.zeros(2, 3) + tensor([[ 0., 0., 0.], + [ 0., 0., 0.]]) + + >>> torch.zeros(5) + tensor([ 0., 0., 0., 0., 0.]) + """ + ... +def zeros_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: + r""" + zeros_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor + + Returns a tensor filled with the scalar value `0`, with the same size as + :attr:`input`. ``torch.zeros_like(input)`` is equivalent to + ``torch.zeros(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``. + + .. warning:: + As of 0.4, this function does not support an :attr:`out` keyword. As an alternative, + the old ``torch.zeros_like(input, out=output)`` is equivalent to + ``torch.zeros(input.size(), out=output)``. + + Args: + input (Tensor): the size of :attr:`input` will determine size of the output tensor. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor. + Default: if ``None``, defaults to the dtype of :attr:`input`. + layout (:class:`torch.layout`, optional): the desired layout of returned tensor. + Default: if ``None``, defaults to the layout of :attr:`input`. + device (:class:`torch.device`, optional): the desired device of returned tensor. + Default: if ``None``, defaults to the device of :attr:`input`. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + memory_format (:class:`torch.memory_format`, optional): the desired memory format of + returned Tensor. Default: ``torch.preserve_format``. + + Example:: + + >>> input = torch.empty(2, 3) + >>> torch.zeros_like(input) + tensor([[ 0., 0., 0.], + [ 0., 0., 0.]]) + """ + ... + +__all__ = ['__and__', '__lshift__', '__or__', '__rshift__', '__xor__', '_adaptive_avg_pool2d', + '_adaptive_avg_pool3d', '_add_batch_dim', '_add_relu', '_add_relu_', '_addmm_activation', + '_aminmax', '_amp_foreach_non_finite_check_and_unscale_', '_amp_update_scale_', '_assert_async', + '_assert_scalar', '_assert_tensor_metadata', '_batch_norm_impl_index', '_cast_Byte', '_cast_Char', + '_cast_Double', '_cast_Float', '_cast_Half', '_cast_Int', '_cast_Long', '_cast_Short', + '_choose_qparams_per_tensor', '_chunk_cat', '_coalesce', '_compute_linear_combination', '_conj', + '_conj_copy', '_conj_physical', '_convert_indices_from_coo_to_csr', + '_convert_indices_from_csr_to_coo', '_convert_weight_to_int4pack', '_convolution', + '_convolution_mode', '_copy_from', '_copy_from_and_resize', '_cslt_compress', '_cslt_sparse_mm', + '_cslt_sparse_mm_search', '_ctc_loss', '_cudnn_ctc_loss', '_cudnn_init_dropout_state', + '_cudnn_rnn', '_cudnn_rnn_flatten_weight', '_cufft_clear_plan_cache', + '_cufft_get_plan_cache_max_size', '_cufft_get_plan_cache_size', '_cufft_set_plan_cache_max_size', + '_cummax_helper', '_cummin_helper', '_debug_has_internal_overlap', '_dim_arange', + '_dirichlet_grad', '_disable_functionalization', '_efficientzerotensor', '_embedding_bag', + '_embedding_bag_forward_only', '_empty_affine_quantized', '_empty_per_channel_affine_quantized', + '_enable_functionalization', '_euclidean_dist', '_fake_quantize_learnable_per_channel_affine', + '_fake_quantize_learnable_per_tensor_affine', + '_fake_quantize_per_tensor_affine_cachemask_tensor_qparams', + '_fake_quantize_per_tensor_affine_cachemask_tensor_qparams', '_fft_c2c', '_fft_c2r', '_fft_r2c', + '_fill_mem_eff_dropout_mask_', '_foobar', '_foreach_abs', '_foreach_abs_', '_foreach_acos', + '_foreach_acos_', '_foreach_add', '_foreach_add_', '_foreach_addcdiv', '_foreach_addcdiv_', + '_foreach_addcmul', '_foreach_addcmul_', '_foreach_asin', '_foreach_asin_', '_foreach_atan', + '_foreach_atan_', '_foreach_ceil', '_foreach_ceil_', '_foreach_clamp_max', '_foreach_clamp_max_', + '_foreach_clamp_min', '_foreach_clamp_min_', '_foreach_copy_', '_foreach_cos', '_foreach_cos_', + '_foreach_cosh', '_foreach_cosh_', '_foreach_div', '_foreach_div_', '_foreach_erf', + '_foreach_erf_', '_foreach_erfc', '_foreach_erfc_', '_foreach_exp', '_foreach_exp_', + '_foreach_expm1', '_foreach_expm1_', '_foreach_floor', '_foreach_floor_', '_foreach_frac', + '_foreach_frac_', '_foreach_lerp', '_foreach_lerp_', '_foreach_lgamma', '_foreach_lgamma_', + '_foreach_log', '_foreach_log10', '_foreach_log10_', '_foreach_log1p', '_foreach_log1p_', + '_foreach_log2', '_foreach_log2_', '_foreach_log_', '_foreach_maximum', '_foreach_maximum_', + '_foreach_minimum', '_foreach_minimum_', '_foreach_mul', '_foreach_mul_', '_foreach_neg', + '_foreach_neg_', '_foreach_norm', '_foreach_pow', '_foreach_pow_', '_foreach_reciprocal', + '_foreach_reciprocal_', '_foreach_round', '_foreach_round_', '_foreach_sigmoid', + '_foreach_sigmoid_', '_foreach_sign', '_foreach_sign_', '_foreach_sin', '_foreach_sin_', + '_foreach_sinh', '_foreach_sinh_', '_foreach_sqrt', '_foreach_sqrt_', '_foreach_sub', + '_foreach_sub_', '_foreach_tan', '_foreach_tan_', '_foreach_tanh', '_foreach_tanh_', + '_foreach_trunc', '_foreach_trunc_', '_foreach_zero_', '_from_functional_tensor', + '_functional_assert_async', '_functional_assert_scalar', '_functional_sym_constrain_range', + '_functional_sym_constrain_range_for_size', + '_functionalize_are_all_mutations_hidden_from_autograd', + '_functionalize_are_all_mutations_under_no_grad_or_inference_mode', '_functionalize_commit_update', + '_functionalize_mark_mutation_hidden_from_autograd', '_functionalize_replace', + '_functionalize_sync', '_fused_adam_', '_fused_adamw_', '_fused_dropout', + '_fused_moving_avg_obs_fq_helper', '_fused_moving_avg_obs_fq_helper', '_fused_sdp_choice', + '_fused_sgd_', '_fw_primal_copy', '_grid_sampler_2d_cpu_fallback', + '_has_compatible_shallow_copy_type', '_histogramdd_bin_edges', '_histogramdd_from_bin_cts', + '_histogramdd_from_bin_tensors', '_index_put_impl_', '_indices_copy', '_int_mm', '_is_all_true', + '_is_any_true', '_is_functional_tensor', '_is_zerotensor', '_lazy_clone', '_linalg_check_errors', + '_linalg_det', '_linalg_det', '_linalg_eigh', '_linalg_eigh', '_linalg_slogdet', '_linalg_slogdet', + '_linalg_solve_ex', '_linalg_solve_ex', '_linalg_svd', '_linalg_svd', '_log_softmax', + '_log_softmax_backward_data', '_logcumsumexp', '_lstm_mps', '_lu_with_info', '_lu_with_info', + '_make_dep_token', '_make_dual', '_make_dual_copy', '_make_per_channel_quantized_tensor', + '_make_per_tensor_quantized_tensor', '_masked_scale', '_masked_softmax', '_mixed_dtypes_linear', + '_mkldnn_reshape', '_mkldnn_transpose', '_mkldnn_transpose_', '_mps_convolution', + '_mps_convolution_transpose', '_native_batch_norm_legit', '_native_batch_norm_legit_no_training', + '_native_multi_head_attention', '_neg_view', '_neg_view_copy', '_nested_from_padded', + '_nested_from_padded_and_nested_example', '_nested_get_jagged_dummy', '_nested_get_lengths', + '_nested_get_offsets', '_nested_get_ragged_idx', '_nested_get_values', '_nested_get_values_copy', + '_nested_tensor_from_mask', '_nested_tensor_from_mask_left_aligned', + '_nested_tensor_from_tensor_list', '_nested_tensor_softmax_with_shape', '_nested_view_from_buffer', + '_nested_view_from_buffer_copy', '_nested_view_from_jagged', '_nested_view_from_jagged_copy', + '_nnpack_available', '_nnpack_spatial_convolution', '_pack_padded_sequence', + '_pad_packed_sequence', '_pin_memory', '_prelu_kernel', '_print', '_propagate_xla_data', + '_remove_batch_dim', '_reshape_alias_copy', '_reshape_from_tensor', '_resize_output_', + '_rowwise_prune', '_sample_dirichlet', '_saturate_weight_to_fp16', + '_scaled_dot_product_attention_math', '_scaled_dot_product_cudnn_attention', + '_scaled_dot_product_cudnn_attention', '_scaled_dot_product_efficient_attention', + '_scaled_dot_product_efficient_attention', '_scaled_dot_product_flash_attention', + '_scaled_dot_product_flash_attention', '_scaled_dot_product_flash_attention_for_cpu', + '_scaled_dot_product_flash_attention_for_cpu', '_scaled_mm', '_shape_as_tensor', + '_sobol_engine_draw', '_sobol_engine_ff_', '_sobol_engine_initialize_state_', + '_sobol_engine_scramble_', '_softmax', '_softmax_backward_data', '_sparse_broadcast_to', + '_sparse_broadcast_to_copy', '_sparse_csr_prod', '_sparse_csr_sum', + '_sparse_log_softmax_backward_data', '_sparse_semi_structured_linear', + '_sparse_softmax_backward_data', '_sparse_sparse_matmul', '_sparse_sum', '_stack', + '_standard_gamma', '_standard_gamma_grad', '_sync', '_test_autograd_multiple_dispatch', + '_test_autograd_multiple_dispatch_view', '_test_autograd_multiple_dispatch_view_copy', + '_test_check_tensor', '_test_functorch_fallback', '_test_parallel_materialize', + '_test_serialization_subcmul', '_to_cpu', '_to_functional_tensor', '_to_sparse_semi_structured', + '_transform_bias_rescale_qkv', '_transformer_encoder_layer_fwd', '_trilinear', + '_triton_multi_head_attention', '_triton_scaled_dot_attention', '_unique', '_unique2', + '_unpack_dual', '_unpack_dual', '_unsafe_index', '_unsafe_index_put', '_use_cudnn_ctc_loss', + '_use_cudnn_rnn_flatten_weight', '_validate_compressed_sparse_indices', + '_validate_sparse_bsc_tensor_args', '_validate_sparse_bsr_tensor_args', + '_validate_sparse_compressed_tensor_args', '_validate_sparse_coo_tensor_args', + '_validate_sparse_csc_tensor_args', '_validate_sparse_csr_tensor_args', '_values_copy', + '_weight_int4pack_mm', '_weight_int8pack_mm', '_weight_norm', '_weight_norm_interface', 'abs', + 'abs_', 'absolute', 'acos', 'acos_', 'acosh', 'acosh_', 'adaptive_avg_pool1d', + 'adaptive_max_pool1d', 'add', 'addbmm', 'addcdiv', 'addcmul', 'addmm', 'addmv', 'addmv_', 'addr', + 'adjoint', 'affine_grid_generator', 'alias_copy', 'all', 'allclose', 'alpha_dropout', + 'alpha_dropout_', 'amax', 'amin', 'aminmax', 'aminmax', 'angle', 'any', 'arange', 'arccos', + 'arccos_', 'arccosh', 'arccosh_', 'arcsin', 'arcsin_', 'arcsinh', 'arcsinh_', 'arctan', 'arctan2', + 'arctan_', 'arctanh', 'arctanh_', 'argmax', 'argmin', 'argsort', 'argwhere', 'as_strided', + 'as_strided_', 'as_strided_copy', 'as_strided_scatter', 'as_tensor', 'asarray', 'asin', 'asin_', + 'asinh', 'asinh_', 'atan', 'atan2', 'atan_', 'atanh', 'atanh_', 'avg_pool1d', 'baddbmm', + 'bartlett_window', 'batch_norm', 'batch_norm_backward_elemt', 'batch_norm_backward_reduce', + 'batch_norm_elemt', 'batch_norm_gather_stats', 'batch_norm_gather_stats_with_counts', + 'batch_norm_stats', 'batch_norm_update_stats', 'bernoulli', 'bilinear', + 'binary_cross_entropy_with_logits', 'bincount', 'binomial', 'bitwise_and', 'bitwise_left_shift', + 'bitwise_not', 'bitwise_or', 'bitwise_right_shift', 'bitwise_xor', 'blackman_window', 'bmm', + 'broadcast_to', 'bucketize', 'can_cast', 'cat', 'ccol_indices_copy', 'ceil', 'ceil_', 'celu', + 'celu_', 'channel_shuffle', 'cholesky', 'cholesky_inverse', 'cholesky_solve', + 'choose_qparams_optimized', 'chunk', 'clamp', 'clamp_', 'clamp_max', 'clamp_max_', 'clamp_min', + 'clamp_min_', 'clip', 'clip_', 'clone', 'col_indices_copy', 'column_stack', 'combinations', + 'complex', 'concat', 'concatenate', 'conj', 'conj_physical', 'conj_physical_', 'constant_pad_nd', + 'conv1d', 'conv2d', 'conv3d', 'conv_tbc', 'conv_transpose1d', 'conv_transpose2d', + 'conv_transpose3d', 'convolution', 'copysign', 'corrcoef', 'cos', 'cos_', 'cosh', 'cosh_', + 'cosine_embedding_loss', 'cosine_similarity', 'count_nonzero', 'cov', 'cross', 'crow_indices_copy', + 'ctc_loss', 'cudnn_affine_grid_generator', 'cudnn_batch_norm', 'cudnn_convolution', + 'cudnn_convolution_add_relu', 'cudnn_convolution_relu', 'cudnn_convolution_transpose', + 'cudnn_grid_sampler', 'cudnn_is_acceptable', 'cummax', 'cummax', 'cummin', 'cummin', 'cumprod', + 'cumsum', 'cumulative_trapezoid', 'deg2rad', 'deg2rad_', 'dequantize', 'det', 'detach', 'detach_', + 'detach_copy', 'diag', 'diag_embed', 'diagflat', 'diagonal', 'diagonal_copy', 'diagonal_scatter', + 'diff', 'digamma', 'dist', 'div', 'divide', 'dot', 'dropout', 'dropout_', 'dsmm', 'dsplit', + 'dstack', 'embedding', 'embedding_bag', 'embedding_renorm_', 'empty', 'empty_like', + 'empty_permuted', 'empty_quantized', 'empty_strided', 'eq', 'equal', 'erf', 'erf_', 'erfc', + 'erfc_', 'erfinv', 'exp', 'exp2', 'exp2_', 'exp_', 'expand_copy', 'expm1', 'expm1_', 'eye', + 'fake_quantize_per_channel_affine', 'fake_quantize_per_tensor_affine', 'fbgemm_linear_fp16_weight', + 'fbgemm_linear_fp16_weight_fp32_activation', 'fbgemm_linear_int8_weight', + 'fbgemm_linear_int8_weight_fp32_activation', 'fbgemm_linear_quantize_weight', + 'fbgemm_pack_gemm_matrix_fp16', 'fbgemm_pack_quantized_matrix', 'feature_alpha_dropout', + 'feature_alpha_dropout_', 'feature_dropout', 'feature_dropout_', 'fill', 'fill_', 'fix', 'fix_', + 'flatten', 'flip', 'fliplr', 'flipud', 'float_power', 'floor', 'floor_', 'floor_divide', 'fmax', + 'fmin', 'fmod', 'frac', 'frac_', 'frexp', 'frexp', 'frobenius_norm', 'from_file', 'from_numpy', + 'frombuffer', 'full', 'full_like', 'fused_moving_avg_obs_fake_quant', 'gather', 'gcd', 'gcd_', + 'ge', 'geqrf', 'geqrf', 'ger', 'get_default_dtype', 'get_num_interop_threads', 'get_num_threads', + 'gradient', 'greater', 'greater_equal', 'grid_sampler', 'grid_sampler_2d', 'grid_sampler_3d', + 'group_norm', 'gru', 'gru_cell', 'gt', 'hamming_window', 'hann_window', 'hardshrink', 'heaviside', + 'hinge_embedding_loss', 'histc', 'histogram', 'histogram', 'histogramdd', 'histogramdd', 'hsmm', + 'hsplit', 'hspmm', 'hstack', 'hypot', 'i0', 'i0_', 'igamma', 'igammac', 'imag', 'index_add', + 'index_copy', 'index_fill', 'index_put', 'index_put_', 'index_reduce', 'index_select', + 'indices_copy', 'init_num_threads', 'inner', 'instance_norm', 'int_repr', 'inverse', 'is_complex', + 'is_conj', 'is_distributed', 'is_floating_point', 'is_grad_enabled', 'is_inference', + 'is_inference_mode_enabled', 'is_neg', 'is_nonzero', 'is_same_size', 'is_signed', + 'is_vulkan_available', 'isclose', 'isfinite', 'isin', 'isinf', 'isnan', 'isneginf', 'isposinf', + 'isreal', 'istft', 'kaiser_window', 'kl_div', 'kron', 'kthvalue', 'kthvalue', 'layer_norm', 'lcm', + 'lcm_', 'ldexp', 'ldexp_', 'le', 'lerp', 'less', 'less_equal', 'lgamma', 'linspace', 'log', + 'log10', 'log10_', 'log1p', 'log1p_', 'log2', 'log2_', 'log_', 'log_softmax', 'logaddexp', + 'logaddexp2', 'logcumsumexp', 'logdet', 'logical_and', 'logical_not', 'logical_or', 'logical_xor', + 'logit', 'logit_', 'logspace', 'logsumexp', 'lstm', 'lstm_cell', 'lt', 'lu_solve', 'lu_unpack', + 'lu_unpack', 'margin_ranking_loss', 'masked_fill', 'masked_scatter', 'masked_select', 'matmul', + 'matrix_exp', 'matrix_power', 'max', 'max', 'max_pool1d', 'max_pool1d_with_indices', 'max_pool2d', + 'max_pool3d', 'maximum', 'mean', 'median', 'median', 'min', 'min', 'minimum', 'miopen_batch_norm', + 'miopen_convolution', 'miopen_convolution_add_relu', 'miopen_convolution_relu', + 'miopen_convolution_transpose', 'miopen_depthwise_convolution', 'miopen_rnn', + 'mkldnn_adaptive_avg_pool2d', 'mkldnn_convolution', 'mkldnn_linear_backward_weights', + 'mkldnn_max_pool2d', 'mkldnn_max_pool3d', 'mkldnn_rnn_layer', 'mm', 'mode', 'mode', 'moveaxis', + 'movedim', 'msort', 'mul', 'multinomial', 'multiply', 'mv', 'mvlgamma', 'nan_to_num', + 'nan_to_num_', 'nanmean', 'nanmedian', 'nanmedian', 'nanquantile', 'nansum', 'narrow', + 'narrow_copy', 'native_batch_norm', 'native_channel_shuffle', 'native_dropout', + 'native_group_norm', 'native_layer_norm', 'native_norm', 'ne', 'neg', 'neg_', 'negative', + 'negative_', 'nextafter', 'nonzero', 'nonzero_static', 'norm_except_dim', 'normal', 'not_equal', + 'nuclear_norm', 'numel', 'ones', 'ones_like', 'orgqr', 'ormqr', 'outer', 'pairwise_distance', + 'pdist', 'permute', 'permute_copy', 'pinverse', 'pixel_shuffle', 'pixel_unshuffle', 'poisson', + 'poisson_nll_loss', 'polar', 'polygamma', 'positive', 'pow', 'prelu', 'prod', 'promote_types', + 'put', 'q_per_channel_axis', 'q_per_channel_scales', 'q_per_channel_zero_points', 'q_scale', + 'q_zero_point', 'qr', 'qr', 'quantile', 'quantize_per_channel', 'quantize_per_tensor', + 'quantize_per_tensor_dynamic', 'quantized_batch_norm', 'quantized_gru_cell', 'quantized_lstm_cell', + 'quantized_max_pool1d', 'quantized_max_pool2d', 'quantized_max_pool3d', 'quantized_rnn_relu_cell', + 'quantized_rnn_tanh_cell', 'rad2deg', 'rad2deg_', 'rand', 'rand_like', 'randint', 'randint_like', + 'randn', 'randn_like', 'randperm', 'range', 'ravel', 'real', 'reciprocal', 'reciprocal_', 'relu', + 'relu_', 'remainder', 'renorm', 'repeat_interleave', 'reshape', 'resize_as_', 'resize_as_sparse_', + 'resolve_conj', 'resolve_neg', 'result_type', 'rnn_relu', 'rnn_relu_cell', 'rnn_tanh', + 'rnn_tanh_cell', 'roll', 'rot90', 'round', 'round_', 'row_indices_copy', 'row_stack', 'rrelu', + 'rrelu_', 'rsqrt', 'rsqrt_', 'rsub', 'saddmm', 'scalar_tensor', 'scatter', 'scatter_add', + 'scatter_reduce', 'searchsorted', 'segment_reduce', 'select', 'select_copy', 'select_scatter', + 'selu', 'selu_', 'set_flush_denormal', 'set_num_interop_threads', 'set_num_threads', 'sgn', + 'sigmoid', 'sigmoid_', 'sign', 'signbit', 'sin', 'sin_', 'sinc', 'sinc_', 'sinh', 'sinh_', + 'slice_copy', 'slice_inverse', 'slice_scatter', 'slogdet', 'slogdet', 'smm', 'softmax', 'sort', + 'sort', 'sparse_bsc_tensor', 'sparse_bsr_tensor', 'sparse_compressed_tensor', 'sparse_coo_tensor', + 'sparse_csc_tensor', 'sparse_csr_tensor', 'split_copy', 'split_with_sizes', + 'split_with_sizes_copy', 'spmm', 'sqrt', 'sqrt_', 'square', 'square_', 'squeeze', 'squeeze_copy', + 'sspaddmm', 'stack', 'std', 'std_mean', 'sub', 'subtract', 'sum', 'svd', 'svd', 'swapaxes', + 'swapdims', 'sym_constrain_range', 'sym_constrain_range_for_size', 't', 't_copy', 'take', + 'take_along_dim', 'tan', 'tan_', 'tanh', 'tanh_', 'tensor', 'tensor_split', 'threshold', + 'threshold_', 'tile', 'topk', 'topk', 'trace', 'transpose', 'transpose_copy', 'trapezoid', 'trapz', + 'triangular_solve', 'triangular_solve', 'tril', 'tril_indices', 'triplet_margin_loss', 'triu', + 'triu_indices', 'true_divide', 'trunc', 'trunc_', 'unbind', 'unbind_copy', 'unflatten', + 'unfold_copy', 'unique_dim', 'unsafe_chunk', 'unsafe_split', 'unsafe_split_with_sizes', + 'unsqueeze', 'unsqueeze_copy', 'values_copy', 'vander', 'var', 'var_mean', 'vdot', + 'view_as_complex', 'view_as_complex_copy', 'view_as_real', 'view_as_real_copy', 'view_copy', + 'vsplit', 'vstack', 'where', 'xlogy', 'xlogy_', 'zero_', 'zeros', 'zeros_like'] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/__config__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/__config__.py new file mode 100644 index 0000000000000000000000000000000000000000..f7e3e209654a8846ddc42d31220101340043c276 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/__config__.py @@ -0,0 +1,22 @@ +import torch + + +def show(): + """ + Return a human-readable string with descriptions of the + configuration of PyTorch. + """ + return torch._C._show_config() + + +# TODO: In principle, we could provide more structured version/config +# information here. For now only CXX_FLAGS is exposed, as Timer +# uses them. +def _cxx_flags(): + """Returns the CXX_FLAGS used when building PyTorch.""" + return torch._C._cxx_flags() + + +def parallel_info(): + r"""Returns detailed string with parallelization settings""" + return torch._C._parallel_info() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_compile.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_compile.py new file mode 100644 index 0000000000000000000000000000000000000000..354d64e9ff9fddc9a1dc321241ce8bea7955b58a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_compile.py @@ -0,0 +1,30 @@ +""" +APIs related to torch.compile which lazily import torch._dynamo to avoid +circular dependencies. +""" +import functools + + +def _disable_dynamo(fn=None, recursive=True): + """ + This API should be only used inside torch, external users should still use + torch._dynamo.disable. The main goal of this API is to avoid circular + imports issues that is common while using _dynamo.disable inside torch + itself. + + This API avoids it by lazily importing torch._dynamo from the import time to + the invocation of the decorated function. + """ + if fn is not None: + + @functools.wraps(fn) + def inner(*args, **kwargs): + import torch._dynamo + + return torch._dynamo.disable(fn, recursive)(*args, **kwargs) + + return inner + else: + # decorator usage like @_disable_dynamo(recursive=False). The resulting + # object expects the original decorated function as the arg. + return functools.partial(_disable_dynamo, recursive=recursive) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_jit_internal.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_jit_internal.py new file mode 100644 index 0000000000000000000000000000000000000000..64509816e09cf029bce0663287d94661c6c4585c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_jit_internal.py @@ -0,0 +1,1510 @@ +""" +The weak_script annotation needs to be here instead of inside torch/jit/ so it +can be used in other places in torch/ (namely torch.nn) without running into +circular dependency problems +""" + +import ast +import builtins +import collections +import contextlib +import enum +import inspect +import io +import pickle +import sys +import threading +import types +import typing +import warnings +import weakref +from textwrap import dedent +from typing import ( # noqa: F401 + Any, + Callable, + Dict, + Final, + ForwardRef, + Generic, + get_args, # new in 3.8 + get_origin, # new in 3.8 + List, + Optional, + Tuple, + Type, + TypeVar, + Union, +) + +import torch + +# This is needed. `torch._jit_internal` is imported before `torch.distributed.__init__`. +# Explicitly ask to import `torch.distributed.__init__` first. +# Otherwise, "AttributeError: module 'torch' has no attribute 'distributed'" is raised. +import torch.distributed.rpc +import torch.package._mangling as package_mangling +from torch._awaits import _Await +from torch._C import _Await as CAwait, Future as CFuture +from torch._sources import fake_range, get_source_lines_and_file, parse_def +from torch.futures import Future + +IS_PY39_PLUS: Final[bool] = sys.version_info >= (3, 9) +IS_PY310_PLUS: Final[bool] = sys.version_info >= (3, 10) + +BuiltinUnionType: Union[Type, Tuple[Type, ...]] +if sys.version_info >= (3, 10): + # NOTE: IS_PY310_PLUS doesn't work with mypy. + # cf. https://mypy.readthedocs.io/en/stable/common_issues.html#python-version-and-system-platform-checks + BuiltinUnionType = types.UnionType +else: + BuiltinUnionType = () # trick: this makes isinstance short circuit. + +LockType: Type +try: + import _thread + + LockType = _thread.LockType +except ImportError: + import _dummy_thread # type: ignore[import-not-found] + + LockType = _dummy_thread.LockType + +# Wrapper functions that can call either of 2 functions depending on a boolean +# argument +boolean_dispatched: "weakref.WeakKeyDictionary[Callable, Dict[str, Callable]]" = ( + weakref.WeakKeyDictionary() +) # noqa: T484 + + +FAKE_FILENAME_PREFIX = "__torch_jit_dataclass" + + +class SourceLoader: + def __init__(self): + self.content = {} + + def cache(self, fn, source): + self.content[fn] = source + + def get_source(self, fn): + return self.content.get(fn) + + +loader = SourceLoader() + + +def createResolutionCallbackFromEnv(lookup_base): + """ + Creates a resolution callback that will look up qualified names in an + environment, starting with `lookup_base` for the base of any qualified + names, then proceeding down the lookup chain with the resolved object. + + You should not use this directly, it should only be used from the other + createResolutionCallbackFrom* functions. + """ + + def lookupInModule(qualified_name, module): + if "." in qualified_name: + base, remaining_pieces = qualified_name.split(".", maxsplit=1) + module_value = getattr(module, base) + return lookupInModule(remaining_pieces, module_value) + else: + return getattr(module, qualified_name) + + def parseNestedExpr(expr, module) -> Tuple[Any, int]: + i = 0 + while i < len(expr) and expr[i] not in (",", "[", "]"): + i += 1 + + # Special case logic for the empty Tuple as a subscript (used + # in the type annotation `Tuple[()]`) + if expr[:i] == "()": + return (), i + + base = lookupInModule(expr[:i].strip(), module) + assert base is not None, f"Unresolvable type {expr[:i]}" + if i == len(expr) or expr[i] != "[": + return base, i + + assert expr[i] == "[" + parts = [] + while expr[i] != "]": + part_len = 0 + i += 1 + part, part_len = parseNestedExpr(expr[i:], module) + parts.append(part) + i += part_len + if len(parts) > 1: + return base[tuple(parts)], i + 1 + else: + return base[parts[0]], i + 1 + + def parseExpr(expr, module): + try: + value, len_parsed = parseNestedExpr(expr, module) + assert len_parsed == len( + expr + ), "whole expression was not parsed, falling back to c++ parser" + return value + except Exception: + """ + The python resolver fails in several cases in known unit tests, and is intended + to fall back gracefully to the c++ resolver in general. For example, python 2 style + annotations which are frequent in our unit tests often fail with types e.g. int not + resolvable from the calling frame. + """ + return None + + return lambda expr: parseExpr(expr, lookup_base) + + +def createResolutionCallbackFromFrame(frames_up: int = 0): + """ + Creates a function which, given a string variable name, + returns the value of the variable in the scope of the caller of + the function which called createResolutionCallbackFromFrame (by default). + + This is used to enable access in-scope Python variables inside + TorchScript fragments. + + frames_up is number of additional frames to go up on the stack. + The default value is 0, which correspond to the frame of the caller + of createResolutionCallbackFromFrame. Also for example, if frames_up is set + to 1, then the frame of the caller's caller of createResolutionCallbackFromFrame + will be taken. + + For example, the following program prints 2:: + + def bar(): + cb = createResolutionCallbackFromFrame(1) + print(cb("foo")) + + def baz(): + foo = 2 + bar() + + baz() + """ + frame = inspect.currentframe() + i = 0 + while i < frames_up + 1: + assert frame is not None + frame = frame.f_back + i += 1 + + assert frame is not None + f_locals = frame.f_locals + f_globals = frame.f_globals + + class env: + def __getattr__(self, key): + if key in f_locals: + return f_locals[key] + elif key in f_globals: + return f_globals[key] + elif key in dir(builtins): + return getattr(builtins, key) + + return createResolutionCallbackFromEnv(env()) + + +def get_closure(fn): + """ + Get a dictionary of closed over variables from a function + """ + captures = {} + captures.update(fn.__globals__) + + for index, captured_name in enumerate(fn.__code__.co_freevars): + captures[captured_name] = fn.__closure__[index].cell_contents + + return captures + + +# [local resolution in python] +# Depending on where a variable is defined, and where it is used, we may +# or may not be able to recover its value when recursively compiling a +# script function. Remember in the general case, a module or function is +# first defined and then later scripted. This means we do not have a +# chance to capture the active frames when the function is defined. Hence any +# name resolution has to happen later on the created closure. The way +# python captures type annotations restricts what we can recover. The +# follow example illustrates the different cases: +# +# class MyGlobalClass: +# ... +# def my_local_scope(): +# @torch.jit.script +# class MyClass: +# ... +# @torch.jit.script +# class MyClassUsedAsVar: +# ... +# def eg(x: MyClass, y: MyGlobalClass): +# a_local_capture : Foo +# return MyClassUsedAsVar(x) +# +# MyGlobalClass is defined in the __globals__ dictionary of function +# 'eg', so it is always recoverable. my_local_scope introduces a new local +# variable scope in the function. Classes defined here are only visible as +# local variables. For the case of MyClassUsedAsVar, it is captured +# because it is used as a variable inside the body of the function, and we +# can resolve it using the captures returned from `get_closure`. However, +# the type annotations are not captured by the closure. In Python +# 3.0--3.9, the _value_ of MyClass and MyGlobalClass will be available as +# annotations on `eg``, but starting in Python 4.0, they will represented as +# strings and no longer present. Furthermore, since the body of `eg` does +# not reference those names, they do not appear in the list of closed over +# variables. In Python 2.x, type annotations are in comments, leading to a +# similar situation where their definitions are not available. We anticipate +# that most users will not run into this issue because their modules and +# functions will be defined at a global scope like MyGlobalClass. In cases +# where they are not, it is possible to work around issues by declaring the +# values global in the function. +# In Python 3.9 declaring class as global will make it invisible to +# `inspect.getsource`, see https://bugs.python.org/issue42666 . +# This could be worked around by manualy adding it to `global()` dictionary. + + +def createResolutionCallbackFromClosure(fn): + """ + Create a resolutionCallback by introspecting the function instead of + looking up the stack for the enclosing scope + """ + closure = get_closure(fn) + + class closure_lookup: + # This is a class since `closure` is a dict and it's easier in + # `env_helper` if everything just works with `getattr` calls + def __getattr__(self, key): + if key in closure: + return closure[key] + elif hasattr(typing, key): + return getattr(typing, key) + elif hasattr(builtins, key): + return getattr(builtins, key) + return None + + return createResolutionCallbackFromEnv(closure_lookup()) + + +def can_compile_class(cls) -> bool: + # If any of the functions on a type don't have a code object, this type can't + # be compiled and is probably a builtin / bound from C + if is_ignored_fn(cls): + return False + + # Ignore the following list of built-in classes. + ignored_builtin_classes = (torch.nn.Module, tuple, list, Exception) + if issubclass(cls, ignored_builtin_classes): + return False + + names = cls.__dict__ + fns = [ + getattr(cls, name) + for name in names + if inspect.isroutine(getattr(cls, name, None)) + ] + has_code = [hasattr(fn, "__code__") for fn in fns] + return all(has_code) + + +def get_callable_argument_names(fn) -> List[str]: + """ + Gets names of all POSITIONAL_OR_KEYWORD arguments for callable `fn`. + Returns an empty list when other types of arguments are present. + + This is used by `torch.jit.trace` to assign meaningful argument names to + traced functions and modules. + + Args: + fn: A callable. + Returns: + Argument names: List[str] + """ + # inspect.signature may fail, give up in that case. + try: + callable_signature = inspect.signature(fn) + except Exception: + return [] + + argument_names = [] + for name, param in callable_signature.parameters.items(): + # All four other types of arguments do not map to individual values + # with a keyword as name. + if not param.kind == param.POSITIONAL_OR_KEYWORD: + continue + + argument_names.append(name) + + return argument_names + + +def get_annotation_str(annotation): + """ + Convert an AST node containing a type annotation to the string present in the source + that represents the same annotation. + """ + if isinstance(annotation, ast.Name): + return annotation.id + elif isinstance(annotation, ast.Attribute): + return ".".join([get_annotation_str(annotation.value), annotation.attr]) + elif isinstance(annotation, ast.Subscript): + # In Python3.9+ subscript indicies are not wrapped in ast.Index + subscript_slice = annotation.slice if IS_PY39_PLUS else annotation.slice.value # type: ignore[attr-defined] + return f"{get_annotation_str(annotation.value)}[{get_annotation_str(subscript_slice)}]" + elif isinstance(annotation, ast.Tuple): + return ",".join([get_annotation_str(elt) for elt in annotation.elts]) + elif isinstance(annotation, (ast.Constant, ast.NameConstant)): + return f"{annotation.value}" + + # If an AST node is not handled here, it's probably handled in ScriptTypeParser. + return None + + +def get_type_hint_captures(fn): + """ + Get a dictionary containing type resolution mappings necessary to resolve types + for the literal annotations on 'fn'. These are not considered to be closed-over by fn + and must be obtained separately (e.g. using this function). + + Args: + fn: A callable. + Returns: + A Dict[str, Any] containing a mapping from the literal annotations used on + fn to the Python objects they refer to. + """ + # First, try to get the source of the function. We'll need to parse it to find the actual string names + # that were used to annotate the types, since inspect.signature() will only return the class object that + # the annotation refers to, not the string name. If we can't get the source, simply return an empty dict. + # This may happen in cases where the function is synthesized dynamically at runtime. + src = loader.get_source(fn) + if src is None: + src = inspect.getsource(fn) + + # Gather a dictionary of parameter name -> type, skipping any parameters whose annotated + # types are strings. These are only understood by TorchScript in the context of a type annotation + # that refers to a class in its own definition, but trying to include a mapping for this in the result + # function would cause infinite recursion because the class is currently being compiled. + # In addition, there is logic in ScriptTypeParser to handle this. + signature = inspect.signature(fn) + name_to_type = { + name: parameter.annotation + for name, parameter in signature.parameters.items() + if parameter.annotation is not inspect.Parameter.empty + and not isinstance(parameter.annotation, str) + } + + # Then, get the literal type annotations from the function declaration + # by source inspection. This accounts for the case in which aliases are used + # to annotate the arguments (e.g device_t = torch.device, and then d: device_t). + # frontend.py cannot be used here because it includes _jit_internal, so use ast instead. + a = ast.parse(dedent(src)) + if len(a.body) != 1 or not isinstance(a.body[0], ast.FunctionDef): + raise RuntimeError(f"Expected {fn} to be a function") + f = a.body[0] + + # Prepare a dictionary of source annotation -> type, which will be the final result of this function, + # by using the parsed AST (f) to reconstruct source annotations as strings for each parameter and mapping + # them to the type object corresponding to the annotation via name_to_type using the parameter name. + annotation_to_type = {} + + for arg in f.args.args: + # Get the source type annotation string for this argument if possible. + arg_annotation_str = ( + get_annotation_str(arg.annotation) if arg.annotation else None + ) + + # If the argument has no annotation or get_annotation_str cannot convert it to a string, + # arg_annotation_str will be None. Skip this arg; ScriptTypeParser will probably handle + # this in the latter case. + if arg_annotation_str is None: + continue + + # Insert {arg_annotation_str: type} into annotation_to_type if possible. One reason arg_name may not + # be present in name_to_type is that the annotation itself is a string and not a type object + # (common for self-refential annotations in classes). Once again, let ScriptTypeParser handle this. + arg_name = arg.arg + if arg_name in name_to_type: + annotation_to_type[arg_annotation_str] = name_to_type[arg_name] + + # If there is a valid return annotation, include it in annotation_to_type. As with argument annotations, + # the literal annotation has to be convertible to a string by get_annotation_str, and the actual type + # of the annotation cannot be a string. + literal_return_annotation = get_annotation_str(f.returns) + valid_literal_annotation = literal_return_annotation is not None + return_annotation = signature.return_annotation + valid_return_annotation_type = ( + return_annotation is not inspect.Parameter.empty + and not isinstance(return_annotation, str) + ) + if valid_literal_annotation and valid_return_annotation_type: + annotation_to_type[literal_return_annotation] = return_annotation + + return annotation_to_type + + +def createResolutionCallbackForClassMethods(cls): + """ + This looks at all the methods defined in a class and pulls their closed-over + variables into a dictionary and uses that to resolve variables. + """ + # cls is a type here, so `ismethod` is false since the methods on the type + # aren't bound to anything, so Python treats them as regular functions + fns = [ + getattr(cls, name) + for name in cls.__dict__ + if inspect.isroutine(getattr(cls, name)) + ] + # Skip built-ins, as they do not have global scope nor type hints + # Needed to support `enum.Enum` derived classes in Python-3.11 + # That adds `_new_member_` property which is an alias to `__new__` + fns = [fn for fn in fns if not inspect.isbuiltin(fn) and hasattr(fn, "__globals__")] + captures = {} + + for fn in fns: + captures.update(get_closure(fn)) + captures.update(get_type_hint_captures(fn)) + + def lookup_in_class(key): + if key in captures: + return captures[key] + else: + return getattr(builtins, key, None) + + return lookup_in_class + + +def boolean_dispatch( + arg_name, arg_index, default, if_true, if_false, module_name, func_name +): + """ + Dispatches to either of 2 script functions based on a boolean argument. + In TorchScript, the boolean argument must be constant so that the correct + function to use can be determined at compile time. + """ + + def fn(*args, **kwargs): + dispatch_flag = default + if arg_name in kwargs: + dispatch_flag = kwargs[arg_name] + elif arg_index < len(args): + dispatch_flag = args[arg_index] + + if dispatch_flag: + return if_true(*args, **kwargs) + else: + return if_false(*args, **kwargs) + + if if_true.__doc__ is None and if_false.__doc__ is not None: + doc = if_false.__doc__ + if_true.__doc__ = doc + elif if_false.__doc__ is None and if_true.__doc__ is not None: + doc = if_true.__doc__ + if_false.__doc__ = doc + elif if_false.__doc__ is None and if_true.__doc__ is None: + # neither function has a docstring + doc = None + else: + raise RuntimeError("only one function can have a docstring") + fn.__doc__ = doc + + if module_name is not None: + fn.__module__ = module_name + if func_name is not None: + fn.__name__ = func_name + + boolean_dispatched[fn] = { + "if_true": if_true, + "if_false": if_false, + "index": arg_index, + "default": default, + "arg_name": arg_name, + } + return fn + + +class FunctionModifiers: + """ + Used to denote the behavior of a function in TorchScript. See export() and + ignore() for details. + """ + + UNUSED = "unused (ignored and replaced with raising of an exception)" + IGNORE = "ignore (leave as a call to Python, cannot be torch.jit.save'd)" + EXPORT = "export (compile this function even if nothing calls it)" + DEFAULT = "default (compile if called from a exported function / forward)" + COPY_TO_SCRIPT_WRAPPER = ( + "if this method is not scripted, copy the python method onto the scripted model" + ) + _DROP = "_drop (function is fully ignored, declaration can be unscriptable)" + + +def export(fn): + """ + This decorator indicates that a method on an ``nn.Module`` is used as an entry point into a + :class:`ScriptModule` and should be compiled. + + ``forward`` implicitly is assumed to be an entry point, so it does not need this decorator. + Functions and methods called from ``forward`` are compiled as they are seen + by the compiler, so they do not need this decorator either. + + Example (using ``@torch.jit.export`` on a method): + + .. testcode:: + + import torch + import torch.nn as nn + + class MyModule(nn.Module): + def implicitly_compiled_method(self, x): + return x + 99 + + # `forward` is implicitly decorated with `@torch.jit.export`, + # so adding it here would have no effect + def forward(self, x): + return x + 10 + + @torch.jit.export + def another_forward(self, x): + # When the compiler sees this call, it will compile + # `implicitly_compiled_method` + return self.implicitly_compiled_method(x) + + def unused_method(self, x): + return x - 20 + + # `m` will contain compiled methods: + # `forward` + # `another_forward` + # `implicitly_compiled_method` + # `unused_method` will not be compiled since it was not called from + # any compiled methods and wasn't decorated with `@torch.jit.export` + m = torch.jit.script(MyModule()) + """ + fn._torchscript_modifier = FunctionModifiers.EXPORT + return fn + + +def unused(fn): + """ + This decorator indicates to the compiler that a function or method should + be ignored and replaced with the raising of an exception. This allows you + to leave code in your model that is not yet TorchScript compatible and still + export your model. + + Example (using ``@torch.jit.unused`` on a method):: + + import torch + import torch.nn as nn + + class MyModule(nn.Module): + def __init__(self, use_memory_efficient): + super().__init__() + self.use_memory_efficient = use_memory_efficient + + @torch.jit.unused + def memory_efficient(self, x): + import pdb + pdb.set_trace() + return x + 10 + + def forward(self, x): + # Use not-yet-scriptable memory efficient mode + if self.use_memory_efficient: + return self.memory_efficient(x) + else: + return x + 10 + + m = torch.jit.script(MyModule(use_memory_efficient=False)) + m.save("m.pt") + + m = torch.jit.script(MyModule(use_memory_efficient=True)) + # exception raised + m(torch.rand(100)) + """ + if isinstance(fn, property): + prop = fn + setattr( # noqa: B010 + prop.fget, "_torchscript_modifier", FunctionModifiers.UNUSED + ) + + if prop.fset: + setattr( # noqa: B010 + prop.fset, "_torchscript_modifier", FunctionModifiers.UNUSED + ) + + return prop + + fn._torchscript_modifier = FunctionModifiers.UNUSED + return fn + + +# No op context manager from python side +class _IgnoreContextManager(contextlib.AbstractContextManager): + def __init__(self, **kwargs): + pass + + def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: + pass + + +def ignore(drop=False, **kwargs): + """ + This decorator indicates to the compiler that a function or method should + be ignored and left as a Python function. This allows you to leave code in + your model that is not yet TorchScript compatible. If called from TorchScript, + ignored functions will dispatch the call to the Python interpreter. Models with ignored + functions cannot be exported; use :func:`@torch.jit.unused ` instead. + + Example (using ``@torch.jit.ignore`` on a method):: + + import torch + import torch.nn as nn + + class MyModule(nn.Module): + @torch.jit.ignore + def debugger(self, x): + import pdb + pdb.set_trace() + + def forward(self, x): + x += 10 + # The compiler would normally try to compile `debugger`, + # but since it is `@ignore`d, it will be left as a call + # to Python + self.debugger(x) + return x + + m = torch.jit.script(MyModule()) + + # Error! The call `debugger` cannot be saved since it calls into Python + m.save("m.pt") + + Example (using ``@torch.jit.ignore(drop=True)`` on a method): + + .. testcode:: + + import torch + import torch.nn as nn + + class MyModule(nn.Module): + @torch.jit.ignore(drop=True) + def training_method(self, x): + import pdb + pdb.set_trace() + + def forward(self, x): + if self.training: + self.training_method(x) + return x + + m = torch.jit.script(MyModule()) + + # This is OK since `training_method` is not saved, the call is replaced + # with a `raise`. + m.save("m.pt") + + .. testcleanup:: + + import os + os.remove('m.pt') + """ + + if callable(drop): + # used without any args, so drop is actually a function + # @torch.jit.ignore + # def fn(...): + fn = drop + fn._torchscript_modifier = FunctionModifiers.IGNORE + return fn + + if not isinstance(drop, bool): + raise RuntimeError( + "Argument to @torch.jit.ignore must be a bool or " + f"a function but got {drop}" + ) + + # for backwards compat + drop_on_export = kwargs.pop("drop_on_export", None) + if drop_on_export: + warnings.warn( + "ignore(drop_on_export=True) has been deprecated. TorchScript will now drop the function " + "call on compilation. Use torch.jit.unused now. {}", + category=FutureWarning, + ) + + drop = drop_on_export + elif drop: + warnings.warn( + "ignore(True) has been deprecated. TorchScript will now drop the function " + "call on compilation. Use torch.jit.unused now. {}", + category=FutureWarning, + ) + + def decorator(fn): + if drop: + fn._torchscript_modifier = FunctionModifiers.UNUSED + else: + fn._torchscript_modifier = FunctionModifiers.IGNORE + return fn + + return decorator + + +def _drop(fn): + fn._torchscript_modifier = FunctionModifiers._DROP + return fn + + +def _copy_to_script_wrapper(fn): + fn._torchscript_modifier = FunctionModifiers.COPY_TO_SCRIPT_WRAPPER + return fn + + +def module_has_exports(mod): + for name in dir(mod): + if hasattr(mod, name): + item = getattr(mod, name) + if callable(item): + if get_torchscript_modifier(item) is FunctionModifiers.EXPORT: + return True + return False + + +# WARNING: should_drop is currently being used by our JIT code coverage plug-in to mark JIT'd code as covered. If you +# rename this function, please update references in tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py to +# allow JIT'd code to still be covered. +def should_drop(fn) -> bool: + attr = get_torchscript_modifier(fn) + if attr is None: + return False + return attr is FunctionModifiers.UNUSED or attr is FunctionModifiers._DROP + + +def is_ignored_fn(fn) -> bool: + mod = get_torchscript_modifier(fn) + return ( + mod is FunctionModifiers.UNUSED + or mod is FunctionModifiers.IGNORE + or mod is FunctionModifiers._DROP + ) + + +def _is_drop_fn(fn) -> bool: + mod = get_torchscript_modifier(fn) + return mod is FunctionModifiers._DROP + + +def is_static_fn(cls, fn) -> bool: + return isinstance(inspect.getattr_static(cls, fn, default=None), staticmethod) + + +def get_static_fn(cls, fn): + return inspect.getattr_static(cls, fn).__func__ + + +def get_torchscript_modifier(fn): + if not callable(fn): + return None + if hasattr(fn, "__func__"): + fn = fn.__func__ + return getattr(fn, "_torchscript_modifier", FunctionModifiers.DEFAULT) + + +def copy_torchscript_modifier(orig, new) -> None: + attr = get_torchscript_modifier(orig) + if attr is None: + return + new._torchscript_modifier = attr + + +# overloading registration +# overloads get registered in this file, and compiled in torch/jit/__init__.py +# so that they can be imported in nn/functional.py without an import cycle + +# qualified_name => list[overload_functions] +_overloaded_fns: Dict[str, List[Callable]] = {} # noqa: T484 + + +_OVERLOAD_EXAMPLE = """ +Example usage of overload function: +@torch.jit._overload +def my_function(x: type0) -> type0: # decl 1 + pass + +@torch.jit._overload +def my_function(x: type1) -> type1: # decl 2 + pass + +def my_function(x): # implementation + if isinstance(x, type0): + return x + elif isinstance(x, type1): + return x +""" + + +def get_overload_no_implementation_error_message(kind, obj): + sourcelines, file_lineno, filename = get_source_lines_and_file(obj) + return ( + f'Implementation for the {kind} "{_qualified_name(obj)}" is missing. Please make ' + f"sure a definition is provided and defined after all overload declarations.\n" + f'File "{filename}", line {file_lineno}:\n' + + "".join(sourcelines) + + "\n" + + _OVERLOAD_EXAMPLE + ) + + +def _check_overload_body(func): + try: + parsed_def = parse_def(func) + except OSError as e: + # Parsing the function definition can raise an OSError if source is unavailable. + # Since this is just an initial check, just raise a warning if this is the case. + warnings.warn( + f"Unable to retrieve source for @torch.jit._overload function: {func}." + ) + return + + body = parsed_def.ast.body[0].body + + def is_pass(x): + return isinstance(x, ast.Pass) + + def is_ellipsis(x): + return isinstance(x, ast.Expr) and isinstance(x.value, ast.Ellipsis) + + if len(body) != 1 or not (is_pass(body[0]) or is_ellipsis(body[0])): + msg = ( + "Only `pass` statement or `...` can be the body of overload declaration:\n" + ) + msg += "\n".join(parsed_def.source.split("\n")[:3]) + msg += " <- Expecting `pass` or `...` here!\n" + _OVERLOAD_EXAMPLE + raise RuntimeError(msg) + + +def _overload(func): + _check_overload_body(func) + qual_name = _qualified_name(func) + global _overloaded_fns + fn_overload_list = _overloaded_fns.get(qual_name) + if fn_overload_list is None: + fn_overload_list = [] + _overloaded_fns[qual_name] = fn_overload_list + fn_overload_list.append(func) + return func + + +def _get_fn_overloads(qual_name): + return _overloaded_fns.get(qual_name) + + +def _clear_fn_overloads(qual_name) -> None: + del _overloaded_fns[qual_name] + + +def get_class_name_lineno(method) -> Tuple[str, int]: + current_frame = inspect.currentframe() + + # one for the get_class_name call, one for _overload_method call + for i in range(2): + assert ( + current_frame is not None + ) # assert current frame is not an Optional[FrameType] + current_frame = current_frame.f_back + + assert current_frame is not None # same here + class_name = current_frame.f_code.co_name + line_no = current_frame.f_code.co_firstlineno + return class_name, line_no + + +# At the point the decorator is applied to class methods the method +# has no reference to its owning class. _qualified_name would not include +# the class it is defined in, so any methods with the same name in the same file +# would have the same _qualified_name, even if they were defined in different +# classes. This problem only exists in python 2. +# We get around this problem by looking at the stack frame and identifying +# the class name, and throwing an error whenever overloads are used +# when modules of the same name are in the same file + +# qualified_name => class name => list[overload_functions] +_overloaded_methods: Dict[str, Dict[str, List[Callable]]] = {} # noqa: T484 + + +# (qualified_name, class name) => class_fileno +_overloaded_method_class_fileno: Dict[Tuple[str, str], int] = {} + + +def _overload_method(func): + _check_overload_body(func) + qual_name = _qualified_name(func) + global _overloaded_methods + class_name_map = _overloaded_methods.get(qual_name, None) + if class_name_map is None: + class_name_map = {} + _overloaded_methods[qual_name] = class_name_map + + class_name, line_no = get_class_name_lineno(func) + method_overloads = class_name_map.get(class_name, None) + if method_overloads is None: + method_overloads = [] + class_name_map[class_name] = method_overloads + _overloaded_method_class_fileno[(qual_name, class_name)] = line_no + else: + existing_lineno = _overloaded_method_class_fileno[(qual_name, class_name)] + if existing_lineno != line_no: + raise RuntimeError( + "Cannot currently overload the same method name in two different" + " classes with the same name in the same module" + ) + + method_overloads.append(func) + return func + + +def _get_overloaded_methods(method, mod_class): + # TODO: __name__ not set for submodules in recursive script + if not hasattr(method, "__name__"): + return None + qual_name = _qualified_name(method) + class_name_map = _overloaded_methods.get(qual_name, None) + if class_name_map is None: + return None + overloads = class_name_map.get(mod_class.__name__, None) + if overloads is None: + return None + + method_line_no = get_source_lines_and_file(method)[1] + mod_class_fileno = get_source_lines_and_file(mod_class)[1] + mod_end_fileno = mod_class_fileno + len(get_source_lines_and_file(mod_class)[0]) + if not (method_line_no >= mod_class_fileno and method_line_no <= mod_end_fileno): + raise Exception( + "Overloads are not useable when a module is redeclared within the same file: " + + str(method) + ) + return overloads + + +def is_tuple(ann) -> bool: + if ann is Tuple: + raise_error_container_parameter_missing("Tuple") + + # For some reason Python 3.7 violates the Type[A, B].__origin__ == Type rule + if not hasattr(ann, "__module__"): + return False + + ann_origin = get_origin(ann) + if IS_PY39_PLUS and ann.__module__ == "builtins" and ann_origin is tuple: + return True + return ann.__module__ == "typing" and (ann_origin is Tuple or ann_origin is tuple) + + +def is_list(ann) -> bool: + if ann is List: + raise_error_container_parameter_missing("List") + + if not hasattr(ann, "__module__"): + return False + + ann_origin = get_origin(ann) + if IS_PY39_PLUS and ann.__module__ == "builtins" and ann_origin is list: + return True + return ann.__module__ == "typing" and (ann_origin is List or ann_origin is list) + + +def is_dict(ann) -> bool: + if ann is Dict: + raise_error_container_parameter_missing("Dict") + + if not hasattr(ann, "__module__"): + return False + + ann_origin = get_origin(ann) + if IS_PY39_PLUS and ann.__module__ == "builtins" and ann_origin is dict: + return True + return ann.__module__ == "typing" and (ann_origin is Dict or ann_origin is dict) + + +def is_union(ann): + if ann is Union: + raise_error_container_parameter_missing("Union") + + return isinstance(ann, BuiltinUnionType) or ( + hasattr(ann, "__module__") + and ann.__module__ == "typing" + and (get_origin(ann) is Union) + ) + + +def is_optional(ann): + if ann is Optional: + raise_error_container_parameter_missing("Optional") + + def is_optional_as_optional(ann): + return ( + hasattr(ann, "__module__") + and ann.__module__ == "typing" + and (get_origin(ann) is Optional) + ) + + def is_union_as_optional(ann): + ann_args = get_args(ann) + return len(ann_args) == 2 and (None in ann_args or type(None) in ann_args) + + return is_optional_as_optional(ann) or (is_union(ann) and is_union_as_optional(ann)) + + +def is_future(ann) -> bool: + if ann is Future: + raise RuntimeError( + "Attempted to use Future without a " + "contained type. Please add a contained type, e.g. " + "Future[int]" + ) + return get_origin(ann) is Future + + +def is_await(ann) -> bool: + if ann is _Await: + return True + return get_origin(ann) is _Await + + +if torch.distributed.rpc.is_available(): + from torch._C._distributed_rpc import PyRRef + from torch.distributed.rpc import RRef + + def is_rref(ann) -> bool: + if ann is RRef: + raise RuntimeError( + "Attempted to use RRef without a " + "contained type. Please add a contained type, e.g. " + "RRef[int]" + ) + return get_origin(ann) is RRef + + def is_rref_instance(obj) -> bool: + return isinstance(obj, PyRRef) + +else: + + def is_rref_instance(obj) -> bool: + # If the RPC module doesn't exist then RRefs don't exist either. + return False + + +def is_final(ann) -> bool: + return ( + hasattr(ann, "__module__") + and ann.__module__ in {"typing", "typing_extensions"} + and (get_origin(ann) is Final or isinstance(ann, type(Final))) + ) + + +# allows BroadcastingList instance to be subscriptable +class BroadcastingListCls: + def __getitem__(self, types): + return + + +# mypy doesn't support parameters on types, so we have to explicitly type each +# list size +BroadcastingList1 = BroadcastingListCls() +for i in range(2, 7): + globals()[f"BroadcastingList{i}"] = BroadcastingList1 + + +def is_scripting() -> bool: + r""" + Function that returns True when in compilation and False otherwise. This + is useful especially with the @unused decorator to leave code in your + model that is not yet TorchScript compatible. + .. testcode:: + + import torch + + @torch.jit.unused + def unsupported_linear_op(x): + return x + + def linear(x): + if torch.jit.is_scripting(): + return torch.linear(x) + else: + return unsupported_linear_op(x) + """ + return False + + +# Retrieves a fully-qualified name (module hierarchy + classname) for a given obj. +def _qualified_name(obj, mangle_name=True) -> str: + # This special case allows us to override the qualified name on a type. + # It's currently used in conjunction with tracing, where we create a + # fake module to filter only supported attributes. However, since this + # new type is defined as a local class, we need a mechanism to override + # its qualname so it appears correctly in the TorchScript system. This, + # we set '_jit_override_qualname' with the original traced module's + # qualified name, which is picked up here + if hasattr(obj, "_jit_override_qualname"): + return obj._jit_override_qualname + # short-circuit in cases where the object already has a known qualified name + if isinstance(obj, torch._C.ScriptFunction): + return obj.qualified_name + + if getattr(obj, "__name__", None): + name = obj.__name__ + # Enum classes do not have `__name__` attr, instead they have `name`. + elif isinstance(obj, enum.Enum): + name = obj.name + else: + raise RuntimeError("Could not get name of python class object") + + if name == "": + name = "_lambda" # make name a valid identifier + + module_name = obj.__module__ + + # If the module is actually a torchbind module, then we should short circuit + if module_name == "torch._classes": + return obj.qualified_name + + # The Python docs are very clear that `__module__` can be None, but I can't + # figure out when it actually would be. + if module_name is None: + raise RuntimeError( + f"Could not get qualified name for class '{name}': " + "__module__ can't be None." + ) + + # if getattr(sys.modules[module_name], name) is not obj: + # raise RuntimeError(f"Could not get qualified name for class '{name}': " + # f"the attr {name} on module {module_name} is not the class") + + # torch.package and TorchScript have separate mangling schemes to avoid + # name collisions from multiple packages. To avoid them interfering with + # each other, normalize the package manging here. + if package_mangling.is_mangled(module_name): + module_name = module_name.replace("<", "_") + module_name = module_name.replace(">", "_") + + # The PythonExceptionValue C++ class in torch/csrc/jit/python/python_sugared_value.h + # does not need mangle the python class name. + if mangle_name: + # __main__ is a builtin module, so rewrite it to "__torch__". + if module_name == "__main__": + module_name = "__torch__" + else: + # Everything else gets a "__torch__" prefix to avoid name collisions + # with the names of user values. + module_name = "__torch__." + module_name + + if "." in name: + raise RuntimeError( + f"Could not get qualified name for class '{name}': " + f"'{name}' is not a valid identifier" + ) + + return module_name + "." + name + + +def _try_get_dispatched_fn(fn): + if not callable(fn): + return None + return boolean_dispatched.get(fn) + + +def _get_named_tuple_properties( + obj, loc: Optional[torch._C._jit_tree_views.SourceRange] = None, rcb=None +): + if loc is None: + loc = fake_range() + + assert issubclass(obj, tuple) and hasattr(obj, "_fields") + if hasattr(obj, "_field_defaults"): + defaults = [ + obj._field_defaults[field] + for field in obj._fields + if field in obj._field_defaults + ] + else: + defaults = [] + # In 3.10 recommended way to get annotations is to call `inspect.get_annotations` function + # Also, annotations from base class are not inherited so they need to be queried explicitly + if sys.version_info[:2] < (3, 10): + obj_annotations = getattr(obj, "__annotations__", {}) + else: + obj_annotations = inspect.get_annotations(obj) + if len(obj_annotations) == 0 and hasattr(obj, "__base__"): + obj_annotations = inspect.get_annotations(obj.__base__) + + annotations = [] + for field in obj._fields: + if field in obj_annotations: + field_type = obj_annotations[field] + # [Note: ForwardRef annotations in NamedTuple attributes] + # NamedTuple types are slightly different from normal types. + # + # Normally, annotations are evaluted like this (during jit.script): + # 1. Load strings of python code into c++ and parse. + # 2. Get annotations as strings + # 3. Use the PythonResolver's resolution callback (rcb) to convert + # the string into a python object + # 4. We call into annotations.py:ann_to_type to convert python obj + # from step 3 into a type that torchscript understands. + # + # NamedTuples are more complicated, because it has sub-types. + # Normally, once we have the NamedTuple type object from #3, + # we can just look at the annotation literal values and use + # ann_to_type directly on them. + # + # But sometimes, users will annotate with string literals, e.g. + # x: 'int' + # This also happens with PEP563 (from __forward__ import annotations) + # + # These annotations appear in the annotation dict as ForwardRef('int'). + # + # Then, we need to convert the string into a python object. This + # requires having local context for custom objects or imported types. + # rcb() is what gives us this. So, we plumb rcb through the stack so + # it can be used in this context for the if block below. + # + # FAQ: + # - Why do we need this special handling for NamedTuple but string + # annotations work fine for normal types? Normally, we parse the + # string directly and then call rcb() directly from C++. + # - Why not use ForwardRef._evaluate? For that, we need globals() + # and locals() for the local context where the NamedTuple was defined. + # rcb is what lets us look up into these. So, basically rcb does the + # hard work for us. + if isinstance(field_type, ForwardRef) and rcb is not None: + rcb_type = rcb(field_type.__forward_arg__) + # rcb returns None if it can't find anything. + if rcb_type is None: + raise ValueError( + f"Unknown type annotation: '{field_type}' in NamedTuple {obj.__name__}." + f" Likely due to partial support for ForwardRef parameters in NamedTuples, see #95858." + f" Issue occurred at {loc.highlight()}" + ) + field_type = rcb_type + the_type = torch.jit.annotations.ann_to_type(field_type, loc, rcb) + annotations.append(the_type) + else: + annotations.append(torch._C.TensorType.getInferred()) + return type(obj).__name__, obj._fields, annotations, defaults + + +def _create_named_tuple( + t, unqual_name: str, field_names: List[str], defaults: Tuple[Any, ...] +): + TupleType = collections.namedtuple(unqual_name, field_names, defaults=defaults) # type: ignore[call-arg, no-redef, misc] + return TupleType(*t) + + +@contextlib.contextmanager +def _disable_emit_hooks(): + hooks = torch._C._jit_get_emit_hooks() + torch._C._jit_set_emit_hooks(None, None) + try: + yield + finally: + torch._C._jit_set_emit_hooks(hooks[0], hooks[1]) + + +def _disable_emit_hooks_decorator(_DecoratorContextManager) -> None: # noqa: F811 + def __enter__(self) -> None: + self.hooks = torch._C._jit_get_emit_hooks() + torch._C._jit_set_emit_hooks(None, None) + + def __exit__(self, *args) -> None: + torch._C._jit_set_emit_hooks(self.hooks[0], self.hooks[1]) + + +def _is_exception(obj) -> bool: + if not inspect.isclass(obj): + return False + return issubclass(obj, Exception) + + +def raise_error_container_parameter_missing(target_type) -> None: + if target_type == "Dict": + raise RuntimeError( + "Attempted to use Dict without " + "contained types. Please add contained type, e.g. " + "Dict[int, int]" + ) + raise RuntimeError( + f"Attempted to use {target_type} without a " + "contained type. Please add a contained type, e.g. " + f"{target_type}[int]" + ) + + +def check_args_exist(target_type) -> None: + if target_type is List or target_type is list: + raise_error_container_parameter_missing("List") + elif target_type is Tuple or target_type is tuple: + raise_error_container_parameter_missing("Tuple") + elif target_type is Dict or target_type is dict: + raise_error_container_parameter_missing("Dict") + elif target_type is None or target_type is Optional: + raise_error_container_parameter_missing("Optional") + + +def check_empty_containers(obj) -> None: + if obj == [] or obj == {} or obj == (): + warnings.warn( + "The inner type of a container is lost when " + "calling torch.jit.isinstance in eager mode. For " + "example, List[int] would become list and " + "therefore falsely return True for List[float] or" + " List[str]." + ) + + +# supports List/Dict/Tuple and Optional types +# TODO support future +def container_checker(obj, target_type) -> bool: + origin_type = get_origin(target_type) + check_args_exist(target_type) + if origin_type is None: + return False + elif origin_type is list or origin_type is List: + check_empty_containers(obj) + if not isinstance(obj, list): + return False + arg_type = get_args(target_type)[0] + arg_origin = get_origin(arg_type) + for el in obj: + # check if nested container, ex: List[List[str]] + if arg_origin: # processes nested container, ex: List[List[str]] + if not container_checker(el, arg_type): + return False + elif not isinstance(el, arg_type): + return False + return True + elif origin_type is Dict or origin_type is dict: + check_empty_containers(obj) + if not isinstance(obj, dict): + return False + key_type = get_args(target_type)[0] + val_type = get_args(target_type)[1] + for key, val in obj.items(): + # check if keys are of right type + if not isinstance(key, key_type): + return False + val_origin = get_origin(val_type) + if val_origin: + if not container_checker(val, val_type): + return False + elif not isinstance(val, val_type): + return False + return True + elif origin_type is Tuple or origin_type is tuple: + check_empty_containers(obj) + if not isinstance(obj, tuple): + return False + arg_types = get_args(target_type) + if len(obj) != len(arg_types): + return False + for el, el_type in zip(obj, arg_types): + el_origin = get_origin(el_type) + if el_origin: + if not container_checker(el, el_type): + return False + elif not isinstance(el, el_type): + return False + return True + elif origin_type is Union or issubclass( + origin_type, BuiltinUnionType + ): # also handles Optional + if obj is None: # check before recursion because None is always fine + return True + inner_types = get_args(target_type) + for t in inner_types: + t_origin = get_origin(t) + if t_origin: + return container_checker(obj, t) + elif isinstance(obj, t): + return True + return False + + +def _isinstance(obj, target_type) -> bool: + if isinstance(target_type, collections.abc.Container): + if not isinstance(target_type, tuple): + raise RuntimeError( + "The second argument to " + "`torch.jit.isinstance` must be a type " + "or a tuple of types" + ) + for t_type in target_type: + if _isinstance(obj, t_type): + return True + return False + + origin_type = get_origin(target_type) + if origin_type: + return container_checker(obj, target_type) + + # Check to handle non-typed optional origin returns as none instead + # of as optional in 3.7-3.8 + check_args_exist(target_type) + + # handle non-containers + return isinstance(obj, target_type) + + +class _TensorExtractor(pickle.Pickler): + def __init__(self, *args, tensors: List[torch.Tensor], **kwargs): + super().__init__(*args, **kwargs) + self.tensors = tensors + + def persistent_id(self, obj): + if isinstance(obj, torch.Tensor): + self.tensors.append(obj) + return "" + # Since we just want to extract tensors, we don't mind if an object is + # unpicklable if it doesn't contain tensors, as we can just ignore/skip + # it. To play it safe, we only do so for common objects that we're sure + # don't contain tensors. Feel free to add new types here. Note also that + # even if a type isn't listed here this won't block users, since thet + # can just add a __getstate__ or __reduce__ method to their class. + if isinstance(obj, LockType): + return "" + # Futures and RRefs don't technically contain a value, they just offer + # the means to access a value. + if isinstance(obj, CFuture) or is_rref_instance(obj): + return "" + if isinstance(obj, CAwait): + return "" + if isinstance(obj, torch.cuda.Event): + return "" + if isinstance(obj, threading.Thread): + return "" + return None + + +def _extract_tensors(obj): + r""" + This function is exclusively called from C++. + See ``torch/csrc/jit/python/python_ivalue.h``. + + It extracts the tensors contained in the given object, through pickling. + """ + tensors: List[torch.Tensor] = [] + extractor = _TensorExtractor(io.BytesIO(), protocol=-1, tensors=tensors) + extractor.dump(obj) + return tensors + + +# In Python-3.11+ typed enums (i.e. IntEnum for example) retain number of base class methods in subclass +# that were previously dropped. To preserve the behavior, explicitly drop them there + +if sys.version_info > (3, 10): + _drop(enum.Enum.__new__) + _drop(enum.Enum.__format__) + _drop(enum.Enum.__repr__) + _drop(enum.Enum.__str__) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_meta_registrations.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_meta_registrations.py new file mode 100644 index 0000000000000000000000000000000000000000..e63e990d95141a64c7e8bd753b59a2d3219d9c2b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_meta_registrations.py @@ -0,0 +1,6253 @@ +import math +from enum import Enum +from functools import partial +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch._prims_common as utils +from torch import SymBool, SymFloat, Tensor +from torch._decomp import ( + _add_op_to_registry, + _convert_out_params, + global_decomposition_table, + meta_table, +) +from torch._ops import OpOverload +from torch._prims import _prim_elementwise_meta, ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND +from torch._prims_common import ( + corresponding_complex_dtype, + corresponding_real_dtype, + elementwise_dtypes, + ELEMENTWISE_TYPE_PROMOTION_KIND, + IntLike, + make_contiguous_strides_for, + TensorLike, +) + +from torch._prims_common.wrappers import ( + _maybe_convert_to_dtype, + _maybe_resize_out, + _resize_output_check, + _safe_copy_out, + out_wrapper, +) +from torch._refs import _broadcast_shapes, _maybe_broadcast +from torch.utils import _pytree as pytree + + +aten = torch.ops.aten + +_meta_lib_dont_use_me_use_register_meta = torch.library.Library("aten", "IMPL", "Meta") + + +def register_meta(op): + def wrapper(fn): + fn = _convert_out_params(fn) + + def register(op): + _add_op_to_registry(meta_table, op, fn) + + pytree.tree_map_(register, op) + return fn + + return wrapper + + +def elementwise_meta( + *args, + type_promotion: ELEMENTWISE_TYPE_PROMOTION_KIND, +): + # Perform type promotion, as this is expected from prim_metafunction + _, result_dtype = utils.elementwise_dtypes( + *args, + type_promotion_kind=type_promotion, + ) + args = [_maybe_convert_to_dtype(x, result_dtype) for x in args] + + # Broadcast + args = _maybe_broadcast(*args) + + # Perform prim checks + return _prim_elementwise_meta( + *args, type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT + ) + + +def toRealValueType(dtype): + from_complex = { + torch.complex32: torch.half, + torch.cfloat: torch.float, + torch.cdouble: torch.double, + } + return from_complex.get(dtype, dtype) + + +def check_inplace_broadcast(self_shape, *args_shape): + broadcasted_shape = tuple(_broadcast_shapes(self_shape, *args_shape)) + torch._check( + broadcasted_shape == self_shape, + lambda: f"output with shape {self_shape} doesn't match the broadcast shape {broadcasted_shape}", + ) + + +@register_meta([aten.linspace, aten.logspace]) +@out_wrapper() +def meta_linspace_logspace( + start, + end, + steps, + base=None, + dtype=None, + device=None, + layout=torch.strided, + pin_memory=False, + requires_grad=False, +): + if isinstance(start, torch.Tensor): + torch._check( + start.dim() == 0, + lambda: "linspace only supports 0-dimensional start and end tensors", + ) + if isinstance(end, torch.Tensor): + torch._check( + end.dim() == 0, + lambda: "linspace only supports 0-dimensional start and end tensors", + ) + + if any(isinstance(arg, complex) for arg in (start, end, steps)): + default_complex_dtype = utils.corresponding_complex_dtype( + torch.get_default_dtype() + ) + if dtype is None: + dtype = default_complex_dtype + else: + torch._check( + utils.is_complex_dtype(dtype), + lambda: f"linspace(): inferred dtype {default_complex_dtype} can't be safely cast to passed dtype {dtype}", + ) + else: + dtype = dtype or torch.get_default_dtype() + assert isinstance(dtype, torch.dtype) + + # steps does not participate in the computation of the dtype + torch._check_type( + isinstance(steps, IntLike), + lambda: f"received an invalid combination of arguments - got \ +({type(start).__name__}, {type(end).__name__}, {type(steps).__name__})", + ) + assert isinstance(steps, IntLike) # for mypy + torch._check(steps >= 0, lambda: "number of steps must be non-negative") + + return torch.empty( + (steps,), # type: ignore[arg-type] + dtype=dtype, + layout=layout, + device="meta", + pin_memory=pin_memory, + requires_grad=requires_grad, + ) + + +@register_meta([aten.take.default, aten.take.out]) +@out_wrapper() +def meta_take(self, index): + # Type and device checks + torch._check( + index.dtype == torch.long, + lambda: f"take(): Expected a long tensor for index, but got {index.dtype}", + ) + # Index checks + torch._check_index( + not (self.numel() == 0 and index.numel() != 0), + lambda: "take(): tried to take from an empty tensor", + ) + return self.new_empty(index.shape) + + +@register_meta([aten.linalg_cross.default, aten.linalg_cross.out]) +@out_wrapper() +def linalg_cross(self, other, *, dim=-1): + x_d = self.ndim + y_d = other.ndim + torch._check( + x_d == y_d, + lambda: "linalg.cross: inputs must have the same number of dimensions.", + ) + torch._check( + self.size(dim) == 3 and other.size(dim) == 3, + lambda: ( + f"linalg.cross: inputs dimension {dim} must have length 3. " + f"Got {self.size(dim)} and {other.size(dim)}" + ), + ) + out_shape = _broadcast_shapes(self.shape, other.shape) + return self.new_empty(out_shape) + + +@register_meta(aten.linalg_matrix_exp) +@out_wrapper() +def linalg_matrix_exp(self): + squareCheckInputs(self, "linalg.matrix_exp") + checkFloatingOrComplex(self, "linalg.matrix_exp") + return torch.empty_like(self, memory_format=torch.contiguous_format) + + +@register_meta( + [aten.cummax.default, aten.cummax.out, aten.cummin.default, aten.cummin.out] +) +@out_wrapper("values", "indices") +def cummaxmin(self, dim): + values = torch.empty(self.shape, device=self.device, dtype=self.dtype) + indices = torch.empty(self.shape, device=self.device, dtype=torch.int64) + if self.numel() != 0 and self.ndim != 0: + # Checks that dim is within bounds + maybe_wrap_dim(dim, self.ndim) + return values, indices + + +@register_meta([aten.logcumsumexp.default, aten.logcumsumexp.out]) +@out_wrapper() +def logcumsumexp(self, dim): + # Checks that dim is within bounds + maybe_wrap_dim(dim, self.ndim) + return torch.empty_like(self).contiguous() + + +# Stride-related code from _exec_fft in aten/src/ATen/native/cuda/SpectralOps.cpp +def _exec_fft(out, self, out_sizes, dim, forward): + ndim = self.ndim + signal_ndim = len(dim) + batch_dims = ndim - signal_ndim + + # Permute dimensions so batch dimensions come first, and in stride order + dim_permute = list(range(ndim)) + + is_transformed_dim = [False for _ in range(ndim)] + for d in dim: + is_transformed_dim[d] = True + + # std::partition + left, right = [], [] + for d in dim_permute: + if not is_transformed_dim[d]: + left.append(d) + else: + right.append(d) + dim_permute = left + right + batch_end = len(left) + + self_strides = self.stride() + tmp = dim_permute[:batch_end] + tmp.sort(key=lambda x: self_strides[x], reverse=True) + dim_permute = tmp + dim_permute[batch_end:] + input = self.permute(dim_permute) + + # Collapse batch dimensions into a single dimension + batched_sizes = [-1] + list(input.shape[batch_dims:]) + input = input.reshape(batched_sizes) + + batch_size = input.size(0) + batched_sizes[0] = batch_size + batched_out_sizes = batched_sizes + for i in range(len(dim)): + batched_out_sizes[i + 1] = out_sizes[dim[i]] + out = out.reshape(batched_out_sizes) + + # Reshaping to original batch shape and inverting the dimension permutation + out_strides = [0 for _ in range(ndim)] + batch_numel = 1 + i = batch_dims - 1 + while i >= 0: + out_strides[dim_permute[i]] = batch_numel * out.stride(0) + batch_numel *= out_sizes[dim_permute[i]] + i -= 1 + for i in range(batch_dims, ndim): + out_strides[dim_permute[i]] = out.stride(1 + (i - batch_dims)) + return out.as_strided(out_sizes, out_strides, out.storage_offset()) + + +# See _fft_c2c_cufft in aten/src/ATen/native/cuda/SpectralOps.cpp +# and _fft_c2c_mkl in aten/src/ATen/native/mkl/SpectralOps.cpp +@register_meta([aten._fft_c2c.default, aten._fft_c2c.out]) +@out_wrapper() +def meta_fft_c2c(self, dim, normalization, forward): + assert self.dtype.is_complex + + out_sizes = self.shape + output = self.new_empty(out_sizes) + + if not dim: + return output + + sorted_dims = dim[:] + self_strides = self.stride() + sorted_dims.sort(key=lambda x: self_strides[x], reverse=True) + output = _exec_fft(output, self, out_sizes, sorted_dims, forward) + + return output + + +@register_meta([aten._fft_r2c.default, aten._fft_r2c.out]) +@out_wrapper() +def meta_fft_r2c(self, dim, normalization, onesided): + assert self.dtype.is_floating_point + output_sizes = list(self.size()) + + if onesided: + last_dim = dim[-1] + last_dim_halfsize = (output_sizes[last_dim] // 2) + 1 + output_sizes[last_dim] = last_dim_halfsize + + return self.new_empty( + output_sizes, dtype=utils.corresponding_complex_dtype(self.dtype) + ) + + +@register_meta(aten.randperm.generator_out) +def meta_randperm(n, *, generator=None, out): + return _maybe_resize_out(out, torch.Size([n])) + + +@register_meta(aten.randperm.default) +def meta_randperm_default( + n, *, dtype=torch.long, layout=None, device=None, pin_memory=None +): + return torch.empty( + n, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory + ) + + +@register_meta(aten.randint.default) +def meta_randint( + high, size, *, dtype=torch.long, layout=None, device=None, pin_memory=None +): + return torch.empty( + size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory + ) + + +@register_meta(aten.randint.low) +def meta_randint_low( + low, + high, + size, + *, + dtype=torch.long, + layout=None, + device=None, + pin_memory=None, +): + return torch.empty( + size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory + ) + + +@register_meta(aten.rand.default) +def meta_rand_default(size, *, dtype=None, layout=None, device=None, pin_memory=None): + return torch.empty( + size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory + ) + + +@register_meta([aten._fft_c2r.default, aten._fft_c2r.out]) +@out_wrapper() +def meta_fft_c2r(self, dim, normalization, lastdim): + assert self.dtype.is_complex + output_sizes = list(self.size()) + output_sizes[dim[-1]] = lastdim + return self.new_empty(output_sizes, dtype=toRealValueType(self.dtype)) + + +@register_meta(aten.copy_.default) +def meta_copy_(self, src, non_blocking=False): + # This code simulates the original decomp from inductor, + # which runs most of the meta checks that we care about. + # In theory, we should make this more robust by carefully + # auditing our C++ copy_() kernel and copying the checks here. + + if torch._debug_has_internal_overlap(self) == 1: # 1 == MemOverlap::Yes + raise RuntimeError( + "more than one element of the written-to tensor refers to a single memory location" + ) + + if isinstance(src, Tensor): + intermediate = src.to(self, non_blocking) + if self.size() != intermediate.size(): + aten.expand_copy.default(intermediate, self.size()) + return self + + +def inferUnsqueezeGeometry(tensor, dim): + result_sizes = list(tensor.size()) + result_strides = list(tensor.stride()) + new_stride = 1 if dim >= tensor.dim() else result_sizes[dim] * result_strides[dim] + result_sizes.insert(dim, 1) + result_strides.insert(dim, new_stride) + return result_sizes, result_strides + + +@register_meta(aten.unsqueeze_.default) +def meta_unsqueeze_(self, dim): + dim = maybe_wrap_dim(dim, self.dim() + 1) + g_sizes, g_strides = inferUnsqueezeGeometry(self, dim) + self.as_strided_(g_sizes, g_strides) + return self + + +@register_meta(aten._sparse_semi_structured_linear) +def meta_sparse_structured_linear( + input: Tensor, + weight: Tensor, + _meta: Tensor, + bias: Optional[Tensor] = None, + _activation_opt: Optional[str] = None, + out_dtype: Optional[torch.dtype] = None, +): + output_sizes = list(input.shape) + if bias is not None: + assert weight.size(0) == bias.size(0), "output size mismatch" + assert weight.size(1) == input.size(-1) / 2 + output_sizes[-1] = weight.size(0) + + # see: https://github.com/pytorch/pytorch/pull/114477#issuecomment-1830121375 + # We assume that we have already squashed the inputs into a 2-D tensor + # Then, as the output is transposed, we need to propagate the transposed + # stride information to the output tensor + assert len(input.shape) == 2, "we can only handle the squashed input case" + transposed_strides = (1, input.size(0)) + + if out_dtype is not None: + assert ( + input.dtype == torch.int8 and out_dtype == torch.int32 + ), "out_dtype is only supported for i8i8->i32 linear operator" + output = input.new_empty( + output_sizes, + dtype=input.dtype if out_dtype is None else out_dtype, + ).as_strided(output_sizes, transposed_strides) + + return output + + +@register_meta(aten._cslt_sparse_mm) +def meta__cslt_sparse_mm( + compressed_A: torch.Tensor, + dense_B: torch.Tensor, + bias: Optional[Tensor] = None, + alpha: Optional[Tensor] = None, + out_dtype: Optional[torch.dtype] = None, + transpose_result: bool = False, +): + assert dense_B.dtype in { + torch.float32, + torch.float16, + torch.bfloat16, + torch.int8, + }, "_cslt_sparse_mm only supports fp16, bf16, and int8" + assert compressed_A.dtype == dense_B.dtype, "inputs must have the same dtype" + assert len(dense_B.shape) == 2, "_cslt_sparse_mm only supports 2d inputs" + + is_int8_input_type = compressed_A.dtype == torch.int8 + compression_factor = 10 if is_int8_input_type else 9 + k = dense_B.size(0) + n = dense_B.size(1) + m = (compressed_A.numel() * 16) // (compression_factor * k) + if bias is not None: + assert m == bias.size(0) + + if out_dtype is not None: + assert is_int8_input_type and out_dtype in { + torch.float16, + torch.bfloat16, + torch.int32, + }, "out_dtype is only supported for i8i8->fp16, bf16, or i32 matmul" + output_shape = (n, m) if transpose_result else (m, n) + result = dense_B.new_empty(output_shape, dtype=out_dtype) + return result + + +@register_meta(aten.index_reduce.default) +def meta_index_reduce( + self: Tensor, + dim: int, + index: Tensor, + source: torch.Tensor, + reduce: str, + *, + include_self: bool = True, +) -> Tensor: + return torch.empty_like(self, memory_format=torch.contiguous_format) + + +@register_meta(aten.index_reduce_.default) +def meta_index_reduce_( + self: Tensor, + dim: int, + index: Tensor, + source: torch.Tensor, + reduce: str, + *, + include_self: bool = True, +) -> Tensor: + return self + + +# Implementations below are taken from https://github.com/albanD/subclass_zoo/blob/main/python_meta_tensor.py +@out_wrapper() +@register_meta(aten.index_select.default) +def meta_index_select(self, dim, index): + result_size = list(self.size()) + if self.dim() > 0: + result_size[dim] = index.numel() + return self.new_empty(result_size) + + +@register_meta(aten.segment_reduce.default) +def meta_segment_reduce( + data: Tensor, + reduce: str, + *, + lengths: Optional[Tensor] = None, + indices: Optional[Tensor] = None, + offsets: Optional[Tensor] = None, + axis: int = 0, + unsafe: bool = False, + initial=None, +) -> Tensor: + if indices is not None: + raise NotImplementedError( + "segment_reduce(): indices based reduction is not supported yet." + ) + + def segment_reduce_lengths_tensor(lengths_shape): + return torch.empty( + lengths_shape + data.shape[axis + 1 :], + dtype=data.dtype, + device="meta", + memory_format=torch.contiguous_format, + ) + + if lengths is not None: + return segment_reduce_lengths_tensor(lengths.shape) + # FIXME should probably check that lengths and offset aren't both set, but + # the ATen implementation neglects this too + if offsets is not None: + # lengths == torch.diff(offsets) + lengths_shape = offsets.shape[:-1] + (offsets.shape[-1] - 1,) + return segment_reduce_lengths_tensor(lengths_shape) + raise RuntimeError("segment_reduce(): Either lengths or offsets must be defined.") + + +@register_meta([aten.max.default, aten.max.unary_out]) +@out_wrapper() +def meta_max(self): + return self.new_empty(()) + + +@register_meta(aten.max.dim) +def meta_max_dim(self, dim, keepdim=False): + dim = utils.reduction_dims(self.shape, (dim,)) + output_shape = _compute_reduction_shape(self, dim, keepdim) + return ( + self.new_empty(output_shape), + self.new_empty(output_shape, dtype=torch.long), + ) + + +@register_meta([aten.min.default, aten.min.unary_out]) +@out_wrapper() +def meta_min(self): + return self.new_empty(()) + + +@register_meta(aten.min.dim) +def meta_min_dim(self, dim, keepdim=False): + dim = utils.reduction_dims(self.shape, (dim,)) + output_shape = _compute_reduction_shape(self, dim, keepdim) + return ( + self.new_empty(output_shape), + self.new_empty(output_shape, dtype=torch.long), + ) + + +@register_meta(aten.angle.default) +def meta_angle(self): + if self.is_complex(): + result_dtype = corresponding_real_dtype(self.dtype) + else: + _, result_dtype = elementwise_dtypes( + self, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, + ) + return torch.empty_like(self, dtype=result_dtype) + + +@register_meta(aten.angle.out) +def meta_angle_out(self, out): + torch._resize_output_(out, self.size(), self.device) + return out.copy_(torch.angle(self)) + + +@register_meta(aten._assert_async.default) +def assert_async(val): + return + + +@register_meta(aten._assert_async.msg) +def assert_async_meta(val, assert_msg): + return + + +@register_meta(aten._print.default) +def print_meta(s): + return + + +@register_meta(aten._make_dep_token.default) +def make_dep_token( + *, + dtype=None, + layout=None, + device=None, + pin_memory=None, + memory_format=None, +): + return torch.empty([], device="meta") + + +@register_meta(aten.sym_constrain_range.default) +def sym_constrain_range(size, min=None, max=None): + # Avoid importing sympy at a module level + from torch.fx.experimental.symbolic_shapes import constrain_range + + if isinstance(size, (SymFloat, SymBool)): + raise ValueError("Constraining SymFloat or Symbool is nyi") + constrain_range(size, min=min, max=max) + + +@register_meta(aten._functional_sym_constrain_range.default) +def functional_sym_constrain_range(size, min=None, max=None, dep_token=None): + aten.sym_constrain_range(size, min=min, max=max) + return dep_token + + +@register_meta(aten.sym_constrain_range_for_size.default) +def sym_constrain_range_for_size(size, min=None, max=None): + # Avoid importing sympy at a module level + from torch.fx.experimental.symbolic_shapes import _constrain_range_for_size + + if isinstance(size, (SymFloat, SymBool)): + raise ValueError("Constraining SymFloat or Symbool is nyi") + _constrain_range_for_size(size, min=min, max=max) + + +@register_meta(aten._functional_sym_constrain_range_for_size.default) +def functional_sym_constrain_range_for_size(size, min, max, dep_token): + aten.sym_constrain_range_for_size(size, min=min, max=max) + return dep_token + + +@register_meta(aten._functional_assert_async.msg) +def functional_assert_async_meta(val, assert_msg, dep_token): + return dep_token + + +# From aten/src/ATen/native/LinearAlgebraUtils.h +def squareCheckInputs(self: Tensor, f_name: str): + assert ( + self.dim() >= 2 + ), f"{f_name}: The input tensor must have at least 2 dimensions." + assert self.size(-1) == self.size( + -2 + ), f"{f_name}: A must be batches of square matrices, but they are {self.size(-2)} by {self.size(-1)} matrices" + + +# Validates input shapes and devices +# for linear solve methods (solve, cholesky_solve, lu_solve, triangular_solve) +# From aten/src/ATen/native/LinearAlgebraUtils.h +def linearSolveCheckInputs( + self: Tensor, + A: Tensor, + name: str, +): + torch._check( + self.device == A.device, + lambda: ( + f"Expected b and A to be on the same device, but found b on " + f"{self.device} and A on {A.device} instead." + ), + ) + + torch._check( + self.dtype == A.dtype, + lambda: ( + f"Expected b and A to have the same dtype, but found b of type " + f"{self.dtype} and A of type {A.dtype} instead." + ), + ) + + torch._check( + A.size(-1) == A.size(-2), + lambda: ( + f"A must be batches of square matrices, " + f"but they are {A.size(-2)} by {A.size(-1)} matrices" + ), + ) + + torch._check( + A.size(-1) == self.size(-2), + lambda: ( + f"Incompatible matrix sizes for {name}: each A " + f"matrix is {A.size(-1)} by {A.size(-1)}" + f" but each b matrix is {self.size(-2)} by {self.size(-1)}" + ), + ) + + +# From aten/src/ATen/native/LinearAlgebraUtils.h +def checkFloatingOrComplex( + t: Tensor, f_name: str, allow_low_precision_dtypes: bool = True +): + dtype = t.dtype + torch._check( + t.is_floating_point() or t.is_complex(), + lambda: f"{f_name}: Expected a floating point or complex tensor as input. Got {dtype}", + ) + if not allow_low_precision_dtypes: + torch._check( + dtype in (torch.float, torch.double, torch.cfloat, torch.cdouble), + lambda: f"{f_name}: Low precision dtypes not supported. Got {dtype}", + ) + + +# From aten/src/ATen/native/LinearAlgebraUtils.h +def checkIsMatrix(A: Tensor, f_name: str, arg_name: str = "A"): + torch._check( + A.dim() >= 2, + lambda: f"{f_name}: The input tensor {arg_name} must have at least 2 dimensions.", + ) + + +def checkInputsSolver( + A: Tensor, + B: Tensor, + left: bool, + f_name: str, +): + squareCheckInputs(A, f_name) + checkIsMatrix(B, f_name) + torch._check( + A.size(-2) == B.size(-2) if left else A.size(-1) == B.size(-1), + lambda: ( + f"{f_name}: Incompatible shapes of A and B for the equation " + f"{'AX = B' if left else 'XA = B'}" + f" ({A.size(-2)}x{A.size(-1)} and {B.size(-2)}x{B.size(-1)})" + ), + ) + + +def checkSameDevice( + fn_name: str, result: Tensor, input: Tensor, result_name: str = "result" +): + torch._check( + result.device == input.device, + lambda: ( + f"{fn_name}: Expected {result_name} and input tensors to be on the same device, but got " + f"{result_name} on {result.device} and input on {input.device}" + ), + ) + + +def checkUplo(UPLO: str): + UPLO_uppercase = UPLO.upper() + torch._check( + len(UPLO) == 1 and (UPLO_uppercase == "U" or UPLO_uppercase == "L"), + lambda: f"Expected UPLO argument to be 'L' or 'U', but got {UPLO}", + ) + + +@register_meta([aten._linalg_eigh.default, aten._linalg_eigh.eigenvalues]) +@out_wrapper("eigenvalues", "eigenvectors") +def meta__linalg_eigh( + A: Tensor, + UPLO: str = "L", + compute_v: bool = True, +): + squareCheckInputs(A, "linalg.eigh") + checkUplo(UPLO) + + shape = list(A.shape) + if compute_v: + vecs = A.new_empty(shape) + vecs.as_strided_(shape, make_contiguous_strides_for(shape, row_major=False)) + else: + vecs = A.new_empty([0]) + + shape.pop() + vals = A.new_empty(shape, dtype=toRealValueType(A.dtype)) + + return vals, vecs + + +@register_meta([aten._linalg_eigvals.default, aten.linalg_eigvals.out]) +@out_wrapper() +def meta__linalg_eigvals(input: Tensor) -> Tensor: + squareCheckInputs(input, "linalg.eigvals") + complex_dtype = ( + input.dtype + if utils.is_complex_dtype(input.dtype) + else utils.corresponding_complex_dtype(input.dtype) + ) + return input.new_empty(input.shape[:-1], dtype=complex_dtype) + + +@register_meta([aten.linalg_eig]) +@out_wrapper("eigenvalues", "eigenvectors") +def meta_linalg_eig(input: Tensor): + squareCheckInputs(input, "linalg.eig") + complex_dtype = ( + input.dtype + if utils.is_complex_dtype(input.dtype) + else utils.corresponding_complex_dtype(input.dtype) + ) + values = input.new_empty(input.shape[:-1], dtype=complex_dtype) + vectors = input.new_empty(input.shape, dtype=complex_dtype) + return values, vectors + + +def cloneBatchedColumnMajor(src: Tensor) -> Tensor: + return src.mT.clone(memory_format=torch.contiguous_format).transpose(-2, -1) + + +@register_meta(aten._cholesky_solve_helper) +@out_wrapper() +def _cholesky_solve_helper(self: Tensor, A: Tensor, upper: bool) -> Tensor: + return cloneBatchedColumnMajor(self) + + +@register_meta(aten.cholesky_solve) +@out_wrapper() +def cholesky_solve(self: Tensor, A: Tensor, upper: bool = False) -> Tensor: + torch._check( + self.ndim >= 2, + lambda: f"b should have at least 2 dimensions, but has {self.ndim} dimensions instead", + ) + torch._check( + A.ndim >= 2, + lambda: f"u should have at least 2 dimensions, but has {A.ndim} dimensions instead", + ) + self_broadcasted, A_broadcasted = _linalg_broadcast_batch_dims_name( + self, A, "cholesky_solve" + ) + return _cholesky_solve_helper(self_broadcasted, A_broadcasted, upper) + + +@register_meta(aten.cholesky) +@out_wrapper() +def cholesky(self: Tensor, upper: bool = False) -> Tensor: + if self.numel() == 0: + return torch.empty_like(self, memory_format=torch.legacy_contiguous_format) + squareCheckInputs(self, "cholesky") + return cloneBatchedColumnMajor(self) + + +@register_meta(aten.cholesky_inverse) +@out_wrapper() +def cholesky_inverse(self: Tensor, upper: bool = False) -> Tensor: + squareCheckInputs(self, "cholesky_inverse") + return cloneBatchedColumnMajor(self) + + +# From aten/src/ATen/native/BatchLinearAlgebra.cpp +@register_meta(aten.linalg_cholesky_ex.default) +def linalg_cholesky_ex(A: Tensor, upper: bool = False, check_errors: bool = False): + squareCheckInputs(A, "linalg.cholesky") + checkFloatingOrComplex(A, "linalg.cholesky") + + A_shape = A.shape + ndim = len(A_shape) + + # L + L_strides = make_contiguous_strides_for(A_shape, False) + L = A.new_empty(A_shape) + L.as_strided_(A_shape, L_strides) + + # infos + infos = A.new_empty(A_shape[0 : ndim - 2], dtype=torch.int32) + return L, infos + + +@register_meta( + [aten.linalg_householder_product.default, aten.linalg_householder_product.out] +) +@out_wrapper() +def linalg_householder_product(input: Tensor, tau: Tensor) -> Tensor: + torch._check( + input.ndim >= 2, + lambda: "torch.linalg.householder_product: input must have at least 2 dimensions.", + ) + torch._check( + input.size(-2) >= input.size(-1), + lambda: "torch.linalg.householder_product: input.shape[-2] must be greater than or equal to input.shape[-1]", + ) + torch._check( + input.size(-1) >= tau.size(-1), + lambda: "torch.linalg.householder_product: input.shape[-1] must be greater than or equal to tau.shape[-1]", + ) + + torch._check( + input.ndim - tau.ndim == 1, + lambda: ( + f"torch.linalg.householder_product: Expected tau to have one dimension less than input, " + f"but got tau.ndim equal to {tau.ndim} and input.ndim is equal to {input.ndim}" + ), + ) + if input.ndim > 2: + expected_batch_tau_shape = input.shape[:-2] + actual_batch_tau_shape = tau.shape[:-1] + torch._check( + actual_batch_tau_shape == expected_batch_tau_shape, + lambda: ( + f"torch.linalg.householder_product: Expected batch dimensions of tau to be " + f"equal to input.shape[:-2], but got {actual_batch_tau_shape}" + ), + ) + + torch._check( + tau.dtype == input.dtype, + lambda: ( + f"torch.linalg.householder_product: tau dtype {tau.dtype}" + f" does not match input dtype {input.dtype}" + ), + ) + checkSameDevice("torch.linalg.householder_product", tau, input, "tau") + + return torch.empty_strided( + size=input.shape, + stride=make_contiguous_strides_for(input.shape, row_major=False), + dtype=input.dtype, + device=input.device, + ) + + +# From aten/src/ATen/native/BatchLinearAlgebra.cpp +@register_meta(aten.linalg_inv_ex.default) +def linalg_inv_ex_meta(A: Tensor, check_errors: bool = False): + squareCheckInputs(A, "linalg.inv_ex") + checkFloatingOrComplex(A, "linalg.inv_ex", allow_low_precision_dtypes=False) + + L = A.new_empty(A.shape) + L.as_strided_(A.shape, make_contiguous_strides_for(A.shape, row_major=False)) + + infos = A.new_empty(A.shape[:-2], dtype=torch.int32) + return L, infos + + +@register_meta([aten.linalg_ldl_factor_ex.default, aten.linalg_ldl_factor_ex.out]) +@out_wrapper("LD", "pivots", "info") +def linalg_ldl_factor_ex_meta( + self: Tensor, + *, + hermitian: bool = False, + check_errors: bool = False, +) -> Tuple[Tensor, Tensor, Tensor]: + squareCheckInputs(self, "torch.linalg.ldl_factor_ex") + checkFloatingOrComplex(self, "torch.linalg.ldl_factor_ex") + LD = torch.empty_strided( + size=self.shape, + stride=make_contiguous_strides_for(self.shape, row_major=False), + dtype=self.dtype, + device=self.device, + ) + pivots = self.new_empty(self.shape[:-1], dtype=torch.int) + info = self.new_empty(self.shape[:-2], dtype=torch.int) + return LD, pivots, info + + +@register_meta([aten.linalg_ldl_solve.default, aten.linalg_ldl_solve.out]) +@out_wrapper() +def linalg_ldl_solve_meta( + LD: Tensor, pivots: Tensor, B: Tensor, *, hermitian: bool = False +) -> Tensor: + squareCheckInputs(LD, "torch.linalg.ldl_solve") + checkFloatingOrComplex(LD, "torch.linalg.ldl_solve") + linearSolveCheckInputs(B, LD, "torch.linalg.ldl_solve") + torch._check( + B.ndim >= 2, + lambda: ( + f"torch.linalg.ldl_solve: Expected B to have at least 2 dimensions, " + f"but it has {B.ndim} dimensions instead" + ), + ) + expected_pivots_shape = LD.shape[:-1] + torch._check( + expected_pivots_shape == pivots.shape, + lambda: ( + f"torch.linalg.ldl_solve: Expected LD.shape[:-1] and pivots.shape to be the same, " + f"but got pivots with shape {pivots.shape} instead" + ), + ) + torch._check( + utils.is_integer_dtype(pivots.dtype), + lambda: f"torch.linalg.ldl_solve: Expected pivots to be integers. Got {pivots.dtype}", + ) + torch._check( + LD.dtype == B.dtype, + lambda: f"torch.linalg.ldl_solve: LD dtype {LD.dtype} does not match b dtype {B.dtype}", + ) + B_broadcast_size, _ = _linalg_broadcast_batch_dims(B, LD) + return torch.empty_strided( + size=B_broadcast_size, + stride=make_contiguous_strides_for(B_broadcast_size, row_major=False), + dtype=B.dtype, + device=B.device, + ) + + +@register_meta([aten.linalg_lu.default, aten.linalg_lu.out]) +@out_wrapper("P", "L", "U") +def linalg_lu_meta(A: Tensor, *, pivot: bool = True) -> Tuple[Tensor, Tensor, Tensor]: + torch._check( + A.ndim >= 2, + lambda: f"linalg.lu: Expected tensor with 2 or more dimensions. Got size: {A.shape} instead", + ) + + sizes = list(A.shape) + m = sizes[-2] + n = sizes[-1] + k = min(m, n) + + sizes[-1] = m + if pivot: + P = A.new_empty(sizes) + else: + P = A.new_empty([0]) + + sizes[-1] = k + L = A.new_empty(sizes) + + sizes[-2] = k + sizes[-1] = n + U = A.new_empty(sizes) + return P, L, U + + +@register_meta([aten.linalg_lu_factor_ex.default, aten.linalg_lu_factor_ex.out]) +@out_wrapper("LU", "pivots", "info") +def linalg_lu_factor_ex_meta( + A: Tensor, *, pivot: bool = True, check_errors: bool = False +) -> Tuple[Tensor, Tensor, Tensor]: + torch._check( + A.ndim >= 2, + lambda: f"torch.lu_factor: Expected tensor with 2 or more dimensions. Got size: {A.shape} instead", + ) + + sizes = list(A.shape) + m = sizes[-2] + n = sizes[-1] + + LU = torch.empty_strided( + size=sizes, + stride=make_contiguous_strides_for(sizes, row_major=False), + dtype=A.dtype, + device=A.device, + ) + + # Sets sizes to the size of pivots + sizes.pop() + sizes[-1] = min(m, n) + pivots = A.new_empty(sizes, dtype=torch.int) + + # Sets sizes to the size of info + sizes.pop() + info = A.new_empty(sizes, dtype=torch.int) + + return LU, pivots, info + + +@register_meta([aten.linalg_lu_solve.default, aten.linalg_lu_solve.out]) +@out_wrapper() +def linalg_lu_solve_meta( + LU: Tensor, + pivots: Tensor, + B: Tensor, + *, + left: bool = True, + adjoint: bool = False, +) -> Tensor: + # dtype + checkFloatingOrComplex(LU, "torch.linalg.lu_solve") + torch._check( + LU.dtype == B.dtype, + lambda: ( + f"linalg.lu_solve: Expected LU and B to have the same dtype, " + f"but found LU of type {LU.dtype} and B of type {B.dtype} instead" + ), + ) + torch._check( + pivots.dtype == torch.int, + lambda: "linalg.lu_solve: pivots should be a Tensor of scalar type torch.int32", + ) + + # matrix shapes + squareCheckInputs(LU, "torch.linalg.lu_solve") + checkInputsSolver(LU, B, left, "linalg.lu_solve") + torch._check( + LU.size(-1) == pivots.size(-1), + lambda: "linalg.lu_solve: Number of pivots per batch should be same as the dimension of the matrix", + ) + + # batches + torch._check( + LU.shape[:-1] == pivots.shape, + lambda: ( + f"linalg.lu_solve: Expected LU.shape[:-1] and pivots.shape to be the same, " + f"but got pivots with shape {pivots.shape} instead" + ), + ) + + B_broadcast_size, _ = _linalg_broadcast_batch_dims(B, LU) + + result = torch.empty_strided( + size=B_broadcast_size, + stride=make_contiguous_strides_for(B_broadcast_size, row_major=not left), + dtype=B.dtype, + device=B.device, + ) + + if result.numel() != 0 and not left: + if result.is_complex(): + result = result.conj() + + return result + + +@register_meta(aten.lu_unpack) +@out_wrapper("P", "L", "U") +def lu_unpack_meta( + LU: Tensor, + pivots: Tensor, + unpack_data: bool = True, + unpack_pivots: bool = True, +) -> Tuple[Tensor, Tensor, Tensor]: + torch._check( + LU.ndim >= 2, + lambda: f"torch.lu_unpack: Expected tensor with 2 or more dimensions. Got size: {LU.shape} instead", + ) + if unpack_pivots: + torch._check( + pivots.dtype == torch.int32, + lambda: ( + "torch.lu_unpack: LU_pivots is expected to be a contiguous tensor of torch.int32 dtype.\n" + "Note: this function is intended to be used with the output produced by torch.linalg.lu_factor" + ), + ) + sizes = list(LU.shape) + m = sizes[-2] + n = sizes[-1] + k = min(m, n) + sizes[-1] = m + if unpack_pivots: + P = LU.new_empty(sizes) + else: + P = LU.new_empty([0]) + if unpack_data: + sizes[-1] = k + L = LU.new_empty(sizes) + sizes[-2] = k + sizes[-1] = n + U = LU.new_empty(sizes) + else: + L = LU.new_empty([0]) + U = LU.new_empty([0]) + return P, L, U + + +# parse the "mode" param in linalg_qr: return a tuple of bools (compute_q, reduced) +def _parse_qr_mode(mode: str) -> Tuple[bool, bool]: + if mode == "reduced": + compute_q = True + reduced = True + elif mode == "complete": + compute_q = True + reduced = False + elif mode == "r": + compute_q = False + reduced = True # this is actually irrelevant in this mode + else: + torch._check( + False, + lambda: ( + f"qr received unrecognized mode '{mode}' " + f"but expected one of 'reduced' (default), 'r', or 'complete'" + ), + ) + return compute_q, reduced # type: ignore[possibly-undefined] + + +@register_meta([aten.linalg_qr.default, aten.linalg_qr.out]) +@out_wrapper("Q", "R") +def linalg_qr_meta( + A: Tensor, + mode: str = "reduced", +) -> Tuple[Tensor, Tensor]: + checkIsMatrix(A, "linalg.qr") + checkFloatingOrComplex(A, "linalg.qr") + + compute_q, reduced_mode = _parse_qr_mode(mode) + + m = A.shape[-2] + n = A.shape[-1] + k = min(m, n) + + if compute_q: + Q_shape = list(A.shape) + Q_shape[-1] = k if reduced_mode else m + Q = A.new_empty(Q_shape) + Q.as_strided_(Q_shape, make_contiguous_strides_for(Q_shape, row_major=False)) + else: + Q = A.new_empty([0]) + + # For readability + R_shape = list(A.shape) + R_shape[-2] = k if reduced_mode or not compute_q else m + R = A.new_empty(R_shape) + R.as_strided_(R_shape, make_contiguous_strides_for(R_shape, row_major=False)) + return Q, R + + +@register_meta([aten._linalg_slogdet.default, aten._linalg_slogdet.sign]) +@out_wrapper("sign", "logabsdet", "LU", "pivots") +def _linalg_slogdet(A: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]: + squareCheckInputs(A, "linalg.slogdet") + checkFloatingOrComplex(A, "linalg.slogdet", False) + shape = A.shape + sign = A.new_empty(shape[:-2]) + logabsdet = A.new_empty(shape[:-2], dtype=toRealValueType(A.dtype)) + LU = torch.empty_strided( + size=shape, + stride=make_contiguous_strides_for(shape, False), + dtype=A.dtype, + device=A.device, + ) + pivots = A.new_empty(shape[:-1], dtype=torch.int32) + return sign, logabsdet, LU, pivots + + +# From aten/src/ATen/native/BatchLinearAlgebra.cpp +# NOTE: matching defaults in aten/src/ATen/native/native_functions.yaml +@register_meta(aten._linalg_svd.default) +def _linalg_svd_meta( + A: Tensor, + full_matrices: bool = False, + compute_uv: bool = True, + driver: Optional[str] = None, +): + checkIsMatrix(A, "linalg.svd") + checkFloatingOrComplex(A, "linalg.svd") + + batch_dims = list(A.shape[:-2]) + m = A.shape[-2] + n = A.shape[-1] + k = min(m, n) + + if compute_uv: + U_shape = batch_dims + [m, m if full_matrices else k] + U = A.new_empty(U_shape) + U.as_strided_(U_shape, make_contiguous_strides_for(U_shape, row_major=False)) + + V_shape = batch_dims + [n if full_matrices else k, n] + V = A.new_empty(V_shape) + # NB: This checks for CUDA since there is no way to check for cuSolver. + # Also, this might not work correctly on CPU when fake_device is not + # available as device_hint just defaults to CUDA in that case. See + # _linalg_svd meta in core. + is_cuda = device_hint(A) == "cuda" + V.as_strided_(V_shape, make_contiguous_strides_for(V_shape, row_major=is_cuda)) + else: + # doesn't matter + U = A.new_empty([0]) + V = A.new_empty([0]) + + # S is always real, even when A is complex. + S = A.new_empty(batch_dims + [k], dtype=toRealValueType(A.dtype)) + return U, S, V + + +def _linalg_broadcast_batch_dims( + arg1: Tensor, arg2: Tensor +) -> Tuple[List[int], List[int]]: + # broadcast the batch dimensions of arg1 and arg2. + arg1_batch_sizes = arg1.shape[:-2] + arg2_batch_sizes = arg2.shape[:-2] + expand_batch_portion = _broadcast_shapes(arg1_batch_sizes, arg2_batch_sizes) + + arg1_expand_size = list(expand_batch_portion) + arg1_expand_size += [arg1.size(-2), arg1.size(-1)] + + arg2_expand_size = list(expand_batch_portion) + arg2_expand_size += [arg2.size(-2), arg2.size(-1)] + return arg1_expand_size, arg2_expand_size + + +def _linalg_broadcast_batch_dims_name( + arg1: Tensor, arg2: Tensor, name: Optional[str] +) -> Tuple[Tensor, Tensor]: + # If there's no name we assume we don't want to check the errors + if name: + linearSolveCheckInputs(arg1, arg2, name) + + arg1_expand_size, arg2_expand_size = _linalg_broadcast_batch_dims(arg1, arg2) + + arg1_broadcasted = ( + arg1 if arg1_expand_size == arg1.shape else arg1.expand(arg1_expand_size) + ) + arg2_broadcasted = ( + arg2 if arg2_expand_size == arg2.shape else arg2.expand(arg2_expand_size) + ) + return arg1_broadcasted, arg2_broadcasted + + +def linalg_solve_is_vector_rhs(input: Tensor, other: Tensor) -> bool: + expected_batched_rhs_shape = input.shape[:-1] + vector_case = other.ndim == 1 or ( + input.ndim - 1 == other.ndim and other.shape == expected_batched_rhs_shape + ) + return vector_case + + +@register_meta(aten._linalg_solve_ex) +def _linalg_solve_ex( + A: Tensor, + B: Tensor, + *, + left: bool = True, + check_errors: bool = False, + result: Optional[Tensor] = None, + LU: Optional[Tensor] = None, + pivots: Optional[Tensor] = None, + info: Optional[Tensor] = None, +) -> Tuple[Tensor, Tensor, Tensor, Tensor]: + checkFloatingOrComplex(A, "linalg.solve") + torch._check( + A.dtype == B.dtype, + lambda: ( + f"linalg.solve: Expected A and B to have the same dtype, but found A of type " + f"{A.dtype} and B of type {B.dtype} instead" + ), + ) + vector_case = linalg_solve_is_vector_rhs(A, B) + B_ = B.unsqueeze(-1) if vector_case else B + checkInputsSolver(A, B_, left, "linalg.solve") + B_broad_shape, _ = _linalg_broadcast_batch_dims(B_, A) + torch._check( + left or not vector_case, + lambda: ( + "linalg.solve: Vector broadcasting of the left hand side is not supported for left=False. " + "In this case linalg.solve is equivalent to B / A.squeeze(-1)" + ), + ) + result_shape = B_broad_shape[:-1] if vector_case else B_broad_shape + result_ = torch.empty_strided( + size=result_shape, + stride=make_contiguous_strides_for(result_shape, not left), + dtype=B.dtype, + device=B.device, + ) + shape = A.shape + ndim = A.ndim + LU_ = torch.empty_strided( + size=shape, + stride=make_contiguous_strides_for(shape, False), + dtype=A.dtype, + device=A.device, + ) + pivots_ = A.new_empty(shape[:-1], dtype=torch.int32) + info_ = A.new_empty(shape[:-2], dtype=torch.int32) + out = (result, LU, pivots, info) + res = (result_, LU_, pivots_, info_) + if all(x is not None for x in out): + for r, o in zip(res, out): + # resize and copy operations are done in-place + _maybe_resize_out(o, r.shape) # type: ignore[arg-type] + # strides are not copied in out_wrapper + o.as_strided_(r.shape, r.stride()) # type: ignore[union-attr] + _safe_copy_out(copy_from=r, copy_to=o, exact_dtype=False) # type: ignore[arg-type] + return res + + +@register_meta([aten.linalg_solve_triangular.default, aten.linalg_solve_triangular.out]) +def linalg_solve_triangular_meta( + A: Tensor, + B: Tensor, + *, + upper: bool, + left: bool = True, + unitriangular: bool = False, + out: Optional[Tensor] = None, +) -> Tensor: + if out is None: + out = A.new_empty([0]) + assert isinstance(out, TensorLike) + checkInputsSolver(A, B, left, "linalg.solve_triangular") + B_, A_ = _linalg_broadcast_batch_dims_name(B, A, None) + avoid_copy_A = A_.transpose(-2, -1).is_contiguous() and A_.is_conj() + if avoid_copy_A: + out = _maybe_resize_out(out, B_.shape) + else: + # reimplementation of resize_output with result F-contig + if _resize_output_check(out, B_.shape): + out.resize_(B_.transpose(-2, -1).shape) + out.transpose_(-2, -1) + return out # type: ignore[return-value] + + +@register_meta(aten.triangular_solve) +@out_wrapper("solution", "cloned_coefficient") +def triangular_solve_meta( + self: Tensor, + A: Tensor, + upper: bool = True, + transpose: bool = False, + unitriangular: bool = False, +) -> Tuple[Tensor, Tensor]: + torch._check( + self.ndim >= 2, + lambda: ( + f"torch.triangular_solve: Expected b to have at least 2 dimensions, " + f"but it has {self.ndim} dimensions instead" + ), + ) + torch._check( + A.ndim >= 2, + lambda: ( + f"torch.triangular_solve: Expected A to have at least 2 dimensions, " + f"but it has {A.ndim} dimensions instead" + ), + ) + + linearSolveCheckInputs(self, A, "triangular_solve") + + if A.layout == torch.strided: + self_broadcast_size, A_broadcast_size = _linalg_broadcast_batch_dims(self, A) + solution = torch.empty_strided( + size=self_broadcast_size, + stride=make_contiguous_strides_for(self_broadcast_size, row_major=False), + dtype=self.dtype, + device=self.device, + ) + cloned_coefficient = torch.empty_strided( + size=A_broadcast_size, + stride=make_contiguous_strides_for(A_broadcast_size, row_major=False), + dtype=A.dtype, + device=A.device, + ) + elif A.layout == torch.sparse_csr or A.layout == torch.sparse_bsr: + solution = torch.empty_like(self) + cloned_coefficient = self.new_empty([0]) + else: + torch._check(False, lambda: "triangular_solve: Got an unexpected layout.") + return solution, cloned_coefficient # type: ignore[possibly-undefined] + + +# From aten/src/ATen/native/LinearAlgebra.cpp +@register_meta(aten._linalg_det.default) +def _linalg_det_meta(A): + squareCheckInputs(A, "linalg.det") + checkFloatingOrComplex(A, "linalg.det") + + det = A.new_empty(A.shape[:-2]) + + LU = A.new_empty(A.shape) + LU.as_strided_(A.shape, make_contiguous_strides_for(A.shape, row_major=False)) + + pivots = A.new_empty(A.shape[:-1], dtype=torch.int32) + return det, LU, pivots + + +@register_meta(aten.ormqr) +@out_wrapper() +def ormqr( + input: Tensor, + tau: Tensor, + other: Tensor, + left: bool = True, + transpose: bool = False, +) -> Tensor: + torch._check( + input.ndim >= 2, lambda: "torch.ormqr: input must have at least 2 dimensions." + ) + torch._check( + other.ndim >= 2, lambda: "torch.ormqr: other must have at least 2 dimensions." + ) + + left_size_condition = -2 if left else -1 + torch._check( + other.shape[left_size_condition] >= tau.shape[-1], + lambda: f"torch.ormqr: other.shape[{left_size_condition}] must be greater than or equal to tau.shape[-1]", + ) + torch._check( + other.shape[left_size_condition] == input.shape[-2], + lambda: f"torch.ormqr: other.shape[{left_size_condition}] must be equal to input.shape[-2]", + ) + + torch._check( + tau.shape[-1] <= input.shape[-1], + lambda: "torch.ormqr: tau.shape[-1] must be less than or equal to input.shape[-1]", + ) + + torch._check( + input.ndim - tau.ndim == 1, + lambda: ( + f"torch.ormqr: Expected tau to have one dimension less than input, " + f"but got tau.ndim equal to {tau.ndim} and input.ndim is equal to {input.ndim}" + ), + ) + torch._check( + input.ndim == other.ndim, + lambda: ( + f"torch.ormqr: Expected other to have the same number of dimensions as input, " + f"but got other.ndim equal to {other.ndim} and input.ndim is equal to {input.ndim}" + ), + ) + + if input.ndim > 2: + expected_batch_shape = input.shape[:-2] + actual_batch_tau_shape = tau.shape[:-1] + torch._check( + actual_batch_tau_shape == expected_batch_shape, + lambda: ( + f"torch.ormqr: Expected batch dimensions of tau to be " + f"equal to input.shape[:-2], but got {actual_batch_tau_shape}" + ), + ) + + actual_batch_other_shape = other.shape[:-2] + torch._check( + actual_batch_other_shape == expected_batch_shape, + lambda: ( + f"torch.ormqr: Expected batch dimensions of other to be " + f"equal to input.shape[:-2], but got {actual_batch_other_shape}" + ), + ) + + torch._check( + tau.dtype == input.dtype, + lambda: ( + f"torch.ormqr: Expected input and tau to have the same dtype, " + f"but input has dtype {input.dtype} and tau has dtype {tau.dtype}" + ), + ) + torch._check( + other.dtype == input.dtype, + lambda: ( + f"torch.ormqr: Expected input and other to have the same dtype, " + f"but input has dtype {input.dtype} and other has dtype {other.dtype}" + ), + ) + + checkSameDevice("torch.ormqr", tau, input, "tau") + checkSameDevice("torch.ormqr", other, input, "other") + + return torch.empty_strided( + size=other.shape, + stride=make_contiguous_strides_for(other.shape, row_major=False), + dtype=other.dtype, + device=other.device, + ) + + +def _padding_check_valid_input(input, padding, *, dim): + torch._check( + len(padding) == 2 * dim, + lambda: f"padding size is expected to be {2 * dim}, but got: {len(padding)}", + ) + + input_dim = input.ndim + + is_batch_mode = input_dim == (dim + 2) + + valid_batch_mode = is_batch_mode + valid_non_batch_mode = not is_batch_mode + + if is_batch_mode: + # allow batch size of 0-dim. + for d in range(1, input_dim): + valid_batch_mode = valid_batch_mode and input.size(d) != 0 + else: + for d in range(0, input_dim): + valid_non_batch_mode = valid_non_batch_mode and input.size(d) != 0 + + # allow empty batch size but not other dimensions. + torch._check( + valid_batch_mode or valid_non_batch_mode, + lambda: ( + f"Expected {dim + 1}D or {dim + 2}D (batch mode) tensor with possibly 0 batch size " + f"and other non-zero dimensions for input, but got: {input.shape}" + ), + ) + + +def _pad1d_common(input, padding, *, is_reflection): + dim_plane = 0 + dim_w = 1 + nbatch = 1 + + if input.ndim == 3: + nbatch = input.size(0) + dim_w += 1 + dim_plane += 1 + + _padding_check_valid_input(input, padding, dim=1) + + pad_l, pad_r = padding + + nplane = input.size(dim_plane) + input_w = input.size(dim_w) + output_w = input_w + pad_l + pad_r + + if is_reflection: + torch._check( + pad_l < input_w and pad_r < input_w, + lambda: ( + f"Argument #4: Padding size should be less than the corresponding input dimension, " + f"but got: padding ({pad_l}, {pad_r}) at dimension {dim_w} of input {input.shape}" + ), + ) + + torch._check( + output_w >= 1, + lambda: f"input (W: {input_w}) is too small. Calculated output W: {output_w}", + ) + + if input.ndim == 2: + return input.new_empty((nplane, output_w)) + else: + return input.new_empty((nbatch, nplane, output_w)) + + +@register_meta(aten.reflection_pad1d) +@out_wrapper() +def meta_reflection_pad1d(input, padding): + return _pad1d_common(input, padding, is_reflection=True) + + +@register_meta(aten.replication_pad1d) +@out_wrapper() +def meta_replication_pad1d(input, padding): + return _pad1d_common(input, padding, is_reflection=False) + + +def _pad1d_backward_common(grad_output, input, padding, *, is_reflection): + dim_w = 1 + if not is_reflection: + torch._check(len(padding) == 2, lambda: "padding size is expected to be 2") + + if input.ndim == 3: + dim_w += 1 + + pad_l, pad_r = padding + + input_w = input.size(dim_w) + output_w = input_w + pad_l + pad_r + + if is_reflection: + torch._check( + pad_l < input_w and pad_r < input_w, + lambda: ( + f"Argument #4: Padding size should be less than the corresponding input dimension, " + f"but got: padding ({pad_l}, {pad_r}) at dimension {dim_w} of input {input.shape}" + ), + ) + + torch._check( + output_w == grad_output.size(dim_w), + lambda: f"grad_output width unexpected. Expected: {output_w}, Got: {grad_output.size(dim_w)}", + ) + + return input.new_empty(input.shape) + + +@register_meta(aten.reflection_pad1d_backward) +@out_wrapper("grad_input") +def meta_reflection_pad1d_backward(grad_output, input, padding): + return _pad1d_backward_common(grad_output, input, padding, is_reflection=True) + + +@register_meta(aten.replication_pad1d_backward) +@out_wrapper("grad_input") +def meta_replication_pad1d_backward(grad_output, input, padding): + return _pad1d_backward_common(grad_output, input, padding, is_reflection=False) + + +def _pad2d_common(input, padding, *, is_reflection): + dim_w = 2 + dim_h = 1 + dim_slices = 0 + nbatch = 1 + + _padding_check_valid_input(input, padding, dim=2) + + ndim = input.ndim + if ndim == 4: + nbatch = input.size(0) + dim_w += 1 + dim_h += 1 + dim_slices += 1 + + pad_l, pad_r, pad_t, pad_b = padding + + nplane = input.size(dim_slices) + input_h = input.size(dim_h) + input_w = input.size(dim_w) + output_h = input_h + pad_t + pad_b + output_w = input_w + pad_l + pad_r + + if is_reflection: + torch._check( + pad_l < input_w and pad_r < input_w, + lambda: ( + f"Argument #4: Padding size should be less than the corresponding input dimension, " + f"but got: padding ({pad_l}, {pad_r}) at dimension {dim_w} of input {input.shape}" + ), + ) + torch._check( + pad_t < input_h and pad_b < input_h, + lambda: ( + f"Argument #6: Padding size should be less than the corresponding input dimension, " + f"but got: padding ({pad_t}, {pad_b}) at dimension {dim_h} of input {input.shape}" + ), + ) + + torch._check( + output_w >= 1 or output_h >= 1, + lambda: ( + f"input (H: {input_h} W: {input_w}) is too small. " + f"Calculated output H: {output_h} W: {output_w}" + ), + ) + + if input.ndim == 3: + return input.new_empty((nplane, output_h, output_w)) + else: + return input.new_empty((nbatch, nplane, output_h, output_w)) + + +@register_meta(aten.reflection_pad2d) +@out_wrapper() +def meta_reflection_pad2d(input, padding): + return _pad2d_common(input, padding, is_reflection=True) + + +@register_meta(aten.replication_pad2d) +@out_wrapper() +def meta_replication_pad2d(input, padding): + return _pad2d_common(input, padding, is_reflection=False) + + +@register_meta( + [ + aten.reflection_pad2d_backward.default, + aten.reflection_pad2d_backward.grad_input, + aten.replication_pad2d_backward.default, + aten.replication_pad2d_backward.grad_input, + ] +) +@out_wrapper("grad_input") +def meta_pad2d_backward(grad_output, self, padding): + dim_w = 2 + dim_h = 1 + dim_plane = 0 + nbatch = 1 + + self_shape = self.shape + if self.dim() == 4: + nbatch = self_shape[0] + dim_w += 1 + dim_h += 1 + dim_plane += 1 + + pad_l, pad_r, pad_t, pad_b = padding + + nplane = self_shape[dim_plane] + input_h = self_shape[dim_h] + input_w = self_shape[dim_w] + output_h = input_h + pad_t + pad_b + output_w = input_w + pad_l + pad_r + + torch._check( + output_w == grad_output.size(dim_w), + lambda: f"grad_output width unexpected. Expected: {output_w}, Got: {grad_output.size(dim_w)}", + ) + torch._check( + output_h == grad_output.size(dim_h), + lambda: f"grad_output height unexpected. Expected: {output_h}, Got: {grad_output.size(dim_h)}", + ) + return self.new_empty(self.shape) + + +def _pad3d_common(input, padding, *, is_reflection): + dim_w = 3 + dim_h = 2 + dim_d = 1 + dim_plane = 0 + + _padding_check_valid_input(input, padding, dim=3) + + batch_mode = input.ndim == 5 + if batch_mode: + nbatch = input.size(0) + dim_w += 1 + dim_h += 1 + dim_d += 1 + dim_plane += 1 + + pad_l, pad_r, pad_t, pad_b, pad_f, pad_bk = padding + + nplane = input.size(dim_plane) + input_d = input.size(dim_d) + input_h = input.size(dim_h) + input_w = input.size(dim_w) + output_d = input_d + pad_f + pad_bk + output_h = input_h + pad_t + pad_b + output_w = input_w + pad_l + pad_r + + if is_reflection: + torch._check( + pad_l < input_w and pad_r < input_w, + lambda: ( + f"Argument #4: Padding size should be less than the corresponding input dimension, " + f"but got: padding ({pad_l}, {pad_r}) at dimension {dim_w} of input {input.shape}" + ), + ) + torch._check( + pad_t < input_h and pad_b < input_h, + lambda: ( + f"Argument #6: Padding size should be less than the corresponding input dimension, " + f"but got: padding ({pad_t}, {pad_b}) at dimension {dim_h} of input {input.shape}" + ), + ) + torch._check( + pad_f < input_d and pad_bk < input_d, + lambda: ( + f"Argument #8: Padding size should be less than the corresponding input dimension, " + f"but got: padding ({pad_f}, {pad_bk}) at dimension {dim_d} of input {input.shape}" + ), + ) + + torch._check( + output_w >= 1 or output_h >= 1 or output_d >= 1, + lambda: ( + f"input (D: {input_d} H: {input_h} W: {input_w}) is too small. " + f"Calculated output D: {output_d} H: {output_h} W: {output_w}" + ), + ) + + if batch_mode: + return input.new_empty((nbatch, nplane, output_d, output_h, output_w)) # type: ignore[possibly-undefined] + else: + return input.new_empty((nplane, output_d, output_h, output_w)) + + +@register_meta(aten.reflection_pad3d) +@out_wrapper() +def meta_reflection_pad3d(input, padding): + return _pad3d_common(input, padding, is_reflection=True) + + +@register_meta(aten.replication_pad3d) +@out_wrapper() +def meta_replication_pad3d(input, padding): + return _pad3d_common(input, padding, is_reflection=False) + + +@register_meta( + [ + aten.reflection_pad3d_backward.default, + aten.reflection_pad3d_backward.grad_input, + aten.replication_pad3d_backward.default, + aten.replication_pad3d_backward.grad_input, + ] +) +@out_wrapper("grad_input") +def meta_pad3d_backward(grad_output, input, padding): + torch._check(len(padding) == 6, lambda: "padding size is expected to be 6") + assert input.ndim > 3 + assert grad_output.ndim == input.ndim + + dim_w = 3 + dim_h = 2 + dim_d = 1 + + if input.ndim == 5: + dim_w += 1 + dim_h += 1 + dim_d += 1 + + pad_l, pad_r, pad_t, pad_b, pad_f, pad_bk = padding + + input_d = input.size(dim_d) + input_h = input.size(dim_h) + input_w = input.size(dim_w) + output_d = input_d + pad_f + pad_bk + output_h = input_h + pad_t + pad_b + output_w = input_w + pad_l + pad_r + + torch._check( + output_w == grad_output.size(dim_w), + lambda: f"grad_output width unexpected. Expected: {output_w}, Got: {grad_output.size(dim_w)}", + ) + torch._check( + output_h == grad_output.size(dim_h), + lambda: f"grad_output height unexpected. Expected: {output_h}, Got: {grad_output.size(dim_h)}", + ) + torch._check( + output_d == grad_output.size(dim_d), + lambda: f"grad_output depth unexpected. Expected: {output_d}, Got: {grad_output.size(dim_d)}", + ) + + return input.new_empty(input.shape) + + +@register_meta(aten._pdist_forward) +@out_wrapper() +def meta__pdist_forward(self: Tensor, p: float = 2) -> Tensor: + torch._check( + self.is_contiguous(), lambda: "_pdist_forward requires contiguous input" + ) + n = self.size(0) + if n <= 1: + return self.new_empty([0]).to(memory_format=torch.legacy_contiguous_format) # type: ignore[call-overload] + else: + return self.new_empty((n * (n - 1) // 2,)).to( + memory_format=torch.legacy_contiguous_format + ) # type: ignore[call-overload] + + +@register_meta(aten._pdist_backward) +@out_wrapper() +def meta__pdist_backward(grad: Tensor, self: Tensor, p: float, pdist: Tensor) -> Tensor: + torch._check( + self.is_contiguous(), lambda: "_pdist_backward requires self to be contiguous" + ) + torch._check( + pdist.is_contiguous(), lambda: "_pdist_backward requires pdist to be contiguous" + ) + return torch.empty_like(self, memory_format=torch.legacy_contiguous_format) + + +@register_meta([aten.baddbmm.default, aten.baddbmm.out]) +@out_wrapper() +def meta_baddbmm(self, batch1, batch2, *, beta=1, alpha=1): + dim1 = batch1.size(0) + dim2 = batch1.size(1) + dim3 = batch2.size(2) + self = self.expand((dim1, dim2, dim3)) + torch._check(batch1.dim() == 3, lambda: "batch1 must be a 3D tensor") + torch._check(batch2.dim() == 3, lambda: "batch2 must be a 3D tensor") + torch._check( + self.dtype == batch1.dtype == batch2.dtype, + lambda: f"Input dtypes must be the same, got: input: {self.dtype}, batch1: {batch1.dtype}, batch2: {batch2.dtype}", + ) + batch1_sizes = batch1.shape + batch2_sizes = batch2.shape + bs = batch1_sizes[0] + contraction_size = batch1_sizes[2] + torch._check( + batch2_sizes[0] == bs and batch2_sizes[1] == contraction_size, + lambda: ( + f"Expected size for first two dimensions of batch2 tensor to be: " + f"[{bs}, {contraction_size}] but got: [{batch2_sizes[0]}, {batch2_sizes[1]}]." + ), + ) + return self.new_empty(self.size()) + + +@register_meta([aten.bernoulli.default, aten.bernoulli.out]) +@out_wrapper() +def meta_bernoulli(self, *, generator=None): + # https://github.com/pytorch/pytorch/issues/88612 + return torch.empty_like(self).contiguous() + + +@register_meta(aten.bernoulli_.float) +def meta_bernoulli_(self, p=0.5, generator=None): + return self + + +@register_meta(aten.bernoulli.p) +def meta_bernoulli_p(self, p=0.5, generator=None): + # https://github.com/pytorch/pytorch/issues/88612 + return torch.empty_like(self).contiguous() + + +@register_meta(aten._fused_moving_avg_obs_fq_helper.default) +def meta__fused_moving_avg_obs_fq_helper( + self, + observer_on, + fake_quant_on, + running_min, + running_max, + scale, + zero_point, + averaging_const, + quant_min, + quant_max, + ch_axis, + per_row_fake_quant=False, + symmetric_quant=False, +): + torch._check( + ch_axis < self.dim(), + lambda: "Error in fused_moving_avg_obs_fake_quant_cpu: ch_axis must be < self.dim()", + ) + mask = torch.empty_like(self, dtype=torch.bool) + return (torch.empty_like(self), mask) + + +@register_meta(aten.mm) +@out_wrapper() +def meta_mm(a, b): + torch._check(a.dim() == 2, lambda: "a must be 2D") + torch._check(b.dim() == 2, lambda: "b must be 2D") + N, M1 = a.shape + M2, P = b.shape + torch._check( + M1 == M2, + lambda: f"a and b must have same reduction dim, but got [{N}, {M1}] X [{M2}, {P}].", + ) + return a.new_empty(N, P) + + +def _compute_reduction_shape(self, dims, keepdim): + if keepdim: + return tuple(self.shape[i] if i not in dims else 1 for i in range(self.ndim)) + + return utils.compute_reduction_output_shape(self.shape, dims) + + +# FakeTensors (meta tensors with a device) will report device as meta +# when running meta kernels. Here, access the "fake device" of FakeTensor if it +# exists so meta kernels which have diverge per device will be more +# accurate when run with FakeTensors +def device_hint(tensor) -> "str": + if isinstance(tensor, torch._subclasses.FakeTensor): + return tensor.fake_device.type + else: + return "cuda" # default to cuda + + +def calc_conv_nd_return_shape( + input_tensor: torch.Tensor, + weight: torch.Tensor, + stride: Union[List[int], int], + padding: Union[List[int], int], + dilation: Union[List[int], int], + is_transposed: bool, + groups: int, + output_padding: Optional[Union[List[int], int]] = None, +): + def _formula(ln: int, p: int, d: int, k: int, s: int) -> int: + """ + Formula to apply to calculate the length of some dimension of the output + + See: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html + + Args: + ln: length of the dimension + p: padding in that dim + d: dilation in that dim + k: kernel size in that dim + s: stride in that dim + Returns: + The output length + """ + return (ln + 2 * p - d * (k - 1) - 1) // s + 1 + + def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int: + """ + Formula to apply to calculate the length of some dimension of the output + if transposed convolution is used. + See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html + + Args: + ln: length of the dimension + p: padding in that dim + d: dilation in that dim + k: kernel size in that dim + s: stride in that dim + op: output padding in that dim + + Returns: + The output length + """ + return (ln - 1) * s - 2 * p + d * (k - 1) + op + 1 + + kernel_size = weight.shape[2:] + dims = input_tensor.shape[2:] + if is_transposed: + out_channels = groups * weight.shape[1] + else: + out_channels = weight.shape[0] + if weight.shape[1] * groups != input_tensor.shape[1]: + raise RuntimeError("Invalid channel dimensions") + + ret_shape = [input_tensor.shape[0], out_channels] + if isinstance(stride, IntLike): + stride = [stride] * len(dims) + elif len(stride) == 1: + stride = [stride[0]] * len(dims) + + if isinstance(padding, IntLike): + padding = [padding] * len(dims) + elif len(padding) == 1: + padding = [padding[0]] * len(dims) + + if isinstance(dilation, IntLike): + dilation = [dilation] * len(dims) + elif len(dilation) == 1: + dilation = [dilation[0]] * len(dims) + + output_padding_list: Optional[List[int]] = None + if output_padding: + if isinstance(output_padding, IntLike): + output_padding_list = [output_padding] * len(dims) + elif len(output_padding) == 1: + output_padding_list = [output_padding[0]] * len(dims) + else: + output_padding_list = output_padding + + for i in range(len(dims)): + # If output_padding is present, we are dealing with a transposed convolution + if output_padding_list: + ret_shape.append( + _formula_transposed( + dims[i], + padding[i], + dilation[i], + kernel_size[i], + stride[i], + output_padding_list[i], + ) + ) + else: + ret_shape.append( + _formula(dims[i], padding[i], dilation[i], kernel_size[i], stride[i]) + ) + + return ret_shape + + +def is_channels_last(ten): + return torch._prims_common.suggest_memory_format(ten) == torch.channels_last + + +@register_meta(aten.convolution.default) +def meta_conv( + input_tensor: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: List[int], + padding: List[int], + dilation: List[int], + is_transposed: bool, + output_padding: List[int], + groups: int, +): + def pick_memory_format(): + if device_hint(input_tensor) == "cuda": + if is_channels_last(input_tensor) or is_channels_last(weight): + return torch.channels_last + else: + if is_channels_last(input_tensor): + return torch.channels_last + if input_tensor.is_contiguous(memory_format=torch.contiguous_format): + return torch.contiguous_format + elif input_tensor.is_contiguous(memory_format=torch.preserve_format): + return torch.preserve_format + + shape_out = calc_conv_nd_return_shape( + input_tensor, + weight, + stride, + padding, + dilation, + is_transposed, + groups, + output_padding if is_transposed else None, + ) + + input_channels_dim = 1 + output_channels_dim = 1 + if input_tensor.size(input_channels_dim) == 0: + shape_out[output_channels_dim] = 0 + + out = input_tensor.new_empty(shape_out) + out = out.to(memory_format=pick_memory_format()) # type: ignore[call-overload] + return out + + +if torch._C._has_mkldnn: + _meta_lib_dont_use_me_use_register_meta_for_mkldnn = torch.library.Library( + "mkldnn", "IMPL", "Meta" + ) + + @register_meta(torch.ops.mkldnn._convolution_pointwise.default) + def meta_mkldnn_convolution_default( + input_tensor, + weight, + bias, + padding, + stride, + dilation, + groups, + attr, + scalars, + algorithm, + ): + shape_out = calc_conv_nd_return_shape( + input_tensor, weight, stride, padding, dilation, False, groups, [] + ) + out = input_tensor.new_empty(shape_out) + out_memory_format = torch.channels_last + out = out.to(memory_format=out_memory_format) # type: ignore[call-overload] + return out + + @register_meta(torch.ops.mkldnn._linear_pointwise.default) + def meta_linear_pointwise_default( + input_tensor, weight, bias, attr, scalars, algorithm + ): + return input_tensor.new_empty((*input_tensor.shape[:-1], weight.shape[0])) + + if torch._C.has_mkl: + _meta_lib_dont_use_me_use_register_meta_for_mkl = torch.library.Library( + "mkl", "IMPL", "Meta" + ) + + @register_meta(torch.ops.mkl._mkl_linear) + def meta_mkl_linear( + input_tensor, + packed_weight, + orig_weight, + bias, + batch_size, + ): + return input_tensor.new_empty( + (*input_tensor.shape[:-1], orig_weight.shape[0]) + ) + + _meta_lib_dont_use_me_use_register_meta_for_onednn = torch.library.Library( + "onednn", "IMPL", "Meta" + ) + + @register_meta(torch.ops.onednn.qconv2d_pointwise.default) + def meta_qconv2d_pointwise( + x, + x_scale, + x_zp, + w, # prepacked_weight + w_scale, + w_zp, + bias, + stride, + padding, + dilation, + groups, + output_scale, + output_zero_point, + output_dtype, + attr, + scalars, + algorithm, + ): + shape_out = calc_conv_nd_return_shape( + x, + w, + stride, + padding, + dilation, + False, + groups, + None, + ) + assert output_dtype in [torch.float32, torch.bfloat16] + out = x.new_empty(shape_out, dtype=output_dtype) + out = out.to(memory_format=torch.channels_last) + return out + + @register_meta(torch.ops.onednn.qlinear_pointwise.default) + @register_meta(torch.ops.onednn.qlinear_pointwise.tensor) + def meta_qlinear_pointwise( + x, + x_scale, + x_zp, + w, + w_scale, + w_zp, + bias, + output_scale, + output_zero_point, + output_dtype, + post_op_name, + post_op_args, + post_op_algorithm, + ): + output_shape = list(x.shape) + # The weight has been transposed during the qlinear weight prepack process. + output_shape[-1] = w.shape[1] + assert output_dtype in [torch.float32, torch.bfloat16] + out = x.new_empty(output_shape, dtype=output_dtype) + return out + + _meta_lib_dont_use_me_use_register_meta_for_quantized = torch.library.Library( + "quantized", "IMPL", "Meta" + ) + + @register_meta(torch.ops.quantized.max_pool2d) + def meta_quantized_max_pool2d( + input, + kernel_size, + stride=(), + padding=(0,), + dilation=(1,), + ceil_mode=False, + ): + ( + nInputPlane, + outputHeight, + outputWidth, + ) = max_pool2d_checks_and_compute_shape( + input, kernel_size, stride, padding, dilation, ceil_mode + ) + nbatch = input.size(-4) if input.dim() == 4 else 1 + memory_format = torch.channels_last + if input.dim() == 3: + size = [nInputPlane, outputHeight, outputWidth] + else: + size = [nbatch, nInputPlane, outputHeight, outputWidth] + return torch.empty( + size, + dtype=input.dtype, + device=input.device, + memory_format=memory_format, + ) + + +# from check_dim_size() in aten/src/ATen/TensorUtils.cpp. +def check_dim_size(tensor, dim, dim_size, size): + torch._check( + tensor.dim() == dim and tensor.shape[dim_size] == size, + lambda: f"Expected a tensor of dimension {dim} and tensor.size[{dim_size}] == {size}, " + + f"but got : dimension {tensor.dim()} and tensor.size[{dim_size}] = {tensor.shape[dim_size]}", + ) + + +@register_meta(aten.avg_pool2d.default) +def meta_avg_pool2d( + input, + kernel_size, + stride=(), + padding=(0,), + ceil_mode=False, + count_include_pad=True, + divisor_override=None, +): + def unpack(name, val): + torch._check( + len(val) in [1, 2], + lambda: f"avg_pool2d: {name} must either be a single int, or a tuple of two ints", + ) + H = val[0] + W = H if len(val) == 1 else val[1] + return H, W + + kH, kW = unpack("kernel_size", kernel_size) + torch._check( + len(stride) in [0, 1, 2], + lambda: "avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints", + ) + if len(stride) == 0: + dH, dW = kH, kW + elif len(stride) == 1: + dH, dW = stride[0], stride[0] + else: + dH, dW = unpack("stride", stride) + + padH, padW = unpack("padding", padding) + + torch._check( + divisor_override is None or divisor_override != 0, + lambda: "divisor must be not zero", + ) + + nbatch = input.size(-4) if input.dim() == 4 else 1 + nInputPlane = input.size(-3) + inputHeight = input.size(-2) + inputWidth = input.size(-1) + + outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, 1, ceil_mode) + outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode) + + memory_format = utils.suggest_memory_format(input) + pool2d_shape_check( + input, + kH, + kW, + dH, + dW, + padH, + padW, + 1, + 1, + nInputPlane, + inputHeight, + inputWidth, + outputHeight, + outputWidth, + memory_format, + ) + + if input.dim() == 3: + size = [nInputPlane, outputHeight, outputWidth] + else: + size = [nbatch, nInputPlane, outputHeight, outputWidth] + return torch.empty( + size, + dtype=input.dtype, + device=input.device, + memory_format=memory_format, + ) + + +# from avg_pool2d_backward_shape_check() in aten/src/ATen/native/Pool.h. +def avg_pool2d_backward_shape_check( + input, + gradOutput, + nbatch, + kH, + kW, + dH, + dW, + padH, + padW, + nInputPlane, + inputHeight, + inputWidth, + outputHeight, + outputWidth, + mem_format, +): + pool2d_shape_check( + input, + kH, + kW, + dH, + dW, + padH, + padW, + 1, + 1, + nInputPlane, + inputHeight, + inputWidth, + outputHeight, + outputWidth, + mem_format, + ) + + ndim = input.dim() + nOutputPlane = nInputPlane + + check_dim_size(gradOutput, ndim, ndim - 3, nOutputPlane) + check_dim_size(gradOutput, ndim, ndim - 2, outputHeight) + check_dim_size(gradOutput, ndim, ndim - 1, outputWidth) + + +# Don't override the C++ registration. +@register_meta(aten.avg_pool2d_backward.default) +def meta_avg_pool2d_backward( + gradOutput_, + input, + kernel_size, + stride, + padding, + ceil_mode, + count_include_pad, + divisor_override, +): + # From aten/src/ATen/native/AveragePool2d.cpp structured kernel meta func. + torch._check( + len(kernel_size) == 1 or len(kernel_size) == 2, + lambda: "avg_pool2d: kernel_size must either be a single int, or a tuple of two ints", + ) + kH = kernel_size[0] + kW = kH if len(kernel_size) == 1 else kernel_size[1] + torch._check( + len(stride) == 0 or len(stride) == 1 or len(stride) == 2, + lambda: "avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints", + ) + dH = kH if len(stride) == 0 else stride[0] + dW = kW if len(stride) == 0 else dH if len(stride) == 1 else stride[1] + torch._check( + len(padding) == 1 or len(padding) == 2, + lambda: "avg_pool2d: padding must either be a single int, or a tuple of two ints", + ) + padH = padding[0] + padW = padH if len(padding) == 1 else padding[1] + + torch._check( + divisor_override is None or divisor_override != 0, + lambda: "divisor must be not zero", + ) + + input_size = input.shape + nbatch = input_size[-4] if input.dim() == 4 else 1 + nInputPlane = input_size[-3] + inputHeight = input_size[-2] + inputWidth = input_size[-1] + + outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, 1, ceil_mode) + outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode) + + mem_format = utils.suggest_memory_format(input) + + avg_pool2d_backward_shape_check( + input, + gradOutput_, + nbatch, + kH, + kW, + dH, + dW, + padH, + padW, + nInputPlane, + inputHeight, + inputWidth, + outputHeight, + outputWidth, + mem_format, + ) + + return torch.empty( + input_size, + dtype=input.dtype, + device=input.device, + memory_format=mem_format, + ) + + +@register_meta(aten.avg_pool3d) +@out_wrapper() +def meta_avg_pool3d( + input, + kernel_size, + stride=(), + padding=(0,), + ceil_mode=False, + count_include_pad=True, + divisor_override=None, +): + torch._check( + len(kernel_size) in (1, 3), + lambda: "avg_pool3d: kernel_size must be a single int, or a tuple of three ints", + ) + kT = kernel_size[0] + kH = kT if len(kernel_size) == 1 else kernel_size[1] + kW = kT if len(kernel_size) == 1 else kernel_size[2] + + torch._check( + not stride or len(stride) in (1, 3), + lambda: "avg_pool3d: stride must be omitted, a single int, or a tuple of three ints", + ) + dT = kT if not stride else stride[0] + dH = kH if not stride else (dT if len(stride) == 1 else stride[1]) + dW = kW if not stride else (dT if len(stride) == 1 else stride[2]) + + torch._check( + len(padding) in (1, 3), + lambda: "avg_pool3d: padding must be a single int, or a tuple of three ints", + ) + padT = padding[0] + padH = padT if len(padding) == 1 else padding[1] + padW = padT if len(padding) == 1 else padding[2] + + torch._check( + input.ndim in (4, 5), + lambda: "non-empty 4D or 5D (batch mode) tensor expected for input", + ) + + torch._check( + not divisor_override or divisor_override != 0, + lambda: "divisor must be not zero", + ) + + nbatch = input.size(0) + nslices = input.size(-4) + itime = input.size(-3) + iheight = input.size(-2) + iwidth = input.size(-1) + + otime = pooling_output_shape(itime, kT, padT, dT, 1, ceil_mode) + oheight = pooling_output_shape(iheight, kH, padH, dH, 1, ceil_mode) + owidth = pooling_output_shape(iwidth, kW, padW, dW, 1, ceil_mode) + + pool3d_shape_check( + input, + nslices, + kT, + kH, + kW, + dT, + dH, + dW, + padT, + padH, + padW, + 1, + 1, + 1, + itime, + iheight, + iwidth, + otime, + oheight, + owidth, + "avg_pool3d()", + check_input_size=True, + ) + + if input.ndim == 4: + return input.new_empty((nslices, otime, oheight, owidth)) + else: + return input.new_empty((nbatch, nslices, otime, oheight, owidth)) + + +@register_meta(aten.avg_pool3d_backward) +@out_wrapper("grad_input") +def meta_avg_pool3d_backward( + grad_output, + input, + kernel_size, + stride, + padding, + ceil_mode, + count_include_pad, + divisor_override, +): + torch._check( + len(kernel_size) in (1, 3), + lambda: "avg_pool3d: kernel_size must be a single int, or a tuple of three ints", + ) + kT = kernel_size[0] + kH = kT if len(kernel_size) == 1 else kernel_size[1] + kW = kT if len(kernel_size) == 1 else kernel_size[2] + + torch._check( + not stride or len(stride) in (1, 3), + lambda: "avg_pool3d: stride must be omitted, a single int, or a tuple of three ints", + ) + dT = kT if not stride else stride[0] + dH = kH if not stride else (dT if len(stride) == 1 else stride[1]) + dW = kW if not stride else (dT if len(stride) == 1 else stride[2]) + + torch._check( + len(padding) in (1, 3), + lambda: "avg_pool3d: padding must be a single int, or a tuple of three ints", + ) + padT = padding[0] + padH = padT if len(padding) == 1 else padding[1] + padW = padT if len(padding) == 1 else padding[2] + + torch._check( + input.ndim in (4, 5), + lambda: "non-empty 4D or 5D (batch mode) tensor expected for input", + ) + + torch._check( + not divisor_override or divisor_override != 0, + lambda: "divisor must be not zero", + ) + + nslices = input.size(-4) + itime = input.size(-3) + iheight = input.size(-2) + iwidth = input.size(-1) + + otime_for_shape_check = pooling_output_shape(itime, kT, padT, dT, 1, ceil_mode) + oheight_for_shape_check = pooling_output_shape(iheight, kH, padH, dH, 1, ceil_mode) + owidth_for_shape_check = pooling_output_shape(iwidth, kW, padW, dW, 1, ceil_mode) + + avg_pool3d_backward_shape_check( + input, + grad_output, + nslices, + kT, + kH, + kW, + dT, + dH, + dW, + padT, + padH, + padW, + itime, + iheight, + iwidth, + otime_for_shape_check, + oheight_for_shape_check, + owidth_for_shape_check, + "avg_pool3d_backward()", + ) + + return input.new_empty(input.shape) + + +@register_meta(aten._adaptive_avg_pool2d.default) +def meta_adaptive_avg_pool2d(self, output_size): + torch._check( + self.ndim == 3 or self.ndim == 4, + lambda: f"Expected 3D or 4D tensor, but got {self.shape}", + ) + output_shape = self.shape[:-2] + tuple(output_size) + memory_format = utils.suggest_memory_format(self) + # need to set memory_format to preserve the memory format of the input + # channel last input should have channel last output + return torch.empty( + output_shape, + dtype=self.dtype, + device=self.device, + memory_format=memory_format, + ) + + +@register_meta(aten._adaptive_avg_pool3d.default) +def meta_adaptive_avg_pool3d(self, output_size): + torch._check( + self.ndim == 4 or self.ndim == 5, + lambda: f"Expected 4D or 5D tensor, but got {self.shape}", + ) + return self.new_empty(self.shape[:-3] + tuple(output_size)) + + +@register_meta(aten._adaptive_avg_pool2d_backward.default) +def meta__adaptive_avg_pool2d_backward(grad_out, self): + ndim = grad_out.ndim + for i in range(1, ndim): + torch._check( + grad_out.size(i) > 0, + lambda: f"adaptive_avg_pool2d_backward(): Expected grad_output to have non-zero \ + size for non-batch dimensions, {grad_out.shape} with dimension {i} being empty", + ) + torch._check( + ndim == 3 or ndim == 4, + lambda: f"adaptive_avg_pool2d_backward(): Expected 3D or 4D tensor, but got {self.shape}", + ) + torch._check( + self.dtype == grad_out.dtype, + lambda: f"expected dtype {self.dtype} for `grad_output` but got dtype {grad_out.dtype}", + ) + memory_format = torch.contiguous_format + if is_channels_last(self): + memory_format = torch.channels_last + return self.new_empty(self.shape).to(memory_format=memory_format) + + +@register_meta(aten._adaptive_avg_pool3d_backward) +@out_wrapper("grad_input") +def meta__adaptive_avg_pool3d_backward(grad_output, self): + _adaptive_pool_empty_output_check(grad_output, "adaptive_avg_pool3d_backward") + return torch.empty_like(self, memory_format=torch.legacy_contiguous_format) + + +def _adaptive_pool_empty_output_check(grad_output: Tensor, arg_name: str): + ndim = grad_output.ndim + for i in range(1, ndim): + torch._check( + grad_output.size(i) > 0, + lambda: ( + f"{arg_name}(): Expected grad_output to have non-zero size for non-batch dimensions, " + f"but grad_output has sizes {grad_output.shape} with dimension {i} being empty" + ), + ) + + +@register_meta(aten.adaptive_max_pool2d) +@out_wrapper("out", "indices") +def meta_adaptive_max_pool2d(input, output_size): + ndim = input.ndim + torch._check( + ndim in (3, 4), + lambda: f"adaptive_max_pool2d(): Expected 3D or 4D tensor, but got: {input.shape}", + ) + for i in range(1, ndim): + torch._check( + input.size(i) > 0, + lambda: ( + f"adaptive_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, " + f"but input has sizes {input.shape} with dimension {i} being empty" + ), + ) + + torch._check( + len(output_size) == 2, + lambda: "adaptive_max_pool2d(): internal error: output_size.size() must be 2", + ) + + dimH = 1 + sizeB = 1 + sizeD = 0 + + if input.ndim == 4: + sizeB = input.size(0) + dimH += 1 + + sizeD = input.size(dimH - 1) + osizeH, osizeW = output_size + + if input.ndim == 3: + out_shape = (sizeD, osizeH, osizeW) + out = input.new_empty(out_shape) + indices = input.new_empty(out_shape, dtype=torch.int64) + return out, indices + else: + out_shape = (sizeB, sizeD, osizeH, osizeW) # type: ignore[assignment] + memory_format = utils.suggest_memory_format(input) + out = input.new_empty(out_shape).to(memory_format=memory_format) + indices = input.new_empty(out_shape, dtype=torch.int64).to( + memory_format=memory_format + ) + return out, indices + + +@register_meta(aten.adaptive_max_pool2d_backward) +@out_wrapper("grad_input") +def meta_adaptive_max_pool2d_backward(grad_output, input, indices): + ndim = grad_output.ndim + torch._check( + ndim in (3, 4), + lambda: f"adaptive_max_pooling2d_backward(): Expected 3D or 4D grad_output, but got: {grad_output.shape}", + ) + + _adaptive_pool_empty_output_check(grad_output, "adaptive_max_pool2d_backward") + + torch._check( + input.dtype == grad_output.dtype, + lambda: f"expected dtype {input.dtype} for `grad_output` but got dtype {grad_output.dtype}", + ) + + memory_format = utils.suggest_memory_format(input) + return input.new_empty(input.shape).to(memory_format=memory_format) + + +@register_meta(aten.adaptive_max_pool3d) +@out_wrapper("out", "indices") +def meta_adaptive_max_pool3d(input, output_size): + ndim = input.ndim + torch._check( + ndim in (4, 5), + lambda: f"adaptive_max_pool3d(): Expected 4D or 5D tensor, but got: {input.shape}", + ) + for i in range(1, ndim): + torch._check( + input.size(i) > 0, + lambda: ( + f"adaptive_max_pool3d(): Expected input to have non-zero size for non-batch dimensions, " + f"but input has sizes {input.shape} with dimension {i} being empty" + ), + ) + + torch._check( + len(output_size) == 3, + lambda: "adaptive_max_pool3d(): internal error: output_size.size() must be 3", + ) + + dimD = 0 + sizeB = 1 + sizeD = 0 + + if ndim == 5: + sizeB = input.size(0) + dimD += 1 + + sizeD = input.size(dimD) + osizeT, osizeH, osizeW = output_size + + if ndim == 4: + out_shape = (sizeD, osizeT, osizeH, osizeW) + else: + out_shape = (sizeB, sizeD, osizeT, osizeH, osizeW) # type: ignore[assignment] + + out = input.new_empty(out_shape) + indices = input.new_empty(out_shape, dtype=torch.int64) + + return out, indices + + +@register_meta(aten.adaptive_max_pool3d_backward) +@out_wrapper("grad_input") +def meta_adaptive_max_pool3d_backward(grad_output, input, indices): + _adaptive_pool_empty_output_check(grad_output, "adaptive_max_pool3d_backward") + return input.new_empty(input.shape) + + +@register_meta(aten.repeat_interleave.Tensor) +def meta_repeat_interleave_Tensor(repeats, output_size=None): + if output_size is None: + raise RuntimeError("cannot repeat_interleave a meta tensor without output_size") + return repeats.new_empty(output_size) + + +@register_meta([aten.complex.default, aten.complex.out]) +@out_wrapper() +def meta_complex(real, imag): + assert real.dtype.is_floating_point + assert imag.dtype.is_floating_point + out_shape = _broadcast_shapes(real.shape, imag.shape) + return real.new_empty(out_shape, dtype=corresponding_complex_dtype(real.dtype)) + + +@register_meta([aten.nonzero_static.default, aten.nonzero_static.out]) +@out_wrapper() +def nonzero_static(self, *, size: int, fill_value: int = -1): + return self.new_empty((size, self.dim()), dtype=torch.long) + + +@register_meta([aten.index.Tensor, aten._unsafe_index.Tensor]) +def meta_index_Tensor(self, indices): + torch._check(bool(indices), lambda: "at least one index must be provided") + # aten::index is the internal advanced indexing implementation + # checkIndexTensorTypes and expandTensors + result: List[Optional[Tensor]] = [] + for i, index in enumerate(indices): + if index is not None: + torch._check( + index.dtype in [torch.long, torch.int, torch.int8, torch.bool], + lambda: "tensors used as indices must be long, int, byte or bool tensors", + ) + if index.dtype in [torch.int8, torch.bool]: + nonzero = index.nonzero() + k = len(result) + torch._check_index( + k + index.ndim <= self.ndim, + lambda: f"too many indices for tensor of dimension {self.ndim}", + ) + for j in range(index.ndim): + torch._check_index( + index.shape[j] == self.shape[k + j], + lambda: f"The shape of the mask {index.shape} at index {i} " + f"does not match the shape of the indexed tensor {self.shape} at index {k + j}", + ) + result.append(nonzero.select(1, j)) + else: + result.append(index) + else: + result.append(index) + indices = result + torch._check( + len(indices) <= self.ndim, + lambda: f"too many indices for tensor of dimension {self.ndim} (got {len(indices)})", + ) + # expand_outplace + import torch._refs as refs # avoid import cycle in mypy + + indices = list(refs._maybe_broadcast(*indices)) + # add missing null tensors + while len(indices) < self.ndim: + indices.append(None) + + # hasContiguousSubspace + # true if all non-null tensors are adjacent + # See: + # https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing + # https://stackoverflow.com/questions/53841497/why-does-numpy-mixed-basic-advanced-indexing-depend-on-slice-adjacency + state = 0 + has_contiguous_subspace = False + for index in indices: + if state == 0: + if index is not None: + state = 1 + elif state == 1: + if index is None: + state = 2 + else: + if index is not None: + break + else: + has_contiguous_subspace = True + + # transposeToFront + # This is the logic that causes the newly inserted dimensions to show up + # at the beginning of the tensor, if they're not contiguous + if not has_contiguous_subspace: + dims = [] + transposed_indices = [] + for i, index in enumerate(indices): + if index is not None: + dims.append(i) + transposed_indices.append(index) + for i, index in enumerate(indices): + if index is None: + dims.append(i) + transposed_indices.append(index) + self = self.permute(dims) + indices = transposed_indices + + # AdvancedIndex::AdvancedIndex + # Now we can assume the indices have contiguous subspace + # This is simplified from AdvancedIndex which goes to more effort + # to put the input and indices in a form so that TensorIterator can + # take them. If we write a ref for this, probably that logic should + # get implemented + before_shape: List[int] = [] + after_shape: List[int] = [] + replacement_shape: List[int] = [] + for dim, index in enumerate(indices): + if index is None: + if replacement_shape: + after_shape.append(self.shape[dim]) + else: + before_shape.append(self.shape[dim]) + else: + replacement_shape = list(index.shape) + return self.new_empty(before_shape + replacement_shape + after_shape) + + +@register_meta([aten.convolution_backward.default]) +def meta_convolution_backward( + grad_output_, + input_, + weight_, + bias_sizes_opt, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + output_mask, +): + # High level logic taken from slow_conv3d_backward_cpu which should + # be representative of all convolution_backward impls + backend_grad_input = None + backend_grad_weight = None + backend_grad_bias = None + + if output_mask[0]: + backend_grad_input = grad_output_.new_empty(input_.size()) + if output_mask[1]: + backend_grad_weight = grad_output_.new_empty(weight_.size()) + if output_mask[2]: + backend_grad_bias = grad_output_.new_empty(bias_sizes_opt) + + return (backend_grad_input, backend_grad_weight, backend_grad_bias) + + +@register_meta([aten.addbmm.default, aten.addbmm.out]) +@out_wrapper() +def meta_addbmm(self, batch1, batch2, *, beta=1, alpha=1): + dim1 = batch1.size(1) + dim2 = batch2.size(2) + self = self.expand((dim1, dim2)) + torch._check(batch1.dim() == 3, lambda: "batch1 must be a 3D tensor") + torch._check(batch2.dim() == 3, lambda: "batch2 must be a 3D tensor") + torch._check( + batch1.size(0) == batch2.size(0), + lambda: f"batch1 and batch2 must have same number of batches, got {batch1.size(0)} and {batch2.size(0)}", + ) + torch._check( + batch1.size(2) == batch2.size(1), + lambda: ( + f"Incompatible matrix sizes for bmm ({batch1.size(1)}x{batch1.size(2)} " + f"and {batch2.size(1)}x{batch2.size(2)})" + ), + ) + torch._check( + self.size(0) == dim1 and self.size(1) == dim2, + lambda: "self tensor does not match matmul output shape", + ) + return self.new_empty(self.size()) + + +def register_meta_foreach(ops): + def wrapper(fn): + def register(op): + op_name = str(op).split(".")[1] + scalar_op = getattr(aten, op_name.replace("_foreach_", "")) + + _add_op_to_registry( + meta_table, + op, + partial( + fn, + _scalar_op=scalar_op, + ), + ) + + pytree.tree_map_(register, ops) + return fn + + return wrapper + + +@register_meta_foreach( + [ + aten._foreach_abs, + aten._foreach_acos, + aten._foreach_asin, + aten._foreach_atan, + aten._foreach_ceil, + aten._foreach_cos, + aten._foreach_cosh, + aten._foreach_erf, + aten._foreach_erfc, + aten._foreach_exp, + aten._foreach_expm1, + aten._foreach_frac, + aten._foreach_floor, + aten._foreach_lgamma, + aten._foreach_log, + aten._foreach_log10, + aten._foreach_log1p, + aten._foreach_log2, + aten._foreach_neg, + aten._foreach_norm, + aten._foreach_reciprocal, + aten._foreach_round, + aten._foreach_sigmoid, + aten._foreach_sign, + aten._foreach_sin, + aten._foreach_sinh, + aten._foreach_sqrt, + aten._foreach_tan, + aten._foreach_tanh, + aten._foreach_trunc, + aten._foreach_zero, + aten._foreach_add, + aten._foreach_sub, + aten._foreach_mul, + aten._foreach_div, + aten._foreach_clamp_min, + aten._foreach_clamp_max, + aten._foreach_lerp, + ], +) +def _meta_foreach_out_of_place(*args, _scalar_op=None, **kwargs): + torch._check( + isinstance(args[0], list), + lambda: (f"The first argument must be List[Tensor], but got {type(args[0])}."), + ) + + nelem = len(args[0]) + torch._check( + nelem > 0, + lambda: ("Tensor list must have at least one tensor."), + ) + + nlists = 1 + for iarg, arg in enumerate(args[1:]): + if isinstance(arg, list): + nlists += 1 + torch._check( + len(arg) == nelem, + lambda: ( + f"self and argument-{iarg+2} must match in length, " + f"but got {nelem} and {len(arg)}." + ), + ) + elif isinstance(arg, Tensor): + torch._check( + arg.dim() == 0 and arg.numel() == 1, + lambda: ( + "scalar tensor expected to be 0 dim but it has " + f"{arg.dim()} dimensions and {arg.numel()} elements." + ), + ) + else: + break + + result = [] + for elem in range(nelem): + each_args = [args[i][elem] for i in range(nlists)] + result.append(_scalar_op(*each_args, *args[nlists:], **kwargs)) + + return result + + +@register_meta_foreach( + [ + aten._foreach_abs_, + aten._foreach_acos_, + aten._foreach_asin_, + aten._foreach_atan_, + aten._foreach_ceil_, + aten._foreach_cos_, + aten._foreach_cosh_, + aten._foreach_erf_, + aten._foreach_erfc_, + aten._foreach_exp_, + aten._foreach_expm1_, + aten._foreach_frac_, + aten._foreach_floor_, + aten._foreach_lgamma_, + aten._foreach_log_, + aten._foreach_log10_, + aten._foreach_log1p_, + aten._foreach_log2_, + aten._foreach_neg_, + aten._foreach_reciprocal_, + aten._foreach_round_, + aten._foreach_sigmoid_, + aten._foreach_sign_, + aten._foreach_sin_, + aten._foreach_sinh_, + aten._foreach_sqrt_, + aten._foreach_tan_, + aten._foreach_tanh_, + aten._foreach_trunc_, + aten._foreach_zero_, + aten._foreach_add_, + aten._foreach_sub_, + aten._foreach_mul_, + aten._foreach_div_, + aten._foreach_clamp_min_, + aten._foreach_clamp_max_, + aten._foreach_lerp_, + aten._foreach_copy_, + ] +) +def _meta_foreach_inplace(*args, _scalar_op=None, **kwargs): + _meta_foreach_out_of_place(*args, _scalar_op=_scalar_op, **kwargs) + return + + +@register_meta([aten._foreach_pow.ScalarAndTensor]) +def meta__foreach_pow_scalar_and_tensor(self, exponent): + # Only foreach_pow has a ScalarAndTensor method and needs special + # handling because it does not work with _meta_foreach_out_of_place. + torch._check( + isinstance(exponent, List), + lambda: f"exponent must be a tensor list but got {type(exponent)}", + ) + return [torch.empty_like(e) for e in exponent] + + +def _check_foreach_binop_tensor_lists(self, other): + torch._check( + isinstance(self, List) and isinstance(other, List), + lambda: ( + "The first two arguments of must be List[Tensor], " + f"but got {type(self)} and {type(other)}." + ), + ) + torch._check( + len(self) > 0 and len(self) == len(other), + lambda: ( + "self and other must be non-empty and match in length, " + f"but got {len(self)} and {len(other)}." + ), + ) + + +@register_meta( + [ + aten._foreach_maximum, + aten._foreach_minimum, + ] +) +def meta__foreach_binop_scalar(*args): + # aten.maximum(Tensor, Scalar) does not exist. + return _meta_foreach_out_of_place(*args, _scalar_op=aten.clamp_min) + + +@register_meta( + [ + aten._foreach_maximum_, + aten._foreach_minimum_, + ] +) +def meta__foreach_binop__scalar(*args): + # aten.maximum(Tensor, Scalar) does not exist + _meta_foreach_inplace(*args, _scalar_op=aten.clamp_min_) + return + + +@register_meta( + [ + aten._foreach_addcdiv.Scalar, + aten._foreach_addcmul.Scalar, + ] +) +def meta__foreach_addcop_scalar(self, tensor1, tensor2, scalar=1): + # forach_addcdiv and addcdiv have different signatures and + # cannot use _meta_foreach_out_of_place. + torch._check( + all(isinstance(l, List) for l in [self, tensor1, tensor2]), + lambda: ( + "All arguments must be List[Tensor], " + f"but got {type(self)}, {type(tensor1)}, and {type(tensor2)}" + ), + ) + torch._check(len(self) > 0, lambda: "input tensor list must not be empty.") + torch._check( + len(self) == len(tensor1) and len(self) == len(tensor2), + lambda: "All input tensor lists must have the same length", + ) + + return [torch.empty_like(s) for s in self] + + +@register_meta([aten._foreach_addcdiv_.Tensor, aten._foreach_addcmul_.Tensor]) +def meta__foreach_addcop_tensor(self, tensor1, tensor2, scalars): + torch._check( + all(isinstance(l, List) for l in [self, tensor1, tensor2]) + and isinstance(scalars, torch.Tensor), + lambda: ( + "_foreach_addc*_ op expects arguments of type: List[Tensor], List[Tensor], List[Tensor], tensor, " + f"but got: {type(self)}, {type(tensor1)}, {type(tensor2)}, and {type(scalars)}" + ), + ) + torch._check(len(self) > 0, lambda: "input tensor list must not be empty.") + torch._check( + len(self) == len(tensor1) and len(self) == len(tensor2), + lambda: "All input tensor lists must have the same length", + ) + + +@register_meta( + [ + aten._foreach_addcdiv_.Scalar, + aten._foreach_addcmul_.Scalar, + ] +) +def meta__foreach_addcop__scalar(self, tensor1, tensor2, scalar=1): + torch._check( + all(isinstance(l, List) for l in [self, tensor1, tensor2]), + lambda: ( + "All arguments of _foreach_addc*_ must be List[Tensor], " + f"but got {type(self)}, {type(tensor1)}, and {type(tensor2)}" + ), + ) + torch._check(len(self) > 0, lambda: "input tensor list must not be empty.") + torch._check( + len(self) == len(tensor1) and len(self) == len(tensor2), + lambda: "All input tensor lists must have the same length", + ) + + +@register_meta([aten._fused_adam_.default]) +def meta__fused_adam_( + self, + grads, + exp_avgs, + exp_avg_sqs, + max_exp_avg_sqs, + state_steps, + *, + lr, + beta1, + beta2, + weight_decay, + eps, + amsgrad, + maximize, + grad_scale=None, + found_inf=None, +): + for l in [self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]: + torch._check( + isinstance(l, List), + lambda: f"exponent must be a tensor list but got {type(l)}", + ) + + +@register_meta([aten._fused_adam.default]) +def meta__fused_adam( + self, + grads, + exp_avgs, + exp_avg_sqs, + max_exp_avg_sqs, + state_steps, + *, + lr, + beta1, + beta2, + weight_decay, + eps, + amsgrad, + maximize, + grad_scale=None, + found_inf=None, +): + for l in [self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]: + torch._check( + isinstance(l, List), + lambda: f"exponent must be a tensor list but got {type(l)}", + ) + + def empty_like_list(tensor_list): + return [torch.empty_like(t) for t in tensor_list] + + return ( + empty_like_list(self), + empty_like_list(grads), + empty_like_list(exp_avgs), + empty_like_list(exp_avg_sqs), + empty_like_list(max_exp_avg_sqs), + ) + + +@register_meta([aten._int_mm]) +@out_wrapper() +def meta__int_mm(a, b): + torch._check(a.dim() == 2, lambda: "a must be a 2D tensor") + torch._check(b.dim() == 2, lambda: "b must be a 2D tensor") + torch._check( + a.dtype is torch.int8, + lambda: f"expected self to be int8, got {a.dtype}", + ) + torch._check( + b.dtype is torch.int8, + lambda: f"expected mat2 to be int8, got {b.dtype}", + ) + torch._check( + a.size(1) == b.size(0), + lambda: ( + f"Incompatible matrix sizes for _int_mm ({a.size(0)}x{a.size(1)} " + f"and {b.size(0)}x{b.size(1)})" + ), + ) + return a.new_empty((a.size(0), b.size(1)), dtype=torch.int32) + + +@register_meta([aten._convert_weight_to_int4pack]) +def meta__convert_weight_to_int4pack(w, inner_k_tiles): + torch._check(w.dim() == 2, lambda: "w must be a 2D tensor") + torch._check( + w.dtype is torch.int32, + lambda: f"expected w to be int32, got {w.dtype}", + ) + n = w.size(0) + k = w.size(1) + return w.new_empty( + ( + n // 8, + k // (inner_k_tiles * 16), + 32, + inner_k_tiles // 2, + ), + dtype=torch.int32, + ) + + +@register_meta([aten._weight_int4pack_mm]) +def meta__weight_int4pack_mm(x, w, q_group_size, q_scale_and_zeros): + torch._check(x.dim() == 2, lambda: "x must be a 2D tensor") + torch._check(w.dim() == 4, lambda: "w must be a 4D tensor") + torch._check( + x.dtype is torch.bfloat16, + lambda: f"expected x to be bf16, got {x.dtype}", + ) + torch._check( + w.dtype is torch.int32, + lambda: f"expected w to be int32, got {w.dtype}", + ) + return x.new_empty(x.size(0), w.size(0) * 8, dtype=x.dtype) + + +@register_meta([aten._weight_int8pack_mm]) +def meta__weight_int8pack_mm(x, w, q_scales): + torch._check(x.dim() == 2, lambda: "x must be a 2D tensor") + torch._check( + x.dtype is torch.bfloat16, + lambda: f"expected x to be bf16, got {x.dtype}", + ) + torch._check(w.dim() == 2, lambda: "w must be a 2D tensor") + torch._check( + w.dtype is torch.int8, + lambda: f"expected w to be int8, got {w.dtype}", + ) + return x.new_empty(x.size(0), w.size(0), dtype=x.dtype) + + +@register_meta(aten._cdist_forward.default) +def meta_cdist_forward(x1, x2, p, compute_mode): + torch._check( + x1.dim() >= 2, + lambda: f"cdist only supports at least 2D tensors, X1 got: {x1.dim()}D", + ) + torch._check( + x2.dim() >= 2, + lambda: f"cdist only supports at least 2D tensors, X2 got: {x2.dim()}D", + ) + torch._check( + x1.size(-1) == x2.size(-1), + lambda: f"X1 and X2 must have the same number of columns. X1: {x1.size(-1)} X2: {x2.size(-1)}", + ) + torch._check( + utils.is_float_dtype(x1.dtype), + lambda: "cdist only supports floating-point dtypes, X1 got: {x1.dtype}", + ) + torch._check( + utils.is_float_dtype(x2.dtype), + lambda: "cdist only supports floating-point dtypes, X2 got: {x2.dtype}", + ) + torch._check(p >= 0, lambda: "cdist only supports non-negative p values") + torch._check( + compute_mode in (None, 1, 2), + lambda: f"possible modes: None, 1, 2, but was: {compute_mode}", + ) + r1 = x1.size(-2) + r2 = x2.size(-2) + batch_tensor1 = x1.shape[:-2] + batch_tensor2 = x2.shape[:-2] + output_shape = list(torch.broadcast_shapes(batch_tensor1, batch_tensor2)) + output_shape.extend([r1, r2]) + return x1.new_empty(output_shape) + + +@register_meta(aten._cdist_backward) +@out_wrapper() +def meta_cdist_backward(grad, x1, x2, p, cdist): + c1 = x1.shape[-1] + r1 = x1.shape[-2] + r2 = x2.shape[-2] + batch_tensor1 = x1.shape[:-2] + batch_tensor2 = x2.shape[:-2] + expand_batch_portion = list(torch.broadcast_shapes(batch_tensor1, batch_tensor2)) + tensor1_expand_size = expand_batch_portion.copy() + tensor1_expand_size.extend([r1, c1]) + batch_product = math.prod(expand_batch_portion) + if r1 == 0 or r2 == 0 or c1 == 0 or batch_product == 0: + return torch.zeros_like(x1) + if tensor1_expand_size != list(x1.shape): + x1 = x1.expand(tensor1_expand_size) + return torch.empty_like(x1, memory_format=torch.contiguous_format) + + +# NB: This meta function accepts non-meta arguments! When this behavior +# was originally introduced this was accidental, but it is now load bearing +# as people are using this so that they can conveniently test code involving +# embeddings (feeding CPU tensor inputs with meta device EmbeddingBag module) +@register_meta(aten._embedding_bag.default) +def meta_embedding_bag( + weight, + indices, + offsets, + scale_grad_by_freq=False, + mode=0, + sparse=False, + per_sample_weights=None, + include_last_offset=False, + padding_idx=-1, +): + torch._check( + indices.dtype in (torch.long, torch.int), + lambda: f"expected indices to be long or int, got {indices.dtype}", + ) + torch._check( + offsets.dtype in (torch.long, torch.int), + lambda: f"expected offsets to be long or int, got {offsets.dtype}", + ) + torch._check( + utils.is_float_dtype(weight.dtype), + lambda: f"expected weight to be floating point type, got {weight.dtype}", + ) + + num_bags = offsets.size(0) + if include_last_offset: + torch._check( + num_bags >= 1, + lambda: "include_last_offset: numBags should be at least 1", + ) + num_bags -= 1 + + output = weight.new_empty(num_bags, weight.size(1)) + MODE_SUM, MODE_MEAN, MODE_MAX = range(3) + + if per_sample_weights is not None: + torch._check( + mode == MODE_SUM, + lambda: "embedding_bag: per_sample_weights only supported with mode='sum'", + ) + torch._check( + per_sample_weights.dtype == weight.dtype, + lambda: f"expected weight ({weight.dtype}) and per_sample_weights ({per_sample_weights.dtype}) to have same dtype", + ) + torch._check( + per_sample_weights.ndim == 1, + lambda: f"expected per_sample_weights to be 1D tensor, got {per_sample_weights.ndim}D", + ) + torch._check( + per_sample_weights.numel() == indices.numel(), + lambda: ( + f"expected per_sample_weights.numel() ({per_sample_weights.numel()} " + f"to be the same as indices.numel() ({indices.numel()})" + ), + ) + + def is_fast_path_index_select_scale(src, scale, output, padding_idx): + return ( + is_fast_path_index_select(src, output, padding_idx) and scale.stride(0) == 1 + ) + + def is_fast_path_index_select(src, output, padding_idx): + return ( + (src.dtype == torch.float or src.dtype == torch.half) + and src.stride(1) == 1 + and output.stride(1) == 1 + and padding_idx < 0 + ) + + def is_fast_path(src, scale, output, padding_idx): + if scale is not None: + return is_fast_path_index_select_scale(src, scale, output, padding_idx) + else: + return is_fast_path_index_select(src, output, padding_idx) + + if device_hint(offsets) != "cpu": + offset2bag = indices.new_empty(indices.size(0)) + bag_size = indices.new_empty(offsets.size()) + if mode == MODE_MAX: + max_indices = indices.new_empty(num_bags, weight.size(1)) + else: + max_indices = indices.new_empty(0) + else: + fast_path_sum = is_fast_path(weight, per_sample_weights, output, padding_idx) + if mode in (MODE_MEAN, MODE_MAX) or not fast_path_sum: + offset2bag = offsets.new_empty(indices.size(0)) + else: + offset2bag = offsets.new_empty(0) + bag_size = offsets.new_empty(num_bags) + # This part of the logic comes from make_max_indices_out in EmbeddingBag.cpp + numBags = offsets.shape[0] + if mode == MODE_MAX: + if include_last_offset: + torch._check( + numBags >= 1, + lambda: "include_last_offset: numBags should be at least 1", + ) + numBags -= 1 + max_indices = offsets.new_empty(numBags, weight.shape[1]) + else: + max_indices = offsets.new_empty(bag_size.size()) + return output, offset2bag, bag_size, max_indices + + +@register_meta(aten._embedding_bag_forward_only.default) +def meta_embedding_bag_forward_only(weight, indices, offsets, *args): + output, offset2bag, bag_size, max_indices = meta_embedding_bag( + weight, indices, offsets, *args + ) + if device_hint(offsets) == "cpu": + bag_size = offsets.new_empty(offsets.size()) + return output, offset2bag, bag_size, max_indices + + +def _get_reduction_dtype(input, dtype, promote_int_to_long=True): + # if specified, dtype takes precedence + if dtype: + return dtype + + if input.dtype.is_floating_point or input.dtype.is_complex: + return input.dtype + elif promote_int_to_long: + return torch.long + + return input.dtype + + +@register_meta([aten.nansum.default, aten.nansum.out]) +@out_wrapper() +def meta_nansum(input, dims=None, keepdim=False, *, dtype=None): + output_dtype = _get_reduction_dtype(input, dtype, promote_int_to_long=True) + dims = utils.reduction_dims(input.shape, dims) + output_shape = _compute_reduction_shape(input, dims, keepdim) + return input.new_empty(output_shape, dtype=output_dtype) + + +@register_meta([aten.median.default, aten.nanmedian.default]) +def meta_median(input): + output_shape = utils.compute_reduction_output_shape( + input.shape, tuple(range(input.dim())) + ) + return input.new_empty(output_shape) + + +@register_meta( + [ + aten.median.dim, + aten.median.dim_values, + aten.nanmedian.dim, + aten.nanmedian.dim_values, + aten.mode.default, + aten.mode.values, + ] +) +@out_wrapper("values", "indices") +def meta_median_mode_dim(input, dim=-1, keepdim=False): + if device_hint(input) == "cuda": + utils.alert_not_deterministic("median CUDA with indices output") + dim = utils.reduction_dims(input.shape, (dim,)) + output_shape = _compute_reduction_shape(input, dim, keepdim) + return ( + input.new_empty(output_shape), + input.new_empty(output_shape, dtype=torch.long), + ) + + +@register_meta(aten.logical_not_.default) +def meta_logical_not_(self): + return self + + +@register_meta(aten.repeat.default) +def meta_repeat(self, repeats): + torch._check( + len(repeats) >= self.dim(), + lambda: "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor", + ) + # Add new leading dimensions to the tensor if the + # number of target dimensions is larger than the + # number of source dimensions. + num_new_dimensions = len(repeats) - self.dim() + padded_size = (1,) * num_new_dimensions + tuple(self.shape) + target_size = [padded_size[i] * repeats[i] for i in range(len(repeats))] + return self.new_empty(target_size) + + +@register_meta(aten.zero_.default) +def meta_zero_(self): + return self + + +@register_meta( + [ + aten.mul_.Scalar, + aten.div_.Scalar, + aten.mul_.Tensor, + aten.div_.Tensor, + aten.logical_and_.default, + aten.logical_or_.default, + aten.logical_xor_.default, + ], +) +def meta_binop_inplace(self, other): + if isinstance(other, torch.Tensor): + check_inplace_broadcast(self.shape, other.shape) + return self + + +@register_meta( + [ + aten.add_.Scalar, + aten.sub_.Scalar, + aten.add_.Tensor, + aten.sub_.Tensor, + ], +) +def meta_binop_inplace_alpha(self, other, alpha=1): + if isinstance(other, torch.Tensor): + check_inplace_broadcast(self.shape, other.shape) + return self + + +@register_meta([aten.round.default, aten.round.decimals]) +def meta_round(self, **kwargs): + return elementwise_meta( + self, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT + ) + + +def shift_dtype_check(fn_name, self, val): + torch._check( + utils.is_integer_dtype(self.dtype), + lambda: f"{fn_name}: Expected input tensor to have an integral dtype. Got {self.dtype}", + ) + if isinstance(val, torch.Tensor): + torch._check( + utils.is_integer_dtype(val.dtype), + lambda: f"{fn_name}: Expected shift value to have an integral dtype. Got {val.dtype}", + ) + else: + torch._check( + isinstance(val, IntLike), + lambda: f"{fn_name}: Expected shift value to be an int. Got {val}", + ) + + +@register_meta([aten.__rshift__.Tensor, aten.__rshift__.Scalar]) +def meta_rshifts(self, other): + shift_dtype_check("rshift", self, other) + return elementwise_meta( + self, other, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT + ) + + +@register_meta([aten.__lshift__.Tensor, aten.__lshift__.Scalar]) +def meta_lshifts(self, other): + shift_dtype_check("lshift", self, other) + return elementwise_meta( + self, other, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT + ) + + +@register_meta(aten.zero.default) +def meta_zero(self): + return self.new_empty(self.shape) + + +@register_meta([aten.fill_.Tensor, aten.fill_.Scalar]) +def meta_fill_(self, val): + return self + + +@register_meta([aten.fill.Tensor, aten.fill.Scalar]) +def meta_fill(self, val): + return torch.empty_like(self) + + +@register_meta(aten.relu_.default) +def meta_relu_(self): + return self + + +@register_meta([aten.index_put.default, aten._unsafe_index_put.default]) +def meta_index_put(self, indices, values, accumulate=False): + return torch.empty_like(self) + + +@register_meta(aten.masked_fill_.Scalar) +def meta_masked_fill_(self, mask, value): + check_inplace_broadcast(self.shape, mask.shape) + return self + + +@register_meta(aten.masked_scatter_) +def meta_masked_scatter_(self, mask, source): + torch._check( + mask.dtype in (torch.bool, torch.uint8), lambda: "Mask must be bool or uint8" + ) + torch._check( + self.dtype == source.dtype, + lambda: "masked_scatter: expected self and source to have same " + "dtypes but got {self.dtype} and {source.dtype}", + ) + return self + + +@register_meta(aten.masked_scatter) +@out_wrapper() +def meta_masked_scatter(self, mask, source): + self, mask = _maybe_broadcast(self, mask) + output = torch.empty_like(self, memory_format=torch.contiguous_format) + return meta_masked_scatter_(output, mask, source) + + +@register_meta(aten.masked_scatter_backward) +def meta_masked_scatter_backward(self, mask, sizes): + return self.new_empty(sizes) + + +@register_meta(aten.index_put_.default) +def meta_index_put_(self, indices, values, accumulate=False): + return self + + +@register_meta(aten.alias.default) +def meta_alias(self): + return self.view(self.shape) + + +def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None): + torch._check(batch1.dim() == 3, lambda: "batch1 must be a 3D tensor") + torch._check(batch2.dim() == 3, lambda: "batch2 must be a 3D tensor") + + batch1_sizes = batch1.size() + batch2_sizes = batch2.size() + + bs = batch1_sizes[0] + contraction_size = batch1_sizes[2] + res_rows = batch1_sizes[1] + res_cols = batch2_sizes[2] + output_size = (bs, res_rows, res_cols) + + torch._check( + batch2_sizes[0] == bs and batch2_sizes[1] == contraction_size, + lambda: f"Expected size for first two dimensions of batch2 tensor to be: [{bs}" + f", {contraction_size}] but got: [{batch2_sizes[0]}, {batch2_sizes[1]}].", + ) + + # TODO: handle out + + output = batch2.new_empty(output_size) + + if not is_bmm and self_baddbmm is not None: + torch._check(self_baddbmm.dim() == 3, lambda: "self must be a 3D tensor") + torch._check( + self_baddbmm.size() == output_size, + lambda: f"Expected an input tensor shape with shape {output_size} but got shape: {self_baddbmm.size()}", + ) + + return output + + +@register_meta(aten.bmm.default) +def meta_bmm(self, mat2): + return common_meta_baddbmm_bmm(self, mat2, True) + + +def div_rtn(x, y): + q = x // y + r = x % y + # WARNING: explicit bool conversion here is necessary; + # would be fixed by SymBool + if r != 0 and (bool(r < 0) != bool(y < 0)): + q -= 1 + return q + + +def pooling_output_shape_pad_lr( + inputSize, kernelSize, pad_l, pad_r, stride, dilation, ceil_mode +): + outputSize = ( + div_rtn( + inputSize + + pad_l + + pad_r + - dilation * (kernelSize - 1) + - 1 + + (stride - 1 if ceil_mode else 0), + stride, + ) + + 1 + ) + if ceil_mode: + if (outputSize - 1) * stride >= inputSize + pad_l: + outputSize -= 1 + return outputSize + + +def pooling_output_shape(inputSize, kernelSize, pad, stride, dilation, ceil_mode): + torch._check(stride != 0, lambda: "stride should not be zero") + torch._check(pad >= 0, lambda: f"pad must be non-negative, but got pad: {pad}") + torch._check( + pad <= ((kernelSize - 1) * dilation + 1) // 2, + lambda: ( + f"pad should be at most half of effective kernel size, but got pad={pad}, " + f"kernel_size={kernelSize} and dilation={dilation}" + ), + ) + return pooling_output_shape_pad_lr( + inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode + ) + + +def pool2d_shape_check( + input, + kH, + kW, + dH, + dW, + padH, + padW, + dilationH, + dilationW, + nInputPlane, + inputHeight, + inputWidth, + outputHeight, + outputWidth, + memory_format, +): + ndim = input.dim() + nOutputPlane = nInputPlane + + torch._check( + kW > 0 and kH > 0, + lambda: "kernel size should be greater than zero, but got kH: {kH}, kW: {kW}", + ) + torch._check( + dW > 0 and dH > 0, + lambda: "stride should be greater than zero, but got dH: {dH}, dW: {dW}", + ) + torch._check( + dilationH > 0 and dilationW > 0, + lambda: "dilation should be greater than zero, but got dilationH: {dilationH}, dilationW: {dilationW}", + ) + + valid_dims = input.size(1) != 0 and input.size(2) != 0 + + if memory_format == torch.channels_last: + torch._check( + ndim == 4 and valid_dims and input.size(3) != 0, + lambda: "Expected 4D (batch mode) tensor expected for input with channels_last layout" + " with optional 0 dim batch size for input, but got: {input.size()}", + ) + else: + torch._check( + (ndim == 3 and input.size(0) != 0 and valid_dims) + or (ndim == 4 and valid_dims and input.size(3) != 0), + lambda: f"Expected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got: {input.size()}", + ) + + torch._check( + kW // 2 >= padW and kH // 2 >= padH, + lambda: "pad should be smaller than or equal to half of kernel size, but got " + f"padW = {padW}, padH = {padH}, kW = {kW}, kH = {kH}", + ) + + torch._check( + outputWidth >= 1 and outputHeight >= 1, + lambda: f"Given input size: ({nInputPlane}x{inputHeight}x{inputWidth}). " + f"Calculated output size: ({nOutputPlane}x{outputHeight}x{outputWidth}). " + "Output size is too small", + ) + + +def pool3d_shape_check( + input: Tensor, + nslices: int, + kT: int, + kH: int, + kW: int, + dT: int, + dH: int, + dW: int, + pT: int, + pH: int, + pW: int, + dilationT: int, + dilationH: int, + dilationW: int, + itime: int, + iheight: int, + iwidth: int, + otime: int, + oheight: int, + owidth: int, + fn_name: str, + check_input_size: bool = False, +): + ndim = input.ndim + + torch._check( + kT > 0 and kW > 0 and kH > 0, + lambda: ( + f"kernel size should be greater than zero, but got " + f"kT: {kT}, kH: {kH}, kW: {kW}" + ), + ) + torch._check( + dT > 0 and dW > 0 and dH > 0, + lambda: ( + f"stride should be greater than zero, but got " + f"dT: {dT}, dH: {dH}, dW: {dW}" + ), + ) + torch._check( + dilationT > 0 and dilationW > 0 and dilationH > 0, + lambda: ( + f"dilation should be greater than zero, but got " + f"dilationT: {dilationT}, dilationH: {dilationH}, dilationW: {dilationW}" + ), + ) + + torch._check( + ndim in (4, 5), + lambda: f"{fn_name}: Expected 4D or 5D tensor for input, but got: {input.shape}", + ) + + for i in range(ndim): + if ndim == 5 and i == 0: + # size of batch-dim can be 0. + continue + torch._check( + input.size(i) > 0, + lambda: ( + f"{fn_name}: Expected input's non-batch dimensions to have positive length," + f" but input has a shape of {input.shape}" + f" and non-batch dimension {input.size(i)} has length zero!" + ), + ) + + if check_input_size: # AveragePool3d + torch._check( + itime >= kT and iheight >= kH and iwidth >= kW, + lambda: ( + f"input image (T: {itime} H: {iheight} W: {iwidth}) smaller than " + f"kernel size (kT: {kT} kH: {kH} kW: {kW})" + ), + ) + + torch._check( + kT / 2 >= pT and kW / 2 >= pW and kH / 2 >= pH, + lambda: ( + f"pad should be smaller than or equal to half of kernel size, but got " + f"kT: {kT} kW: {kW} kH: {kH} padT: {pT} padW: {pW} padH: {pH}" + ), + ) + + torch._check( + otime >= 1 and owidth >= 1 and oheight >= 1, + lambda: ( + f"Given input size: ({nslices}x{itime}x{iheight}x{iwidth}). " + f"Calculated output size: ({nslices}x{otime}x{oheight}x{owidth}). " + f"Output size is too small" + ), + ) + + +def max_pool3d_backward_shape_check( + input, + grad_output, + indices, + nslices, + kT, + kH, + kW, + dT, + dH, + dW, + pT, + pH, + pW, + dilationT, + dilationH, + dilationW, + itime, + iheight, + iwidth, + otime, + oheight, + owidth, + fn_name, +): + ndim = input.ndim + + pool3d_shape_check( + input, + nslices, + kT, + kH, + kW, + dT, + dH, + dW, + pT, + pH, + pW, + dilationT, + dilationH, + dilationW, + itime, + iheight, + iwidth, + otime, + oheight, + owidth, + fn_name, + ) + + check_dim_size(grad_output, ndim, ndim - 4, nslices) + check_dim_size(grad_output, ndim, ndim - 3, otime) + check_dim_size(grad_output, ndim, ndim - 2, oheight) + check_dim_size(grad_output, ndim, ndim - 1, owidth) + + check_dim_size(indices, ndim, ndim - 4, nslices) + check_dim_size(indices, ndim, ndim - 3, otime) + check_dim_size(indices, ndim, ndim - 2, oheight) + check_dim_size(indices, ndim, ndim - 1, owidth) + + +def avg_pool3d_backward_shape_check( + input: Tensor, + grad_output: Tensor, + nslices: int, + kT: int, + kH: int, + kW: int, + dT: int, + dH: int, + dW: int, + pT: int, + pH: int, + pW: int, + itime: int, + iheight: int, + iwidth: int, + otime: int, + oheight: int, + owidth: int, + fn_name: str, +): + ndim = input.ndim + + pool3d_shape_check( + input, + nslices, + kT, + kH, + kW, + dT, + dH, + dW, + pT, + pH, + pW, + 1, + 1, + 1, + itime, + iheight, + iwidth, + otime, + oheight, + owidth, + fn_name, + True, + ) + + check_dim_size(grad_output, ndim, ndim - 4, nslices) + check_dim_size(grad_output, ndim, ndim - 3, otime) + check_dim_size(grad_output, ndim, ndim - 2, oheight) + check_dim_size(grad_output, ndim, ndim - 1, owidth) + + +def max_pool2d_checks_and_compute_shape( + input, kernel_size, stride, padding, dilation, ceil_mode +): + # Reference: aten/src/ATen/native/DilatedMaxPool2d.cpp + def unpack(name, val): + torch._check( + len(val) in [1, 2], + lambda: f"max_pool2d: {name} must either be a single int, or a tuple of two ints", + ) + H = val[0] + W = H if len(val) == 1 else val[1] + return H, W + + kH, kW = unpack("kernel_size", kernel_size) + + torch._check( + len(stride) in [0, 1, 2], + lambda: "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints", + ) + if len(stride) == 0: + dH, dW = kH, kW + else: + dH, dW = unpack("stride", stride) + + padH, padW = unpack("padding", padding) + dilationH, dilationW = unpack("dilation", dilation) + nInputPlane = input.size(-3) + inputHeight = input.size(-2) + inputWidth = input.size(-1) + + memory_format = utils.suggest_memory_format(input) + if memory_format == torch.channels_last: + torch._check( + input.dim() == 4, + lambda: "non-empty 4D (batch mode) tensor expected for input with channels_last layout", + ) + elif memory_format == torch.contiguous_format: + torch._check( + input.dim() in [3, 4], + lambda: "non-empty 3D or 4D (batch mode) tensor expected for input", + ) + else: + torch._check( + False, + lambda: "Unsupport memory format. Supports only ChannelsLast, Contiguous", + ) + + outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode) + outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode) + + pool2d_shape_check( + input, + kH, + kW, + dH, + dW, + padH, + padW, + dilationH, + dilationW, + nInputPlane, + inputHeight, + inputWidth, + outputHeight, + outputWidth, + memory_format, + ) + + return nInputPlane, outputHeight, outputWidth + + +@register_meta(aten.max_pool2d_with_indices_backward.default) +def meta_max_pool2d_with_indices_backward( + grad_output, + self, + kernel_size, + stride, + padding, + dilation, + ceil_mode, + indices, +): + ( + nInputPlane, + outputHeight, + outputWidth, + ) = max_pool2d_checks_and_compute_shape( + self, kernel_size, stride, padding, dilation, ceil_mode + ) + + torch._check( + self.dtype == grad_output.dtype, + lambda: f"Expected dtype {self.dtype} for `gradOutput` but got dtype {grad_output.dtype}", + ) + + nOutputPlane = nInputPlane + ndim = self.ndim + + def _check_dim_size(t): + check_dim_size(t, ndim, ndim - 3, nOutputPlane) + check_dim_size(t, ndim, ndim - 2, outputHeight) + check_dim_size(t, ndim, ndim - 1, outputWidth) + + _check_dim_size(grad_output) + _check_dim_size(indices) + + memory_format = utils.suggest_memory_format(self) + return torch.empty( + self.shape, + dtype=self.dtype, + device=self.device, + memory_format=memory_format, + ) + + +@register_meta(aten.max_pool2d_with_indices.default) +def meta_max_pool2d_with_indices( + input, kernel_size, stride=(), padding=(0,), dilation=(1,), ceil_mode=False +): + ( + nInputPlane, + outputHeight, + outputWidth, + ) = max_pool2d_checks_and_compute_shape( + input, kernel_size, stride, padding, dilation, ceil_mode + ) + + nbatch = input.size(-4) if input.dim() == 4 else 1 + memory_format = utils.suggest_memory_format(input) + if input.dim() == 3: + size = [nInputPlane, outputHeight, outputWidth] + else: + size = [nbatch, nInputPlane, outputHeight, outputWidth] + return ( + torch.empty( + size, + dtype=input.dtype, + device=input.device, + memory_format=memory_format, + ), + torch.empty( + size, + dtype=torch.int64, + device=input.device, + memory_format=memory_format, + ), + ) + + +@register_meta(aten.fractional_max_pool2d.default) +def meta_fractional_max_pool2d(self_, kernel_size, output_size, random_samples): + torch._check( + self_.ndim in (3, 4), + lambda: f"fractional_max_pool2d: Expected 3D or 4D tensor, but got: {self_.ndim}", + ) + ndim = self_.ndim + + for d in range(ndim - 3, ndim): + torch._check( + self_.size(d) > 0, + f"fractional_max_pool2d: Expected input to have non-zero " + f" size for non-batch dimenions, but got {self_.size()} with dimension {d} empty", + ) + + # the check and message are out of sync, but this matches the structured meta + torch._check( + len(kernel_size) == 2, + lambda: "fractional_max_pool2d: kernel_size must" + "either be a single int or tuple of Ints", + ) + torch._check( + len(output_size) == 2, + lambda: "fractional_max_pool2d: output_size must " + "either be a single int or tuple of Ints", + ) + + input_channels = self_.size(-3) + input_height = self_.size(-2) + input_width = self_.size(-1) + if ndim == 4: + input_batch = self_.size(0) + else: + input_batch = 1 + + torch._check( + self_.dtype == random_samples.dtype, + lambda: "Expect _random_samples to have the same dtype as input", + ) + torch._check( + random_samples.ndim == 3, + lambda: f"Expect _random samples to have 3 dimensions got, {random_samples.ndim}", + ) + + n = random_samples.size(0) + c = random_samples.size(1) + d = random_samples.size(2) + torch._check( + n >= input_batch, + "Expect _random_samples.size(0) no less then input batch size.", + ) + torch._check( + c == input_channels, + lambda: "Expect _random_samples.size(1) equals to input channel size.", + ) + torch._check(d == 2, lambda: f"Expect _random_samples.size(2) equals to 2 got {d}.") + + torch._check( + output_size[0] + kernel_size[0] - 1 <= input_height, + lambda: f"fractional_max_pool2d: kernel height {kernel_size[0]} is too large relative to input height {input_height}", + ) + torch._check( + output_size[1] + kernel_size[1] - 1 <= input_width, + lambda: f"fractional_max_pool2d: kernel width {kernel_size[1]} is too large relative to input width {input_width}", + ) + + if self_.dim() == 4: + size = [input_batch, input_channels, output_size[0], output_size[1]] + else: + size = [input_channels, output_size[0], output_size[1]] + + return ( + torch.empty( + size, + dtype=self_.dtype, + device=self_.device, + ), + torch.empty( + size, + dtype=torch.int64, + device=self_.device, + ), + ) + + +@register_meta(aten.max_unpool2d) +@out_wrapper() +def meta_max_unpool2d(self_, indices, output_size): + utils.alert_not_deterministic("max_unpooling2d_forward_out") + + torch._check( + indices.dtype == torch.int64, + lambda: f"elements in indices should be type int64 but got: {indices.dtype}", + ) + torch._check( + len(output_size) == 2, + lambda: ( + f"There should be exactly two elements (height, width) in output_size, " + f"but got {len(output_size)} elements." + ), + ) + + oheight, owidth = output_size + + torch._check( + self_.ndim in (3, 4), + lambda: ( + f"Input to max_unpooling2d should be a 3d or 4d Tensor, " + f"but got a tensor with {self_.ndim} dimensions." + ), + ) + torch._check( + self_.shape == indices.shape, + lambda: ( + f"Expected shape of indices to be same as that of the input tensor ({self_.shape}) " + f"but got indices tensor with shape: {indices.shape}" + ), + ) + + for i in range(1, self_.ndim): + torch._check( + self_.size(i) > 0, + lambda: ( + f"max_unpooling2d(): " + f"Expected input to have non-zero size for non-batch dimensions, " + f"but got {self_.shape} with dimension {i} being empty." + ), + ) + + self = self_.contiguous() + + if self_.ndim == 3: + nchannels = self.size(0) + result = self.new_empty((nchannels, oheight, owidth)) + else: + nbatch = self.size(0) + nchannels = self.size(1) + result = self.new_empty((nbatch, nchannels, oheight, owidth)) + + return result + + +def _max_unpooling3d_shape_check(input, indices, output_size, stride, padding, fn_name): + torch._check( + indices.dtype == torch.int64, lambda: "elements in indices should be type int64" + ) + torch._check( + input.ndim in (4, 5), + lambda: f"Input to max_unpooling3d should be a 4d or 5d Tensor, but got a tensor with {input.ndim} dimensions.", + ) + torch._check( + len(output_size) == 3, + lambda: ( + f"There should be exactly three elements (depth, height, width) in output_size, " + f"but got {len(output_size)} elements." + ), + ) + torch._check( + len(stride) == 3, + lambda: f"There should be exactly three elements (depth, height, width) in stride, but got: {len(stride)} elements.", + ) + torch._check( + len(padding) == 3, + lambda: f"There should be exactly three elements (depth, height, width) in padding, but got: {len(padding)} elements.", + ) + torch._check( + input.shape == indices.shape, + lambda: ( + f"Expected shape of indices to be same as that of the input tensor ({input.shape}) " + f"but got indices tensor with shape: {indices.shape}" + ), + ) + + for i in range(1, input.ndim): + torch._check( + input.size(i) > 0, + lambda: ( + f"{fn_name}: " + f"Expected input to have non-zero size for non-batch dimensions, " + f"but got {input.shape} with dimension {i} being empty." + ), + ) + + torch._check( + stride[0] > 0 and stride[1] > 0 and stride[2] > 0, + lambda: f"strides should be greater than zero, but got stride: {stride}", + ) + + +@register_meta(aten.max_unpool3d) +@out_wrapper() +def meta_max_unpool3d(self_, indices, output_size, stride, padding): + utils.alert_not_deterministic("max_unpooling3d_forward_out") + + _max_unpooling3d_shape_check( + self_, indices, output_size, stride, padding, "max_unpooling3d()" + ) + + self = self_.contiguous() + + odepth, oheight, owidth = output_size + + if self_.ndim == 4: + nchannels = self.size(0) + result = self.new_empty((nchannels, odepth, oheight, owidth)) + else: + nbatch = self.size(0) + nchannels = self.size(1) + result = self.new_empty((nbatch, nchannels, odepth, oheight, owidth)) + + return result + + +@register_meta(aten.max_pool3d_with_indices) +@out_wrapper("out", "indices") +def meta_max_pool3d_with_indices( + input, + kernel_size, + stride=(), + padding=(0,), + dilation=(1,), + ceil_mode=False, +): + torch._check( + len(kernel_size) in (1, 3), + lambda: "max_pool3d: kernel_size must either be a single int, or a tuple of three ints", + ) + kT = kernel_size[0] + kH = kT if len(kernel_size) == 1 else kernel_size[1] + kW = kT if len(kernel_size) == 1 else kernel_size[2] + + torch._check( + not stride or len(stride) in (1, 3), + lambda: "max_pool3d: stride must either be omitted, a single int, or a tuple of three ints", + ) + dT = kT if not stride else stride[0] + dH = kH if not stride else (dT if len(stride) == 1 else stride[1]) + dW = kW if not stride else (dT if len(stride) == 1 else stride[2]) + + torch._check( + len(padding) in (1, 3), + lambda: "max_pool3d: padding must either be a single int, or a tuple of three ints", + ) + pT = padding[0] + pH = pT if len(padding) == 1 else padding[1] + pW = pT if len(padding) == 1 else padding[2] + + torch._check( + len(dilation) in (1, 3), + lambda: "max_pool3d: dilation must be either a single int, or a tuple of three ints", + ) + dilationT = dilation[0] + dilationH = dilationT if len(dilation) == 1 else dilation[1] + dilationW = dilationT if len(dilation) == 1 else dilation[2] + + torch._check( + input.ndim in (4, 5), + lambda: "non-empty 4D or 5D (batch mode) tensor expected for input", + ) + + nbatch = input.size(-5) if input.ndim == 5 else 1 + nslices = input.size(-4) + itime = input.size(-3) + iheight = input.size(-2) + iwidth = input.size(-1) + + otime = pooling_output_shape(itime, kT, pT, dT, dilationT, ceil_mode) + oheight = pooling_output_shape(iheight, kH, pH, dH, dilationH, ceil_mode) + owidth = pooling_output_shape(iwidth, kW, pW, dW, dilationW, ceil_mode) + + pool3d_shape_check( + input, + nslices, + kT, + kH, + kW, + dT, + dH, + dW, + pT, + pH, + pW, + dilationT, + dilationH, + dilationW, + itime, + iheight, + iwidth, + otime, + oheight, + owidth, + "max_pool3d_with_indices()", + ) + + channels_last = ( + input.ndim == 5 and utils.suggest_memory_format(input) == torch.channels_last_3d + ) + if input.ndim == 4: + input_channels_last_check = input.unsqueeze(0) + channels_last = ( + not input_channels_last_check.is_contiguous() + ) and input_channels_last_check.is_contiguous( + memory_format=torch.channels_last_3d + ) + out_shape = (nslices, otime, oheight, owidth) + else: + out_shape = (nbatch, nslices, otime, oheight, owidth) # type: ignore[assignment] + + out = input.new_empty(out_shape) + indices = input.new_empty(out_shape, dtype=torch.int64) + + if channels_last: + out = out.to(memory_format=torch.channels_last_3d) + indices = indices.to(memory_format=torch.channels_last_3d) + + return out, indices + + +@register_meta(aten.max_pool3d_with_indices_backward) +@out_wrapper("grad_input") +def meta_max_pool3d_with_indices_backward( + grad_output, + input, + kernel_size, + stride, + padding, + dilation, + ceil_mode, + indices, +): + torch._check( + len(kernel_size) in (1, 3), + lambda: "max_pool3d: kernel_size must either be a single int, or a tuple of three ints", + ) + kT = kernel_size[0] + kH = kT if len(kernel_size) == 1 else kernel_size[1] + kW = kT if len(kernel_size) == 1 else kernel_size[2] + + torch._check( + not stride or len(stride) in (1, 3), + lambda: "max_pool3d: stride must either be omitted, a single int, or a tuple of three ints", + ) + dT = kT if not stride else stride[0] + dH = kH if not stride else (dT if len(stride) == 1 else stride[1]) + dW = kW if not stride else (dT if len(stride) == 1 else stride[2]) + + torch._check( + len(padding) in (1, 3), + lambda: "max_pool3d: padding must either be a single int, or a tuple of three ints", + ) + pT = padding[0] + pH = pT if len(padding) == 1 else padding[1] + pW = pT if len(padding) == 1 else padding[2] + + torch._check( + len(dilation) in (1, 3), + lambda: "max_pool3d: dilation must be either a single int, or a tuple of three ints", + ) + dilationT = dilation[0] + dilationH = dilationT if len(dilation) == 1 else dilation[1] + dilationW = dilationT if len(dilation) == 1 else dilation[2] + + torch._check( + input.ndim in (4, 5), + lambda: "non-empty 4D or 5D (batch mode) tensor expected for input", + ) + + nslices = input.size(-4) + itime = input.size(-3) + iheight = input.size(-2) + iwidth = input.size(-1) + + otime = grad_output.size(-3) + oheight = grad_output.size(-2) + owidth = grad_output.size(-1) + + max_pool3d_backward_shape_check( + input, + grad_output, + indices, + nslices, + kT, + kH, + kW, + dT, + dH, + dW, + pT, + pH, + pW, + dilationT, + dilationH, + dilationW, + itime, + iheight, + iwidth, + otime, + oheight, + owidth, + "max_pool3d_with_indices_backward()", + ) + + channels_last = ( + input.ndim == 5 and utils.suggest_memory_format(input) == torch.channels_last_3d + ) + if input.ndim == 4: + input_channels_last_check = input.unsqueeze(0) + channels_last = ( + not input_channels_last_check.is_contiguous() + ) and input_channels_last_check.is_contiguous( + memory_format=torch.channels_last_3d + ) + + grad_input = input.new_empty(input.shape) + + if channels_last: + grad_input = grad_input.to(memory_format=torch.channels_last_3d) + + return grad_input + + +def check_grid_sampler_common(input: Tensor, grid: Tensor): + torch._check( + input.device == grid.device, + lambda: ( + f"grid_sampler(): expected input and grid to be on same device, but input " + f"is on {input.device} and grid is on {grid.device}" + ), + ) + torch._check( + input.layout == torch.strided and grid.layout == torch.strided, + lambda: ( + f"grid_sampler(): expected input and grid to have torch.strided layout, but " + f"input has {input.layout} and grid has {grid.layout}" + ), + ) + torch._check( + input.shape[0] == grid.shape[0], + lambda: ( + f"grid_sampler(): expected grid and input to have same batch size, but got " + f"input with sizes {input.shape} and grid with sizes {grid.shape}" + ), + ) + torch._check( + grid.shape[-1] == input.ndim - 2, + lambda: ( + f"grid_sampler(): expected grid to have size {input.ndim - 2} in last " + f"dimension, but got grid with sizes {grid.shape}" + ), + ) + + for i in range(2, input.ndim): + torch._check( + input.shape[i] > 0, + lambda: ( + f"grid_sampler(): expected input to have non-empty spatial dimensions, " + f"but input has sizes {input.shape} with dimension {i} being empty" + ), + ) + + +class GridSamplerInterpolation(Enum): + BILINEAR = 0 + NEAREST = 1 + BICUBIC = 2 + + +def check_grid_sampler_3d(input: Tensor, grid: Tensor, interpolation_mode: int): + torch._check( + input.ndim == 5 and input.ndim == grid.ndim, + lambda: ( + f"grid_sampler(): expected 5D input and grid with same number of " + f"dimensions, but got input with sizes {input.shape}" + f" and grid with sizes {grid.shape}" + ), + ) + torch._check( + not ( + input.ndim == 5 + and interpolation_mode == GridSamplerInterpolation.BICUBIC.value + ), + lambda: "grid_sampler(): bicubic interpolation only supports 4D input", + ) + + +@register_meta(aten.grid_sampler_2d_backward.default) +def grid_sampler_2d_backward_meta( + grad_output, + input, + grid, + interpolation_mode, + padding_mode, + align_corners, + output_mask, +): + input_requires_grad = output_mask[0] + if input_requires_grad: + grad_input = torch.zeros_like(input, memory_format=torch.contiguous_format) + else: + grad_input = None + grad_grid = torch.empty_like(grid, memory_format=torch.contiguous_format) + return (grad_input, grad_grid) + + +@register_meta(aten.grid_sampler_3d) +@out_wrapper() +def grid_sampler_3d( + input, + grid, + interpolation_mode, + padding_mode, + align_corners, +): + check_grid_sampler_common(input, grid) + check_grid_sampler_3d(input, grid, interpolation_mode) + N = input.shape[0] + C = input.shape[1] + out_D = grid.shape[1] + out_H = grid.shape[2] + out_W = grid.shape[3] + return input.new_empty((N, C, out_D, out_H, out_W)) + + +@register_meta(aten.grid_sampler_3d_backward) +@out_wrapper("grad_input", "grad_grid") +def grid_sampler_3d_backward( + grad_output, + input, + grid, + interpolation_mode, + padding_mode, + align_corners, + output_mask, +): + check_grid_sampler_common(input, grid) + check_grid_sampler_3d(input, grid, interpolation_mode) + input_requires_grad = output_mask[0] + if input_requires_grad: + grad_input = torch.zeros_like( + input, memory_format=torch.legacy_contiguous_format + ) + else: + grad_input = None + grad_grid = torch.empty_like(grid, memory_format=torch.legacy_contiguous_format) + return grad_input, grad_grid + + +@register_meta([aten.full.default]) +def full(size, fill_value, *args, **kwargs): + dtype = kwargs.get("dtype", None) + if not dtype: + dtype = utils.get_dtype(fill_value) + kwargs["dtype"] = dtype + return torch.empty(size, *args, **kwargs) + + +# zeros_like is special cased to work for sparse +@register_meta(aten.zeros_like.default) +def zeros_like( + self, + dtype=None, + layout=None, + device=None, + pin_memory=None, + memory_format=None, +): + if layout == torch.sparse_coo: + torch._check( + memory_format is None, + lambda: "memory format option is only supported by strided tensors", + ) + + res = torch.empty( + 0, + dtype=self.dtype if dtype is None else dtype, + layout=layout, + device=self.device if device is None else device, + pin_memory=pin_memory, + ) + + if self.is_sparse: + res.sparse_resize_and_clear_( + self.size(), self.sparse_dim(), self.dense_dim() + ) + else: + res.sparse_resize_and_clear_(self.size(), self.dim(), 0) + + res._coalesced_(True) + return res + res = aten.empty_like.default( + self, + dtype=dtype, + layout=layout, + device=device, + pin_memory=pin_memory, + memory_format=memory_format, + ) + # device can be not "meta" + res.fill_(0) + return res + + +@register_meta(aten.select.int) +def meta_select(self, dim, index): + ndim = self.dim() + torch._check_index( + ndim != 0, + lambda: "select() cannot be applied to a 0-dim tensor.", + ) + + dim = dim if dim >= 0 else dim + ndim + size = self.size(dim) + + torch._check_index( + not (-index > size or index >= size), + lambda: f"select(): index {index} out of range for tensor of size " + f"{self.size()} at dimension {dim}", + ) + + index = index if index >= 0 else index + size + + new_size = list(self.size()) + new_stride = list(self.stride()) + + new_storage_offset = self.storage_offset() + index * new_stride[dim] + del new_size[dim] + del new_stride[dim] + + return self.as_strided(new_size, new_stride, new_storage_offset) + + +@register_meta(aten.select_scatter.default) +def meta_select_scatter(self, src, dim, index): + return utils.clone_preserve_strides(self) + + +@register_meta(aten.slice_scatter.default) +def meta_slice_scatter(self, src, dim=0, start=None, end=None, step=1): + return utils.clone_preserve_strides(self) + + +# TODO: Deduplicate this with canonicalize_dim +def maybe_wrap_dim(dim: int, dim_post_expr: int, wrap_scalar: bool = True): + if dim_post_expr <= 0: + assert wrap_scalar + dim_post_expr = 1 + min = -dim_post_expr + max = dim_post_expr - 1 + assert not (dim < min or dim > max), f"dim {dim} out of bounds ({min}, {max})" + if dim < 0: + dim += dim_post_expr + return dim + + +def ensure_nonempty_size(t, dim): + return 1 if t.dim() == 0 else t.shape[dim] + + +# From aten/src/ATen/native/ScatterGatherChecks.h +def gather_shape_check(self, dim, index): + self_dims = max(self.dim(), 1) + index_dims = max(index.dim(), 1) + torch._check( + self_dims == index_dims, + lambda: "Index tensor must have the same number of dimensions as input tensor", + ) + for i in range(self_dims): + if i != dim: + torch._check( + ensure_nonempty_size(index, i) <= ensure_nonempty_size(self, i), + lambda: f"Size does not match at dimension {i} expected index {index.shape}" + + f" to be smaller than self {self.shape} apart from dimension {dim}", + ) + + +@register_meta(aten.gather.default) +def meta_gather(self, dim, index, sparse_grad=False): + wrapped_dim = maybe_wrap_dim(dim, self.dim()) + is_index_empty = index.numel() == 0 + if not is_index_empty: + torch._check( + index.dtype == torch.long, + lambda: f"gather(): Expected dtype int64 for index, but got {index.dtype}", + ) + gather_shape_check(self, wrapped_dim, index) + return self.new_empty(index.shape) + + +# From aten/src/ATen/native/TensorAdvancedIndexing.cpp +def get_operator_enum(reduce_, use_new_options=False): + if use_new_options: + if reduce_ == "sum": + return "REDUCE_ADD" + elif reduce_ == "prod": + return "REDUCE_MULTIPLY" + elif reduce_ == "mean": + return "REDUCE_MEAN" + elif reduce_ == "amax": + return "REDUCE_MAXIMUM" + elif reduce_ == "amin": + return "REDUCE_MINIMUM" + torch._check( + False, + lambda: "reduce argument must be either sum, prod, mean, amax or amin.", + ) + return + else: + if reduce_ == "add": + return "REDUCE_ADD" + elif reduce_ == "multiply": + return "REDUCE_MULTIPLY" + torch._check(False, lambda: "reduce argument must be either add or multiply.") + return + + +# From aten/src/ATen/native/ScatterGatherChecks.h +def scatter_gather_dtype_check(method_name, self, index, src_opt=None): + if index.numel() != 0: + torch._check( + index.dtype == torch.long, + lambda: f"{method_name}(): Expected dtype int64 for index", + ) + + if src_opt is not None: + torch._check( + self.dtype == src_opt.dtype, + lambda: f"{method_name}(): Expected self.dtype to be equal to src.dtype", + ) + + +def ensure_nonempty_dim(dim): + return max(dim, 1) + + +# From aten/src/ATen/native/ScatterGatherChecks.h +def scatter_shape_check(self, dim, index, src_opt=None): + if index.numel() == 0: + return + torch._check( + ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()), + lambda: "Index tensor must have the same number of dimensions as self tensor", + ) + + is_wrong_shape = False + self_dims = ensure_nonempty_dim(self.dim()) + + # Check: index.size(d) <= self.size(d) for all d != dim + for d in range(self_dims): + index_d_size = ensure_nonempty_size(index, d) + if d == dim: + continue + if index_d_size > ensure_nonempty_size(self, d): + is_wrong_shape = True + break + + # Check: index.size(d) <= src.size(d) for all d if src is Tensor + if not is_wrong_shape and src_opt is not None: + for d in range(self_dims): + index_d_size = ensure_nonempty_size(index, d) + if index_d_size > ensure_nonempty_size(src_opt, d): + is_wrong_shape = True + break + + if src_opt is not None: + torch._check( + ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()), + lambda: "Index tensor must have the same number of dimensions as self tensor", + ) + torch._check( + not is_wrong_shape, + lambda: f"Expected index {index.shape} to be smaller than self {self.shape}" + + f" apart from dimension {dim} and to be smaller than src {src_opt.shape}", + ) + else: + torch._check( + not is_wrong_shape, + lambda: f"Expected index {index.shape} to be smaller than self {self.shape}" + + f" apart from dimension {dim}", + ) + + +# From aten/src/ATen/native/TensorAdvancedIndexing.cpp +def scatter_meta_impl(self, dim, index, src=None, reduce_=None, use_new_options=False): + wrapped_dim = maybe_wrap_dim(dim, self.dim()) + scatter_gather_dtype_check("scatter", self, index, src) + scatter_shape_check(self, wrapped_dim, index, src) + if reduce_ is not None: + # Check if we have a valid reduce operator. + get_operator_enum(reduce_, use_new_options) + + +@register_meta(aten.scatter_add.default) +def meta_scatter_add(self, dim, index, src): + scatter_meta_impl(self, dim, index, src, "add") + return self.new_empty(self.shape) + + +@register_meta(aten.scatter_add_) +def meta_scatter_add_(self, dim, index, src): + scatter_meta_impl(self, dim, index, src, "add") + return self + + +@register_meta( + [ + aten.scatter.src, + aten.scatter.value, + aten.scatter.reduce, + aten.scatter.value_reduce, + ] +) +@out_wrapper() +def meta_scatter(self, dim, index, src_or_value, reduce=None): + src = src_or_value if isinstance(src_or_value, torch.Tensor) else None + scatter_meta_impl(self, dim, index, src, reduce) + return self.new_empty(self.shape) + + +@register_meta( + [ + aten.scatter_.src, + aten.scatter_.value, + aten.scatter_.reduce, + aten.scatter_.value_reduce, + ] +) +def meta_scatter_(self, dim, index, src_or_value, reduce=None): + src = src_or_value if isinstance(src_or_value, torch.Tensor) else None + scatter_meta_impl(self, dim, index, src, reduce) + return self + + +@register_meta( + [ + aten._scaled_dot_product_flash_attention_backward, + ] +) +def meta__scaled_dot_product_flash_backward( + grad_out: Tensor, + query: Tensor, + key: Tensor, + value: Tensor, + out: Tensor, + logsumexp: Tensor, + cum_seq_q: Tensor, + cum_seq_k: Tensor, + max_q: int, + max_k: int, + dropout_p: float, + is_causal: bool, + philox_seed: Tensor, + philox_offset: Tensor, + scale: Optional[float] = None, +): + grad_q = torch.empty_like(query.transpose(1, 2)).transpose(1, 2) + grad_k = torch.empty_like(key.transpose(1, 2)).transpose(1, 2) + grad_v = torch.empty_like(value.transpose(1, 2)).transpose(1, 2) + return grad_q, grad_k, grad_v + + +@register_meta( + [ + aten._scaled_dot_product_flash_attention_for_cpu, + ] +) +def meta__scaled_dot_product_flash_attention_for_cpu( + query: Tensor, + key: Tensor, + value: Tensor, + dropout_p: float = 0.0, + is_causal: bool = False, + attn_mask: Optional[Tensor] = None, + scale: Optional[float] = None, +): + batch_size = query.size(0) + num_heads = query.size(1) + max_seqlen_batch_q = query.size(2) + head_dim = query.size(3) + + attention = torch.empty( + (batch_size, max_seqlen_batch_q, num_heads, head_dim), + dtype=query.dtype, + device=query.device, + ).transpose(1, 2) + logsumexp = torch.empty( + ( + batch_size, + max_seqlen_batch_q, + num_heads, + ), + dtype=torch.float, + device=query.device, + ).transpose(1, 2) + return ( + attention, + logsumexp, + ) + + +@register_meta( + [ + aten._scaled_dot_product_flash_attention_for_cpu_backward, + ] +) +def meta__scaled_dot_product_flash_attention_for_cpu_backward( + grad_out: Tensor, + query: Tensor, + key: Tensor, + value: Tensor, + out: Tensor, + logsumexp: Tensor, + dropout_p: float, + is_causal: bool, + attn_mask: Optional[Tensor] = None, + scale: Optional[float] = None, +): + # cpus's grad layout is different from cuda's, + # i.e. (batch_size, seq_len,num_heads, head_dim) + batch_size = query.size(0) + num_heads = query.size(1) + head_dim = query.size(3) + len_q = query.size(2) + len_k = key.size(2) + + grad_q = torch.empty_permuted( + (batch_size, num_heads, len_q, head_dim), + (0, 2, 1, 3), + dtype=query.dtype, + device=query.device, + ) + grad_k = torch.empty_permuted( + (batch_size, num_heads, len_k, head_dim), + (0, 2, 1, 3), + dtype=key.dtype, + device=key.device, + ) + grad_v = torch.empty_permuted( + (batch_size, num_heads, len_k, head_dim), + (0, 2, 1, 3), + dtype=value.dtype, + device=value.device, + ) + + return grad_q, grad_k, grad_v + + +@register_meta( + [ + aten._scaled_dot_product_efficient_attention_backward, + ] +) +def meta__scaled_dot_product_efficient_backward( + grad_out: Tensor, + query: Tensor, + key: Tensor, + value: Tensor, + attn_bias: Optional[Tensor], + out: Tensor, + logsumexp: Tensor, + philox_seed: Tensor, + philox_offset: Tensor, + dropout_p: float, + grad_input_mask: List[bool], + is_causal: bool = False, + scale: Optional[float] = None, +): + batch_size = query.size(0) + num_heads = query.size(1) + max_q = query.size(2) + head_dim = query.size(3) + head_dim_v = value.size(3) + + max_k = key.size(2) + + grad_q = torch.empty_permuted( + (batch_size, num_heads, max_q, head_dim), + (0, 2, 1, 3), + dtype=query.dtype, + device=query.device, + ) + grad_k = torch.empty_permuted( + (batch_size, num_heads, max_k, head_dim), + (0, 2, 1, 3), + dtype=key.dtype, + device=key.device, + ) + grad_v = torch.empty_permuted( + (batch_size, num_heads, max_k, head_dim_v), + (0, 2, 1, 3), + dtype=value.dtype, + device=value.device, + ) + grad_bias = None + if attn_bias is not None and grad_input_mask[3]: + lastDim = attn_bias.size(-1) + lastDimAligned = lastDim if lastDim % 16 == 0 else lastDim + 16 - lastDim % 16 + new_sizes = list(attn_bias.size()) + new_sizes[-1] = lastDimAligned + grad_bias = torch.empty( + new_sizes, dtype=attn_bias.dtype, device=attn_bias.device + ) + grad_bias = grad_bias[..., :lastDim] + + return grad_q, grad_k, grad_v, grad_bias + + +@register_meta( + [ + aten._flash_attention_backward, + ] +) +def meta__flash_attention_backward( + grad_out: Tensor, + query: Tensor, + key: Tensor, + value: Tensor, + out: Tensor, + logsumexp: Tensor, + cum_seq_q: Tensor, + cum_seq_k: Tensor, + max_q: int, + max_k: int, + dropout_p: float, + is_causal: bool, + philox_seed: Tensor, + philox_offset: Tensor, + scale: Optional[float] = None, +): + grad_query = torch.empty_like(query) + grad_key = torch.empty_like(key) + grad_value = torch.empty_like(value) + + return grad_query, grad_key, grad_value + + +@register_meta( + [ + aten._efficient_attention_backward, + ] +) +def meta__efficient_attention_backward( + grad_out: Tensor, + query: Tensor, + key: Tensor, + value: Tensor, + bias: Optional[Tensor], + cu_seqlens_q: Optional[Tensor], + cu_seqlens_k: Optional[Tensor], + max_seqlen_q: int, + max_seqlen_k: int, + logsumexp: Tensor, + dropout_p: float, + philox_seed: Tensor, + philox_offset: Tensor, + custom_mask_type: int, + bias_requires_grad: bool, + scale: Optional[float] = None, + num_splits_key: Optional[int] = None, +): + grad_query = torch.empty_like(query) + grad_key = torch.empty_like(key) + grad_value = torch.empty_like(value) + + if bias is not None: + lastDim = bias.size(-1) + lastDimAligned = lastDim if lastDim % 16 == 0 else lastDim + 16 - lastDim % 16 + new_sizes = list(bias.size()) + new_sizes[-1] = lastDimAligned + grad_bias = torch.empty(new_sizes, dtype=bias.dtype, device=bias.device) + grad_bias = grad_bias[..., :lastDim] + else: + grad_bias = torch.empty((), device=query.device) + + return grad_query, grad_key, grad_value, grad_bias + + +@register_meta([aten._scaled_mm.default]) +def meta_scaled_mm( + self: torch.Tensor, + mat2: torch.Tensor, + bias: Optional[torch.Tensor] = None, + out_dtype: Optional[torch.dtype] = None, + scale_a: Optional[torch.Tensor] = None, + scale_b: Optional[torch.Tensor] = None, + scale_result: Optional[torch.Tensor] = None, + use_fast_accum: bool = False, +): + def is_row_major(stride): + return stride[0] > stride[1] and stride[1] == 1 + + def is_col_major(shape, stride): + return stride[0] == 1 and stride[1] == shape[0] + + def is_fp8_type(dtype): + return dtype in ( + torch.float8_e4m3fn, + torch.float8_e5m2, + torch.float8_e4m3fnuz, + torch.float8_e5m2fnuz, + ) + + torch._check( + self.dim() == 2 and mat2.dim() == 2, + lambda: f"Inputs must be 2D but got self.dim()={self.dim()} and mat2.dim()={mat2.dim()}", + ) + torch._check( + is_row_major(self.stride()), + lambda: "self must be row_major", + ) + torch._check( + is_col_major(mat2.shape, mat2.stride()), + lambda: "mat2 must be col_major", + ) + torch._check( + self.size(1) % 16 == 0, + lambda: f"Expected self.size(0) to be divisible by 16, but got self.size(1)={self.size(1)}", + ) + torch._check( + mat2.size(0) % 16 == 0 and mat2.size(1) % 16 == 0, + lambda: f"Expected both dimensions of mat2 to be divisble by 16 but got {mat2.shape}", + ) + torch._check( + is_fp8_type(self.dtype) and is_fp8_type(mat2.dtype), + lambda: f"Expected both inputs to be fp8 types but got self.dtype={self.dtype} and mat2.dtype={mat2.dtype}", + ) + _out_dtype = out_dtype if out_dtype is not None else self.dtype + return torch.empty( + self.size(0), mat2.size(1), dtype=_out_dtype, device=self.device + ), torch.empty((), dtype=torch.float32, device=self.device) + + +@register_meta([aten.scatter_reduce.two, aten.scatter_reduce.two_out]) +@out_wrapper() +def meta_scatter_reduce_two(self, dim, index, src, reduce, include_self=True): + scatter_meta_impl(self, dim, index, src, reduce, use_new_options=True) + return self.new_empty(self.shape) + + +@register_meta(aten.scatter_reduce_.two) +def meta_scatter_reduce__two(self, dim, index, src, reduce, include_self=True): + scatter_meta_impl(self, dim, index, src, reduce, use_new_options=True) + return self + + +@register_meta([aten.multinomial.default, aten.multinomial.out]) +@out_wrapper() +def meta_multinomial(input, num_samples, replacement=False, *, generator=None): + torch._check( + 0 < input.dim() <= 2, + lambda: f"The probabilty distributions dimensions must be 1 or 2, but got {input.dim()}", + ) + if input.dim() == 1: + return torch.empty(num_samples, dtype=torch.long, device=input.device) + return torch.empty( + input.size(0), num_samples, dtype=torch.long, device=input.device + ) + + +def multiply_integers(vs): + r = 1 + for v in vs: + r *= v + return r + + +def upsample_common_check(input_size, output_size, num_spatial_dims): + torch._check( + len(output_size) == num_spatial_dims, + lambda: f"It is expected output_size equals to {num_spatial_dims}, but got size {len(output_size)}", + ) + expected_input_dims = num_spatial_dims + 2 # N, C, ... + torch._check( + len(input_size) == expected_input_dims, + lambda: f"It is expected input_size equals to {expected_input_dims}, but got size {len(input_size)}", + ) + + torch._check( + all(s > 0 for s in input_size[2:]) and all(s > 0 for s in output_size), + lambda: f"Input and output sizes should be greater than 0, but got " + f"input size {input_size} and output size {output_size}", + ) + + nbatch, channels = input_size[:2] + return (nbatch, channels, *output_size) + + +@register_meta( + [aten.upsample_nearest1d.default, aten._upsample_nearest_exact1d.default] +) +def upsample_nearest1d(input, output_size, scales=None): + torch._check( + input.numel() != 0 or multiply_integers(input.size()[1:]), + lambda: f"Non-empty 3D data tensor expected but got a tensor with sizes {input.size()}", + ) + full_output_size = upsample_common_check( + input.size(), output_size, num_spatial_dims=1 + ) + return input.new_empty(full_output_size).to( + memory_format=utils.suggest_memory_format(input) + ) + + +@register_meta( + [aten.upsample_nearest2d.default, aten._upsample_nearest_exact2d.default] +) +def upsample_nearest2d(input, output_size, scales_h=None, scales_w=None): + torch._check( + input.numel() != 0 or multiply_integers(input.size()[1:]), + lambda: f"Non-empty 4D data tensor expected but got a tensor with sizes {input.size()}", + ) + full_output_size = upsample_common_check( + input.size(), output_size, num_spatial_dims=2 + ) + output = input.new_empty(full_output_size) + + # convert output to correct memory format, if necessary + memory_format = utils.suggest_memory_format(input) + + # following "heuristic: only use channels_last path when it's faster than the contiguous path" + _, n_channels, _, _ = input.shape + if input.device.type == "cuda" and n_channels < 4: + memory_format = torch.contiguous_format + + output = output.contiguous(memory_format=memory_format) + + return output + + +@register_meta( + [ + aten.upsample_nearest2d_backward.default, + aten._upsample_nearest_exact2d_backward.default, + ] +) +def upsample_nearest2d_backward( + grad_output: Tensor, + output_size: Sequence[Union[int, torch.SymInt]], + input_size: Sequence[Union[int, torch.SymInt]], + scales_h: Optional[float] = None, + scales_w: Optional[float] = None, +): + full_output_size = upsample_common_check( + input_size, output_size, num_spatial_dims=2 + ) + torch._check( + grad_output.ndim == 4, + lambda: f"Expected grad_output to be a tensor of dimension 4 but got: dimension {grad_output.ndim}", + ) + for i in range(4): + torch._check( + grad_output.size(i) == full_output_size[i], + lambda: ( + f"Expected grad_output to have the same shape as output;" + f" output.size({i}) = {full_output_size[i]}" + f" but got grad_output.size({i}) = {grad_output.size(i)}" + ), + ) + + return grad_output.new_empty(input_size).to( + memory_format=utils.suggest_memory_format(grad_output) + ) # type: ignore[call-overload] + + +@register_meta( + [aten.upsample_nearest3d.default, aten._upsample_nearest_exact3d.default] +) +def upsample_nearest3d(input, output_size, scales_d=None, scales_h=None, scales_w=None): + torch._check( + input.numel() != 0 or multiply_integers(input.size()[1:]), + lambda: f"Non-empty 5D data tensor expected but got a tensor with sizes {input.size()}", + ) + full_output_size = upsample_common_check( + input.size(), output_size, num_spatial_dims=3 + ) + return input.new_empty(full_output_size).to( + memory_format=utils.suggest_memory_format(input) + ) + + +@register_meta( + [ + aten.sort.default, + aten.sort.stable, + aten.sort.values, + aten.sort.values_stable, + ] +) +def meta_sort(self, stable=None, dim=-1, descending=False, values=None, indices=None): + v, i = torch.empty_like(self), torch.empty_like(self, dtype=torch.int64) + if values is not None and indices is not None: + assert isinstance(values, TensorLike) + assert isinstance(indices, TensorLike) + # Makes sure values and indices have the same strides. For cases where + # these have different shapes, like (5, 10, 5) and (0) in msort. + out_shape = v.shape + out_stride = v.stride() + values = _maybe_resize_out(values, out_shape) + indices = _maybe_resize_out(indices, out_shape) + values.as_strided_(out_shape, out_stride) + indices.as_strided_(out_shape, out_stride) + _safe_copy_out(copy_from=v, copy_to=values) # type: ignore[arg-type] + _safe_copy_out(copy_from=i, copy_to=indices) # type: ignore[arg-type] + return values, indices + return v, i + + +@register_meta(aten.argsort.stable) +def meta_argsort(self, *, stable, dim=-1, descending=False): + return meta_sort(self, stable=stable, dim=dim, descending=descending)[1] + + +def rnn_cell_checkSizes( + input_gates, hidden_gates, input_bias, hidden_bias, factor, prev_hidden +): + torch._check(input_gates.ndim == 2, lambda: f"{input_gates.ndim} != 2") + torch._check( + input_gates.shape == hidden_gates.shape, + lambda: f"{input_gates.shape} != {hidden_gates.shape}", + ) + gates_size = input_gates.size(1) + if input_bias is not None: + torch._check(input_bias.ndim == 1, lambda: f"{input_bias.ndim} != 1") + torch._check( + input_bias.numel() == gates_size, + lambda: f"{input_bias.numel()} != {gates_size}", + ) + torch._check( + input_bias.shape == hidden_bias.shape, + lambda: f"{input_bias.shape} != {hidden_bias.shape}", + ) + torch._check(prev_hidden.ndim == 2, lambda: f"{prev_hidden.ndim} != 2") + expected_prev_hidden_numel = input_gates.size(0) * gates_size // factor + torch._check( + prev_hidden.numel() == expected_prev_hidden_numel, + lambda: f"{prev_hidden.numel()} != {input_gates.size(0)} * {gates_size} // {factor} (aka {expected_prev_hidden_numel})", + ) + torch._check( + all( + x.device == input_gates.device + for x in [hidden_gates, input_bias, hidden_bias, prev_hidden] + ), + lambda: "expected all inputs to be same device", + ) + + +@register_meta(aten._thnn_fused_lstm_cell.default) +def _thnn_fused_lstm_cell_meta( + input_gates, hidden_gates, cx, input_bias=None, hidden_bias=None +): + rnn_cell_checkSizes(input_gates, hidden_gates, input_bias, hidden_bias, 4, cx) + workspace = torch.empty_like(input_gates, memory_format=torch.contiguous_format) + hy = torch.empty_like(cx, memory_format=torch.contiguous_format) + cy = torch.empty_like(cx, memory_format=torch.contiguous_format) + return (hy, cy, workspace) + + +@register_meta(aten._cudnn_rnn.default) +def _cudnn_rnn( + input, + weight, + weight_stride0, + weight_buf, + hx, + cx, + mode, + hidden_size, + proj_size, + num_layers, + batch_first, + dropout, + train, + bidirectional, + batch_sizes, + dropout_state, +): + is_input_packed = len(batch_sizes) != 0 + if is_input_packed: + seq_length = len(batch_sizes) + mini_batch = batch_sizes[0] + batch_sizes_sum = input.shape[0] + else: + seq_length = input.shape[1] if batch_first else input.shape[0] + mini_batch = input.shape[0] if batch_first else input.shape[1] + batch_sizes_sum = -1 + + num_directions = 2 if bidirectional else 1 + out_size = proj_size if proj_size != 0 else hidden_size + if is_input_packed: + out_shape = [batch_sizes_sum, out_size * num_directions] + else: + out_shape = ( + [mini_batch, seq_length, out_size * num_directions] + if batch_first + else [seq_length, mini_batch, out_size * num_directions] + ) + output = input.new_empty(out_shape) + + cell_shape = [num_layers * num_directions, mini_batch, hidden_size] + if cx is None: + cy = torch.empty(0, device=input.device) + else: + cy = cx.new_empty(cell_shape) + + hy = hx.new_empty([num_layers * num_directions, mini_batch, out_size]) + + # TODO: Query cudnnGetRNNTrainingReserveSize (expose to python) + reserve_shape = 0 if train else 0 + reserve = input.new_empty(reserve_shape, dtype=torch.uint8) + + return output, hy, cy, reserve, weight_buf + + +@register_meta(aten.mkldnn_rnn_layer.default) +def mkldnn_rnn_layer( + input, + w0, + w1, + w2, + w3, + hx_, + cx_, + reverse, + batch_sizes, + mode, + hidden_size, + num_layers, + has_biases, + bidirectional, + batch_first, + train, +): + seq_length = input.shape[1] if batch_first else input.shape[0] + mini_batch = input.shape[0] if batch_first else input.shape[1] + output_chanels = hidden_size + out_shape = ( + [mini_batch, seq_length, output_chanels] + if batch_first + else [seq_length, mini_batch, output_chanels] + ) + output = input.new_empty(out_shape) + if hx_ is None: + hy = torch.empty(0, device=input.device) + else: + hy = hx_.new_empty(hx_.shape) + if cx_ is None: + cy = torch.empty(0, device=input.device) + else: + cy = cx_.new_empty(cx_.shape) + workspace = torch.empty(0, device=input.device, dtype=torch.uint8) + return output, hy, cy, workspace + + +def zero_numel_check_dims(self, dim, fn_name): + if self.ndim == 0: + torch._check_index( + dim == 0 or dim == -1, + lambda: f"{fn_name}: Expected reduction dim -1 or 0 for scalar but got {dim}", + ) + else: + torch._check_index( + self.size(dim) != 0, + lambda: f"{fn_name}: Expected reduction dim {dim} to have non-zero size.", + ) + + +# From aten/src/ATen/native/ReduceOps.cpp +def check_argmax_argmin(name, self, dim): + if dim is not None: + dim = maybe_wrap_dim(dim, self.dim()) + zero_numel_check_dims(self, dim, name) + else: + torch._check( + self.numel() != 0, + lambda: f"{name}: Expected reduction dim to be specified for input.numel() == 0.", + ) + + +@register_meta([aten.argmax.default, aten.argmin.default]) +def argmax_argmin_meta(self, dim=None, keepdim=False): + check_argmax_argmin("argmax", self, dim) + dims = utils.reduction_dims(self.shape, (dim,) if dim is not None else None) + shape = _compute_reduction_shape(self, dims, keepdim) + return self.new_empty(shape, dtype=torch.int64) + + +@register_meta(aten.scalar_tensor.default) +def scalar_tensor(s, dtype=None, layout=None, device=None, pin_memory=None): + return torch.empty( + (), dtype=dtype, layout=layout, device=device, pin_memory=pin_memory + ) + + +@register_meta(aten.topk.default) +def topk_meta(self, k, dim=-1, largest=True, sorted=True): + # From aten/src/ATen/native/Sorting.cpp + dim = maybe_wrap_dim(dim, self.dim(), wrap_scalar=True) + torch._check( + k >= 0 and k <= (self.size(dim) if self.dim() > 0 else 1), + lambda: "selected index k out of range", + ) + sliceSize = 1 if self.dim() == 0 else self.size(dim) + torch._check(k >= 0 and k <= sliceSize, lambda: "k not in range for dimension") + + topKSize = list(self.shape) + if len(topKSize) > 0: + topKSize[dim] = k + return self.new_empty(topKSize), self.new_empty(topKSize, dtype=torch.int64) + + +legacy_contiguous_memory_format = torch.contiguous_format + + +# From aten/src/ATen/native/cuda/RNN.cu +def checkLSTMBackwardSizes(grad_hy, grad_cy, cx, cy, workspace): + defined_grad = grad_hy if grad_hy is not None else grad_cy + torch._check(defined_grad.dim() == 2, lambda: "") + exp_size = defined_grad.size() + if grad_hy is not None: + torch._check(grad_hy.size() == exp_size, lambda: "") + if grad_cy is not None: + torch._check(grad_cy.size() == exp_size, lambda: "") + torch._check(cx.size() == exp_size, lambda: "") + torch._check(cy.size() == exp_size, lambda: "") + torch._check(workspace.dim() == 2, lambda: "") + torch._check(workspace.numel() == exp_size[0] * exp_size[1] * 4, lambda: "") + + +# From aten/src/ATen/native/cuda/RNN.cu +@register_meta(aten._thnn_fused_lstm_cell_backward_impl.default) +def _thnn_fused_lstm_cell_backward_impl(grad_hy, grad_cy, cx, cy, workspace, has_bias): + if grad_hy is None and grad_cy is None: + return None, None, None + checkLSTMBackwardSizes(grad_hy, grad_cy, cx, cy, workspace) + grad_gates = torch.empty_like( + workspace, memory_format=legacy_contiguous_memory_format + ) + grad_cx = torch.empty_like(cx, memory_format=legacy_contiguous_memory_format) + grad_bias = grad_gates.sum(0, keepdim=False) if has_bias else None + return grad_gates, grad_cx, grad_bias + + +# From aten/src/ATen/native/mps/operations/Linear.mm +@register_meta(aten.linear_backward.default) +def linear_backward(input_, grad_output_, weight_, output_mask): + grad_input = None + grad_weight = None + grad_bias = None + if output_mask[0]: + grad_input = grad_output_.new_empty(input_.size()) + if output_mask[1] or output_mask[2]: + grad_weight = grad_output_.new_empty((grad_output_.size(-1), input_.size(-1))) + grad_bias = grad_output_.new_empty(grad_output_.size(-1)) + return (grad_input, grad_weight, grad_bias) + + +@register_meta(aten.pixel_shuffle.default) +def meta_pixel_shuffle(self, upscale_factor): + assert ( + len(self.shape) > 2 and self.shape[-3] % (upscale_factor * upscale_factor) == 0 + ), f"Invalid input shape for pixel_shuffle: {self.shape} with upscale_factor = {upscale_factor}" + + def is_channels_last(ten): + return torch._prims_common.suggest_memory_format(ten) == torch.channels_last + + def pick_memory_format(): + if is_channels_last(self): + if device_hint(self) == "cuda": + return torch.contiguous_format + else: + return torch.channels_last + elif self.is_contiguous(memory_format=torch.contiguous_format): + return torch.contiguous_format + elif self.is_contiguous(memory_format=torch.preserve_format): + return torch.preserve_format + + C = self.shape[-3] // (upscale_factor * upscale_factor) + Hr = self.shape[-2] * upscale_factor + Wr = self.shape[-1] * upscale_factor + out_shape = (*self.shape[:-3], C, Hr, Wr) + + out = self.new_empty(out_shape) + out = out.to(memory_format=pick_memory_format()) # type: ignore[call-overload] + return out + + +@register_meta(aten.mkldnn_rnn_layer_backward.default) +def mkldnn_rnn_layer_backward( + input, + weight0, + weight1, + weight2, + weight3, + hx_, + cx_tmp, + output, + hy_, + cy_, + grad_output_r_opt, + grad_hy_r_opt, + grad_cy_r_opt, + reverse, + mode, + hidden_size, + num_layers, + has_biases, + train, + bidirectional, + batch_sizes, + batch_first, + workspace, +): + diff_x = input.new_empty(input.shape) + diff_hx = hx_.new_empty(hx_.shape) + diff_cx = cx_tmp.new_empty(cx_tmp.shape) + diff_w1 = weight0.new_empty(weight0.shape) + diff_w2 = weight1.new_empty(weight1.shape) + diff_b = weight2.new_empty(weight2.shape) + return diff_x, diff_w1, diff_w2, diff_b, diff_b, diff_hx, diff_cx + + +@register_meta([aten.bucketize.Tensor, aten.bucketize.Tensor_out]) +@out_wrapper() +def meta_bucketize(self, boundaries, *, out_int32=False, right=False): + return torch.empty_like( + self, dtype=torch.int32 if out_int32 else torch.int64 + ).contiguous() + + +@register_meta( + [aten._upsample_bilinear2d_aa.default, aten._upsample_bicubic2d_aa.default] +) +def meta_upsample_bimode2d_aa( + input, output_size, align_corners, scales_h=None, scales_w=None +): + full_output_size = upsample_common_check( + input.size(), output_size, num_spatial_dims=2 + ) + torch._check( + input.numel() != 0 or all(size > 0 for size in input.size()[1:]), + lambda: f"Non-empty 4D data tensor expected but got a tensor with sizes {input.size()}", + ) + return input.new_empty(full_output_size).to( + memory_format=utils.suggest_memory_format(input) + ) + + +# From aten/src/ATen/native/cuda/AmpKernels.cu +@register_meta(aten._amp_foreach_non_finite_check_and_unscale_.default) +def _amp_foreach_non_finite_check_and_unscale_(self, found_inf, inv_scale): + torch._check( + found_inf.numel() == 1, lambda: "found_inf must be a 1-element tensor." + ) + torch._check( + inv_scale.numel() == 1, lambda: "inv_scale must be a 1-element tensor." + ) + torch._check( + found_inf.dtype.is_floating_point, + lambda: "found_inf must be a float tensor.", + ) + torch._check( + inv_scale.dtype.is_floating_point, + lambda: "inv_scale must be a float tensor.", + ) + + +# From aten/src/ATen/native/UnaryOps.cpp +@register_meta([aten.nan_to_num.default, aten.nan_to_num.out]) +@out_wrapper() +def nan_to_num(self, nan=None, posinf=None, neginf=None): + result_size = list(self.size()) + return self.new_empty(result_size) + + +@register_meta(torch.ops.aten.transpose_) +def transpose_(self, dim0, dim1): + assert self.layout not in { + torch.sparse_csr, + torch.sparse_csc, + torch.sparse_bsr, + torch.sparse_bsc, + }, f"torch.transpose_: in-place transposition is not supported for {self.layout} layout" + + ndims = self.ndim + + dim0 = maybe_wrap_dim(dim0, ndims) + dim1 = maybe_wrap_dim(dim1, ndims) + + if dim0 == dim1: + return self + + size = list(self.size()) + stride = list(self.stride()) + + stride[dim0], stride[dim1] = stride[dim1], stride[dim0] + size[dim0], size[dim1] = size[dim1], size[dim0] + + self.as_strided_(size, stride) + return self + + +@register_meta(torch.ops.aten.t_) +def t_(self): + ndims = self.ndim + + if self.is_sparse: + sparse_dim = self.sparse_dim() + dense_dim = self.dense_dim() + assert ( + sparse_dim <= 2 and dense_dim == 0 + ), f"t_ expects a tensor with <= 2 sparse and 0 dense dimensions, but got {sparse_dim} sparse and {dense_dim} dense dimensions" # noqa: B950 + else: + assert ( + self.dim() <= 2 + ), f"t_ expects a tensor with <= 2 dimensions, but self is {ndims}D" + + return transpose_(self, 0, 0 if ndims < 2 else 1) + + +@register_meta(aten.searchsorted) +@out_wrapper() +def meta_searchsorted( + sorted_sequence, self, *, out_int32=False, right=False, side=None, sorter=None +): + dtype = torch.int32 if out_int32 else torch.int64 + if isinstance(self, torch.Tensor): + return torch.empty_like(self, dtype=dtype).contiguous() + else: # Scalar + return torch.empty((), dtype=dtype, device=sorted_sequence.device) + + +def _check_for_unsupported_isin_dtype(dtype): + torch._check( + dtype not in [torch.bool, torch.bfloat16, torch.complex128, torch.complex64], + lambda: f"Unsupported input type encountered for isin(): {dtype}", + ) + + +@register_meta(aten.isin) +@out_wrapper() +def meta_isin(elements, test_elements, *, assume_unique=False, invert=False): + torch._check( + isinstance(elements, Tensor) or isinstance(test_elements, Tensor), + lambda: "At least one of elements and test_elements must be a Tensor.", + ) + if not isinstance(elements, Tensor): + elements = torch.tensor(elements, device=test_elements.device) + + if not isinstance(test_elements, Tensor): + test_elements = torch.tensor(test_elements, device=elements.device) + + _check_for_unsupported_isin_dtype(elements.dtype) + _check_for_unsupported_isin_dtype(test_elements.dtype) + return torch.empty_like(elements, dtype=torch.bool) + + +@register_meta(aten.polygamma) +@out_wrapper() +def meta_polygamma(n: int, self: Tensor) -> Tensor: + torch._check(n >= 0, lambda: "polygamma(n, x) does not support negative n.") + _, result_dtype = elementwise_dtypes( + self, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, + ) + return torch.empty_like(self, dtype=result_dtype) + + +def _create_unary_float_meta_func(func): + @register_meta(func) + @out_wrapper() + def _f(x): + return elementwise_meta( + x, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT + ) + + return _f + + +def _create_binary_float_meta_func(func): + @register_meta(func) + @out_wrapper() + def _f(x, y): + return elementwise_meta( + x, y, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT + ) + + return _f + + +_create_unary_float_meta_func(aten.special_airy_ai) +_create_unary_float_meta_func(aten.special_bessel_y0) +_create_unary_float_meta_func(aten.special_bessel_y1) +_create_unary_float_meta_func(aten.special_modified_bessel_i0) +_create_unary_float_meta_func(aten.special_modified_bessel_i1) +_create_unary_float_meta_func(aten.special_modified_bessel_k0) +_create_unary_float_meta_func(aten.special_modified_bessel_k1) +_create_unary_float_meta_func(aten.special_scaled_modified_bessel_k0) +_create_unary_float_meta_func(aten.special_scaled_modified_bessel_k1) + + +_create_binary_float_meta_func(aten.special_chebyshev_polynomial_t) +_create_binary_float_meta_func(aten.special_chebyshev_polynomial_u) +_create_binary_float_meta_func(aten.special_chebyshev_polynomial_v) +_create_binary_float_meta_func(aten.special_chebyshev_polynomial_w) +_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_t) +_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_u) +_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_v) +_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_w) +_create_binary_float_meta_func(aten.special_hermite_polynomial_h) +_create_binary_float_meta_func(aten.special_hermite_polynomial_he) +_create_binary_float_meta_func(aten.special_laguerre_polynomial_l) +_create_binary_float_meta_func(aten.special_legendre_polynomial_p) + + +# We must also trigger meta registrations from PrimTorch ref +# decompositions +import torch._refs +import torch._refs.nn.functional +import torch._refs.special + + +def activate_meta(): + activate_meta_table = {} + + # For a given op, we pick the most specific decomp function from + # global_decomp_table in the precedence order of meta > post_autograd > pre_autograd + for type in ["meta", "post_autograd", "pre_autograd"]: + registry = global_decomposition_table[type] + + for opo in registry: + if opo not in activate_meta_table: + activate_meta_table[opo] = registry[opo] + + for op_overload, fn in activate_meta_table.items(): + # Don't register meta for HigherOrderOp's decomp. + # We can reconsider this in the future, but in general, + # the way you do a meta for a HigherOrderOp is different from + # OpOverload. + if isinstance(op_overload, torch._ops.HigherOrderOperator): + continue + assert isinstance(op_overload, OpOverload) + + op_overload.py_impl(torch._C.DispatchKey.Meta)(fn) + + if torch._C._dispatch_has_kernel_for_dispatch_key( + op_overload.name(), "CompositeImplicitAutograd" + ): + # Internally, we shouldn't be registering meta kernels for any operators that + # have CompositeImplicitAutograd kernels. + # Instead, we should be letting those decompositions run, and writing meta kernels + # only for the base operators. + if op_overload in global_decomposition_table["meta"]: + raise RuntimeError( + f"{op_overload} is a CompositeImplicitAutograd op, we shouldn't " + "register meta function for it. Instead, we should let the decomposition run and write " + "meta kernels for the base operators." + ) + pass + elif op_overload.is_view: + # Attempting to register a python meta kernel for a view operator. + # We shouldn't do this, because the output will report as not having aliased storages. + # All view ops have meta kernels in C++ today, so we should use those instead. + pass + elif op_overload.name() in { + "aten::empty_strided", # causing infinite recursion, test_meta.py + "aten::clone", # causing infinite recursion + "aten::_to_copy", # causing infinite recursion, test_serialization.py -k test_tensor_subclass_getstate_overwrite # noqa: B950 + "aten::copy_", # Exception not raised, test_torch.py -k test_storage_meta_errors_cpu_int64 # noqa: B950 + "aten::constant_pad_nd", # requires_grad mismatch, test_ops.py -k test_fake_crossref_backward_amp_istft_cuda_float32 # noqa: B950 + "aten::rot90", # requires_grad mismatch! test_ops.py -k test_fake_crossref_backward_amp_rot90_cuda_float32 # noqa: B950 + "aten::as_strided_scatter", # requires_grad mismatch, test_ops.py -k test_fake_crossref_backward_no_amp_as_strided_scatter_cuda_float32 # noqa: B950 + }: + pass + else: + if "mkldnn::" in op_overload.name(): + _meta_lib_dont_use_me_use_register_meta_for_mkldnn.impl(op_overload, fn) + elif "mkl::" in op_overload.name(): + _meta_lib_dont_use_me_use_register_meta_for_mkl.impl(op_overload, fn) + elif "onednn::" in op_overload.name(): + _meta_lib_dont_use_me_use_register_meta_for_onednn.impl(op_overload, fn) + elif "quantized::" in op_overload.name(): + _meta_lib_dont_use_me_use_register_meta_for_quantized.impl( + op_overload, fn + ) + else: + _meta_lib_dont_use_me_use_register_meta.impl(op_overload, fn) + + +activate_meta() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_namedtensor_internals.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_namedtensor_internals.py new file mode 100644 index 0000000000000000000000000000000000000000..cbc9de2de091d03295a0aca0011036e0a67a97eb --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_namedtensor_internals.py @@ -0,0 +1,157 @@ +from collections import OrderedDict + +""" +This file contains helper functions that implement experimental functionality +for named tensors in python. All of these are experimental, unstable, and +subject to change or deletion. +""" + + +def check_serializing_named_tensor(tensor): + if tensor.has_names(): + raise RuntimeError( + "NYI: Named tensors don't support serialization. Please drop " + "names via `tensor = tensor.rename(None)` before serialization." + ) + + +def build_dim_map(tensor): + """Returns a map of { dim: dim_name } where dim is a name if the dim is named + and the dim index otherwise.""" + return OrderedDict( + [(idx if name is None else name, name) for idx, name in enumerate(tensor.names)] + ) + + +def unzip_namedshape(namedshape): + if isinstance(namedshape, OrderedDict): + namedshape = namedshape.items() + if not hasattr(namedshape, "__iter__") and not isinstance(namedshape, tuple): + raise RuntimeError( + f"Expected namedshape to be OrderedDict or iterable of tuples, got: {type(namedshape)}" + ) + if len(namedshape) == 0: + raise RuntimeError("Expected namedshape to non-empty.") + return zip(*namedshape) + + +def namer_api_name(inplace): + if inplace: + return "rename_" + else: + return "rename" + + +def is_ellipsis(item): + return item == Ellipsis or item == "..." + + +def single_ellipsis_index(names, fn_name): + ellipsis_indices = [i for i, name in enumerate(names) if is_ellipsis(name)] + if len(ellipsis_indices) >= 2: + raise RuntimeError( + f"{fn_name}: More than one Ellipsis ('...') found in names (" + f"{names}). This function supports up to one Ellipsis." + ) + if len(ellipsis_indices) == 1: + return ellipsis_indices[0] + return None + + +def expand_single_ellipsis(numel_pre_glob, numel_post_glob, names): + return names[numel_pre_glob : len(names) - numel_post_glob] + + +def replace_ellipsis_by_position(ellipsis_idx, names, tensor_names): + globbed_names = expand_single_ellipsis( + ellipsis_idx, len(names) - ellipsis_idx - 1, tensor_names + ) + return names[:ellipsis_idx] + globbed_names + names[ellipsis_idx + 1 :] + + +def resolve_ellipsis(names, tensor_names, fn_name): + """ + Expands ... inside `names` to be equal to a list of names from `tensor_names`. + """ + ellipsis_idx = single_ellipsis_index(names, fn_name) + if ellipsis_idx is None: + return names + return replace_ellipsis_by_position(ellipsis_idx, names, tensor_names) + + +def update_names_with_list(tensor, names, inplace): + # Special case for tensor.rename(None) + if len(names) == 1 and names[0] is None: + return tensor._update_names(None, inplace) + + return tensor._update_names( + resolve_ellipsis(names, tensor.names, namer_api_name(inplace)), inplace + ) + + +def update_names_with_mapping(tensor, rename_map, inplace): + dim_map = build_dim_map(tensor) + for old_dim in rename_map.keys(): + new_dim = rename_map[old_dim] + if old_dim in dim_map.keys(): + dim_map[old_dim] = new_dim + else: + raise RuntimeError( + f"{namer_api_name(inplace)}: Tried to rename dim '{old_dim}' to dim " + f"{new_dim} in Tensor[{tensor.names}] but dim '{old_dim}' does not exist" + ) + return tensor._update_names(tuple(dim_map.values()), inplace) + + +def update_names(tensor, names, rename_map, inplace): + """There are two usages: + + tensor.rename(*names) returns a view on tensor with named dims `names`. + `names` must be of length `tensor.dim()`; otherwise, if '...' is in `names`, + then it is expanded greedily to be equal to the corresponding names from + `tensor.names`. + + For example, + ``` + >>> # xdoctest: +SKIP + >>> x = torch.empty(2, 3, 5, 7, names=('N', 'C', 'H', 'W')) + >>> x.rename('...', 'height', 'width').names + ('N', 'C', 'height', 'width') + + >>> # xdoctest: +SKIP + >>> x.rename('batch', '...', 'width').names + ('batch', 'C', 'H', 'width') + + ``` + + tensor.rename(**rename_map) returns a view on tensor that has rename dims + as specified in the mapping `rename_map`. + + For example, + ``` + >>> # xdoctest: +SKIP + >>> x = torch.empty(2, 3, 5, 7, names=('N', 'C', 'H', 'W')) + >>> x.rename(W='width', H='height').names + ('N', 'C', 'height', 'width') + + ``` + + Finally, tensor.rename has an in-place version called tensor.rename_. + """ + has_names = len(names) > 0 + has_rename_pairs = bool(rename_map) + if has_names and has_rename_pairs: + raise RuntimeError( + f"{namer_api_name(inplace)}: This function takes either positional " + f"args or keyword args, but not both. Use tensor.{namer_api_name(inplace)}(*names) " + f"to name dims and tensor.{namer_api_name(inplace)}(**rename_map) to rename " + "dims." + ) + + # Special case for tensor.rename(*[]), which is valid for a 0 dim tensor. + if not has_names and not has_rename_pairs: + return update_names_with_list(tensor, names, inplace) + + if has_names: + return update_names_with_list(tensor, names, inplace) + return update_names_with_mapping(tensor, rename_map, inplace) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_tensor.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_tensor.py new file mode 100644 index 0000000000000000000000000000000000000000..c1e5fffb11e2a10f0ca957db2c5e21c78ffd3eb8 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_tensor.py @@ -0,0 +1,1543 @@ +import copyreg +import enum +import functools +import warnings +from collections import OrderedDict +from copy import deepcopy +from numbers import Number +from typing import Any, Dict, Optional, Tuple, Union + +import torch +import torch._C as _C +import torch.utils.hooks as hooks +from torch._namedtensor_internals import ( + check_serializing_named_tensor, + is_ellipsis, + resolve_ellipsis, + single_ellipsis_index, + unzip_namedshape, + update_names, +) +from torch.overrides import ( + get_default_nowrap_functions, + handle_torch_function, + has_torch_function, + has_torch_function_unary, + has_torch_function_variadic, +) +from torch.utils.dlpack import DLDeviceType + + +def _handle_torch_function_and_wrap_type_error_to_not_implemented(f): + assigned = functools.WRAPPER_ASSIGNMENTS + + @functools.wraps(f, assigned=assigned) + def wrapped(*args, **kwargs): + try: + # See https://github.com/pytorch/pytorch/issues/75462 + if has_torch_function(args): + return handle_torch_function(wrapped, args, *args, **kwargs) + return f(*args, **kwargs) + except TypeError: + return NotImplemented + + return wrapped + + +# Should not be used, this is kept only for BC of loading old serialized Tensor subclasses +def _rebuild_from_type(func, type, args, dict): + if type is Tensor: + return func(*args) + + ret = func(*args).as_subclass(type) + ret.__dict__ = dict + return ret + + +def _rebuild_from_type_v2(func, new_type, args, state): + ret = func(*args) + if type(ret) is not new_type: + ret = ret.as_subclass(new_type) + # Tensor does define __setstate__ even though it doesn't define + # __getstate__. So only use __setstate__ if it is NOT the one defined + # on Tensor + if ( + getattr(ret.__class__, "__setstate__", Tensor.__setstate__) + is not Tensor.__setstate__ + ): + ret.__setstate__(state) + else: + ret = torch._utils._set_obj_state(ret, state) + return ret + + +# NB: If you subclass Tensor, and want to share the subclassed class +# across processes, you must also update torch/multiprocessing/reductions.py +# to define a ForkingPickler serialization mode for the class. +# +# NB: If you add a new method to Tensor, you must update +# torch/_C/__init__.pyi.in to add a type annotation for your method; +# otherwise, it will not show up in autocomplete. +class Tensor(torch._C.TensorBase): + def __deepcopy__(self, memo): + if has_torch_function_unary(self): + return handle_torch_function(Tensor.__deepcopy__, (self,), self, memo) + if not self.is_leaf: + raise RuntimeError( + "Only Tensors created explicitly by the user " + "(graph leaves) support the deepcopy protocol at the moment. " + "If you were attempting to deepcopy a module, this may be because " + "of a torch.nn.utils.weight_norm usage, " + "see https://github.com/pytorch/pytorch/pull/103001" + ) + if id(self) in memo: + return memo[id(self)] + with torch.no_grad(): + # TODO: skipping storage copy is wrong for meta, as meta + # does accurate alias tracking; however, the code below + # doesn't work because of + # https://github.com/pytorch/pytorch/issues/47442 + # Update the test in test_serialization if you remove 'meta' from here + if ( + self.is_sparse + or self.device.type + in ["lazy", "xla", "mtia", "mps", "ort", "meta", "ipu"] + or ( + not torch._C._has_storage(self) + and self.device.type == torch._C._get_privateuse1_backend_name() + ) + or (type(self) is not Tensor and self.data_ptr() == 0) + ): + new_tensor = self.clone() + if type(new_tensor) is not type(self): + raise RuntimeError( + "The default implementation of __deepcopy__() for wrapper subclasses " + "only works for subclass types that implement clone() and for which " + "cloning returns another instance of the same subclass. You should either " + "properly implement clone() for your subclass or override __deepcopy__() " + "if it is intended behavior for clone() to return an instance of a " + "different type." + ) + else: + new_storage = self._typed_storage()._deepcopy(memo) + if self.is_quantized: + # quantizer_params can be different type based on torch attribute + quantizer_params: Union[ + Tuple[torch.qscheme, float, int], + Tuple[torch.qscheme, Tensor, Tensor, int], + ] + if self.qscheme() == torch.per_tensor_affine: + quantizer_params = ( + self.qscheme(), + self.q_scale(), + self.q_zero_point(), + ) + elif self.qscheme() in ( + torch.per_channel_affine, + torch.per_channel_affine_float_qparams, + ): + quantizer_params = ( + self.qscheme(), + self.q_per_channel_scales(), + self.q_per_channel_zero_points(), + self.q_per_channel_axis(), + ) + else: + raise RuntimeError( + f"Unsupported qscheme {self.qscheme()} in deepcopy" + ) + # TODO: Once we decide to break serialization FC, no longer + # need to wrap with TypedStorage + new_tensor = torch._utils._rebuild_qtensor( + torch.storage.TypedStorage( + wrap_storage=new_storage._untyped_storage, + dtype=self.dtype, + _internal=True, + ), + self.storage_offset(), + self.size(), + self.stride(), + quantizer_params, + self.requires_grad, + self._backward_hooks, + ) + if type(new_tensor) is not type(self): + raise RuntimeError( + "The default implementation of __deepcopy__() for quantized tensors " + "expects the tensor returned by torch._utils._rebuild_qtensor() to " + "match the type of the instance being copied. If you encounter this, " + "please open an issue on PyTorch's GitHub." + ) + else: + new_tensor = self.new_empty([]) + if type(new_tensor) is not type(self): + raise RuntimeError( + "The default implementation of __deepcopy__() for non-wrapper subclasses " + "only works for subclass types that implement new_empty() and for which " + "that function returns another instance of the same subclass. You should " + "either properly implement new_empty() for your subclass or override " + "__deepcopy__() if it is intended behavior for new_empty() to return " + "an instance of a different type." + ) + new_tensor.set_( + new_storage, self.storage_offset(), self.size(), self.stride() + ) + if self.is_conj(): + new_tensor = new_tensor.conj_physical() + if self.is_neg(): + new_tensor = new_tensor.neg() + if self.requires_grad: + new_tensor.requires_grad_() + if self.grad is not None: + new_tensor.grad = self.grad.__deepcopy__(memo) + + if type(self) is not Tensor: + if type(new_tensor) is not type(self): + raise RuntimeError( + "Type of deepcopy result does not match the type of the source tensor. " + "If you encounter this, please open an issue on PyTorch's GitHub." + ) + + # Plain Tensors don't have slots + slots_to_save = copyreg._slotnames(self.__class__) # type: ignore[attr-defined] + for slot in slots_to_save: + if hasattr(self, slot): + setattr(new_tensor, slot, deepcopy(getattr(self, slot), memo)) + + new_tensor.__dict__ = deepcopy(self.__dict__, memo) + + memo[id(self)] = new_tensor + return new_tensor + + def __reduce_ex__(self, proto): + state = torch._utils._get_obj_state(self) + if type(self) is Tensor and not state: + # Fast path for regular tensor without Python state. + return self._reduce_ex_internal(proto) + if has_torch_function_unary(self): + return handle_torch_function(Tensor.__reduce_ex__, (self,), self, proto) + func, args = self._reduce_ex_internal(proto) + return (_rebuild_from_type_v2, (func, type(self), args, state)) + + def storage(self): + r""" + storage() -> torch.TypedStorage + + Returns the underlying :class:`TypedStorage`. + + .. warning:: + + :class:`TypedStorage` is deprecated. It will be removed in the future, and + :class:`UntypedStorage` will be the only storage class. To access the + :class:`UntypedStorage` directly, use :attr:`Tensor.untyped_storage()`. + """ + if has_torch_function_unary(self): + return handle_torch_function(Tensor.storage, (self,), self) + + torch.storage._warn_typed_storage_removal(stacklevel=2) + return self._typed_storage() + + # For internal use only, to avoid raising deprecation warning + def _typed_storage(self): + untyped_storage = self.untyped_storage() + return torch.TypedStorage( + wrap_storage=untyped_storage, dtype=self.dtype, _internal=True + ) + + def _reduce_ex_internal(self, proto): + check_serializing_named_tensor(self) + # See Note [Don't serialize hooks] + torch.utils.hooks.warn_if_has_hooks(self) + backward_hooks: Dict[Any, Any] = OrderedDict() + # Note: Numpy array is chosen to be the rebuild component for XLA, MTIA, ORT Tensors. + # We considered a few options: + # 1. CPU tensor can't be used here. + # Otherwise in torch.load CPU storage is reconstructed with randomly + # initialized data, moved onto backend device, and then storage is updated + # to the serialized content. This works perfectly for CPU/CUDA but not these backends; + # their tensors are disconnected with storage so they don't get the update. + # 2. Python list is not a good fit due to performance reason. + # `tolist()` converts every single element in the tensor into python objects + # and serialize them one by one. + if self.device.type in ["xla", "mtia", "ort"] or ( + not torch._C._has_storage(self) + and self.device.type == torch._C._get_privateuse1_backend_name() + ): + # Convert BFloat16 tesors to Float32 before conversion to numpy, as numpy doesn't + # support BFloat16. The rebuild tensor from numpy takes in the original self.dtype, + # this would reconstruct the BFloat16 tensor from numpy. + numpy_tensor = ( + self.cpu().numpy() + if self.dtype != torch.bfloat16 + else self.cpu().to(torch.float32).numpy() + ) + return ( + torch._utils._rebuild_device_tensor_from_numpy, + (numpy_tensor, self.dtype, str(self.device), self.requires_grad), + ) + if self.device.type == "meta": + # NB: This implementation BREAKS storage sharing. Current + # hypothesis is that no one cares for meta tensors. + arg_meta = ( + self.dtype, + tuple(self.size()), + self.stride(), + self.requires_grad, + ) + return (torch._utils._rebuild_meta_tensor_no_storage, arg_meta) + if self.is_quantized: + # quantizer_params can be different type based on torch attribute + quantizer_params: Union[ + Tuple[torch.qscheme, float, int], Tuple[Any, Tensor, Tensor, int] + ] + if self.qscheme() == torch.per_tensor_affine: + quantizer_params = ( + torch.per_tensor_affine, + self.q_scale(), + self.q_zero_point(), + ) + elif self.qscheme() in ( + torch.per_channel_affine, + torch.per_channel_affine_float_qparams, + ): + # convert scales and zero points to tuple to avoid recursive calls + # when/if we get multi-axis quantized tensors in the future, the shape + # is recoverable from the main tensor shape + quantizer_params = ( + torch.per_channel_affine, + self.q_per_channel_scales(), + self.q_per_channel_zero_points(), + self.q_per_channel_axis(), + ) + else: + raise RuntimeError( + f"Serialization is not supported for tensors of type {self.qscheme()}" + ) + # TODO: Once we decide to break serialization FC, no longer + # need to wrap with TypedStorage + args_qtensor = ( + torch.storage.TypedStorage( + wrap_storage=self._typed_storage()._untyped_storage, + dtype=self.dtype, + _internal=True, + ), + self.storage_offset(), + tuple(self.size()), + self.stride(), + quantizer_params, + self.requires_grad, + backward_hooks, + ) + return (torch._utils._rebuild_qtensor, args_qtensor) + elif self.is_sparse: + if self.layout == torch.sparse_coo: + args_sparse = ( + self.layout, + (self._indices(), self._values(), self.size(), self.is_coalesced()), + ) + else: + raise NotImplementedError( + f"sparse tensor __reduce_ex__ for layout `{self.layout}`" + ) + return (torch._utils._rebuild_sparse_tensor, args_sparse) + elif self.layout in { + torch.sparse_csr, + torch.sparse_csc, + torch.sparse_bsr, + torch.sparse_bsc, + }: + if self.layout in {torch.sparse_csr, torch.sparse_bsr}: + compressed_indices, plain_indices = ( + self.crow_indices(), + self.col_indices(), + ) + else: + compressed_indices, plain_indices = ( + self.ccol_indices(), + self.row_indices(), + ) + args_sparse_compressed = ( + self.layout, + ( + compressed_indices, + plain_indices, + self.values(), + self.size(), + ), + ) + return (torch._utils._rebuild_sparse_tensor, args_sparse_compressed) + elif self.is_nested: + args_nested = ( + # NB: values() currently returns the storage as a buffer in an unsafe way. + # Ideally, we'd use a private API for this instead. TODO: Switch to this if + # we ever get around to adding it. + self.values(), + self._nested_tensor_size(), + self._nested_tensor_strides(), + self._nested_tensor_storage_offsets(), + ) + return (torch._utils._rebuild_nested_tensor, args_nested) + elif ( + self.data_ptr() == 0 + and type(self) is not torch.Tensor + and type(self).__torch_dispatch__ is not torch.Tensor.__torch_dispatch__ + ): + arg_wrapper_subclass = ( + type(self), + self.dtype, + tuple(self.size()), + self.stride(), + self.storage_offset(), + self.layout, + self.device, + self.requires_grad, + ) + return (torch._utils._rebuild_wrapper_subclass, arg_wrapper_subclass) + else: + v3_dtypes = [ + torch.float8_e5m2, + torch.float8_e4m3fn, + torch.float8_e5m2fnuz, + torch.float8_e4m3fnuz, + torch.bits8, + torch.bits16, + torch.bits1x8, + torch.bits2x4, + torch.bits4x2, + torch.complex32, + ] + if self.dtype in v3_dtypes: + rebuild_func = torch._utils._rebuild_tensor_v3 + storage = self.untyped_storage() + else: + # TODO: Once we decide to break serialization FC, no longer + # need to wrap with TypedStorage + rebuild_func = torch._utils._rebuild_tensor_v2 # type: ignore[assignment] + storage = torch.storage.TypedStorage( + wrap_storage=self._typed_storage()._untyped_storage, + dtype=self.dtype, + _internal=True, + ) # type: ignore[assignment] + args = ( + storage, + self.storage_offset(), + tuple(self.size()), + self.stride(), + self.requires_grad, + backward_hooks, + ) # previously was self._backward_hooks + + if isinstance(storage, torch.storage.UntypedStorage): + args = args + (self.dtype,) # type: ignore[assignment] + + metadata = torch._utils.get_tensor_metadata(self) + if metadata: + args = args + (metadata,) # type: ignore[assignment] + + return (rebuild_func, args) + + def __setstate__(self, state): + if has_torch_function_unary(self): + return handle_torch_function(Tensor.__setstate__, (self,), self, state) + # Warning: this method is NOT called when you torch.load() a tensor; + # that is managed by _rebuild_tensor_v2 + if not self.is_leaf: + raise RuntimeError("__setstate__ can be only called on leaf Tensors") + if len(state) == 4: + # legacy serialization of Tensor + self.set_(*state) + return + elif len(state) == 5: + # legacy serialization of Variable + self.data = state[0] + state = (state[3], state[4], state[2]) + # The setting of _backward_hooks is expected to be a no-op. + # See Note [Don't serialize hooks] + self.requires_grad, _, self._backward_hooks = state + + def __repr__(self, *, tensor_contents=None): + if has_torch_function_unary(self): + return handle_torch_function( + Tensor.__repr__, (self,), self, tensor_contents=tensor_contents + ) + # All strings are unicode in Python 3. + return torch._tensor_str._str(self, tensor_contents=tensor_contents) + + def backward( + self, gradient=None, retain_graph=None, create_graph=False, inputs=None + ): + r"""Computes the gradient of current tensor wrt graph leaves. + + The graph is differentiated using the chain rule. If the tensor is + non-scalar (i.e. its data has more than one element) and requires + gradient, the function additionally requires specifying ``gradient``. + It should be a tensor of matching type and location, that contains + the gradient of the differentiated function w.r.t. ``self``. + + This function accumulates gradients in the leaves - you might need to zero + ``.grad`` attributes or set them to ``None`` before calling it. + See :ref:`Default gradient layouts` + for details on the memory layout of accumulated gradients. + + .. note:: + + If you run any forward ops, create ``gradient``, and/or call ``backward`` + in a user-specified CUDA stream context, see + :ref:`Stream semantics of backward passes`. + + .. note:: + + When ``inputs`` are provided and a given input is not a leaf, + the current implementation will call its grad_fn (though it is not strictly needed to get this gradients). + It is an implementation detail on which the user should not rely. + See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details. + + Args: + gradient (Tensor or None): Gradient w.r.t. the + tensor. If it is a tensor, it will be automatically converted + to a Tensor that does not require grad unless ``create_graph`` is True. + None values can be specified for scalar Tensors or ones that + don't require grad. If a None value would be acceptable then + this argument is optional. + retain_graph (bool, optional): If ``False``, the graph used to compute + the grads will be freed. Note that in nearly all cases setting + this option to True is not needed and often can be worked around + in a much more efficient way. Defaults to the value of + ``create_graph``. + create_graph (bool, optional): If ``True``, graph of the derivative will + be constructed, allowing to compute higher order derivative + products. Defaults to ``False``. + inputs (sequence of Tensor): Inputs w.r.t. which the gradient will be + accumulated into ``.grad``. All other Tensors will be ignored. If not + provided, the gradient is accumulated into all the leaf Tensors that were + used to compute the attr::tensors. + """ + if has_torch_function_unary(self): + return handle_torch_function( + Tensor.backward, + (self,), + self, + gradient=gradient, + retain_graph=retain_graph, + create_graph=create_graph, + inputs=inputs, + ) + torch.autograd.backward( + self, gradient, retain_graph, create_graph, inputs=inputs + ) + + def register_hook(self, hook): + r"""Registers a backward hook. + + The hook will be called every time a gradient with respect to the + Tensor is computed. The hook should have the following signature:: + + hook(grad) -> Tensor or None + + + The hook should not modify its argument, but it can optionally return + a new gradient which will be used in place of :attr:`grad`. + + This function returns a handle with a method ``handle.remove()`` + that removes the hook from the module. + + .. note:: + See :ref:`backward-hooks-execution` for more information on how when this hook + is executed, and how its execution is ordered relative to other hooks. + + Example:: + + >>> v = torch.tensor([0., 0., 0.], requires_grad=True) + >>> h = v.register_hook(lambda grad: grad * 2) # double the gradient + >>> v.backward(torch.tensor([1., 2., 3.])) + >>> v.grad + + 2 + 4 + 6 + [torch.FloatTensor of size (3,)] + + >>> h.remove() # removes the hook + """ + if has_torch_function_unary(self): + return handle_torch_function(Tensor.register_hook, (self,), self, hook) + if not self.requires_grad: + raise RuntimeError( + "cannot register a hook on a tensor that doesn't require gradient" + ) + if self._backward_hooks is None: + self._backward_hooks = OrderedDict() + if self.grad_fn is not None: + self.grad_fn._register_hook_dict(self) + handle = hooks.RemovableHandle(self._backward_hooks) + self._backward_hooks[handle.id] = hook + return handle + + def register_post_accumulate_grad_hook(self, hook): + r"""Registers a backward hook that runs after grad accumulation. + + The hook will be called after all gradients for a tensor have been accumulated, + meaning that the .grad field has been updated on that tensor. The post + accumulate grad hook is ONLY applicable for leaf tensors (tensors without a + .grad_fn field). Registering this hook on a non-leaf tensor will error! + + The hook should have the following signature:: + + hook(param: Tensor) -> None + + Note that, unlike other autograd hooks, this hook operates on the tensor + that requires grad and not the grad itself. The hook can in-place modify + and access its Tensor argument, including its .grad field. + + This function returns a handle with a method ``handle.remove()`` + that removes the hook from the module. + + .. note:: + See :ref:`backward-hooks-execution` for more information on how when this hook + is executed, and how its execution is ordered relative to other hooks. Since + this hook runs during the backward pass, it will run in no_grad mode (unless + create_graph is True). You can use torch.enable_grad() to re-enable autograd + within the hook if you need it. + + Example:: + + >>> v = torch.tensor([0., 0., 0.], requires_grad=True) + >>> lr = 0.01 + >>> # simulate a simple SGD update + >>> h = v.register_post_accumulate_grad_hook(lambda p: p.add_(p.grad, alpha=-lr)) + >>> v.backward(torch.tensor([1., 2., 3.])) + >>> v + tensor([-0.0100, -0.0200, -0.0300], requires_grad=True) + + >>> h.remove() # removes the hook + """ + if has_torch_function_unary(self): + return handle_torch_function( + Tensor.register_post_accumulate_grad_hook, (self,), self, hook + ) + if not self.requires_grad: + raise RuntimeError( + "cannot register a hook on a tensor that doesn't require gradient" + ) + if self.grad_fn is not None: + raise RuntimeError( + "post accumulate grad hooks cannot be registered on non-leaf tensors" + ) + if self._post_accumulate_grad_hooks is None: + self._post_accumulate_grad_hooks: Dict[Any, Any] = OrderedDict() + handle = hooks.RemovableHandle(self._post_accumulate_grad_hooks) + self._post_accumulate_grad_hooks[handle.id] = hook + return handle + + def reinforce(self, reward): + def trim(str): + return "\n".join([line.strip() for line in str.split("\n")]) + + raise RuntimeError( + trim( + r"""reinforce() was removed. + Use torch.distributions instead. + See https://pytorch.org/docs/master/distributions.html + + Instead of: + + probs = policy_network(state) + action = probs.multinomial() + next_state, reward = env.step(action) + action.reinforce(reward) + action.backward() + + Use: + + probs = policy_network(state) + # NOTE: categorical is equivalent to what used to be called multinomial + m = torch.distributions.Categorical(probs) + action = m.sample() + next_state, reward = env.step(action) + loss = -m.log_prob(action) * reward + loss.backward() + """ + ) + ) + + detach = _C._add_docstr( + _C.TensorBase.detach, + r""" + Returns a new Tensor, detached from the current graph. + + The result will never require gradient. + + This method also affects forward mode AD gradients and the result will never + have forward mode AD gradients. + + .. note:: + + Returned Tensor shares the same storage with the original one. + In-place modifications on either of them will be seen, and may trigger + errors in correctness checks. + """, + ) + + detach_ = _C._add_docstr( + _C.TensorBase.detach_, + r""" + Detaches the Tensor from the graph that created it, making it a leaf. + Views cannot be detached in-place. + + This method also affects forward mode AD gradients and the result will never + have forward mode AD gradients. + """, + ) + + def is_shared(self): + r"""Checks if tensor is in shared memory. + + This is always ``True`` for CUDA tensors. + """ + if has_torch_function_unary(self): + return handle_torch_function(Tensor.is_shared, (self,), self) + return self._typed_storage()._is_shared() + + def share_memory_(self): + r"""Moves the underlying storage to shared memory. + + This is a no-op if the underlying storage is already in shared memory + and for CUDA tensors. Tensors in shared memory cannot be resized. + + See :meth:`torch.UntypedStorage.share_memory_` for more details. + """ + if has_torch_function_unary(self): + return handle_torch_function(Tensor.share_memory_, (self,), self) + self._typed_storage()._share_memory_() + return self + + def module_load(self, other, assign=False): + r"""Defines how to transform ``other`` when loading it into ``self`` in :meth:`~nn.Module.load_state_dict`. + + Used when :func:`~torch.__future__.get_swap_module_params_on_conversion` is ``True``. + + It is expected that ``self`` is a parameter or buffer in an ``nn.Module`` and ``other`` is the + value in the state dictionary with the corresponding key, this method defines + how ``other`` is remapped before being swapped with ``self`` via + :func:`~torch.utils.swap_tensors`` in ``module.load_state_dict()``. + + .. note:: + This method should always return a new object that is not ``self`` or ``other``. + For example, the default implementation returns ``self.copy_(other).detach()`` + if ``assign`` is ``False`` or ``other.detach()`` if ``assign`` is ``True``. + + Args: + other (Tensor): value in state dict with key corresponding to ``self`` + assign (bool): the assign argument passed to :meth:`nn.Module.load_state_dict` + + """ + if has_torch_function_variadic(self, other): + return handle_torch_function( + Tensor.module_load, (self, other), self, other, assign=assign + ) + + if assign: + return other.detach() + else: + return self.copy_(other).detach() + + def __reversed__(self): + r"""Reverses the tensor along dimension 0.""" + if has_torch_function_unary(self): + return handle_torch_function(Tensor.__reversed__, (self,), self) + if self.dim() == 0: + return self + else: + return self.flip(0) + + def norm( + self, + p: Optional[Union[float, str]] = "fro", + dim=None, + keepdim=False, + dtype=None, + ): + r"""See :func:`torch.norm`""" + if has_torch_function_unary(self): + return handle_torch_function( + Tensor.norm, (self,), self, p=p, dim=dim, keepdim=keepdim, dtype=dtype + ) + return torch.norm(self, p, dim, keepdim, dtype=dtype) + + def solve(self, other): + from ._linalg_utils import solve + + return solve(self, other) + + def lstsq(self, other): + from ._linalg_utils import lstsq + + return lstsq(self, other) + + def eig(self, eigenvectors=False): + from ._linalg_utils import eig + + return eig(self, eigenvectors=eigenvectors) + + def symeig(self, eigenvectors=False): + from ._linalg_utils import _symeig + + return _symeig(self, eigenvectors=eigenvectors) + + def lu(self, pivot=True, get_infos=False): + r"""See :func:`torch.lu`""" + # If get_infos is True, then we don't need to check for errors and vice versa + if has_torch_function_unary(self): + return handle_torch_function( + Tensor.lu, (self,), self, pivot=pivot, get_infos=get_infos + ) + + LU, pivots, infos = torch._lu_with_info( + self, pivot=pivot, check_errors=(not get_infos) + ) + if get_infos: + return LU, pivots, infos + else: + return LU, pivots + + def stft( + self, + n_fft: int, + hop_length: Optional[int] = None, + win_length: Optional[int] = None, + window: "Optional[Tensor]" = None, + center: bool = True, + pad_mode: str = "reflect", + normalized: bool = False, + onesided: Optional[bool] = None, + return_complex: Optional[bool] = None, + ): + r"""See :func:`torch.stft` + + .. warning:: + This function changed signature at version 0.4.1. Calling with + the previous signature may cause error or return incorrect result. + """ + if has_torch_function_unary(self): + return handle_torch_function( + Tensor.stft, + (self,), + self, + n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + pad_mode=pad_mode, + normalized=normalized, + onesided=onesided, + return_complex=return_complex, + ) + return torch.stft( + self, + n_fft, + hop_length, + win_length, + window, + center, + pad_mode, + normalized, + onesided, + return_complex=return_complex, + ) + + def istft( + self, + n_fft: int, + hop_length: Optional[int] = None, + win_length: Optional[int] = None, + window: "Optional[Tensor]" = None, + center: bool = True, + normalized: bool = False, + onesided: Optional[bool] = None, + length: Optional[int] = None, + return_complex: bool = False, + ): + r"""See :func:`torch.istft`""" + if has_torch_function_unary(self): + return handle_torch_function( + Tensor.istft, + (self,), + self, + n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + normalized=normalized, + onesided=onesided, + length=length, + return_complex=return_complex, + ) + return torch.istft( + self, + n_fft, + hop_length, + win_length, + window, + center, + normalized, + onesided, + length, + return_complex=return_complex, + ) + + def resize(self, *sizes): + if has_torch_function_unary(self): + return handle_torch_function(Tensor.resize, (self,), self, *sizes) + warnings.warn("non-inplace resize is deprecated") + from torch.autograd._functions import Resize + + return Resize.apply(self, sizes) + + def resize_as(self, tensor): + if has_torch_function_variadic(self, tensor): + return handle_torch_function(Tensor.resize_as, (self, tensor), self, tensor) + warnings.warn("non-inplace resize_as is deprecated") + from torch.autograd._functions import Resize + + return Resize.apply(self, tensor.size()) + + def split(self, split_size, dim=0): + r"""See :func:`torch.split`""" + if has_torch_function_unary(self): + return handle_torch_function( + Tensor.split, (self,), self, split_size, dim=dim + ) + if isinstance(split_size, Tensor): + try: + split_size = int(split_size) + except ValueError: + pass + + if isinstance(split_size, (int, torch.SymInt)): + return torch._VF.split(self, split_size, dim) # type: ignore[attr-defined] + else: + return torch._VF.split_with_sizes(self, split_size, dim) + + def unique(self, sorted=True, return_inverse=False, return_counts=False, dim=None): + r"""Returns the unique elements of the input tensor. + + See :func:`torch.unique` + """ + if has_torch_function_unary(self): + return handle_torch_function( + Tensor.unique, + (self,), + self, + sorted=sorted, + return_inverse=return_inverse, + return_counts=return_counts, + dim=dim, + ) + return torch.unique( + self, + sorted=sorted, + return_inverse=return_inverse, + return_counts=return_counts, + dim=dim, + ) + + def unique_consecutive(self, return_inverse=False, return_counts=False, dim=None): + r"""Eliminates all but the first element from every consecutive group of equivalent elements. + + See :func:`torch.unique_consecutive` + """ + if has_torch_function_unary(self): + return handle_torch_function( + Tensor.unique_consecutive, + (self,), + self, + return_inverse=return_inverse, + return_counts=return_counts, + dim=dim, + ) + return torch.unique_consecutive( + self, return_inverse=return_inverse, return_counts=return_counts, dim=dim + ) + + @_handle_torch_function_and_wrap_type_error_to_not_implemented + def __rsub__(self, other): + return _C._VariableFunctions.rsub(self, other) + + @_handle_torch_function_and_wrap_type_error_to_not_implemented + def __rdiv__(self, other): + return self.reciprocal() * other + + __rtruediv__ = __rdiv__ + __itruediv__ = _C.TensorBase.__idiv__ + + __pow__ = _handle_torch_function_and_wrap_type_error_to_not_implemented( + _C.TensorBase.pow + ) + __ipow__ = _handle_torch_function_and_wrap_type_error_to_not_implemented( + _C.TensorBase.pow_ + ) + + @_handle_torch_function_and_wrap_type_error_to_not_implemented + def __rmod__(self, other): + return torch.remainder(other, self) + + def __format__(self, format_spec): + if has_torch_function_unary(self): + return handle_torch_function(Tensor.__format__, (self,), self, format_spec) + if self.dim() == 0 and not self.is_meta and type(self) is Tensor: + return self.item().__format__(format_spec) + return object.__format__(self, format_spec) + + @_handle_torch_function_and_wrap_type_error_to_not_implemented + def __rpow__(self, other): + return torch.pow(other, self) + + @_handle_torch_function_and_wrap_type_error_to_not_implemented + def __floordiv__(self, other): + return torch.floor_divide(self, other) + + @_handle_torch_function_and_wrap_type_error_to_not_implemented + def __rfloordiv__(self, other): + return torch.floor_divide(other, self) + + @_handle_torch_function_and_wrap_type_error_to_not_implemented + def __rlshift__(self, other): + return torch.bitwise_left_shift(other, self) + + @_handle_torch_function_and_wrap_type_error_to_not_implemented + def __rrshift__(self, other): + return torch.bitwise_right_shift(other, self) + + @_handle_torch_function_and_wrap_type_error_to_not_implemented + def __rmatmul__(self, other): + return torch.matmul(other, self) + + __pos__ = _C.TensorBase.positive + __neg__ = _C.TensorBase.neg + __abs__ = _C.TensorBase.abs + + def __len__(self): + if has_torch_function_unary(self): + return handle_torch_function(Tensor.__len__, (self,), self) + if self.dim() == 0: + raise TypeError("len() of a 0-d tensor") + if torch._C._get_tracing_state(): + warnings.warn( + "Using len to get tensor shape might cause the trace to be incorrect. " + "Recommended usage would be tensor.shape[0]. " + "Passing a tensor of different shape might lead to errors or silently give " + "incorrect results.", + category=torch.jit.TracerWarning, + stacklevel=2, + ) + return self.shape[0] + + def __iter__(self): + # NB: we use 'imap' and not 'map' here, so that in Python 2 we get a + # generator and don't eagerly perform all the indexes. This could + # save us work, and also helps keep trace ordering deterministic + # (e.g., if you zip(*hiddens), the eager map will force all the + # indexes of hiddens[0] before hiddens[1], while the generator + # map will interleave them.) + # NB: We have intentionally skipped __torch_function__ dispatch here. + # See gh-54457 + if self.dim() == 0: + raise TypeError("iteration over a 0-d tensor") + if torch._C._get_tracing_state(): + warnings.warn( + "Iterating over a tensor might cause the trace to be incorrect. " + "Passing a tensor of different shape won't change the number of " + "iterations executed (and might lead to errors or silently give " + "incorrect results).", + category=torch.jit.TracerWarning, + stacklevel=2, + ) + return iter(self.unbind(0)) + + def __hash__(self): + # Do NOT handle __torch_function__ here as user's default + # implementation that handle most functions will most likely do it wrong. + # It can be easily overridden by defining this method on the user + # subclass if needed. + return id(self) + + def __dir__(self): + if has_torch_function_unary(self): + return handle_torch_function(Tensor.__dir__, (self,), self) + tensor_methods = dir(self.__class__) + tensor_methods.remove("volatile") # deprecated + attrs = list(self.__dict__.keys()) + keys = tensor_methods + attrs + + # property only available dense, cuda tensors + if (not self.is_cuda) or self.is_sparse: + keys.remove("__cuda_array_interface__") + + return sorted(keys) + + # Numpy array interface, to support `numpy.asarray(tensor) -> ndarray` + __array_priority__ = 1000 # prefer Tensor ops over numpy ones + + def __array__(self, dtype=None): + if has_torch_function_unary(self): + return handle_torch_function(Tensor.__array__, (self,), self, dtype=dtype) + if dtype is None: + return self.numpy() + else: + return self.numpy().astype(dtype, copy=False) + + # Wrap Numpy array again in a suitable tensor when done, to support e.g. + # `numpy.sin(tensor) -> tensor` or `numpy.greater(tensor, 0) -> ByteTensor` + def __array_wrap__(self, array): + if has_torch_function_unary(self): + return handle_torch_function( + Tensor.__array_wrap__, (self,), self, array=array + ) + if array.dtype == bool: + # Workaround, torch has no built-in bool tensor + array = array.astype("uint8") + return torch.from_numpy(array) + + def __contains__(self, element): + r"""Check if `element` is present in tensor + + Args: + element (Tensor or scalar): element to be checked + for presence in current tensor" + """ + if has_torch_function_unary(self): + return handle_torch_function(Tensor.__contains__, (self,), self, element) + if isinstance( + element, (torch.Tensor, Number, torch.SymInt, torch.SymFloat, torch.SymBool) + ): + # type hint doesn't understand the __contains__ result array + return (element == self).any().item() # type: ignore[union-attr] + + raise RuntimeError( + f"Tensor.__contains__ only supports Tensor or scalar, but you passed in a {type(element)}." + ) + + @property + def __cuda_array_interface__(self): + """Array view description for cuda tensors. + + See: + https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html + """ + if has_torch_function_unary(self): + # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185 + return handle_torch_function(Tensor.__cuda_array_interface__.__get__, (self,), self) # type: ignore[attr-defined] + + # raise AttributeError for unsupported tensors, so that + # hasattr(cpu_tensor, "__cuda_array_interface__") is False. + if not self.is_cuda: + raise AttributeError( + "Can't get __cuda_array_interface__ on non-CUDA tensor type: %s " + "If CUDA data is required use tensor.cuda() to copy tensor to device memory." + % self.type() + ) + + if self.is_sparse: + raise AttributeError( + "Can't get __cuda_array_interface__ on sparse type: %s " + "Use Tensor.to_dense() to convert to a dense tensor first." + % self.type() + ) + + # RuntimeError, matching tensor.__array__() behavior. + if self.requires_grad: + raise RuntimeError( + "Can't get __cuda_array_interface__ on Variable that requires grad. " + "If gradients aren't required, use var.detach() to get Variable that doesn't require grad." + ) + + # CUDA devices are little-endian and tensors are stored in native byte + # order. 1-byte entries are endian-agnostic. + typestr = { + torch.complex64: " 0 else 0 + data = (data_ptr, False) # read-only is false + + return dict(typestr=typestr, shape=shape, strides=strides, data=data, version=2) + + def storage_type(self): + r"""storage_type() -> type + + Returns the type of the underlying storage. + + """ + if has_torch_function_unary(self): + return handle_torch_function(Tensor.storage_type, (self,), self) + + torch.storage._warn_typed_storage_removal() + + return self._typed_storage()._get_legacy_storage_class() + + def refine_names(self, *names): + r"""Refines the dimension names of :attr:`self` according to :attr:`names`. + + Refining is a special case of renaming that "lifts" unnamed dimensions. + A ``None`` dim can be refined to have any name; a named dim can only be + refined to have the same name. + + Because named tensors can coexist with unnamed tensors, refining names + gives a nice way to write named-tensor-aware code that works with both + named and unnamed tensors. + + :attr:`names` may contain up to one Ellipsis (``...``). + The Ellipsis is expanded greedily; it is expanded in-place to fill + :attr:`names` to the same length as ``self.dim()`` using names from the + corresponding indices of ``self.names``. + + Python 2 does not support Ellipsis but one may use a string literal + instead (``'...'``). + + Args: + names (iterable of str): The desired names of the output tensor. May + contain up to one Ellipsis. + + Examples:: + + >>> imgs = torch.randn(32, 3, 128, 128) + >>> named_imgs = imgs.refine_names('N', 'C', 'H', 'W') + >>> named_imgs.names + ('N', 'C', 'H', 'W') + + >>> tensor = torch.randn(2, 3, 5, 7, 11) + >>> tensor = tensor.refine_names('A', ..., 'B', 'C') + >>> tensor.names + ('A', None, None, 'B', 'C') + + .. warning:: + The named tensor API is experimental and subject to change. + + """ + if has_torch_function_unary(self): + return handle_torch_function(Tensor.refine_names, (self,), self, *names) + names = resolve_ellipsis(names, self.names, "refine_names") + return super().refine_names(names) + + def align_to(self, *names): + r"""Permutes the dimensions of the :attr:`self` tensor to match the order + specified in :attr:`names`, adding size-one dims for any new names. + + All of the dims of :attr:`self` must be named in order to use this method. + The resulting tensor is a view on the original tensor. + + All dimension names of :attr:`self` must be present in :attr:`names`. + :attr:`names` may contain additional names that are not in ``self.names``; + the output tensor has a size-one dimension for each of those new names. + + :attr:`names` may contain up to one Ellipsis (``...``). + The Ellipsis is expanded to be equal to all dimension names of :attr:`self` + that are not mentioned in :attr:`names`, in the order that they appear + in :attr:`self`. + + Python 2 does not support Ellipsis but one may use a string literal + instead (``'...'``). + + Args: + names (iterable of str): The desired dimension ordering of the + output tensor. May contain up to one Ellipsis that is expanded + to all unmentioned dim names of :attr:`self`. + + Examples:: + + >>> tensor = torch.randn(2, 2, 2, 2, 2, 2) + >>> named_tensor = tensor.refine_names('A', 'B', 'C', 'D', 'E', 'F') + + # Move the F and E dims to the front while keeping the rest in order + >>> named_tensor.align_to('F', 'E', ...) + + .. warning:: + The named tensor API is experimental and subject to change. + + """ + if has_torch_function_unary(self): + return handle_torch_function(Tensor.align_to, (self,), self, *names) + ellipsis_idx = single_ellipsis_index(names, "align_to") + if ellipsis_idx is None: + return super().align_to(names) + return super().align_to( + [name for name in names if not is_ellipsis(name)], ellipsis_idx + ) + + def unflatten(self, dim, sizes): + r""" + unflatten(dim, sizes) -> Tensor + + See :func:`torch.unflatten`. + + """ + if has_torch_function_unary(self): + return handle_torch_function(Tensor.unflatten, (self,), self, dim, sizes) + + if not sizes: + raise RuntimeError("unflatten: sizes must be non-empty") + + names = None + if isinstance(sizes, OrderedDict) or ( + isinstance(sizes, (tuple, list)) and isinstance(sizes[0], (tuple, list)) + ): + names, sizes = unzip_namedshape(sizes) + return super().unflatten(dim, sizes, names) + else: + return super().unflatten(dim, sizes) + + def rename_(self, *names, **rename_map): + """In-place version of :meth:`~Tensor.rename`.""" + + if has_torch_function_unary(self): + return handle_torch_function( + Tensor.rename_, (self,), self, *names, **rename_map + ) + + # Note [rename_ / rename API] + # The Python API for these is different from the C++ API. In Python: + # 1) tensor.rename(*names) takes a vararglist of names + # 2) tensor.rename(**rename_map) takes a map of names to rename. + # C++ is static, making it difficult to implement similar behavior. + return update_names(self, names, rename_map, inplace=True) + + def rename(self, *names, **rename_map): + """Renames dimension names of :attr:`self`. + + There are two main usages: + + ``self.rename(**rename_map)`` returns a view on tensor that has dims + renamed as specified in the mapping :attr:`rename_map`. + + ``self.rename(*names)`` returns a view on tensor, renaming all + dimensions positionally using :attr:`names`. + Use ``self.rename(None)`` to drop names on a tensor. + + One cannot specify both positional args :attr:`names` and keyword args + :attr:`rename_map`. + + Examples:: + + >>> imgs = torch.rand(2, 3, 5, 7, names=('N', 'C', 'H', 'W')) + >>> renamed_imgs = imgs.rename(N='batch', C='channels') + >>> renamed_imgs.names + ('batch', 'channels', 'H', 'W') + + >>> renamed_imgs = imgs.rename(None) + >>> renamed_imgs.names + (None, None, None, None) + + >>> renamed_imgs = imgs.rename('batch', 'channel', 'height', 'width') + >>> renamed_imgs.names + ('batch', 'channel', 'height', 'width') + + .. warning:: + The named tensor API is experimental and subject to change. + + """ + if has_torch_function_unary(self): + return handle_torch_function( + Tensor.rename, (self,), self, *names, **rename_map + ) + + # See Note [rename_ / rename API] + return update_names(self, names, rename_map, inplace=False) + + def to_sparse_coo(self): + """Convert a tensor to :ref:`coordinate format `. + + Examples:: + + >>> dense = torch.randn(5, 5) + >>> sparse = dense.to_sparse_coo() + >>> sparse._nnz() + 25 + + """ + return self.to_sparse() + + def dim_order(self): + """ + + dim_order() -> tuple + + Returns a tuple of int describing the dim order or physical layout of :attr:`self`. + + Args: + None + + Dim order represents how dimensions are laid out in memory, + starting from the outermost to the innermost dimension. + + Example:: + >>> torch.empty((2, 3, 5, 7)).dim_order() + (0, 1, 2, 3) + >>> torch.empty((2, 3, 5, 7), memory_format=torch.channels_last).dim_order() + (0, 2, 3, 1) + + .. warning:: + The dim_order tensor API is experimental and subject to change. + + """ + if has_torch_function_unary(self): + return handle_torch_function(Tensor.dim_order, (self,), self) + + import torch._prims_common as utils + + return tuple(utils.compute_elementwise_output_logical_to_physical_perm(self)) + + def _update_names(self, names, inplace): + if has_torch_function_unary(self): + return handle_torch_function( + Tensor._update_names, (self,), self, names, inplace + ) + + # See Note [rename_ / rename API] + if inplace: + return super().rename_(names) + else: + return super().rename(names) + + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + """ + This __torch_function__ implementation wraps subclasses such that + methods called on subclasses return a subclass instance instead of + a ``torch.Tensor`` instance. + + One corollary to this is that you need coverage for torch.Tensor + methods if implementing __torch_function__ for subclasses. + + We recommend always calling ``super().__torch_function__`` as the base + case when doing the above. + + While not mandatory, we recommend making `__torch_function__` a classmethod. + """ + if kwargs is None: + kwargs = {} + + if not all(issubclass(cls, t) for t in types): + return NotImplemented + + with _C.DisableTorchFunctionSubclass(): + ret = func(*args, **kwargs) + if func in get_default_nowrap_functions(): + return ret + else: + return _convert(ret, cls) + + __torch_dispatch__ = _C._disabled_torch_dispatch_impl + + def __dlpack__(self, stream=None): + """ + Creates a DLpack `capsule https://data-apis.org/array-api/latest/design_topics/data_interchange.html#data-interchange`_ + of the current tensor to be exported to other libraries. + + This function will be called from the `from_dlpack` method + of the library that will consume the capsule. `from_dlpack` passes the current + stream to this method as part of the specification. + + Args: + stream (integer or None): An optional Python integer representing a + pointer to a CUDA stream. The current stream is synchronized with + this stream before the capsule is created, and since the capsule + shares its storage with the tensor this make it safe to access from + both streams. If None or -1 is passed then no synchronization is performed. + If 1 (on CUDA) or 0 (on ROCM) then the default stream is used for + synchronization. + """ + if has_torch_function_unary(self): + return handle_torch_function(Tensor.__dlpack__, (self,), self, stream) + + # DLPack capsules can't capture all of PyTorch's semantics, + # so we prohibit exporting tensors that would lose their properties like + # requires_grad and having the conjugate bit set. + if self.requires_grad: + raise RuntimeError( + "Can't export tensors that require gradient, use tensor.detach()" + ) + if self.is_conj(): + raise RuntimeError("Can't export tensors with the conjugate bit set") + if self.layout != torch.strided: + raise RuntimeError( + "Can't export tensors with layout other than torch.strided" + ) + + if stream is not None and type(stream) is not int: + # Stream pointers in CUDA/ROCm are uniquely numbered and can + # be retrieved from their integer value. + raise TypeError("stream must be ``int`` or ``none``") + elif stream is not None and stream != -1: + if self.device.type == "cuda": + # NB: This logic handles the special case values for default + # streams and must be kept in sync with from_dlpack in + # torch/utils/dlpack.py + if stream == 1 and torch.version.hip is None: + stream = torch.cuda.default_stream() + elif stream == 0 and torch.version.hip is not None: + stream = torch.cuda.default_stream() + else: + stream = torch.cuda.ExternalStream(stream) + # Only synchronize on different streams + sync_stream = torch.cuda.current_stream() + if stream != sync_stream: + event = torch.cuda.Event() + event.record(sync_stream) + stream.wait_event(event) + return torch.to_dlpack(self) + + def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]: + if has_torch_function_unary(self): + return handle_torch_function(Tensor.__dlpack_device__, (self,), self) + device = self.device + idx = device.index if device.index is not None else 0 + torch_device_type = device.type + if torch_device_type == "cuda" and torch.version.hip is not None: + device_type = DLDeviceType.kDLROCM + elif torch_device_type == "cpu" and self.is_pinned(): + device_type = DLDeviceType.kDLCPUPinned + elif torch_device_type == "cuda": + device_type = DLDeviceType.kDLGPU + elif torch_device_type == "cpu": + device_type = DLDeviceType.kDLCPU + elif self.device.type == "xpu": + device_type = DLDeviceType.kDLOneAPI + else: + raise ValueError(f"Unknown device type {torch_device_type} for Dlpack") + return (device_type, idx) + + __module__ = "torch" + + +def _convert(ret, cls): + if cls is Tensor: + return ret + + if isinstance(ret, Tensor) and not isinstance(ret, cls): + ret = ret.as_subclass(cls) + + if isinstance(ret, (tuple, list)): + # Also handles things like namedtuples + ret = type(ret)(_convert(r, cls) for r in ret) + + return ret diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_vmap_internals.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_vmap_internals.py new file mode 100644 index 0000000000000000000000000000000000000000..8440abccb23904e935878e245b390465e04b5db0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_vmap_internals.py @@ -0,0 +1,237 @@ +import functools +import warnings +from typing import Any, Callable, List, Optional, Tuple, Union + +import torch +from torch import Tensor +from torch.utils._pytree import _broadcast_to_and_flatten, tree_flatten, tree_unflatten + +in_dims_t = Union[int, Tuple] +out_dims_t = Union[int, Tuple[int, ...]] + + +# Checks that all args-to-be-batched have the same batch dim size +def _validate_and_get_batch_size( + flat_in_dims: List[Optional[int]], flat_args: List +) -> int: + batch_sizes = [ + arg.size(in_dim) + for in_dim, arg in zip(flat_in_dims, flat_args) + if in_dim is not None + ] + if batch_sizes and any(size != batch_sizes[0] for size in batch_sizes): + raise ValueError( + f"vmap: Expected all tensors to have the same size in the mapped " + f"dimension, got sizes {batch_sizes} for the mapped dimension" + ) + return batch_sizes[0] + + +def _num_outputs(batched_outputs: Union[Tensor, Tuple[Tensor, ...]]) -> int: + if isinstance(batched_outputs, tuple): + return len(batched_outputs) + return 1 + + +# If value is a tuple, check it has length `num_elements`. +# If value is not a tuple, make a tuple with `value` repeated `num_elements` times +def _as_tuple( + value: Any, num_elements: int, error_message_lambda: Callable[[], str] +) -> Tuple: + if not isinstance(value, tuple): + return (value,) * num_elements + if len(value) != num_elements: + raise ValueError(error_message_lambda()) + return value + + +# Creates BatchedTensors for every Tensor in arg that should be batched. +# Returns the (potentially) batched arguments and the batch_size. +def _create_batched_inputs( + in_dims: in_dims_t, args: Tuple, vmap_level: int, func: Callable +) -> Tuple[Tuple, int]: + if not isinstance(in_dims, int) and not isinstance(in_dims, tuple): + raise ValueError( + f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(): " + f"expected `in_dims` to be int or a (potentially nested) tuple " + f"matching the structure of inputs, got: {type(in_dims)}." + ) + if len(args) == 0: + raise ValueError( + f"vmap({_get_name(func)})(): got no inputs. Maybe you forgot to add " + f"inputs, or you are trying to vmap over a function with no inputs. " + f"The latter is unsupported." + ) + + flat_args, args_spec = tree_flatten(args) + flat_in_dims = _broadcast_to_and_flatten(in_dims, args_spec) + if flat_in_dims is None: + raise ValueError( + f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(): " + f"in_dims is not compatible with the structure of `inputs`. " + f"in_dims has structure {tree_flatten(in_dims)[1]} but inputs " + f"has structure {args_spec}." + ) + + for arg, in_dim in zip(flat_args, flat_in_dims): + if not isinstance(in_dim, int) and in_dim is not None: + raise ValueError( + f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(): " + f"Got in_dim={in_dim} for an input but in_dim must be either " + f"an integer dimension or None." + ) + if isinstance(in_dim, int) and not isinstance(arg, Tensor): + raise ValueError( + f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(): " + f"Got in_dim={in_dim} for an input but the input is of type " + f"{type(arg)}. We cannot vmap over non-Tensor arguments, " + f"please use None as the respective in_dim" + ) + if in_dim is not None and (in_dim < 0 or in_dim >= arg.dim()): + raise ValueError( + f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(): " + f"Got in_dim={in_dim} for some input, but that input is a Tensor " + f"of dimensionality {arg.dim()} so expected in_dim to satisfy " + f"0 <= in_dim < {arg.dim()}." + ) + + batch_size = _validate_and_get_batch_size(flat_in_dims, flat_args) + # See NOTE [Ignored _remove_batch_dim, _add_batch_dim] + batched_inputs = [ + arg if in_dim is None else torch._add_batch_dim(arg, in_dim, vmap_level) + for in_dim, arg in zip(flat_in_dims, flat_args) + ] + return tree_unflatten(batched_inputs, args_spec), batch_size + + +# Undos the batching (and any batch dimensions) associated with the `vmap_level`. +def _unwrap_batched( + batched_outputs: Union[Tensor, Tuple[Tensor, ...]], + out_dims: out_dims_t, + vmap_level: int, + batch_size: int, + func: Callable, + allow_none_pass_through: bool = False, +) -> Tuple: + num_outputs = _num_outputs(batched_outputs) + out_dims_as_tuple = _as_tuple( + out_dims, + num_outputs, + lambda: f"vmap({_get_name(func)}, ..., out_dims={out_dims}): `out_dims` must " + f"have one dim per output (got {num_outputs} outputs) of {_get_name(func)}.", + ) + + # NOTE [Ignored _remove_batch_dim, _add_batch_dim] + # There is something wrong with our type bindings for functions that begin + # with '_', see #40397. + if isinstance(batched_outputs, Tensor): + out_dim = out_dims_as_tuple[0] + return torch._remove_batch_dim(batched_outputs, vmap_level, batch_size, out_dim) # type: ignore[return-value] + if allow_none_pass_through: + return tuple( + ( + torch._remove_batch_dim(out, vmap_level, batch_size, out_dim) + if out is not None + else None + ) + for out, out_dim in zip(batched_outputs, out_dims_as_tuple) + ) + else: + return tuple( + torch._remove_batch_dim(out, vmap_level, batch_size, out_dim) + for out, out_dim in zip(batched_outputs, out_dims_as_tuple) + ) + + +# Checks that `fn` returned one or more Tensors and nothing else. +# NB: A python function that return multiple arguments returns a single tuple, +# so we are effectively checking that `outputs` is a single Tensor or a tuple of +# Tensors. +def _validate_outputs(outputs: Any, func: Callable) -> None: + if isinstance(outputs, Tensor): + return + if not isinstance(outputs, tuple): + raise ValueError( + f"vmap({_get_name(func)}, ...): `{_get_name(func)}` must only return " + f"Tensors, got type {type(outputs)} as the return." + ) + for idx, output in enumerate(outputs): + if isinstance(output, Tensor): + continue + raise ValueError( + f"vmap({_get_name(func)}, ...): `{_get_name(func)}` must only return " + f"Tensors, got type {type(output)} for return {idx}." + ) + + +def _check_out_dims_is_int_or_int_tuple(out_dims: out_dims_t, func: Callable) -> None: + if isinstance(out_dims, int): + return + if not isinstance(out_dims, tuple) or not all( + isinstance(out_dim, int) for out_dim in out_dims + ): + raise ValueError( + f"vmap({_get_name(func)}, ..., out_dims={out_dims}): `out_dims` must be " + f"an int or a tuple of int representing where in the outputs the " + f"vmapped dimension should appear." + ) + + +def _get_name(func: Callable): + if hasattr(func, "__name__"): + return func.__name__ + + # Not all callables have __name__, in fact, only static functions/methods do. + # A callable created via functools.partial or an nn.Module, to name some + # examples, don't have a __name__. + return repr(func) + + +# vmap(func)(inputs) wraps all Tensor inputs to be batched in BatchedTensors, +# sends those into func, and then unwraps the output BatchedTensors. Operations +# on BatchedTensors perform the batched operations that the user is asking for. +def vmap(func: Callable, in_dims: in_dims_t = 0, out_dims: out_dims_t = 0) -> Callable: + """ + Please use torch.vmap instead of this API. + """ + warnings.warn( + "Please use torch.vmap instead of torch._vmap_internals.vmap. ", + stacklevel=2, + ) + return _vmap(func, in_dims, out_dims) + + +# A version of vmap but without the initial "experimental prototype" warning +def _vmap( + func: Callable, + in_dims: in_dims_t = 0, + out_dims: out_dims_t = 0, + allow_none_pass_through: bool = False, +) -> Callable: + # The `allow_none_pass_through` argument is a temporary workaround may be removed. + # Currently it enables us to wrap the call in `autograd.grad` to the autograd engine, + # which may return None if any of the inputs are unused. See the issue discussing this: + # https://github.com/facebookresearch/functorch/issues/159. + @functools.wraps(func) + def wrapped(*args): + _check_out_dims_is_int_or_int_tuple(out_dims, func) + vmap_level = torch._C._vmapmode_increment_nesting() + try: + batched_inputs, batch_size = _create_batched_inputs( + in_dims, args, vmap_level, func + ) + batched_outputs = func(*batched_inputs) + if not allow_none_pass_through: + _validate_outputs(batched_outputs, func) + return _unwrap_batched( + batched_outputs, + out_dims, + vmap_level, + batch_size, + func, + allow_none_pass_through=allow_none_pass_through, + ) + finally: + torch._C._vmapmode_decrement_nesting() + + return wrapped diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_weights_only_unpickler.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_weights_only_unpickler.py new file mode 100644 index 0000000000000000000000000000000000000000..c8f24bef1b51d2b978eb1083ad405832057c2107 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_weights_only_unpickler.py @@ -0,0 +1,306 @@ +# Unpickler restricted to loading only state dicts +# Restrict constructing types to a list defined in _get_allowed_globals() +# Restrict BUILD operation to `Tensor`, `Parameter` and `OrderedDict` types only +# Restrict APPEND/APPENDS to `list` +# In `GLOBALS` operation do not do class lookup by name, but rather rely on dictionary +# defined by `_get_allowed_globals()` method, that contains: +# - torch types (Storage, dtypes, Tensor, `torch.Size`), +# - `torch._utils._rebuild` functions. +# - `torch.nn.Parameter` +# - `collections.OrderedDict` + +# Based of https://github.com/python/cpython/blob/main/Lib/pickle.py +# Expected to be useful for loading PyTorch model weights +# For example: +# data = urllib.request.urlopen('https://download.pytorch.org/models/resnet50-0676ba61.pth').read() +# buf = io.BytesIO(data) +# weights = torch.load(buf, weights_only = True) + +import functools as _functools +from collections import OrderedDict +from pickle import ( + APPEND, + APPENDS, + BINFLOAT, + BINGET, + BININT, + BININT1, + BININT2, + BINPERSID, + BINPUT, + BINUNICODE, + BUILD, + bytes_types, + decode_long, + EMPTY_DICT, + EMPTY_LIST, + EMPTY_SET, + EMPTY_TUPLE, + GLOBAL, + LONG1, + LONG_BINGET, + LONG_BINPUT, + MARK, + NEWFALSE, + NEWOBJ, + NEWTRUE, + NONE, + PROTO, + REDUCE, + SETITEM, + SETITEMS, + SHORT_BINSTRING, + STOP, + TUPLE, + TUPLE1, + TUPLE2, + TUPLE3, + UnpicklingError, +) +from struct import unpack +from sys import maxsize +from typing import Any, Dict, List + +import torch + + +# Unpickling machinery +@_functools.lru_cache(maxsize=1) +def _get_allowed_globals(): + rc: Dict[str, Any] = { + "collections.OrderedDict": OrderedDict, + "torch.nn.parameter.Parameter": torch.nn.Parameter, + "torch.serialization._get_layout": torch.serialization._get_layout, + "torch.Size": torch.Size, + "torch.Tensor": torch.Tensor, + } + # dtype + for t in [ + torch.complex32, + torch.complex64, + torch.complex128, + torch.float8_e5m2, + torch.float8_e4m3fn, + torch.float8_e5m2fnuz, + torch.float8_e4m3fnuz, + torch.float16, + torch.float32, + torch.float64, + torch.int8, + torch.int16, + torch.int32, + torch.int64, + ]: + rc[str(t)] = t + # Tensor classes + for tt in torch._tensor_classes: + rc[f"{tt.__module__}.{tt.__name__}"] = tt + # Storage classes + for ts in torch._storage_classes: + if ts not in (torch.storage.TypedStorage, torch.storage.UntypedStorage): + # Wrap legacy storage types in a dummy class + rc[f"{ts.__module__}.{ts.__name__}"] = torch.serialization.StorageType( + ts.__name__ + ) + else: + rc[f"{ts.__module__}.{ts.__name__}"] = ts + # Rebuild functions + for f in [ + torch._utils._rebuild_parameter, + torch._utils._rebuild_tensor, + torch._utils._rebuild_tensor_v2, + torch._utils._rebuild_tensor_v3, + torch._utils._rebuild_sparse_tensor, + torch._utils._rebuild_meta_tensor_no_storage, + torch._utils._rebuild_nested_tensor, + ]: + rc[f"torch._utils.{f.__name__}"] = f + + # Handles Tensor Subclasses, Tensor's with attributes. + # NOTE: It calls into above rebuild functions for regular Tensor types. + rc["torch._tensor._rebuild_from_type_v2"] = torch._tensor._rebuild_from_type_v2 + return rc + + +class Unpickler: + def __init__(self, file, *, encoding: str = "bytes"): + self.encoding = encoding + self.readline = file.readline + self.read = file.read + self.memo: Dict[int, Any] = {} + + def load(self): + """Read a pickled object representation from the open file. + + Return the reconstituted object hierarchy specified in the file. + """ + self.metastack = [] + self.stack: List[Any] = [] + self.append = self.stack.append + read = self.read + readline = self.readline + while True: + key = read(1) + if not key: + raise EOFError + assert isinstance(key, bytes_types) + # Risky operators + if key[0] == GLOBAL[0]: + module = readline()[:-1].decode("utf-8") + name = readline()[:-1].decode("utf-8") + full_path = f"{module}.{name}" + if full_path in _get_allowed_globals(): + self.append(_get_allowed_globals()[full_path]) + else: + raise RuntimeError(f"Unsupported class {full_path}") + elif key[0] == NEWOBJ[0]: + args = self.stack.pop() + cls = self.stack.pop() + if cls is not torch.nn.Parameter: + raise RuntimeError(f"Trying to instantiate unsupported class {cls}") + self.append(torch.nn.Parameter(*args)) + elif key[0] == REDUCE[0]: + args = self.stack.pop() + func = self.stack[-1] + if func not in _get_allowed_globals().values(): + raise RuntimeError( + f"Trying to call reduce for unrecognized function {func}" + ) + self.stack[-1] = func(*args) + elif key[0] == BUILD[0]: + state = self.stack.pop() + inst = self.stack[-1] + if type(inst) is torch.Tensor: + # Legacy unpickling + inst.set_(*state) + elif type(inst) is torch.nn.Parameter: + inst.__setstate__(state) + elif type(inst) is OrderedDict: + inst.__dict__.update(state) + else: + raise RuntimeError( + f"Can only build Tensor, parameter or dict objects, but got {type(inst)}" + ) + # Stack manipulation + elif key[0] == APPEND[0]: + item = self.stack.pop() + list_obj = self.stack[-1] + if type(list_obj) is not list: + raise RuntimeError( + f"Can only append to lists, but got {type(list_obj)}" + ) + list_obj.append(item) + elif key[0] == APPENDS[0]: + items = self.pop_mark() + list_obj = self.stack[-1] + if type(list_obj) is not list: + raise RuntimeError( + f"Can only extend lists, but got {type(list_obj)}" + ) + list_obj.extend(items) + elif key[0] == SETITEM[0]: + (v, k) = (self.stack.pop(), self.stack.pop()) + self.stack[-1][k] = v + elif key[0] == SETITEMS[0]: + items = self.pop_mark() + for i in range(0, len(items), 2): + self.stack[-1][items[i]] = items[i + 1] + elif key[0] == MARK[0]: + self.metastack.append(self.stack) + self.stack = [] + self.append = self.stack.append + elif key[0] == TUPLE[0]: + items = self.pop_mark() + self.append(tuple(items)) + elif key[0] == TUPLE1[0]: + self.stack[-1] = (self.stack[-1],) + elif key[0] == TUPLE2[0]: + self.stack[-2:] = [(self.stack[-2], self.stack[-1])] + elif key[0] == TUPLE3[0]: + self.stack[-3:] = [(self.stack[-3], self.stack[-2], self.stack[-1])] + # Basic types construction + elif key[0] == NONE[0]: + self.append(None) + elif key[0] == NEWFALSE[0]: + self.append(False) + elif key[0] == NEWTRUE[0]: + self.append(True) + elif key[0] == EMPTY_TUPLE[0]: + self.append(()) + elif key[0] == EMPTY_LIST[0]: + self.append([]) + elif key[0] == EMPTY_DICT[0]: + self.append({}) + elif key[0] == EMPTY_SET[0]: + self.append(set()) + elif key[0] == BININT[0]: + self.append(unpack("d", self.read(8))[0]) + elif key[0] == BINUNICODE[0]: + strlen = unpack(" maxsize: + raise RuntimeError("String is too long") + strval = str(read(strlen), "utf-8", "surrogatepass") + self.append(strval) + elif key[0] == SHORT_BINSTRING[0]: + strlen = read(1)[0] + strdata = read(strlen) + if self.encoding != "bytes": + strdata = strdata.decode(self.encoding, "strict") + self.append(strdata) + elif key[0] == BINPERSID[0]: + pid = self.stack.pop() + # Only allow persistent load of storage + if type(pid) is not tuple and not type(pid) is not int: + raise RuntimeError( + f"persistent_load id must be tuple or int, but got {type(pid)}" + ) + if ( + type(pid) is tuple + and len(pid) > 0 + and torch.serialization._maybe_decode_ascii(pid[0]) != "storage" + ): + raise RuntimeError( + f"Only persistent_load of storage is allowed, but got {pid[0]}" + ) + self.append(self.persistent_load(pid)) + elif key[0] in [BINGET[0], LONG_BINGET[0]]: + idx = (read(1) if key[0] == BINGET[0] else unpack(" None: + assert row_block_size is not None and col_block_size is not None + self._packed_params = torch.ops.sparse.qlinear_prepack(weight, bias, row_block_size, col_block_size) + + @torch.jit.export + def _weight_bias(self): + (weight, bias, block_sizes) = torch.ops.sparse.qlinear_unpack(self._packed_params) + return (weight, bias, block_sizes[0], block_sizes[1]) + + def forward(self, x): + return x + + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + 'dtype'] = self.dtype + destination[prefix + '_packed_params'] = self._weight_bias() + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + version = local_metadata.get('version', None) + assert version <= self._version + + self.dtype = state_dict.pop(prefix + 'dtype') + weight, bias, row_block_size, col_block_size = state_dict.pop(prefix + '_packed_params') + self.set_weight_bias(weight, bias, row_block_size, col_block_size) + + super()._load_from_state_dict(state_dict, prefix, local_metadata, False, + missing_keys, unexpected_keys, error_msgs) + + @torch.jit.export + def __getstate__(self): + return self._packed_params, self.training, self.dtype + + @torch.jit.export + def __setstate__(self, state): + (self._packed_params, self.training, self.dtype) = state + + def __repr__(self): + return self._weight_bias().__repr__() + +# TODO (zaf): Inherit from `quantized.Linear` (T83294430) +class Linear(torch.nn.Module): + r""" + A quantized sparse linear module with quantized tensor as inputs and outputs. + """ + _version = 1 + _FLOAT_MODULE = torch.nn.Linear + + def __init__(self, in_features, out_features, row_block_size, col_block_size, bias=True, dtype=torch.qint8): + super().__init__() + + if dtype != torch.qint8: + raise NotImplementedError("Only QINT8 is supported for Sparse Quantized Linear") + + self.in_features = in_features + self.out_features = out_features + + if bias: + bias = torch.zeros(self.out_features, dtype=torch.float) + else: + bias = None + + qweight = torch._empty_affine_quantized([out_features, in_features], + scale=1, zero_point=0, dtype=torch.qint8) + self._packed_params = LinearPackedParams(row_block_size=row_block_size, + col_block_size=col_block_size, + dtype=dtype) + self._packed_params.set_weight_bias(qweight, bias, row_block_size, col_block_size) + self.scale = 1.0 + self.zero_point = 0 + + @classmethod + def _get_name(cls): + return 'SparseQuantizedLinear' + + def extra_repr(self): + return 'in_features={}, out_features={}, scale={}, zero_point={}, qscheme={}'.format( + self.in_features, self.out_features, self.scale, self.zero_point, self.weight().qscheme() + ) + + def __repr__(self): + return _hide_packed_params_repr(self, LinearPackedParams) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.ops.sparse.qlinear(x, self._packed_params._packed_params, self.scale, self.zero_point) + + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + 'scale'] = torch.tensor(self.scale) + destination[prefix + 'zero_point'] = torch.tensor(self.zero_point) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + self.scale = float(state_dict[prefix + 'scale']) + state_dict.pop(prefix + 'scale') + + self.zero_point = int(state_dict[prefix + 'zero_point']) + state_dict.pop(prefix + 'zero_point') + + op_type = int(state_dict[prefix + 'op_type']) + state_dict.pop(prefix + 'op_type') + + version = local_metadata.get('version', None) + assert version <= self._version + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, False, + missing_keys, unexpected_keys, error_msgs) + + def _weight_bias(self): + return self._packed_params._weight_bias() + + def weight(self): + return self._weight_bias()[0] + + def bias(self): + return self._weight_bias()[1] + + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor], + row_block_size: Optional[int], col_block_size: Optional[int]) -> None: + assert row_block_size is not None and col_block_size is not None + self._packed_params.set_weight_bias(w, b, row_block_size, col_block_size) + + @classmethod + def from_float(cls, mod): + r"""Create a quantized sparse module from a float module. + + We only care about the convert at this stage, no need for observers just yet. + + TODO(zaf): Need to add the sparse params to the qconfig + """ + assert type(mod) == cls._FLOAT_MODULE, cls._get_name() + \ + '.from_float only works for ' + cls._FLOAT_MODULE.__name__ + assert hasattr(mod, 'sparse_params'), \ + ('Expecting the Linear to have `sparse_params`. Make sure you have provided arguments ' + 'in the `sparsifier.squash_mask(params_to_save=("sparse_block_shape",))` method.') + sparse_block_shape = mod.sparse_params.get('sparse_block_shape', None) # type: ignore[operator, union-attr] + assert isinstance(sparse_block_shape, (tuple, list)) + assert len(sparse_block_shape) == 2 + # TODO: Need to add options to qconfig to avoid the calibration. + # TODO: Add calibration for the sparsity + assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' + activation_post_process = mod.activation_post_process + weight_post_process = mod.qconfig.weight() # type: ignore[operator, union-attr] + + # Assumption is that the weight is already sparsified by the + # `sparsifier.convert` + weight = mod.weight + + weight_post_process(weight) + dtype = weight_post_process.dtype + act_scale, act_zp = activation_post_process.calculate_qparams() # type: ignore[operator, union-attr] + assert dtype == torch.qint8, 'Weight observer must have dtype torch.qint8' + w_sc, w_zp = weight_post_process.calculate_qparams() + if isinstance(w_zp, torch.Tensor): + assert not torch.any(w_zp.bool()), "All weight zero points must map to 0" + else: + assert w_zp == 0, 'Weight zero point must map to 0' + qweight = _quantize_weight(weight.float(), weight_post_process) + + row_block_size = mod.sparse_params['sparse_block_shape'][0] # type: ignore[index] + col_block_size = mod.sparse_params['sparse_block_shape'][1] # type: ignore[index] + qlinear = cls(mod.in_features, + mod.out_features, + row_block_size, + col_block_size, + dtype=dtype) + qlinear.set_weight_bias(qweight, mod.bias, + row_block_size, col_block_size) # type: ignore[arg-type] + qlinear.scale = float(act_scale) + qlinear.zero_point = int(act_zp) + return qlinear diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3d934f57857436dd0ea7945e327cf3d0532c4c10 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/sparse/quantized/utils.py @@ -0,0 +1,42 @@ +import threading + +__all__ = [ + "LinearBlockSparsePattern" +] + +def _is_valid_linear_block_sparse_pattern(row_block_size, col_block_size): + return (row_block_size == 1 and col_block_size == 4) or \ + (row_block_size == 8 and col_block_size == 1) + +# This is a stop-gap measure as current flow does not allow module +# specific block sparse pattern. +# Infact there is no way to convey sparse pattern via module config +# of quantization flow. Thus using the global context to convey +# sparsity pattern. +# Once the flow supports it, this should be removed. +class LinearBlockSparsePattern: + rlock = threading.RLock() + row_block_size = 1 + col_block_size = 4 + prev_row_block_size = 1 + prev_col_block_size = 4 + + def __init__(self, row_block_size=1, col_block_size=4): + assert _is_valid_linear_block_sparse_pattern(row_block_size, col_block_size) + LinearBlockSparsePattern.rlock.acquire() + LinearBlockSparsePattern.prev_row_block_size = LinearBlockSparsePattern.row_block_size + LinearBlockSparsePattern.prev_col_block_size = LinearBlockSparsePattern.col_block_size + LinearBlockSparsePattern.row_block_size = row_block_size + LinearBlockSparsePattern.col_block_size = col_block_size + + def __enter__(self): + pass + + def __exit__(self, exc_type, exc_value, backtrace): + LinearBlockSparsePattern.row_block_size = LinearBlockSparsePattern.prev_row_block_size + LinearBlockSparsePattern.col_block_size = LinearBlockSparsePattern.prev_col_block_size + LinearBlockSparsePattern.rlock.release() + + @staticmethod + def block_size(): + return LinearBlockSparsePattern.row_block_size, LinearBlockSparsePattern.col_block_size diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..206bfdae4cf52df70bb11401b6e54559a20e896f Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py new file mode 100644 index 0000000000000000000000000000000000000000..7f4fcb461e22ac7c55d6f2c6b1e6298bd4827bb3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py @@ -0,0 +1,309 @@ +import abc +import torch +from typing import Optional, Tuple, List, Any, Dict +from ...sparsifier import base_sparsifier +from collections import defaultdict +from torch import nn +import copy +from ...sparsifier import utils +from torch.nn.utils import parametrize +import sys +import warnings + +if not sys.warnoptions: + # to suppress repeated warnings when being used in a training loop. + warnings.simplefilter("once") + +__all__ = ['BaseDataSparsifier'] + +EMBEDDING_TYPES = { + nn.Embedding, + nn.EmbeddingBag, +} + +SUPPORTED_TYPES = { + torch.Tensor, + nn.Parameter, + *EMBEDDING_TYPES, +} + + +class _Container(nn.Module): + pass + + +class BaseDataSparsifier(base_sparsifier.BaseSparsifier): + r""" + Base Data Sparsifier class for all Data sparsifiers. + The abstract class accepts raw torch tensors / embedding / embedding bags (refer to SUPPORTED_TYPES above) + to prepare for sparsification. + In this case, mask (and parametrizations) is owned by the class and not by the user. + Specifically, the container object inside the class maintains the mask and parametrizations of the input data + + Args: + data_list (list of tuples) + list of (name, data) tuples to sparsify. Lookup SUPPORTED_TYPES + for type of data. Internally, a container module handles the data sparsification. + + defaults (dict) + default configurations will be attached to the + configuration. Only the keys that don't exist in the `config` will + be updated. + Example:: + >>> # xdoctest: +SKIP + >>> data_list = [('tensor_1', torch.randn(3,3)), ('tensor_2', torch.randn(4,4))] + >>> defaults = {'sparsity_level': 0.7} + >>> sparsifier = DerivedDataSparsifier(data_list = data_list, **defaults) # Some sparsifier that inherits BaseDataSparsifier + >>> new_tensor_to_add = {'name': 'tensor_3', 'data': torch.randn(5,5), 'sparsity_level': 0.3} + >>> sparsifier.add_data(**new_tensor_to_add) + >>> # tensor_1 and tensor_2 will have sparsity_level of 0.7 but tensor_3 will have sparsity_level=0.3 + """ + def __init__(self, data_list: Optional[List[Tuple[str, Any]]] = None, **defaults): + super().__init__(defaults=defaults) + + self._container = _Container() + + self.data_groups: Dict[str, Dict] = defaultdict(dict) # name -> {**config} + if data_list is not None: + # add data with default config here + [self.add_data(name, data, **self.defaults) for name, data in data_list] + + def prepare(self): + raise NotImplementedError("this function is undefined for this class") + + def _extract_weight(self, data): + # extract the weight parameter instead of underlying data + if type(data) in [torch.Tensor, nn.Parameter]: + return data + elif type(data) in EMBEDDING_TYPES: + return data.weight + + def add_data(self, name: str, data, reuse_mask=True, **config): + r""" Configures and parametrizes the internal container model with name and data. + + **Note**: + 1. If the data with name already exists, it replaces the data. + 2. While replacing, the old mask is reused when `reuse_mask=True` + 3. If `reuse_mask=True`, then the replacing data needs to have the same shape as that of old data. + 4. By default, the config of the replaced data is used as config for the replacing data, unless something + is specified in the config dictionary. + """ + assert type(data) in SUPPORTED_TYPES, \ + "specified data type not supported at the moment" + local_args = copy.deepcopy(self.defaults) + local_args.update(config) + weight = self._extract_weight(data) + + # Bookkeeping in the container class + mask = local_args.get('mask', torch.ones_like(weight)) + param_class = local_args.get('parametrization', utils.FakeSparsity) + + if name in self.state: + # If the named data already exists - replace + warnings.warn("Replacing existing data of the same name. - Did you mean a different name?") + + # reuse old config + old_args = self.data_groups[name] + local_args = copy.deepcopy(old_args) + local_args.update(config) + + if reuse_mask: + current_data = self.get_data(name=name) + assert weight.shape == current_data.shape, \ + "to retain the old mask, the shape of the new data must be the same as the previous one" + mask = self.get_mask(name=name) # reuse mask instead of creating a new one + + self._delete_data(name=name) + + # parameter creates a deepcopy of the weight inside, so create a buffer + self._container.register_buffer(name=name, tensor=weight) + parametrize.register_parametrization(self._container, name, param_class(mask)) + self.state[name]['mask'] = mask + self.data_groups[name] = local_args + return getattr(self._container, name) + + def get_data(self, name: str, return_original: bool = True): + r"""Returns weight tensor (or data) + Args: + - name: name of the data to be returned + - return_original returns weight tensor without applying parametrization if True + else - returns the sparsified version (parametrized) + """ + if name not in self.data_groups: + raise ValueError("data with specified name does not exist") + + if return_original: + if not parametrize.is_parametrized(self._container, name): + raise ValueError("mask squashed - original mask value does not exist") + data = getattr(self._container.parametrizations, name).original + return data + else: + return getattr(self._container, name) + + def _convert_mask(self, states, sparse_coo=True): + r"""Converts the mask to sparse coo or dense tensors depending on the `sparse_coo` argument. + """ + states = copy.deepcopy(states) + for state in states.values(): + if sparse_coo: + state['mask'] = state['mask'].to_sparse_coo() + else: + state['mask'] = state['mask'].to_dense() + + return states + + def state_dict(self): + r"""Returns the state of the optimizer as a :class:`dict`. + + It contains: + * state - contains name -> mask mapping. + * data_groups - a list containing all sparsity configuration groups + with the key name specifying the name of the data + * container_state_dict - the state dictionary of the internal + container model used for sparsification + """ + state = self._convert_mask(self.state) + return { + 'state': state, + 'data_groups': self.data_groups, + '_container': self._container.state_dict() + } + + def _load_container_from_state(self, states, data_groups, container_state_dict): + r"""This restores the state of the container specifically based on the data present in state and data_groups + If the data was parametrized, then the data would be added to the container and then parametrized, + else it would just add the attribute the container. + """ + for name, state in states.items(): + config_name = data_groups.get(name, None) + if config_name is None: + raise RuntimeError(f"Error loading {name}") + + # check if the data with such a name was parametrized, if so parametrize + # otherwise just set the attribute and continue + parametrized_name = f'parametrizations.{name}.original' + parametrized = False + data = container_state_dict.get(name, None) + if name in container_state_dict: + # the parametrization was probably removed for this + data = container_state_dict.get(name) + + elif parametrized_name in container_state_dict: + # so the weight was parametrized + data = container_state_dict.get(parametrized_name) + parametrized = True + + else: + raise RuntimeError(f"Error loading {name}") + + self._container.register_buffer(name=name, tensor=data) + + if parametrized: + # register parameter if parametrized + mask = state.get('mask', torch.ones_like(data)) + param_class = data_groups.get('parametrization', utils.FakeSparsity) # change once public_api for utils is fixed! + parametrize.register_parametrization(self._container, name, param_class(mask)) + + def load_state_dict(self, state_dict, strict=True): + r"""The load_state_dict() restores the state of the sparsifier based on the state_dict + + Args: + * state_dict - the dictionary that to which the current sparsifier needs to be restored to + * strict - If True - the sparsifier is reset and is restored exactly to the state in state_dict. + If False - the current sparsifier is not reset before loading the state_dict i.e. data added + before loading the state_dict is not erased. + """ + states = copy.deepcopy(state_dict['state']) + data_groups = copy.deepcopy(state_dict['data_groups']) + container_state_dict = copy.deepcopy(state_dict['_container']) + + states = self._convert_mask(states, sparse_coo=False) # convert sparse coo mask to dense + if strict: + # if strict load -> then reset container + self._container = _Container() + + self._load_container_from_state(states, data_groups, container_state_dict) + + if not strict: + states.update(self.state) + data_groups.update(self.data_groups) + + self.__setstate__({'state': states, 'data_groups': data_groups}) + + def __setstate__(self, state): + if '_container' in state: # If container object is in state then load model + container_dict = state.pop('_container') + self._container = _Container() + state['state'] = self._convert_mask(state['state'], sparse_coo=False) # convert sparse coo mask to dense + self._load_container_from_state(state['state'], state['data_groups'], container_dict) + + self.__dict__.update(state) + + def __getstate__(self): + state = self._convert_mask(self.state) + return { + 'defaults': self.defaults, + 'state': state, + 'data_groups': self.data_groups, + '_container': self._container.state_dict() + } + + def __repr__(self): + format_string = self.__class__.__name__ + ' (' + for name, sparse_args in self.data_groups.items(): + format_string += '\n' + format_string += '\tData Group\n' + format_string += f'\t name: {name}\n' + for key in sorted(sparse_args.keys()): + if key == 'data': + continue + format_string += f'\t {key}: {sparse_args[key]}\n' + format_string += ')' + return format_string + + def get_mask(self, name: str): + if name not in self.state: + raise ValueError("data with specified name does not exist") + return self.state[name]['mask'] + + def squash_mask(self, *args, leave_parametrized=True, names=None, **kwargs): + r"""Squashes the sparse masks into the appropriate tensors. Also, accepts list of strings + to squash mask for. If none, squashes mask for all the keys + kwargs: + * names: list of strings to squash mask for + * sparsified: if true - applies the mask before squashing + if false - does not apply the mask before squashing + """ + if names is None: + names = list(self.data_groups.keys()) + for name in names: + parametrize.remove_parametrizations(self._container, name, leave_parametrized=leave_parametrized) + + def step(self): + if not self.enable_mask_update: + return + with torch.no_grad(): + for name, config in self.data_groups.items(): + # get non-sparsified data + data = self.get_data(name) + # need name for the mask otherwise can directly pass mask? + self.update_mask(name, data, **config) + + @abc.abstractmethod + def update_mask(self, name, data, **kwargs): + pass + + def _delete_data(self, name): + """Detaches some data from the sparsifier. + + Args: + name (str) + Name of the data to be removed from the sparsifier + + Note: + Currently private. Kind of used as a helper function when replacing data of the same name + """ + self.squash_mask(names=[name], leave_parametrized=False) # do not apply the mask while deleting + delattr(self._container, name) + self.state.pop(name) + self.data_groups.pop(name) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/_data_sparstity_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/_data_sparstity_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8110bb33822753837de9948b3ef4c0ac9aa9fc96 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/_data_sparstity_utils.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/data_sparsity.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/data_sparsity.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c09f25c6111711be7ebf890d80fa60e895e43469 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/data_sparsity.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/match_utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/match_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..814df1bfc6ecb9e200279b1ee307548762f3442c Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/match_utils.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/prune_functions.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/prune_functions.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75a3802516ca39262a0f1b40e7c646cede4885c9 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/prune_functions.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py new file mode 100644 index 0000000000000000000000000000000000000000..4a0d74d6dc933552fefd47e0e950749079a627fb --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py @@ -0,0 +1,48 @@ +from typing import cast + +import torch +from .base_structured_sparsifier import BaseStructuredSparsifier, FakeStructuredSparsity + +class LSTMSaliencyPruner(BaseStructuredSparsifier): + """ + Prune packed LSTM weights based on saliency. + For each layer {k} inside a LSTM, we have two packed weight matrices + - weight_ih_l{k} + - weight_hh_l{k} + + These tensors pack the weights for the 4 linear layers together for efficiency. + + [W_ii | W_if | W_ig | W_io] + + Pruning this tensor directly will lead to weights being misassigned when unpacked. + To ensure that each packed linear layer is pruned the same amount: + 1. We split the packed weight into the 4 constituent linear parts + 2. Update the mask for each individual piece using saliency individually + + This applies to both weight_ih_l{k} and weight_hh_l{k}. + """ + + def update_mask(self, module, tensor_name, **kwargs): + weights = getattr(module, tensor_name) + + for p in getattr(module.parametrizations, tensor_name): + if isinstance(p, FakeStructuredSparsity): + mask = cast(torch.Tensor, p.mask) + + # select weights based on magnitude + if weights.dim() <= 1: + raise Exception("Structured pruning can only be applied to a 2+dim weight tensor!") + # take norm over all but first dim + dims = tuple(range(1, weights.dim())) + saliency = weights.norm(dim=dims, p=1) + + # handle weights in 4 groups + split_size = len(mask) // 4 + masks = torch.split(mask, split_size) + saliencies = torch.split(saliency, split_size) + + for keep_mask, sal in zip(masks, saliencies): + # mask smallest k values to be removed + k = int(len(keep_mask) * kwargs["sparsity_level"]) + prune = sal.topk(k, largest=False, sorted=False).indices + keep_mask.data[prune] = False # modifies underlying p.mask directly diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/parametrization.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/parametrization.py new file mode 100644 index 0000000000000000000000000000000000000000..df94f7093b53db9dba9106a53b4bab0a2b9bb961 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/pruner/parametrization.py @@ -0,0 +1,59 @@ +import torch +from torch import nn +from torch.nn.utils.parametrize import is_parametrized + + +def module_contains_param(module, parametrization): + if is_parametrized(module): + # see if any of the module tensors have a parametriztion attached that matches the one passed in + return any( + any(isinstance(param, parametrization) for param in param_list) + for key, param_list in module.parametrizations.items() + ) + return False + + +# Structured Pruning Parameterizations +class FakeStructuredSparsity(nn.Module): + r""" + Parametrization for Structured Pruning. Like FakeSparsity, this should be attached to + the 'weight' or any other parameter that requires a mask. + + Instead of an element-wise bool mask, this parameterization uses a row-wise bool mask. + """ + + def __init__(self, mask): + super().__init__() + self.register_buffer("mask", mask) + + def forward(self, x): + assert isinstance(self.mask, torch.Tensor) + assert self.mask.shape[0] == x.shape[0] + shape = [1] * len(x.shape) + shape[0] = -1 + return self.mask.reshape(shape) * x + + def state_dict(self, *args, **kwargs): + # avoid double saving masks + return {} + + +class BiasHook: + def __init__(self, parametrization, prune_bias): + self.param = parametrization + self.prune_bias = prune_bias + + def __call__(self, module, input, output): + + if getattr(module, "_bias", None) is not None: + bias = module._bias.data + if self.prune_bias: + bias[~self.param.mask] = 0 + + # reshape bias to broadcast over output dimensions + idx = [1] * len(output.shape) + idx[1] = -1 + bias = bias.reshape(idx) + + output += bias + return output diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_mappings.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_mappings.py new file mode 100644 index 0000000000000000000000000000000000000000..726cbc6b0fc8af91f1651d3b0f0a56dbb7f21fe2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_mappings.py @@ -0,0 +1,18 @@ +__all__ = [ + "get_static_sparse_quantized_mapping", + "get_dynamic_sparse_quantized_mapping", +] + +def get_static_sparse_quantized_mapping(): + import torch.ao.nn.sparse + _static_sparse_quantized_mapping = { + torch.nn.Linear: torch.ao.nn.sparse.quantized.Linear, + } + return _static_sparse_quantized_mapping + +def get_dynamic_sparse_quantized_mapping(): + import torch.ao.nn.sparse + _dynamic_sparse_quantized_mapping = { + torch.nn.Linear: torch.ao.nn.sparse.quantized.dynamic.Linear, + } + return _dynamic_sparse_quantized_mapping diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/functional.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/functional.py new file mode 100644 index 0000000000000000000000000000000000000000..7c07ae348631b50612823c5d913d075d3aa23fe7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/functional.py @@ -0,0 +1,1983 @@ +from typing import ( + List, Tuple, Optional, Union, Any, Sequence, TYPE_CHECKING +) +import operator +import itertools + +import torch +from torch._C import _add_docstr +import torch.nn.functional as F +from ._lowrank import svd_lowrank, pca_lowrank +from .overrides import ( + has_torch_function, has_torch_function_unary, has_torch_function_variadic, + handle_torch_function) +from ._jit_internal import boolean_dispatch +from ._jit_internal import _overload as overload + +Tensor = torch.Tensor +from torch import _VF + +__all__ = [ + 'atleast_1d', + 'atleast_2d', + 'atleast_3d', + 'align_tensors', + 'broadcast_shapes', + 'broadcast_tensors', + 'cartesian_prod', + 'block_diag', + 'cdist', + 'chain_matmul', + 'einsum', + 'istft', + 'lu', + 'norm', + 'meshgrid', + 'pca_lowrank', + 'split', + 'stft', + 'svd_lowrank', + 'tensordot', + 'unique', + 'unique_consecutive', + 'unravel_index', +] + + +def broadcast_tensors(*tensors): + r"""broadcast_tensors(*tensors) -> List of Tensors + + Broadcasts the given tensors according to :ref:`broadcasting-semantics`. + + Args: + *tensors: any number of tensors of the same type + + .. warning:: + + More than one element of a broadcasted tensor may refer to a single + memory location. As a result, in-place operations (especially ones that + are vectorized) may result in incorrect behavior. If you need to write + to the tensors, please clone them first. + + Example:: + + >>> x = torch.arange(3).view(1, 3) + >>> y = torch.arange(2).view(2, 1) + >>> a, b = torch.broadcast_tensors(x, y) + >>> a.size() + torch.Size([2, 3]) + >>> a + tensor([[0, 1, 2], + [0, 1, 2]]) + """ + # This wrapper exists to support variadic args. + if has_torch_function(tensors): + return handle_torch_function(broadcast_tensors, tensors, *tensors) + return _VF.broadcast_tensors(tensors) # type: ignore[attr-defined] + + +def broadcast_shapes(*shapes): + r"""broadcast_shapes(*shapes) -> Size + + Similar to :func:`broadcast_tensors` but for shapes. + + This is equivalent to + ``torch.broadcast_tensors(*map(torch.empty, shapes))[0].shape`` + but avoids the need create to intermediate tensors. This is useful for + broadcasting tensors of common batch shape but different rightmost shape, + e.g. to broadcast mean vectors with covariance matrices. + + Example:: + + >>> torch.broadcast_shapes((2,), (3, 1), (1, 1, 1)) + torch.Size([1, 3, 2]) + + Args: + \*shapes (torch.Size): Shapes of tensors. + + Returns: + shape (torch.Size): A shape compatible with all input shapes. + + Raises: + RuntimeError: If shapes are incompatible. + """ + # This wrapper exists to support variadic args. + # TODO Move this to C++ once the jit has better support for torch.Size. + if not torch.jit.is_tracing(): + max_len = 0 + for shape in shapes: + if isinstance(shape, (int, torch.SymInt)): + if max_len < 1: + max_len = 1 + elif isinstance(shape, (tuple, list)): + s = len(shape) + if max_len < s: + max_len = s + result = [1] * max_len + + from torch.fx.experimental.symbolic_shapes import guard_size_oblivious + + for shape in shapes: + if isinstance(shape, (int, torch.SymInt)): + shape = (shape,) + if isinstance(shape, (tuple, list)): + for i in range(-1, -1 - len(shape), -1): + if shape[i] < 0: + raise RuntimeError(f"Trying to create tensor with negative dimension ({shape[i]}): ({shape[i]})") + # NB: result is initialized to 1 so this is effectively an + # equals one test + if guard_size_oblivious(shape[i] == 1) or guard_size_oblivious(shape[i] == result[i]): + continue + if result[i] != 1: + raise RuntimeError("Shape mismatch: objects cannot be broadcast to a single shape") + result[i] = shape[i] + else: + raise RuntimeError("Input shapes should be of type ints, a tuple of ints, or a list of ints, got ", shape) + return torch.Size(result) + else: + # with implementation above, torch.jit.trace hardcodes the sizes which makes subsequent replays fail + with torch.no_grad(): + scalar = torch.zeros((), device="cpu") + tensors = [scalar.expand(shape) for shape in shapes] + tensors = broadcast_tensors(*tensors) + return tensors[0].shape + + +def split( + tensor: Tensor, split_size_or_sections: Union[int, List[int]], dim: int = 0 +) -> Tuple[Tensor, ...]: + r"""Splits the tensor into chunks. Each chunk is a view of the original tensor. + + If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will + be split into equally sized chunks (if possible). Last chunk will be smaller if + the tensor size along the given dimension :attr:`dim` is not divisible by + :attr:`split_size`. + + If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split + into ``len(split_size_or_sections)`` chunks with sizes in :attr:`dim` according + to :attr:`split_size_or_sections`. + + Args: + tensor (Tensor): tensor to split. + split_size_or_sections (int) or (list(int)): size of a single chunk or + list of sizes for each chunk + dim (int): dimension along which to split the tensor. + + Example:: + + >>> a = torch.arange(10).reshape(5, 2) + >>> a + tensor([[0, 1], + [2, 3], + [4, 5], + [6, 7], + [8, 9]]) + >>> torch.split(a, 2) + (tensor([[0, 1], + [2, 3]]), + tensor([[4, 5], + [6, 7]]), + tensor([[8, 9]])) + >>> torch.split(a, [1, 4]) + (tensor([[0, 1]]), + tensor([[2, 3], + [4, 5], + [6, 7], + [8, 9]])) + """ + if has_torch_function_unary(tensor): + return handle_torch_function( + split, (tensor,), tensor, split_size_or_sections, dim=dim) + # Overwriting reason: + # This dispatches to two ATen functions depending on the type of + # split_size_or_sections. The branching code is in _tensor.py, which we + # call here. + return tensor.split(split_size_or_sections, dim) + + +def einsum(*args: Any) -> Tensor: + r"""einsum(equation, *operands) -> Tensor + + Sums the product of the elements of the input :attr:`operands` along dimensions specified using a notation + based on the Einstein summation convention. + + Einsum allows computing many common multi-dimensional linear algebraic array operations by representing them + in a short-hand format based on the Einstein summation convention, given by :attr:`equation`. The details of + this format are described below, but the general idea is to label every dimension of the input :attr:`operands` + with some subscript and define which subscripts are part of the output. The output is then computed by summing + the product of the elements of the :attr:`operands` along the dimensions whose subscripts are not part of the + output. For example, matrix multiplication can be computed using einsum as `torch.einsum("ij,jk->ik", A, B)`. + Here, j is the summation subscript and i and k the output subscripts (see section below for more details on why). + + Equation: + + The :attr:`equation` string specifies the subscripts (letters in `[a-zA-Z]`) for each dimension of + the input :attr:`operands` in the same order as the dimensions, separating subscripts for each operand by a + comma (','), e.g. `'ij,jk'` specify subscripts for two 2D operands. The dimensions labeled with the same subscript + must be broadcastable, that is, their size must either match or be `1`. The exception is if a subscript is + repeated for the same input operand, in which case the dimensions labeled with this subscript for this operand + must match in size and the operand will be replaced by its diagonal along these dimensions. The subscripts that + appear exactly once in the :attr:`equation` will be part of the output, sorted in increasing alphabetical order. + The output is computed by multiplying the input :attr:`operands` element-wise, with their dimensions aligned based + on the subscripts, and then summing out the dimensions whose subscripts are not part of the output. + + Optionally, the output subscripts can be explicitly defined by adding an arrow ('->') at the end of the equation + followed by the subscripts for the output. For instance, the following equation computes the transpose of a + matrix multiplication: 'ij,jk->ki'. The output subscripts must appear at least once for some input operand and + at most once for the output. + + Ellipsis ('...') can be used in place of subscripts to broadcast the dimensions covered by the ellipsis. + Each input operand may contain at most one ellipsis which will cover the dimensions not covered by subscripts, + e.g. for an input operand with 5 dimensions, the ellipsis in the equation `'ab...c'` cover the third and fourth + dimensions. The ellipsis does not need to cover the same number of dimensions across the :attr:`operands` but the + 'shape' of the ellipsis (the size of the dimensions covered by them) must broadcast together. If the output is not + explicitly defined with the arrow ('->') notation, the ellipsis will come first in the output (left-most dimensions), + before the subscript labels that appear exactly once for the input operands. e.g. the following equation implements + batch matrix multiplication `'...ij,...jk'`. + + A few final notes: the equation may contain whitespaces between the different elements (subscripts, ellipsis, + arrow and comma) but something like `'. . .'` is not valid. An empty string `''` is valid for scalar operands. + + .. note:: + + ``torch.einsum`` handles ellipsis ('...') differently from NumPy in that it allows dimensions + covered by the ellipsis to be summed over, that is, ellipsis are not required to be part of the output. + + .. note:: + + This function uses opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/) to speed up computation or to + consume less memory by optimizing contraction order. This optimization occurs when there are at least three + inputs, since the order does not matter otherwise. Note that finding _the_ optimal path is an NP-hard problem, + thus, opt_einsum relies on different heuristics to achieve near-optimal results. If opt_einsum is not available, + the default order is to contract from left to right. + + To bypass this default behavior, add the following line to disable the usage of opt_einsum and skip path + calculation: `torch.backends.opt_einsum.enabled = False` + + To specify which strategy you'd like for opt_einsum to compute the contraction path, add the following line: + `torch.backends.opt_einsum.strategy = 'auto'`. The default strategy is 'auto', and we also support 'greedy' and + 'optimal'. Disclaimer that the runtime of 'optimal' is factorial in the number of inputs! See more details in + the opt_einsum documentation (https://optimized-einsum.readthedocs.io/en/stable/path_finding.html). + + .. note:: + + As of PyTorch 1.10 :func:`torch.einsum` also supports the sublist format (see examples below). In this format, + subscripts for each operand are specified by sublists, list of integers in the range [0, 52). These sublists + follow their operands, and an extra sublist can appear at the end of the input to specify the output's + subscripts., e.g. `torch.einsum(op1, sublist1, op2, sublist2, ..., [subslist_out])`. Python's `Ellipsis` object + may be provided in a sublist to enable broadcasting as described in the Equation section above. + + Args: + equation (str): The subscripts for the Einstein summation. + operands (List[Tensor]): The tensors to compute the Einstein summation of. + + Examples:: + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> # trace + >>> torch.einsum('ii', torch.randn(4, 4)) + tensor(-1.2104) + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> # diagonal + >>> torch.einsum('ii->i', torch.randn(4, 4)) + tensor([-0.1034, 0.7952, -0.2433, 0.4545]) + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> # outer product + >>> x = torch.randn(5) + >>> y = torch.randn(4) + >>> torch.einsum('i,j->ij', x, y) + tensor([[ 0.1156, -0.2897, -0.3918, 0.4963], + [-0.3744, 0.9381, 1.2685, -1.6070], + [ 0.7208, -1.8058, -2.4419, 3.0936], + [ 0.1713, -0.4291, -0.5802, 0.7350], + [ 0.5704, -1.4290, -1.9323, 2.4480]]) + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> # batch matrix multiplication + >>> As = torch.randn(3, 2, 5) + >>> Bs = torch.randn(3, 5, 4) + >>> torch.einsum('bij,bjk->bik', As, Bs) + tensor([[[-1.0564, -1.5904, 3.2023, 3.1271], + [-1.6706, -0.8097, -0.8025, -2.1183]], + + [[ 4.2239, 0.3107, -0.5756, -0.2354], + [-1.4558, -0.3460, 1.5087, -0.8530]], + + [[ 2.8153, 1.8787, -4.3839, -1.2112], + [ 0.3728, -2.1131, 0.0921, 0.8305]]]) + + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> # with sublist format and ellipsis + >>> torch.einsum(As, [..., 0, 1], Bs, [..., 1, 2], [..., 0, 2]) + tensor([[[-1.0564, -1.5904, 3.2023, 3.1271], + [-1.6706, -0.8097, -0.8025, -2.1183]], + + [[ 4.2239, 0.3107, -0.5756, -0.2354], + [-1.4558, -0.3460, 1.5087, -0.8530]], + + [[ 2.8153, 1.8787, -4.3839, -1.2112], + [ 0.3728, -2.1131, 0.0921, 0.8305]]]) + + >>> # batch permute + >>> A = torch.randn(2, 3, 4, 5) + >>> torch.einsum('...ij->...ji', A).shape + torch.Size([2, 3, 5, 4]) + + >>> # equivalent to torch.nn.functional.bilinear + >>> A = torch.randn(3, 5, 4) + >>> l = torch.randn(2, 5) + >>> r = torch.randn(2, 4) + >>> torch.einsum('bn,anm,bm->ba', l, A, r) + tensor([[-0.3430, -5.2405, 0.4494], + [ 0.3311, 5.5201, -3.0356]]) + """ + import torch.backends.opt_einsum as opt_einsum + # This wrapper exists to support variadic args. + if len(args) < 2: + raise ValueError('einsum(): must specify the equation string and at least one operand, ' + 'or at least one operand and its subscripts list') + + equation = None + operands = None + + if isinstance(args[0], torch.Tensor): + # Convert the subscript list format which is an interleaving of operand and its subscripts + # list with an optional output subscripts list at the end (see documentation for more details on this) + # to the equation string format by creating the equation string from the subscripts list and grouping the + # input operands into a tensorlist (List[Tensor]). + def parse_subscript(n: int) -> str: + if n == Ellipsis: + return '...' + if n >= 0 and n < 26: + return chr(ord('A') + n) + if n >= 26 and n < 52: + return chr(ord('a') + n - 26) + raise ValueError('einsum(): subscript in subscript list is not within the valid range [0, 52)') + + # Parse subscripts for input operands + equation = ','.join(''.join(parse_subscript(s) for s in l) for l in args[1::2]) + + # Parse optional output subscripts (provided when the number of arguments is odd) + if len(args) % 2 == 1: + equation += '->' + ''.join(parse_subscript(s) for s in args[-1]) + operands = args[:-1:2] + else: + operands = args[::2] + else: + equation = args[0] + operands = args[1:] + + if has_torch_function(operands): + return handle_torch_function(einsum, operands, equation, *operands) + + if len(operands) == 1 and isinstance(operands[0], (list, tuple)): + # the old interface of passing the operands as one list argument + _operands = operands[0] + # recurse incase operands contains value that has torch function + # in the original implementation this line is omitted + return einsum(equation, *_operands) + + if len(operands) <= 2 or not opt_einsum.enabled: + # the path for contracting 0 or 1 time(s) is already optimized + # or the user has disabled using opt_einsum + return _VF.einsum(equation, operands) # type: ignore[attr-defined] + + path = None + if opt_einsum.is_available(): + _opt_einsum = opt_einsum.get_opt_einsum() + tupled_path = _opt_einsum.contract_path(equation, *operands, optimize=opt_einsum.strategy)[0] + # flatten path for dispatching to C++ + path = [item for pair in tupled_path for item in pair] + return _VF.einsum(equation, operands, path=path) # type: ignore[attr-defined] + + +# This wrapper exists to support variadic args. +if TYPE_CHECKING: + # The JIT doesn't understand Union, so only add type annotation for mypy + def meshgrid(*tensors: Union[Tensor, List[Tensor]], + indexing: Optional[str] = None) -> Tuple[Tensor, ...]: + return _meshgrid(*tensors, indexing=indexing) +else: + def meshgrid(*tensors, indexing: Optional[str] = None) -> Tuple[Tensor, ...]: + r"""Creates grids of coordinates specified by the 1D inputs in `attr`:tensors. + + This is helpful when you want to visualize data over some + range of inputs. See below for a plotting example. + + Given :math:`N` 1D tensors :math:`T_0 \ldots T_{N-1}` as + inputs with corresponding sizes :math:`S_0 \ldots S_{N-1}`, + this creates :math:`N` N-dimensional tensors :math:`G_0 \ldots + G_{N-1}`, each with shape :math:`(S_0, ..., S_{N-1})` where + the output :math:`G_i` is constructed by expanding :math:`T_i` + to the result shape. + + .. note:: + 0D inputs are treated equivalently to 1D inputs of a + single element. + + .. warning:: + `torch.meshgrid(*tensors)` currently has the same behavior + as calling `numpy.meshgrid(*arrays, indexing='ij')`. + + In the future `torch.meshgrid` will transition to + `indexing='xy'` as the default. + + https://github.com/pytorch/pytorch/issues/50276 tracks + this issue with the goal of migrating to NumPy's behavior. + + .. seealso:: + + :func:`torch.cartesian_prod` has the same effect but it + collects the data in a tensor of vectors. + + Args: + tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be + treated as tensors of size :math:`(1,)` automatically + + indexing: (str, optional): the indexing mode, either "xy" + or "ij", defaults to "ij". See warning for future changes. + + If "xy" is selected, the first dimension corresponds + to the cardinality of the second input and the second + dimension corresponds to the cardinality of the first + input. + + If "ij" is selected, the dimensions are in the same + order as the cardinality of the inputs. + + Returns: + seq (sequence of Tensors): If the input has :math:`N` + tensors of size :math:`S_0 \ldots S_{N-1}``, then the + output will also have :math:`N` tensors, where each tensor + is of shape :math:`(S_0, ..., S_{N-1})`. + + Example:: + + >>> x = torch.tensor([1, 2, 3]) + >>> y = torch.tensor([4, 5, 6]) + + Observe the element-wise pairings across the grid, (1, 4), + (1, 5), ..., (3, 6). This is the same thing as the + cartesian product. + >>> grid_x, grid_y = torch.meshgrid(x, y, indexing='ij') + >>> grid_x + tensor([[1, 1, 1], + [2, 2, 2], + [3, 3, 3]]) + >>> grid_y + tensor([[4, 5, 6], + [4, 5, 6], + [4, 5, 6]]) + + This correspondence can be seen when these grids are + stacked properly. + >>> torch.equal(torch.cat(tuple(torch.dstack([grid_x, grid_y]))), + ... torch.cartesian_prod(x, y)) + True + + `torch.meshgrid` is commonly used to produce a grid for + plotting. + >>> # xdoctest: +REQUIRES(module:matplotlib) + >>> # xdoctest: +REQUIRES(env:DOCTEST_SHOW) + >>> import matplotlib.pyplot as plt + >>> xs = torch.linspace(-5, 5, steps=100) + >>> ys = torch.linspace(-5, 5, steps=100) + >>> x, y = torch.meshgrid(xs, ys, indexing='xy') + >>> z = torch.sin(torch.sqrt(x * x + y * y)) + >>> ax = plt.axes(projection='3d') + >>> ax.plot_surface(x.numpy(), y.numpy(), z.numpy()) + >>> plt.show() + + .. image:: ../_static/img/meshgrid.png + :width: 512 + + """ + return _meshgrid(*tensors, indexing=indexing) + + +def _meshgrid(*tensors, indexing: Optional[str]): + if has_torch_function(tensors): + return handle_torch_function(meshgrid, tensors, *tensors, indexing=indexing) + if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)): + # the old interface of passing the operands as one list argument + tensors = tensors[0] # type: ignore[assignment] + + # Continue allowing call of old method that takes no indexing + # kwarg for forward compatibility reasons. + # + # Remove this two weeks after landing. + kwargs = {} if indexing is None else {'indexing': indexing} + return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined] + + +def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None, + win_length: Optional[int] = None, window: Optional[Tensor] = None, + center: bool = True, pad_mode: str = 'reflect', normalized: bool = False, + onesided: Optional[bool] = None, + return_complex: Optional[bool] = None) -> Tensor: + r"""Short-time Fourier transform (STFT). + + .. warning:: + From version 1.8.0, :attr:`return_complex` must always be given + explicitly for real inputs and `return_complex=False` has been + deprecated. Strongly prefer `return_complex=True` as in a future + pytorch release, this function will only return complex tensors. + + Note that :func:`torch.view_as_real` can be used to recover a real + tensor with an extra last dimension for real and imaginary components. + + .. warning:: + From version 2.1, a warning will be provided if a :attr:`window` is + not specified. In a future release, this attribute will be required. + Not providing a window currently defaults to using a rectangular window, + which may result in undesirable artifacts. Consider using tapered windows, + such as :func:`torch.hann_window`. + + The STFT computes the Fourier transform of short overlapping windows of the + input. This giving frequency components of the signal as they change over + time. The interface of this function is modeled after (but *not* a drop-in + replacement for) librosa_ stft function. + + .. _librosa: https://librosa.org/doc/latest/generated/librosa.stft.html + + Ignoring the optional batch dimension, this method computes the following + expression: + + .. math:: + X[\omega, m] = \sum_{k = 0}^{\text{win\_length-1}}% + \text{window}[k]\ \text{input}[m \times \text{hop\_length} + k]\ % + \exp\left(- j \frac{2 \pi \cdot \omega k}{\text{n\_fft}}\right), + + where :math:`m` is the index of the sliding window, and :math:`\omega` is + the frequency :math:`0 \leq \omega < \text{n\_fft}` for ``onesided=False``, + or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for ``onesided=True``. + + * :attr:`input` must be either a 1-D time sequence or a 2-D batch of time + sequences. + + * If :attr:`hop_length` is ``None`` (default), it is treated as equal to + ``floor(n_fft / 4)``. + + * If :attr:`win_length` is ``None`` (default), it is treated as equal to + :attr:`n_fft`. + + * :attr:`window` can be a 1-D tensor of size :attr:`win_length`, e.g., from + :meth:`torch.hann_window`. If :attr:`window` is ``None`` (default), it is + treated as if having :math:`1` everywhere in the window. If + :math:`\text{win\_length} < \text{n\_fft}`, :attr:`window` will be padded on + both sides to length :attr:`n_fft` before being applied. + + * If :attr:`center` is ``True`` (default), :attr:`input` will be padded on + both sides so that the :math:`t`-th frame is centered at time + :math:`t \times \text{hop\_length}`. Otherwise, the :math:`t`-th frame + begins at time :math:`t \times \text{hop\_length}`. + + * :attr:`pad_mode` determines the padding method used on :attr:`input` when + :attr:`center` is ``True``. See :meth:`torch.nn.functional.pad` for + all available options. Default is ``"reflect"``. + + * If :attr:`onesided` is ``True`` (default for real input), only values for + :math:`\omega` in :math:`\left[0, 1, 2, \dots, \left\lfloor + \frac{\text{n\_fft}}{2} \right\rfloor + 1\right]` are returned because + the real-to-complex Fourier transform satisfies the conjugate symmetry, + i.e., :math:`X[m, \omega] = X[m, \text{n\_fft} - \omega]^*`. + Note if the input or window tensors are complex, then :attr:`onesided` + output is not possible. + + * If :attr:`normalized` is ``True`` (default is ``False``), the function + returns the normalized STFT results, i.e., multiplied by :math:`(\text{frame\_length})^{-0.5}`. + + * If :attr:`return_complex` is ``True`` (default if input is complex), the + return is a ``input.dim() + 1`` dimensional complex tensor. If ``False``, + the output is a ``input.dim() + 2`` dimensional real tensor where the last + dimension represents the real and imaginary components. + + Returns either a complex tensor of size :math:`(* \times N \times T)` if + :attr:`return_complex` is true, or a real tensor of size :math:`(* \times N + \times T \times 2)`. Where :math:`*` is the optional batch size of + :attr:`input`, :math:`N` is the number of frequencies where STFT is applied + and :math:`T` is the total number of frames used. + + .. warning:: + This function changed signature at version 0.4.1. Calling with the + previous signature may cause error or return incorrect result. + + Args: + input (Tensor): the input tensor of shape `(B?, L)` where `B?` is an optional + batch dimension + n_fft (int): size of Fourier transform + hop_length (int, optional): the distance between neighboring sliding window + frames. Default: ``None`` (treated as equal to ``floor(n_fft / 4)``) + win_length (int, optional): the size of window frame and STFT filter. + Default: ``None`` (treated as equal to :attr:`n_fft`) + window (Tensor, optional): the optional window function. + Shape must be 1d and `<= n_fft` + Default: ``None`` (treated as window of all :math:`1` s) + center (bool, optional): whether to pad :attr:`input` on both sides so + that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. + Default: ``True`` + pad_mode (str, optional): controls the padding method used when + :attr:`center` is ``True``. Default: ``"reflect"`` + normalized (bool, optional): controls whether to return the normalized STFT results + Default: ``False`` + onesided (bool, optional): controls whether to return half of results to + avoid redundancy for real inputs. + Default: ``True`` for real :attr:`input` and :attr:`window`, ``False`` otherwise. + return_complex (bool, optional): whether to return a complex tensor, or + a real tensor with an extra last dimension for the real and + imaginary components. + + .. versionchanged:: 2.0 + ``return_complex`` is now a required argument for real inputs, + as the default is being transitioned to ``True``. + + .. deprecated:: 2.0 + ``return_complex=False`` is deprecated, instead use ``return_complex=True`` + Note that calling :func:`torch.view_as_real` on the output will + recover the deprecated output format. + + Returns: + Tensor: A tensor containing the STFT result with shape `(B?, N, T, C?)` where + - `B?` is an optional batch dimension from the input. + - `N` is the number of frequency samples, `(n_fft // 2) + 1` for + `onesided=True`, or otherwise `n_fft`. + - `T` is the number of frames, `1 + L // hop_length` + for `center=True`, or `1 + (L - n_fft) // hop_length` otherwise. + - `C?` is an optional length-2 dimension of real and imaginary + components, present when `return_complex=False`. + + """ + if has_torch_function_unary(input): + return handle_torch_function( + stft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length, + window=window, center=center, pad_mode=pad_mode, normalized=normalized, + onesided=onesided, return_complex=return_complex) + # NOTE: Do not edit. This code will be removed once the forward-compatibility + # period is over for PR #73432 + if center: + signal_dim = input.dim() + extended_shape = [1] * (3 - signal_dim) + list(input.size()) + pad = int(n_fft // 2) + input = F.pad(input.view(extended_shape), [pad, pad], pad_mode) + input = input.view(input.shape[-signal_dim:]) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] + normalized, onesided, return_complex) + + +istft = _add_docstr( + torch.istft, + "istft(input, n_fft, hop_length=None, win_length=None, window=None, center=True, " + "normalized=False, onesided=None, length=None, return_complex=False) -> Tensor:\n" + r""" +Inverse short time Fourier Transform. This is expected to be the inverse of :func:`~torch.stft`. + +.. warning:: + From version 2.1, a warning will be provided if a :attr:`window` is + not specified. In a future release, this attribute will be required. + Please provide the same window used in the stft call. + +It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the +least squares estimation of the original signal. The algorithm will check using the NOLA condition ( +nonzero overlap). + +Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelope +created by the summation of all the windows is never zero at certain point in time. Specifically, +:math:`\sum_{t=-\infty}^{\infty} |w|^2[n-t\times hop\_length] \cancel{=} 0`. + +Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame, +``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False +since the signal isn't padded). If `length` is given in the arguments and is longer than expected, +``istft`` will pad zeros to the end of the returned signal. + +If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc. +Left padding can be trimmed off exactly because they can be calculated but right padding cannot be +calculated without additional information. + +Example: Suppose the last window is: +``[17, 18, 0, 0, 0]`` vs ``[18, 0, 0, 0, 0]`` + +The :attr:`n_fft`, :attr:`hop_length`, :attr:`win_length` are all the same which prevents the calculation +of right padding. These additional values could be zeros or a reflection of the signal so providing +:attr:`length` could be useful. If :attr:`length` is ``None`` then padding will be aggressively removed +(some loss of signal). + +[1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform," +IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984. + +Args: + input (Tensor): The input tensor. Expected to be in the format of :func:`~torch.stft`, + output. That is a complex tensor of shape `(B?, N, T)` where + + - `B?` is an optional batch dimension + - `N` is the number of frequency samples, `(n_fft // 2) + 1` + for onesided input, or otherwise `n_fft`. + - `T` is the number of frames, `1 + length // hop_length` for centered stft, + or `1 + (length - n_fft) // hop_length` otherwise. + + .. versionchanged:: 2.0 + Real datatype inputs are no longer supported. Input must now have a + complex datatype, as returned by ``stft(..., return_complex=True)``. + n_fft (int): Size of Fourier transform + hop_length (Optional[int]): The distance between neighboring sliding window frames. + (Default: ``n_fft // 4``) + win_length (Optional[int]): The size of window frame and STFT filter. (Default: ``n_fft``) + window (Optional[torch.Tensor]): The optional window function. + Shape must be 1d and `<= n_fft` + (Default: ``torch.ones(win_length)``) + center (bool): Whether :attr:`input` was padded on both sides so that the :math:`t`-th frame is + centered at time :math:`t \times \text{hop\_length}`. + (Default: ``True``) + normalized (bool): Whether the STFT was normalized. (Default: ``False``) + onesided (Optional[bool]): Whether the STFT was onesided. + (Default: ``True`` if `n_fft != fft_size` in the input size) + length (Optional[int]): The amount to trim the signal by (i.e. the + original signal length). Defaults to `(T - 1) * hop_length` for + centered stft, or `n_fft + (T - 1) * hop_length` otherwise, where `T` + is the number of input frames. + return_complex (Optional[bool]): + Whether the output should be complex, or if the input should be + assumed to derive from a real signal and window. + Note that this is incompatible with ``onesided=True``. + (Default: ``False``) + +Returns: + Tensor: Least squares estimation of the original signal of shape `(B?, length)` where + `B?` is an optional batch dimension from the input tensor. +""") + + +if TYPE_CHECKING: + # These _impl functions return a variable number of tensors as output with + # __torch_function__; tuple unpacking is done already rather than being + # done by the caller of the _impl function + _unique_impl_out = Any +else: + _unique_impl_out = Tuple[Tensor, Tensor, Tensor] + + +def _unique_impl(input: Tensor, sorted: bool = True, + return_inverse: bool = False, return_counts: bool = False, + dim: Optional[int] = None) -> _unique_impl_out: + r"""unique(input, sorted=True, return_inverse=False, return_counts=False, dim=None) -> Tuple[Tensor, Tensor, Tensor] + + Returns the unique elements of the input tensor. + + .. note:: This function is different from :func:`torch.unique_consecutive` in the sense that + this function also eliminates non-consecutive duplicate values. + + .. note:: Currently in the CUDA implementation and the CPU implementation, + `torch.unique` always sort the tensor at the beginning regardless of the `sort` argument. + Sorting could be slow, so if your input tensor is already sorted, it is recommended to use + :func:`torch.unique_consecutive` which avoids the sorting. + + Args: + input (Tensor): the input tensor + sorted (bool): Whether to sort the unique elements in ascending order + before returning as output. + return_inverse (bool): Whether to also return the indices for where + elements in the original input ended up in the returned unique list. + return_counts (bool): Whether to also return the counts for each unique + element. + dim (int, optional): the dimension to operate upon. If ``None``, the + unique of the flattened input is returned. Otherwise, each of the + tensors indexed by the given dimension is treated as one of the + elements to apply the unique operation upon. See examples for more + details. Default: ``None`` + + Returns: + (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing + + - **output** (*Tensor*): the output list of unique scalar elements. + - **inverse_indices** (*Tensor*): (optional) if + :attr:`return_inverse` is True, there will be an additional + returned tensor (same shape as input) representing the indices + for where elements in the original input map to in the output; + otherwise, this function will only return a single tensor. + - **counts** (*Tensor*): (optional) if + :attr:`return_counts` is True, there will be an additional + returned tensor (same shape as output or output.size(dim), + if dim was specified) representing the number of occurrences + for each unique value or tensor. + + Example:: + + >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long)) + >>> output + tensor([1, 2, 3]) + + >>> output, inverse_indices = torch.unique( + ... torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True) + >>> output + tensor([1, 2, 3]) + >>> inverse_indices + tensor([0, 2, 1, 2]) + + >>> output, inverse_indices = torch.unique( + ... torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True) + >>> output + tensor([1, 2, 3]) + >>> inverse_indices + tensor([[0, 2], + [1, 2]]) + + >>> a = torch.tensor([ + ... [ + ... [1, 1, 0, 0], + ... [1, 1, 0, 0], + ... [0, 0, 1, 1], + ... ], + ... [ + ... [0, 0, 1, 1], + ... [0, 0, 1, 1], + ... [1, 1, 1, 1], + ... ], + ... [ + ... [1, 1, 0, 0], + ... [1, 1, 0, 0], + ... [0, 0, 1, 1], + ... ], + ... ]) + + >>> # If we call `torch.unique(a, dim=0)`, each of the tensors `a[idx, :, :]` + >>> # will be compared. We can see that `a[0, :, :]` and `a[2, :, :]` match + >>> # each other, so one of them will be removed. + >>> (a[0, :, :] == a[2, :, :]).all() + tensor(True) + >>> a_unique_dim0 = torch.unique(a, dim=0) + >>> a_unique_dim0 + tensor([[[0, 0, 1, 1], + [0, 0, 1, 1], + [1, 1, 1, 1]], + [[1, 1, 0, 0], + [1, 1, 0, 0], + [0, 0, 1, 1]]]) + + >>> # Notice which sub-tensors from `a` match with the sub-tensors from + >>> # `a_unique_dim0`: + >>> (a_unique_dim0[0, :, :] == a[1, :, :]).all() + tensor(True) + >>> (a_unique_dim0[1, :, :] == a[0, :, :]).all() + tensor(True) + + >>> # For `torch.unique(a, dim=1)`, each of the tensors `a[:, idx, :]` are + >>> # compared. `a[:, 0, :]` and `a[:, 1, :]` match each other, so one of + >>> # them will be removed. + >>> (a[:, 0, :] == a[:, 1, :]).all() + tensor(True) + >>> torch.unique(a, dim=1) + tensor([[[0, 0, 1, 1], + [1, 1, 0, 0]], + [[1, 1, 1, 1], + [0, 0, 1, 1]], + [[0, 0, 1, 1], + [1, 1, 0, 0]]]) + + >>> # For `torch.unique(a, dim=2)`, the tensors `a[:, :, idx]` are compared. + >>> # `a[:, :, 0]` and `a[:, :, 1]` match each other. Also, `a[:, :, 2]` and + >>> # `a[:, :, 3]` match each other as well. So in this case, two of the + >>> # sub-tensors will be removed. + >>> (a[:, :, 0] == a[:, :, 1]).all() + tensor(True) + >>> (a[:, :, 2] == a[:, :, 3]).all() + tensor(True) + >>> torch.unique(a, dim=2) + tensor([[[0, 1], + [0, 1], + [1, 0]], + [[1, 0], + [1, 0], + [1, 1]], + [[0, 1], + [0, 1], + [1, 0]]]) + """ + if has_torch_function_unary(input): + return handle_torch_function( + unique, (input,), input, sorted=sorted, return_inverse=return_inverse, + return_counts=return_counts, dim=dim) + + if dim is not None: + output, inverse_indices, counts = _VF.unique_dim( + input, + dim, + sorted=sorted, + return_inverse=return_inverse, + return_counts=return_counts, + ) + else: + output, inverse_indices, counts = torch._unique2( + input, + sorted=sorted, + return_inverse=return_inverse, + return_counts=return_counts, + ) + return output, inverse_indices, counts + + +def _unique_consecutive_impl(input: Tensor, return_inverse: bool = False, + return_counts: bool = False, + dim: Optional[int] = None) -> _unique_impl_out: + r"""Eliminates all but the first element from every consecutive group of equivalent elements. + + .. note:: This function is different from :func:`torch.unique` in the sense that this function + only eliminates consecutive duplicate values. This semantics is similar to `std::unique` + in C++. + + Args: + input (Tensor): the input tensor + return_inverse (bool): Whether to also return the indices for where + elements in the original input ended up in the returned unique list. + return_counts (bool): Whether to also return the counts for each unique + element. + dim (int): the dimension to apply unique. If ``None``, the unique of the + flattened input is returned. default: ``None`` + + Returns: + (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing + + - **output** (*Tensor*): the output list of unique scalar elements. + - **inverse_indices** (*Tensor*): (optional) if + :attr:`return_inverse` is True, there will be an additional + returned tensor (same shape as input) representing the indices + for where elements in the original input map to in the output; + otherwise, this function will only return a single tensor. + - **counts** (*Tensor*): (optional) if + :attr:`return_counts` is True, there will be an additional + returned tensor (same shape as output or output.size(dim), + if dim was specified) representing the number of occurrences + for each unique value or tensor. + + Example:: + + >>> x = torch.tensor([1, 1, 2, 2, 3, 1, 1, 2]) + >>> output = torch.unique_consecutive(x) + >>> output + tensor([1, 2, 3, 1, 2]) + + >>> output, inverse_indices = torch.unique_consecutive(x, return_inverse=True) + >>> output + tensor([1, 2, 3, 1, 2]) + >>> inverse_indices + tensor([0, 0, 1, 1, 2, 3, 3, 4]) + + >>> output, counts = torch.unique_consecutive(x, return_counts=True) + >>> output + tensor([1, 2, 3, 1, 2]) + >>> counts + tensor([2, 2, 1, 2, 1]) + """ + if has_torch_function_unary(input): + return handle_torch_function( + unique_consecutive, (input,), input, return_inverse=return_inverse, + return_counts=return_counts, dim=dim) + output, inverse_indices, counts = _VF.unique_consecutive( # type: ignore[attr-defined] + input, return_inverse=return_inverse, return_counts=return_counts, dim=dim) + return output, inverse_indices, counts + + +def _return_counts(input, sorted=True, return_inverse=False, return_counts=False, dim=None): + # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] + + if has_torch_function_unary(input): + return _unique_impl(input, sorted, return_inverse, return_counts, dim) + + output, _, counts = _unique_impl(input, sorted, return_inverse, return_counts, dim) + return output, counts + + +def _return_output(input, sorted=True, return_inverse=False, return_counts=False, dim=None): + # type: (Tensor, bool, bool, bool, Optional[int]) -> Tensor + + if has_torch_function_unary(input): + return _unique_impl(input, sorted, return_inverse, return_counts, dim) + + output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim) + return output + + +def _return_inverse(input, sorted=True, return_inverse=False, return_counts=False, dim=None): + # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] + + if has_torch_function_unary(input): + return _unique_impl(input, sorted, return_inverse, return_counts, dim) + + output, inverse_indices, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim) + return output, inverse_indices + + +_return_inverse_false = boolean_dispatch( + arg_name='return_counts', + arg_index=3, + default=False, + if_true=_return_counts, + if_false=_return_output, + module_name=__name__, + func_name='unique') + +_return_inverse_true = boolean_dispatch( + arg_name='return_counts', + arg_index=3, + default=False, + if_true=_unique_impl, + if_false=_return_inverse, + module_name=__name__, + func_name='unique') + +# The return type of unique depends on `return_inverse`, and `return_counts` so in order to +# resolve the output type in TorchScript we need to statically know the value of both parameters + +unique = boolean_dispatch( + arg_name='return_inverse', + arg_index=2, + default=False, + if_true=_return_inverse_true, + if_false=_return_inverse_false, + module_name=__name__, + func_name='unique') +unique.__doc__ = _unique_impl.__doc__ + + +def _consecutive_return_counts(input, return_inverse=False, return_counts=False, dim=None): + # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] + + if has_torch_function_unary(input): + return _unique_consecutive_impl(input, return_inverse, return_counts, dim) + + output, _, counts = _unique_consecutive_impl(input, return_inverse, return_counts, dim) + return output, counts + + +def _consecutive_return_output(input, return_inverse=False, return_counts=False, dim=None): + # type: (Tensor, bool, bool, Optional[int]) -> Tensor + + if has_torch_function_unary(input): + return _unique_consecutive_impl(input, return_inverse, return_counts, dim) + + output, _, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim) + return output + + +def _consecutive_return_inverse(input, return_inverse=False, return_counts=False, dim=None): + # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] + + if has_torch_function_unary(input): + return _unique_consecutive_impl(input, return_inverse, return_counts, dim) + + output, inverse_indices, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim) + return output, inverse_indices + + +_consecutive_return_inverse_false = boolean_dispatch( + arg_name='return_counts', + arg_index=1, + default=False, + if_true=_consecutive_return_counts, + if_false=_consecutive_return_output, + module_name=__name__, + func_name='unique_consecutive') + +_consecutive_return_inverse_true = boolean_dispatch( + arg_name='return_counts', + arg_index=1, + default=False, + if_true=_unique_consecutive_impl, + if_false=_consecutive_return_inverse, + module_name=__name__, + func_name='unique_consecutive') + +# The return type of unique depends on `return_inverse`, and `return_counts` so in order to +# resolve the output type in TorchScript we need to statically know the value of both parameters + +unique_consecutive = boolean_dispatch( + arg_name='return_inverse', + arg_index=2, + default=False, + if_true=_consecutive_return_inverse_true, + if_false=_consecutive_return_inverse_false, + module_name=__name__, + func_name='unique_consecutive') +unique_consecutive.__doc__ = _unique_consecutive_impl.__doc__ + +if TYPE_CHECKING: + pass + # There's no good way to use this type annotation without breaking JIT + # overloads. So leave untyped for mypy for now. +else: + @overload + def tensordot(a, b, dims: int = 2, out: Optional[torch.Tensor] = None): + pass + + @overload # noqa: F811 + def tensordot(a, b, dims: Tuple[List[int], List[int]], out: Optional[torch.Tensor] = None): # noqa: F811 + pass + + @overload # noqa: F811 + def tensordot(a, b, dims: List[List[int]], out: Optional[torch.Tensor] = None): # noqa: F811 + pass + + @overload # noqa: F811 + def tensordot(a, b, dims: torch.Tensor, out: Optional[torch.Tensor] = None): # noqa: F811 + pass + + +def tensordot(a, b, dims=2, out: Optional[torch.Tensor] = None): # noqa: F811 + r"""Returns a contraction of a and b over multiple dimensions. + + :attr:`tensordot` implements a generalized matrix product. + + Args: + a (Tensor): Left tensor to contract + b (Tensor): Right tensor to contract + dims (int or Tuple[List[int], List[int]] or List[List[int]] containing two lists or Tensor): number of dimensions to + contract or explicit lists of dimensions for :attr:`a` and + :attr:`b` respectively + + When called with a non-negative integer argument :attr:`dims` = :math:`d`, and + the number of dimensions of :attr:`a` and :attr:`b` is :math:`m` and :math:`n`, + respectively, :func:`~torch.tensordot` computes + + .. math:: + r_{i_0,...,i_{m-d}, i_d,...,i_n} + = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}. + + When called with :attr:`dims` of the list form, the given dimensions will be contracted + in place of the last :math:`d` of :attr:`a` and the first :math:`d` of :math:`b`. The sizes + in these dimensions must match, but :func:`~torch.tensordot` will deal with broadcasted + dimensions. + + Examples:: + + >>> a = torch.arange(60.).reshape(3, 4, 5) + >>> b = torch.arange(24.).reshape(4, 3, 2) + >>> torch.tensordot(a, b, dims=([1, 0], [0, 1])) + tensor([[4400., 4730.], + [4532., 4874.], + [4664., 5018.], + [4796., 5162.], + [4928., 5306.]]) + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA) + >>> a = torch.randn(3, 4, 5, device='cuda') + >>> b = torch.randn(4, 5, 6, device='cuda') + >>> c = torch.tensordot(a, b, dims=2).cpu() + tensor([[ 8.3504, -2.5436, 6.2922, 2.7556, -1.0732, 3.2741], + [ 3.3161, 0.0704, 5.0187, -0.4079, -4.3126, 4.8744], + [ 0.8223, 3.9445, 3.2168, -0.2400, 3.4117, 1.7780]]) + + >>> a = torch.randn(3, 5, 4, 6) + >>> b = torch.randn(6, 4, 5, 3) + >>> torch.tensordot(a, b, dims=([2, 1, 3], [1, 2, 0])) + tensor([[ 7.7193, -2.4867, -10.3204], + [ 1.5513, -14.4737, -6.5113], + [ -0.2850, 4.2573, -3.5997]]) + """ + if has_torch_function_variadic(a, b): + return handle_torch_function(tensordot, (a, b), a, b, dims=dims, out=out) + + if not isinstance(dims, (tuple, list, torch.Tensor, int, torch.SymInt)): + raise RuntimeError("tensordot expects dims to be int or " + + "Tuple[List[int], List[int]] or " + + "List[List[int]] containing two lists, but got " + + f"dims={dims}") + + dims_a: List[int] = [] + dims_b: List[int] = [] + + if isinstance(dims, (tuple, list)): + dims_a, dims_b = dims + + if isinstance(dims, torch.Tensor): + num_elements = dims.numel() + if num_elements > 1: + assert dims.size()[0] == 2 + dims_a = torch.jit.annotate(List[int], dims[0].tolist()) + dims_b = torch.jit.annotate(List[int], dims[1].tolist()) + else: + dims_val = int(dims.item()) + if dims_val < 0: + raise RuntimeError(f"tensordot expects dims >= 0, but got dims={dims}") + dims_a = list(range(-dims_val, 0)) + dims_b = list(range(dims_val)) + + if isinstance(dims, (int, torch.SymInt)): + if dims < 0: + raise RuntimeError(f"tensordot expects dims >= 0, but got dims={dims}") + if dims > min(a.dim(), b.dim()): + raise RuntimeError(f"tensordot expects dims < ndim_a or ndim_b, but got dims={dims}") + dims_a = list(range(-dims, 0)) + dims_b = list(range(dims)) + + if out is None: + return _VF.tensordot(a, b, dims_a, dims_b) # type: ignore[attr-defined] + else: + return _VF.tensordot(a, b, dims_a, dims_b, out=out) # type: ignore[attr-defined] + + +def cartesian_prod(*tensors: Tensor) -> Tensor: + """Do cartesian product of the given sequence of tensors. The behavior is similar to + python's `itertools.product`. + + Args: + *tensors: any number of 1 dimensional tensors. + + Returns: + Tensor: A tensor equivalent to converting all the input tensors into lists, + do `itertools.product` on these lists, and finally convert the resulting list + into tensor. + + Example:: + + >>> import itertools + >>> a = [1, 2, 3] + >>> b = [4, 5] + >>> list(itertools.product(a, b)) + [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)] + >>> tensor_a = torch.tensor(a) + >>> tensor_b = torch.tensor(b) + >>> torch.cartesian_prod(tensor_a, tensor_b) + tensor([[1, 4], + [1, 5], + [2, 4], + [2, 5], + [3, 4], + [3, 5]]) + """ + # This wrapper exists to support variadic args. + if has_torch_function(tensors): + return handle_torch_function(cartesian_prod, tensors, *tensors) + return _VF.cartesian_prod(tensors) # type: ignore[attr-defined] + + +def block_diag(*tensors): + """Create a block diagonal matrix from provided tensors. + + Args: + *tensors: One or more tensors with 0, 1, or 2 dimensions. + + Returns: + Tensor: A 2 dimensional tensor with all the input tensors arranged in + order such that their upper left and lower right corners are + diagonally adjacent. All other elements are set to 0. + + Example:: + + >>> import torch + >>> A = torch.tensor([[0, 1], [1, 0]]) + >>> B = torch.tensor([[3, 4, 5], [6, 7, 8]]) + >>> C = torch.tensor(7) + >>> D = torch.tensor([1, 2, 3]) + >>> E = torch.tensor([[4], [5], [6]]) + >>> torch.block_diag(A, B, C, D, E) + tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 3, 4, 5, 0, 0, 0, 0, 0], + [0, 0, 6, 7, 8, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 7, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 1, 2, 3, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 4], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 5], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 6]]) + """ + # This wrapper exists to support variadic args. + if has_torch_function(tensors): + return handle_torch_function(block_diag, tensors, *tensors) + return torch._C._VariableFunctions.block_diag(tensors) # type: ignore[attr-defined] + + +def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessary'): + # type: (Tensor, Tensor, float, str) -> (Tensor) + r"""Computes batched the p-norm distance between each pair of the two collections of row vectors. + + Args: + x1 (Tensor): input tensor of shape :math:`B \times P \times M`. + x2 (Tensor): input tensor of shape :math:`B \times R \times M`. + p: p value for the p-norm distance to calculate between each vector pair + :math:`\in [0, \infty]`. + compute_mode: + 'use_mm_for_euclid_dist_if_necessary' - will use matrix multiplication approach to calculate + euclidean distance (p = 2) if P > 25 or R > 25 + 'use_mm_for_euclid_dist' - will always use matrix multiplication approach to calculate + euclidean distance (p = 2) + 'donot_use_mm_for_euclid_dist' - will never use matrix multiplication approach to calculate + euclidean distance (p = 2) + Default: use_mm_for_euclid_dist_if_necessary. + + If x1 has shape :math:`B \times P \times M` and x2 has shape :math:`B \times R \times M` then the + output will have shape :math:`B \times P \times R`. + + This function is equivalent to `scipy.spatial.distance.cdist(input,'minkowski', p=p)` + if :math:`p \in (0, \infty)`. When :math:`p = 0` it is equivalent to + `scipy.spatial.distance.cdist(input, 'hamming') * M`. When :math:`p = \infty`, the closest + scipy function is `scipy.spatial.distance.cdist(xn, lambda x, y: np.abs(x - y).max())`. + + Example: + + >>> a = torch.tensor([[0.9041, 0.0196], [-0.3108, -2.4423], [-0.4821, 1.059]]) + >>> a + tensor([[ 0.9041, 0.0196], + [-0.3108, -2.4423], + [-0.4821, 1.0590]]) + >>> b = torch.tensor([[-2.1763, -0.4713], [-0.6986, 1.3702]]) + >>> b + tensor([[-2.1763, -0.4713], + [-0.6986, 1.3702]]) + >>> torch.cdist(a, b, p=2) + tensor([[3.1193, 2.0959], + [2.7138, 3.8322], + [2.2830, 0.3791]]) + """ + if has_torch_function_variadic(x1, x2): + return handle_torch_function( + cdist, (x1, x2), x1, x2, p=p, compute_mode=compute_mode) + if compute_mode == 'use_mm_for_euclid_dist_if_necessary': + return _VF.cdist(x1, x2, p, None) # type: ignore[attr-defined] + elif compute_mode == 'use_mm_for_euclid_dist': + return _VF.cdist(x1, x2, p, 1) # type: ignore[attr-defined] + elif compute_mode == 'donot_use_mm_for_euclid_dist': + return _VF.cdist(x1, x2, p, 2) # type: ignore[attr-defined] + else: + raise ValueError(f"{compute_mode} is not a valid value for compute_mode") + + +def atleast_1d(*tensors): + r""" + Returns a 1-dimensional view of each input tensor with zero dimensions. + Input tensors with one or more dimensions are returned as-is. + + Args: + input (Tensor or list of Tensors) + + Returns: + output (Tensor or tuple of Tensors) + + Example:: + + >>> x = torch.arange(2) + >>> x + tensor([0, 1]) + >>> torch.atleast_1d(x) + tensor([0, 1]) + >>> x = torch.tensor(1.) + >>> x + tensor(1.) + >>> torch.atleast_1d(x) + tensor([1.]) + >>> x = torch.tensor(0.5) + >>> y = torch.tensor(1.) + >>> torch.atleast_1d((x, y)) + (tensor([0.5000]), tensor([1.])) + """ + # This wrapper exists to support variadic args. + if has_torch_function(tensors): + return handle_torch_function(atleast_1d, tensors, *tensors) + if len(tensors) == 1: + tensors = tensors[0] + return _VF.atleast_1d(tensors) # type: ignore[attr-defined] + + +def atleast_2d(*tensors): + r""" + Returns a 2-dimensional view of each input tensor with zero dimensions. + Input tensors with two or more dimensions are returned as-is. + + Args: + input (Tensor or list of Tensors) + + Returns: + output (Tensor or tuple of Tensors) + + Example:: + + >>> x = torch.tensor(1.) + >>> x + tensor(1.) + >>> torch.atleast_2d(x) + tensor([[1.]]) + >>> x = torch.arange(4).view(2, 2) + >>> x + tensor([[0, 1], + [2, 3]]) + >>> torch.atleast_2d(x) + tensor([[0, 1], + [2, 3]]) + >>> x = torch.tensor(0.5) + >>> y = torch.tensor(1.) + >>> torch.atleast_2d((x, y)) + (tensor([[0.5000]]), tensor([[1.]])) + """ + # This wrapper exists to support variadic args. + if has_torch_function(tensors): + return handle_torch_function(atleast_2d, tensors, *tensors) + if len(tensors) == 1: + tensors = tensors[0] + return _VF.atleast_2d(tensors) # type: ignore[attr-defined] + + +def atleast_3d(*tensors): + r""" + Returns a 3-dimensional view of each input tensor with zero dimensions. + Input tensors with three or more dimensions are returned as-is. + + Args: + input (Tensor or list of Tensors) + + Returns: + output (Tensor or tuple of Tensors) + + Example: + + >>> x = torch.tensor(0.5) + >>> x + tensor(0.5000) + >>> torch.atleast_3d(x) + tensor([[[0.5000]]]) + >>> y = torch.arange(4).view(2, 2) + >>> y + tensor([[0, 1], + [2, 3]]) + >>> torch.atleast_3d(y) + tensor([[[0], + [1]], + + [[2], + [3]]]) + >>> x = torch.tensor(1).view(1, 1, 1) + >>> x + tensor([[[1]]]) + >>> torch.atleast_3d(x) + tensor([[[1]]]) + >>> x = torch.tensor(0.5) + >>> y = torch.tensor(1.) + >>> torch.atleast_3d((x, y)) + (tensor([[[0.5000]]]), tensor([[[1.]]])) + """ + # This wrapper exists to support variadic args. + if has_torch_function(tensors): + return handle_torch_function(atleast_3d, tensors, *tensors) + if len(tensors) == 1: + tensors = tensors[0] + return _VF.atleast_3d(tensors) # type: ignore[attr-defined] + + +if TYPE_CHECKING: + pass + # There's no good way to use this type annotation; cannot rename norm() to + # _norm_impl() in a way that doesn't break JIT overloads. So leave untyped + # for mypy for now. + # def norm(input: Tensor, + # p: Optional[Union[str, Number]] = "fro", + # dim: Optional[Union[int, List[int]]] = None, + # keepdim: bool = False, + # out: Optional[Tensor] = None, + # dtype: _dtype = None) -> Tensor: + # return _norm_impl(input, p, dim, keepdim, out, dtype) +else: + # TODO: type dim as BroadcastingList when + # https://github.com/pytorch/pytorch/issues/33782 is fixed + @overload + def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): + # type: (Tensor, str, Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor + pass + + @overload # noqa: F811 + def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: F811 + # type: (Tensor, Optional[number], Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor + pass + + @overload # noqa: F811 + def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: F811 + # type: (Tensor, Optional[number], Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor + pass + + @overload # noqa: F811 + def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: F811 + # type: (Tensor, str, Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor + pass + + +def norm(input, p: Optional[Union[float, str]] = "fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: F811 + r"""Returns the matrix norm or vector norm of a given tensor. + + .. warning:: + + torch.norm is deprecated and may be removed in a future PyTorch release. + Its documentation and behavior may be incorrect, and it is no longer + actively maintained. + + Use :func:`torch.linalg.vector_norm` when computing vector norms and + :func:`torch.linalg.matrix_norm` when computing matrix norms. + For a function with a similar behavior as this one see :func:`torch.linalg.norm`. + Note, however, the signature for these functions is slightly different than the + signature for ``torch.norm``. + + Args: + input (Tensor): The input tensor. Its data type must be either a floating + point or complex type. For complex inputs, the norm is calculated using the + absolute value of each element. If the input is complex and neither + :attr:`dtype` nor :attr:`out` is specified, the result's data type will + be the corresponding floating point type (e.g. float if :attr:`input` is + complexfloat). + + p (int, float, inf, -inf, 'fro', 'nuc', optional): the order of norm. Default: ``'fro'`` + The following norms can be calculated: + + ====== ============== ========================== + ord matrix norm vector norm + ====== ============== ========================== + 'fro' Frobenius norm -- + 'nuc' nuclear norm -- + Number -- sum(abs(x)**ord)**(1./ord) + ====== ============== ========================== + + The vector norm can be calculated across any number of dimensions. + The corresponding dimensions of :attr:`input` are flattened into + one dimension, and the norm is calculated on the flattened + dimension. + + Frobenius norm produces the same result as ``p=2`` in all cases + except when :attr:`dim` is a list of three or more dims, in which + case Frobenius norm throws an error. + + Nuclear norm can only be calculated across exactly two dimensions. + + dim (int, tuple of ints, list of ints, optional): + Specifies which dimension or dimensions of :attr:`input` to + calculate the norm across. If :attr:`dim` is ``None``, the norm will + be calculated across all dimensions of :attr:`input`. If the norm + type indicated by :attr:`p` does not support the specified number of + dimensions, an error will occur. + keepdim (bool, optional): whether the output tensors have :attr:`dim` + retained or not. Ignored if :attr:`dim` = ``None`` and + :attr:`out` = ``None``. Default: ``False`` + out (Tensor, optional): the output tensor. Ignored if + :attr:`dim` = ``None`` and :attr:`out` = ``None``. + dtype (:class:`torch.dtype`, optional): the desired data type of + returned tensor. If specified, the input tensor is casted to + :attr:`dtype` while performing the operation. Default: None. + + .. note:: + Even though ``p='fro'`` supports any number of dimensions, the true + mathematical definition of Frobenius norm only applies to tensors with + exactly two dimensions. :func:`torch.linalg.matrix_norm` with ``ord='fro'`` + aligns with the mathematical definition, since it can only be applied across + exactly two dimensions. + + Example:: + + >>> import torch + >>> a = torch.arange(9, dtype= torch.float) - 4 + >>> b = a.reshape((3, 3)) + >>> torch.norm(a) + tensor(7.7460) + >>> torch.norm(b) + tensor(7.7460) + >>> torch.norm(a, float('inf')) + tensor(4.) + >>> torch.norm(b, float('inf')) + tensor(4.) + >>> c = torch.tensor([[ 1, 2, 3], [-1, 1, 4]] , dtype=torch.float) + >>> torch.norm(c, dim=0) + tensor([1.4142, 2.2361, 5.0000]) + >>> torch.norm(c, dim=1) + tensor([3.7417, 4.2426]) + >>> torch.norm(c, p=1, dim=1) + tensor([6., 6.]) + >>> d = torch.arange(8, dtype=torch.float).reshape(2, 2, 2) + >>> torch.norm(d, dim=(1, 2)) + tensor([ 3.7417, 11.2250]) + >>> torch.norm(d[0, :, :]), torch.norm(d[1, :, :]) + (tensor(3.7417), tensor(11.2250)) + """ + + if has_torch_function_unary(input): + return handle_torch_function( + norm, (input,), input, p=p, dim=dim, keepdim=keepdim, out=out, dtype=dtype) + + # NB. All the repeated code and weird python is to please TorchScript. + # For a more compact implementation see the relevant function in `_refs/__init__.py` + + # We don't do this for MPS or sparse tensors + if input.layout == torch.strided and input.device.type in \ + ("cpu", "cuda", "meta", torch.utils.backend_registration._privateuse1_backend_name): + if dim is not None: + if isinstance(dim, (int, torch.SymInt)): + _dim = [dim] + else: + _dim = dim + else: + _dim = None # type: ignore[assignment] + + if isinstance(p, str): + if p == "fro" and (dim is None or isinstance(dim, (int, torch.SymInt)) or len(dim) <= 2): + if out is None: + return torch.linalg.vector_norm(input, 2, _dim, keepdim, dtype=dtype) + else: + return torch.linalg.vector_norm(input, 2, _dim, keepdim, dtype=dtype, out=out) + + # Here we either call the nuclear norm, or we call matrix_norm with some arguments + # that will throw an error + if _dim is None: + _dim = list(range(input.ndim)) + if out is None: + return torch.linalg.matrix_norm(input, p, _dim, keepdim, dtype=dtype) + else: + return torch.linalg.matrix_norm(input, p, _dim, keepdim, dtype=dtype, out=out) + else: + # NB. p should be Union[str, number], not Optional! + _p = 2.0 if p is None else p + if out is None: + return torch.linalg.vector_norm(input, _p, _dim, keepdim, dtype=dtype) + else: + return torch.linalg.vector_norm(input, _p, _dim, keepdim, dtype=dtype, out=out) + + ndim = input.dim() + + # catch default case + if dim is None and out is None and dtype is None and p is not None: + if isinstance(p, str): + if p == "fro": + return _VF.frobenius_norm(input, dim=(), keepdim=keepdim) + if not isinstance(p, str): + _dim = [i for i in range(ndim)] # noqa: C416 TODO: rewrite as list(range(m)) + return _VF.norm(input, p, dim=_dim, keepdim=keepdim) # type: ignore[attr-defined] + + # TODO: when https://github.com/pytorch/pytorch/issues/33782 is fixed + # remove the overloads where dim is an int and replace with BraodcastingList1 + # and remove next four lines, replace _dim with dim + if dim is not None: + if isinstance(dim, (int, torch.SymInt)): + _dim = [dim] + else: + _dim = dim + else: + _dim = None # type: ignore[assignment] + + if isinstance(p, str): + if p == "fro": + if dtype is not None: + raise ValueError("dtype argument is not supported in frobenius norm") + + if _dim is None: + _dim = list(range(ndim)) + if out is None: + return _VF.frobenius_norm(input, _dim, keepdim=keepdim) # type: ignore[arg-type] + else: + return _VF.frobenius_norm(input, _dim, keepdim=keepdim, out=out) # type: ignore[arg-type] + elif p == "nuc": + if dtype is not None: + raise ValueError("dtype argument is not supported in nuclear norm") + if _dim is None: + if out is None: + return _VF.nuclear_norm(input, keepdim=keepdim) # type: ignore[arg-type] + else: + return _VF.nuclear_norm(input, keepdim=keepdim, out=out) # type: ignore[arg-type] + else: + if out is None: + return _VF.nuclear_norm(input, _dim, keepdim=keepdim) # type: ignore[arg-type] + else: + return _VF.nuclear_norm(input, _dim, keepdim=keepdim, out=out) # type: ignore[arg-type] + raise RuntimeError(f"only valid string values are 'fro' and 'nuc', found {p}") + else: + if _dim is None: + _dim = list(range(ndim)) + + if out is None: + if dtype is None: + return _VF.norm(input, p, _dim, keepdim=keepdim) # type: ignore[attr-defined] + else: + return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype) # type: ignore[attr-defined] + else: + if dtype is None: + return _VF.norm(input, p, _dim, keepdim=keepdim, out=out) # type: ignore[attr-defined] + else: + return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype, out=out) # type: ignore[attr-defined] + +def unravel_index(indices: Tensor, shape: Union[int, Sequence[int], torch.Size]) -> Tuple[Tensor, ...]: + r"""Converts a tensor of flat indices into a tuple of coordinate tensors that + index into an arbitrary tensor of the specified shape. + + Args: + indices (Tensor): An integer tensor containing indices into the + flattened version of an arbitrary tensor of shape :attr:`shape`. + All elements must be in the range ``[0, prod(shape) - 1]``. + + shape (int, sequence of ints, or torch.Size): The shape of the arbitrary + tensor. All elements must be non-negative. + + Returns: + tuple of Tensors: Each ``i``-th tensor in the output corresponds with + dimension ``i`` of :attr:`shape`. Each tensor has the same shape as + ``indices`` and contains one index into dimension ``i`` for each of the + flat indices given by ``indices``. + + Example:: + + >>> import torch + >>> torch.unravel_index(torch.tensor(4), (3, 2)) + (tensor(2), + tensor(0)) + + >>> torch.unravel_index(torch.tensor([4, 1]), (3, 2)) + (tensor([2, 0]), + tensor([0, 1])) + + >>> torch.unravel_index(torch.tensor([0, 1, 2, 3, 4, 5]), (3, 2)) + (tensor([0, 0, 1, 1, 2, 2]), + tensor([0, 1, 0, 1, 0, 1])) + + >>> torch.unravel_index(torch.tensor([1234, 5678]), (10, 10, 10, 10)) + (tensor([1, 5]), + tensor([2, 6]), + tensor([3, 7]), + tensor([4, 8])) + + >>> torch.unravel_index(torch.tensor([[1234], [5678]]), (10, 10, 10, 10)) + (tensor([[1], [5]]), + tensor([[2], [6]]), + tensor([[3], [7]]), + tensor([[4], [8]])) + + >>> torch.unravel_index(torch.tensor([[1234], [5678]]), (100, 100)) + (tensor([[12], [56]]), + tensor([[34], [78]])) + """ + if has_torch_function_unary(indices): + return handle_torch_function( + unravel_index, (indices,), indices, shape=shape) + res_tensor = _unravel_index(indices, shape) + return res_tensor.unbind(-1) + +def _unravel_index(indices: Tensor, shape: Union[int, Sequence[int]]) -> Tensor: + torch._check_type( + not indices.is_complex() and not indices.is_floating_point() and not indices.dtype == torch.bool, + lambda: f"expected 'indices' to be integer dtype, but got {indices.dtype}") + + torch._check_type( + isinstance(shape, (int, torch.SymInt, Sequence)), + lambda: f"expected 'shape' to be int or sequence of ints, but got {type(shape)}") + + if isinstance(shape, (int, torch.SymInt)): + shape = torch.Size([shape]) + else: + for dim in shape: + torch._check_type( + isinstance(dim, (int, torch.SymInt)), + lambda: f"expected 'shape' sequence to only contain ints, but got {type(dim)}") + shape = torch.Size(shape) + + torch._check_value( + all(dim >= 0 for dim in shape), + lambda: f"'shape' cannot have negative values, but got {tuple(shape)}") + + coefs = list(reversed(list(itertools.accumulate(reversed(shape[1:] + torch.Size([1])), func=operator.mul)))) + return indices.unsqueeze(-1).floor_divide( + torch.tensor(coefs, device=indices.device, dtype=torch.int64) + ) % torch.tensor(shape, device=indices.device, dtype=torch.int64) + +def chain_matmul(*matrices, out=None): + r"""Returns the matrix product of the :math:`N` 2-D tensors. This product is efficiently computed + using the matrix chain order algorithm which selects the order in which incurs the lowest cost in terms + of arithmetic operations (`[CLRS]`_). Note that since this is a function to compute the product, :math:`N` + needs to be greater than or equal to 2; if equal to 2 then a trivial matrix-matrix product is returned. + If :math:`N` is 1, then this is a no-op - the original matrix is returned as is. + + .. warning:: + + :func:`torch.chain_matmul` is deprecated and will be removed in a future PyTorch release. + Use :func:`torch.linalg.multi_dot` instead, which accepts a list of two or more tensors + rather than multiple arguments. + + Args: + matrices (Tensors...): a sequence of 2 or more 2-D tensors whose product is to be determined. + out (Tensor, optional): the output tensor. Ignored if :attr:`out` = ``None``. + + Returns: + Tensor: if the :math:`i^{th}` tensor was of dimensions :math:`p_{i} \times p_{i + 1}`, then the product + would be of dimensions :math:`p_{1} \times p_{N + 1}`. + + Example:: + + >>> # xdoctest: +SKIP + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> a = torch.randn(3, 4) + >>> b = torch.randn(4, 5) + >>> c = torch.randn(5, 6) + >>> d = torch.randn(6, 7) + >>> # will raise a deprecation warning + >>> torch.chain_matmul(a, b, c, d) + tensor([[ -2.3375, -3.9790, -4.1119, -6.6577, 9.5609, -11.5095, -3.2614], + [ 21.4038, 3.3378, -8.4982, -5.2457, -10.2561, -2.4684, 2.7163], + [ -0.9647, -5.8917, -2.3213, -5.2284, 12.8615, -12.2816, -2.5095]]) + + .. _`[CLRS]`: https://mitpress.mit.edu/books/introduction-algorithms-third-edition + """ + # This wrapper exists to support variadic args. + if has_torch_function(matrices): + return handle_torch_function(chain_matmul, matrices, *matrices) + + if out is None: + return _VF.chain_matmul(matrices) # type: ignore[attr-defined] + else: + return _VF.chain_matmul(matrices, out=out) # type: ignore[attr-defined] + + +def _lu_impl(A, pivot=True, get_infos=False, out=None): + # type: (Tensor, bool, bool, Any) -> Tuple[Tensor, Tensor, Tensor] + r"""Computes the LU factorization of a matrix or batches of matrices + :attr:`A`. Returns a tuple containing the LU factorization and + pivots of :attr:`A`. Pivoting is done if :attr:`pivot` is set to + ``True``. + + .. warning:: + + :func:`torch.lu` is deprecated in favor of :func:`torch.linalg.lu_factor` + and :func:`torch.linalg.lu_factor_ex`. :func:`torch.lu` will be removed in a + future PyTorch release. + ``LU, pivots, info = torch.lu(A, compute_pivots)`` should be replaced with + + .. code:: python + + LU, pivots = torch.linalg.lu_factor(A, compute_pivots) + + ``LU, pivots, info = torch.lu(A, compute_pivots, get_infos=True)`` should be replaced with + + .. code:: python + + LU, pivots, info = torch.linalg.lu_factor_ex(A, compute_pivots) + + .. note:: + * The returned permutation matrix for every matrix in the batch is + represented by a 1-indexed vector of size ``min(A.shape[-2], A.shape[-1])``. + ``pivots[i] == j`` represents that in the ``i``-th step of the algorithm, + the ``i``-th row was permuted with the ``j-1``-th row. + * LU factorization with :attr:`pivot` = ``False`` is not available + for CPU, and attempting to do so will throw an error. However, + LU factorization with :attr:`pivot` = ``False`` is available for + CUDA. + * This function does not check if the factorization was successful + or not if :attr:`get_infos` is ``True`` since the status of the + factorization is present in the third element of the return tuple. + * In the case of batches of square matrices with size less or equal + to 32 on a CUDA device, the LU factorization is repeated for + singular matrices due to the bug in the MAGMA library + (see magma issue 13). + * ``L``, ``U``, and ``P`` can be derived using :func:`torch.lu_unpack`. + + .. warning:: + The gradients of this function will only be finite when :attr:`A` is full rank. + This is because the LU decomposition is just differentiable at full rank matrices. + Furthermore, if :attr:`A` is close to not being full rank, + the gradient will be numerically unstable as it depends on the computation of :math:`L^{-1}` and :math:`U^{-1}`. + + Args: + A (Tensor): the tensor to factor of size :math:`(*, m, n)` + pivot (bool, optional): controls whether pivoting is done. Default: ``True`` + get_infos (bool, optional): if set to ``True``, returns an info IntTensor. + Default: ``False`` + out (tuple, optional): optional output tuple. If :attr:`get_infos` is ``True``, + then the elements in the tuple are Tensor, IntTensor, + and IntTensor. If :attr:`get_infos` is ``False``, then the + elements in the tuple are Tensor, IntTensor. Default: ``None`` + + Returns: + (Tensor, IntTensor, IntTensor (optional)): A tuple of tensors containing + + - **factorization** (*Tensor*): the factorization of size :math:`(*, m, n)` + + - **pivots** (*IntTensor*): the pivots of size :math:`(*, \text{min}(m, n))`. + ``pivots`` stores all the intermediate transpositions of rows. + The final permutation ``perm`` could be reconstructed by + applying ``swap(perm[i], perm[pivots[i] - 1])`` for ``i = 0, ..., pivots.size(-1) - 1``, + where ``perm`` is initially the identity permutation of :math:`m` elements + (essentially this is what :func:`torch.lu_unpack` is doing). + + - **infos** (*IntTensor*, *optional*): if :attr:`get_infos` is ``True``, this is a tensor of + size :math:`(*)` where non-zero values indicate whether factorization for the matrix or + each minibatch has succeeded or failed + + Example:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK) + >>> # xdoctest: +IGNORE_WANT("non-deterministic") + >>> A = torch.randn(2, 3, 3) + >>> A_LU, pivots = torch.lu(A) + >>> A_LU + tensor([[[ 1.3506, 2.5558, -0.0816], + [ 0.1684, 1.1551, 0.1940], + [ 0.1193, 0.6189, -0.5497]], + + [[ 0.4526, 1.2526, -0.3285], + [-0.7988, 0.7175, -0.9701], + [ 0.2634, -0.9255, -0.3459]]]) + >>> pivots + tensor([[ 3, 3, 3], + [ 3, 3, 3]], dtype=torch.int32) + >>> A_LU, pivots, info = torch.lu(A, get_infos=True) + >>> if info.nonzero().size(0) == 0: + ... print('LU factorization succeeded for all samples!') + LU factorization succeeded for all samples! + """ + # If get_infos is True, then we don't need to check for errors and vice versa + return torch._lu_with_info(A, pivot=pivot, check_errors=(not get_infos)) + +if TYPE_CHECKING: + _ListOrSeq = Sequence[Tensor] +else: + _ListOrSeq = List[Tensor] + + +def _check_list_size(out_len: int, get_infos: bool, out: _ListOrSeq) -> None: + get_infos_int = 1 if get_infos else 0 + if out_len - get_infos_int != 2: + raise TypeError(f"expected tuple of {2 + int(get_infos)} elements but got {out_len}") + if not isinstance(out, (tuple, list)): + raise TypeError(f"argument 'out' must be tuple of Tensors, not {type(out).__name__}") + + +def _lu_with_infos(A, pivot=True, get_infos=False, out=None): + # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor, Tensor]]) -> Tuple[Tensor, Tensor, Tensor] + if has_torch_function_unary(A): + return handle_torch_function( + lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out) + result = _lu_impl(A, pivot, get_infos, out) + if out is not None: + _check_list_size(len(out), get_infos, out) + for i in range(len(out)): + out[i].resize_as_(result[i]).copy_(result[i]) + return out + else: + return result # A_LU, pivots, infos + + +def _lu_no_infos(A, pivot=True, get_infos=False, out=None): + # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor] + # need to check for torch_function here so that we exit if + if has_torch_function_unary(A): + return handle_torch_function( + lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out) + result = _lu_impl(A, pivot, get_infos, out) + if out is not None: + _check_list_size(len(out), get_infos, out) + for i in range(len(out)): + out[i].resize_as_(result[i]).copy_(result[i]) + return out + else: + return result[0], result[1] # A_LU, pivots + +# The return type of lu depends on `get_infos`, so in order to resolve the output type +# of lu in TorchScript we need to statically know the value of `get_infos` +lu = boolean_dispatch( + arg_name='get_infos', + arg_index=2, + default=False, + if_true=_lu_with_infos, + if_false=_lu_no_infos, + module_name=__name__, + func_name='lu') +lu.__doc__ = _lu_impl.__doc__ + + +def align_tensors(*tensors): + raise RuntimeError('`align_tensors` not yet implemented.') diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/library.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/library.py new file mode 100644 index 0000000000000000000000000000000000000000..350aa8995c090ee0af8933d5ddeafc3011e84559 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/library.py @@ -0,0 +1,532 @@ +from ._ops import OpOverload +from typing import Any, Optional, Set, List +import traceback +import torch +import weakref +import functools +import inspect +import re +import contextlib +import sys + +__all__ = [ + 'Library', + 'impl', + 'define', + 'fallthrough_kernel', + 'impl_abstract', + 'get_ctx', +] + +# Set containing the combination of (namespace, operator, DispatchKey) for which a new kernel has been registered +# The keys in the set are of the form `namespace + "/" + op_name + "/" + dispatch_key`. +# This set is maintained to ensure that two libraries don't try to override the exact same functionality to avoid +# libraries calling into kernels not intended to be called. +_impls: Set[str] = set() +_defs: Set[str] = set() + +# prim is reserved by TorchScript interpreter +_reserved_namespaces = ['prim'] + +def fallthrough_kernel(): + """ + A dummy function to pass to ``Library.impl`` in order to register a fallthrough. + """ + raise NotImplementedError("fallthrough_kernel() should never be called.") + +class Library: + """ + A class to create libraries that can be used to register new operators or + override operators in existing libraries from Python. + A user can optionally pass in a dispatch keyname if they only want to register + kernels corresponding to only one specific dispatch key. + + To create a library to override operators in an existing library (with name ns), set the kind to "IMPL". + To create a new library (with name ns) to register new operators, set the kind to "DEF". + To create a fragment of a possibly existing library to register operators (and bypass + the limitation that there is only one library for a given namespace), set the kind to + "FRAGMENT". + + Args: + ns: library name + kind: "DEF", "IMPL" (default: "IMPL"), "FRAGMENT" + dispatch_key: PyTorch dispatch key (default: "") + """ + def __init__(self, ns, kind, dispatch_key=""): + if kind not in ('IMPL', 'DEF', 'FRAGMENT'): + raise ValueError("Unsupported kind: ", kind) + + if ns in _reserved_namespaces and (kind == "DEF" or kind == 'FRAGMENT'): + raise ValueError(ns, " is a reserved namespace. Please try creating a library with another name.") + + frame = traceback.extract_stack(limit=3)[0] + filename, lineno = frame.filename, frame.lineno + self.m: Optional[Any] = torch._C._dispatch_library(kind, ns, dispatch_key, filename, lineno) + self.ns = ns + self._op_defs: Set[str] = set() + self._op_impls: Set[str] = set() + self._registration_handles: List["torch._library.utils.RegistrationHandle"] = [] + self.kind = kind + self.dispatch_key = dispatch_key + # Use a finalizer to setup the "destructor" instead of __del__. + # Python __del__ can lead to weird things (globals and locals may already + # be gone when __del__ actually gets called!). finalizers help the + # situation because it lets us capture references and keeps them alive + weakref.finalize(self, _del_library, _impls, self._op_impls, _defs, self._op_defs, self._registration_handles) + + def __repr__(self): + return f"Library(kind={self.kind}, ns={self.ns}, dispatch_key={self.dispatch_key})>" + + def define(self, schema, alias_analysis="", *, tags=()): + r'''Defines a new operator and its semantics in the ns namespace. + + Args: + schema: function schema to define a new operator. + alias_analysis (optional): Indicates if the aliasing properties of the operator arguments can be + inferred from the schema (default behavior) or not ("CONSERVATIVE"). + tags (Tag | Sequence[Tag]): one or more torch.Tag to apply to this + operator. Tagging an operator changes the operator's behavior + under various PyTorch subsystems; please read the docs for the + torch.Tag carefully before applying it. + + Returns: + name of the operator as inferred from the schema. + + Example:: + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LIBRARY) + >>> my_lib = Library("foo", "DEF") + >>> my_lib.define("sum(Tensor self) -> Tensor") + ''' + # This is added because we also want to disallow PURE_FUNCTION alias analysis which is a valid + # AliasAnalysis type in C++ + if alias_analysis not in ["", "FROM_SCHEMA", "CONSERVATIVE"]: + raise RuntimeError(f"Invalid alias_analysis type {alias_analysis}") + assert self.m is not None + if isinstance(tags, torch.Tag): + tags = (tags,) + result = self.m.define(schema, alias_analysis, tuple(tags)) + qualname = self.ns + "::" + schema.split("(")[0] + self._op_defs.add(qualname) + _defs.add(qualname) + return result + + def impl(self, op_name, fn, dispatch_key=''): + r'''Registers the function implementation for an operator defined in the library. + + Args: + op_name: operator name (along with the overload) or OpOverload object. + fn: function that's the operator implementation for the input dispatch key or :func:`~fallthrough_kernel` + to register a fallthrough. + dispatch_key: dispatch key that the input function should be registered for. By default, it uses + the dispatch key that the library was created with. + + Example:: + >>> my_lib = Library("aten", "IMPL") + >>> def div_cpu(self, other): + >>> return self * (1 / other) + >>> my_lib.impl("div.Tensor", div_cpu, "CPU") + ''' + if not callable(fn): + raise TypeError(f"Input function is required to be a callable but found type {type(fn)}") + if dispatch_key == '': + dispatch_key = self.dispatch_key + + if isinstance(op_name, str): + name = op_name + elif isinstance(op_name, OpOverload): + name = op_name._schema.name + overload_name = op_name._schema.overload_name + if overload_name != '': + name = name + '.' + overload_name + else: + raise RuntimeError("impl should be passed either a name or an OpOverload object as the first argument") + + key = self.ns + "/" + name.split("::")[-1] + "/" + dispatch_key + if key in _impls: + # TODO: in future, add more info about where the existing function is registered (this info is + # today already returned by the C++ warning when impl is called but we error out before that) + raise RuntimeError("This is not allowed since there's already a kernel registered from python overriding {}" + "'s behavior for {} dispatch key and {} namespace.". + format(name.split("::")[-1], dispatch_key, self.ns)) + + if dispatch_key == "Meta": + dispatcher_op_name = name + if '::' not in dispatcher_op_name: + dispatcher_op_name = f'{self.ns}::{dispatcher_op_name}' + + # Internally, we shouldn't be registering meta kernels for any operators that + # have CompositeImplicitAutograd kernels. + # Instead, we should be letting those decompositions run, and writing meta kernels + # only for the base operators. + if torch._C._dispatch_has_kernel_for_dispatch_key(dispatcher_op_name, "CompositeImplicitAutograd"): + raise RuntimeError( + f"We should not register a meta kernel directly to the operator '{name}'," + " because it has a CompositeImplicitAutograd kernel in core." + " Instead we should let the operator decompose, and ensure that we have meta kernels" + " for the base ops that it decomposes into.") + + assert self.m is not None + self.m.impl(name, dispatch_key if dispatch_key != "" else "CompositeImplicitAutograd", fn) + + _impls.add(key) + self._op_impls.add(key) + + def _destroy(self): + if self.m is not None: + self.m.reset() + self.m = None + for handle in self._registration_handles: + handle.destroy() + self._registration_handles.clear() + for name in self._op_defs: + # Delete the cached torch.ops.ns.foo if it was registered. + # Otherwise, accessing it leads to a segfault. + # It's possible that we only registered an overload in this Library + # and another library owns an alive overload. + # That's OK - the next time torch.ops.ns.foo gets called, it'll be + # recomputed to point at the right collection of overloads. + ns, name_with_overload = name.split("::") + name = name_with_overload.split(".")[0] + if not hasattr(torch.ops, ns): + continue + namespace = getattr(torch.ops, ns) + if not hasattr(namespace, name): + continue + delattr(namespace, name) + + +def _del_library(captured_impls, op_impls, captured_defs, op_defs, registration_handles): + captured_impls -= op_impls + captured_defs -= op_defs + for handle in registration_handles: + handle.destroy() + + +@contextlib.contextmanager +def _scoped_library(*args, **kwargs): + try: + lib = Library(*args, **kwargs) + yield lib + finally: + lib._destroy() + + +_keep_alive: List[Library] = [] + + +NAMELESS_SCHEMA = re.compile(r"\(.*\) -> .*") + + +@functools.singledispatch +def define(qualname, schema, *, lib=None, tags=()): + r"""Defines a new operator. + + In PyTorch, defining an op (short for "operator") is a two step-process: + - we need to define the op (by providing an operator name and schema) + - we need to implement behavior for how the operator interacts with + various PyTorch subsystems, like CPU/CUDA Tensors, Autograd, etc. + + This entrypoint defines the custom operator (the first step) + you must then perform the second step by calling various + ``impl_*`` APIs, like :func:`torch.library.impl` or + :func:`torch.library.impl_abstract`. + + Args: + qualname (str): The qualified name for the operator. Should be + a string that looks like "namespace::name", e.g. "aten::sin". + Operators in PyTorch need a namespace to + avoid name collisions; a given operator may only be created once. + If you are writing a Python library, we recommend the namespace to + be the name of your top-level module. + schema (str): The schema of the operator. E.g. "(Tensor x) -> Tensor" + for an op that accepts one Tensor and returns one Tensor. It does + not contain the operator name (that is passed in ``qualname``). + lib (Optional[Library]): If provided, the lifetime of this operator + will be tied to the lifetime of the Library object. + tags (Tag | Sequence[Tag]): one or more torch.Tag to apply to this + operator. Tagging an operator changes the operator's behavior + under various PyTorch subsystems; please read the docs for the + torch.Tag carefully before applying it. + + Example:: + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LIBRARY) + >>> import torch + >>> import numpy as np + >>> + >>> # Define the operator + >>> torch.library.define("mylib::sin", "(Tensor x) -> Tensor") + >>> + >>> # Add implementations for the operator + >>> @torch.library.impl("mylibrary::sin", "cpu") + >>> def f(x): + >>> return torch.from_numpy(np.sin(x.numpy())) + >>> + >>> # Call the new operator from torch.ops. + >>> x = torch.randn(3) + >>> y = torch.ops.mylib.sin(x) + >>> assert torch.allclose(y, x) + + """ + if not isinstance(qualname, str): + raise ValueError( + f"define(qualname, schema): expected qualname " + f"to be instance of str, got {type(qualname)}") + namespace, name = torch._library.utils.parse_namespace(qualname) + if lib is None: + lib = Library(namespace, "FRAGMENT") + _keep_alive.append(lib) + if not NAMELESS_SCHEMA.fullmatch(schema): + raise ValueError( + f"define(qualname, schema, ...): expected schema " + f"to look like e.g. \"(Tensor x) -> Tensor\" but " + f"got \"{schema}\"") + lib.define(name + schema, alias_analysis="", tags=tags) + + +@define.register +def _(lib: Library, schema, alias_analysis=""): + """The old torch.library.define. + We're keeping this around for BC reasons + """ + def wrap(f): + name = lib.define(schema, alias_analysis) + lib.impl(name, f) + return f + return wrap + + +@functools.singledispatch +def impl(qualname, types, func=None, *, lib=None): + """Register an implementation for a device type for this operator. + + You may pass "default" for ``types`` to register this implementation as the + default implementation for ALL device types. + Please only use this if the implementation truly supports all device types; + for example, this is true if it is a composition of built-in PyTorch operators. + + Some valid types are: "cpu", "cuda", "xla", "mps", "ipu", "xpu". + + Args: + qualname (str): Should be a string that looks like "namespace::operator_name". + types (str | Sequence[str]): The device types to register an impl to. + lib (Optional[Library]): If provided, the lifetime of this registration + will be tied to the lifetime of the Library object. + + Examples: + >>> import torch + >>> import numpy as np + >>> + >>> # Define the operator + >>> torch.library.define("mylibrary::sin", "(Tensor x) -> Tensor") + >>> + >>> # Add implementations for the cpu device + >>> @torch.library.impl("mylibrary::sin", "cpu") + >>> def f(x): + >>> return torch.from_numpy(np.sin(x.numpy())) + >>> + >>> x = torch.randn(3) + >>> y = torch.ops.mylibrary.sin(x) + >>> assert torch.allclose(y, x.sin()) + """ + if isinstance(types, str): + types = (types,) + keys = set({}) + for typ in types: + is_dispatch_key = torch._C._parse_dispatch_key(typ) + if is_dispatch_key: + # We also support passing a DispatchKey to impl. Please prefer using + # the higher-level torch.library APIs and only pass DispatchKey to + # torch.library.impl with caution (or even better, don't use this + # option and file an issue on GitHub for what you need). + # We don't advertise this to users because + # it is very easy to shoot yourself in the foot. + keys.add(typ) + else: + keys.add(_device_type_to_key(typ)) + + def register(func): + namespace, _ = torch._library.utils.parse_namespace(qualname) + if lib is None: + use_lib = Library(namespace, "FRAGMENT") + _keep_alive.append(use_lib) + else: + use_lib = lib + for key in keys: + use_lib.impl(qualname, func, key) + + if func is None: + return register + else: + register(func) + + +def _device_type_to_key(device_type: str) -> str: + if device_type == "default": + # This is technically not correct, because although all device_type + # DispatchKeys are included in CompositeExplicitAutograd, + # not everything in CompositeExplicitAutograd is associated with a + # device_type. I don't really care that much about the difference. + return "CompositeExplicitAutograd" + return torch._C._dispatch_key_for_device(device_type) + + +@impl.register +def _(lib: Library, name, dispatch_key=""): + """Legacy torch.library.impl API. Kept around for BC""" + def wrap(f): + lib.impl(name, f, dispatch_key) + return f + return wrap + + + +def impl_abstract(qualname, func=None, *, lib=None, _stacklevel=1): + r"""Register an abstract implementation for this operator. + + An "abstract implementation" specifies the behavior of this operator on + Tensors that carry no data. Given some input Tensors with certain properties + (sizes/strides/storage_offset/device), it specifies what the properties of + the output Tensors are. + + The abstract implementation has the same signature as the operator. + It is run for both FakeTensors and meta tensors. To write an abstract + implementation, assume that all Tensor inputs to the operator are + regular CPU/CUDA/Meta tensors, but they do not have storage, and + you are trying to return regular CPU/CUDA/Meta tensor(s) as output. + The abstract implementation must consist of only PyTorch operations + (and may not directly access the storage or data of any input or + intermediate Tensors). + + This API may be used as a decorator (see examples). + + For a detailed guide on custom ops, please see + https://docs.google.com/document/d/1W--T6wz8IY8fOI0Vm8BF44PdBgs283QvpelJZWieQWQ/edit + + Examples: + >>> import torch + >>> import numpy as np + >>> from torch import Tensor + >>> + >>> # Example 1: an operator without data-dependent output shape + >>> torch.library.define( + >>> "mylib::custom_linear", + >>> "(Tensor x, Tensor weight, Tensor bias) -> Tensor") + >>> + >>> @torch.library.impl_abstract("mylib::custom_linear") + >>> def custom_linear_abstract(x, weight): + >>> assert x.dim() == 2 + >>> assert weight.dim() == 2 + >>> assert bias.dim() == 1 + >>> assert x.shape[1] == weight.shape[1] + >>> assert weight.shape[0] == bias.shape[0] + >>> assert x.device == weight.device + >>> + >>> return (x @ weight.t()) + bias + >>> + >>> # Example 2: an operator with data-dependent output shape + >>> torch.library.define("mylib::custom_nonzero", "(Tensor x) -> Tensor") + >>> + >>> @torch.library.impl_abstract("mylib::custom_nonzero") + >>> def custom_nonzero_abstract(x): + >>> # Number of nonzero-elements is data-dependent. + >>> # Since we cannot peek at the data in an abstract impl, + >>> # we use the ctx object to construct a new symint that + >>> # represents the data-dependent size. + >>> ctx = torch.library.get_ctx() + >>> nnz = ctx.new_dynamic_size() + >>> shape = [nnz, x.dim()] + >>> result = x.new_empty(shape, dtype=torch.int64) + >>> return result + >>> + >>> @torch.library.impl("mylib::custom_nonzero", "cpu") + >>> def custom_nonzero_cpu(x): + >>> x_np = x.numpy() + >>> res = np.stack(np.nonzero(x_np), axis=1) + >>> return torch.tensor(res, device=x.device) + + """ + source = torch._library.utils.get_source(_stacklevel + 1) + frame = sys._getframe(_stacklevel) + caller_module = inspect.getmodule(frame) + # Can be none if you call impl_abstract from somewhere there isn't a module + # (e.g. __main__) + caller_module_name = None if caller_module is None else caller_module.__name__ + + # TODO(rzou): We're gonna need to stage this change with torchvision, + # since torchvision is github first. + if caller_module_name is not None and caller_module_name.startswith("torchvision."): + caller_module_name = None + + def inner(func): + entry = torch._library.simple_registry.singleton.find(qualname) + if caller_module_name is not None: + func_to_register = _check_pystubs_once(func, qualname, caller_module_name) + else: + func_to_register = func + + handle = entry.abstract_impl.register(func_to_register, source) + if lib is not None: + lib._registration_handles.append(handle) + return func + + if func is None: + return inner + return inner(func) + + +# If the op was defined in C++, then we want to make sure there was an +# m.impl_abstract_pystub(module, ...) call and that the module is the +# same as the module that called torch.library.impl_abstract. +def _check_pystubs_once(func, qualname, actual_module_name): + checked = False + + def inner(*args, **kwargs): + nonlocal checked + if checked: + return func(*args, **kwargs) + + op = torch._library.utils.lookup_op(qualname) + if op._defined_in_python: + checked = True + return func(*args, **kwargs) + + maybe_pystub = torch._C._dispatch_pystub( + op._schema.name, + op._schema.overload_name) + if not maybe_pystub: + namespace = op.namespace + cpp_filename = op._handle().debug() + raise RuntimeError( + f"Operator '{qualname}' was defined in C++ and has a Python " + f"abstract impl. In this situation, we require there to also be a " + f"companion C++ `m.impl_abstract_pystub(\"{actual_module_name}\")` " + f"call, but we could not find one. Please add that to " + f"to the top of the C++ TORCH_LIBRARY({namespace}, ...) block the " + f"operator was registered in ({cpp_filename})") + pystub_module = maybe_pystub[0] + if actual_module_name != pystub_module: + cpp_filename = op._handle().debug() + raise RuntimeError( + f"Operator '{qualname}' specified that its python abstract impl " + f"is in the Python module '{pystub_module}' but it was actually found " + f"in '{actual_module_name}'. Please either move the abstract impl " + f"or correct the m.impl_abstract_pystub call ({cpp_filename})") + checked = True + return func(*args, **kwargs) + return inner + + +# NOTE [ctx inside the fake implementation] +# If a user has an operator with data-dependent output shape, then when writing +# a fake implementation they must query the current ctx and use methods on the +# ctx to construct a new unbacked symint. +# +# This is done via us setting the global_ctx_getter function every time a fake +# implementation is invoked. +def get_ctx() -> "torch._library.abstract_impl.AbstractImplCtx": + """get_ctx() returns the current AbstractImplCtx object. + + Calling ``get_ctx()`` is only valid inside of an abstract impl + (see :func:`torch.library.impl_abstract` for more usage details. + """ + return torch._library.abstract_impl.global_ctx_getter() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/overrides.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/overrides.py new file mode 100644 index 0000000000000000000000000000000000000000..8076802e48a7ca9a11340045ea516f243bc6ad05 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/overrides.py @@ -0,0 +1,1973 @@ +""" +Python implementation of ``__torch_function__`` + +While most of the torch API and handling for ``__torch_function__`` happens +at the C++ level, some of the torch API is written in Python so we need +python-level handling for ``__torch_function__`` overrides as well. The main +developer-facing functionality in this file are handle_torch_function and +has_torch_function. See torch/functional.py and test/test_overrides.py +for usage examples. + +Note +---- +heavily inspired by NumPy's ``__array_function__`` (see: +https://github.com/pytorch/pytorch/issues/24015 and +https://www.numpy.org/neps/nep-0018-array-function-protocol.html +) + +If changing this file in a way that can affect ``__torch_function__`` overhead, +please report the benchmarks in ``benchmarks/overrides_benchmark``. See the +instructions in the ``README.md`` in that directory. +""" + +import __future__ # noqa: F404 + +import collections +import functools +import types +import warnings +from typing import Dict, Set, List, Any, Callable, Iterable, Type, Tuple +from functools import wraps +import contextlib + +import torch +from torch._C import ( + _has_torch_function, _has_torch_function_unary, + _has_torch_function_variadic, _add_docstr, + _push_on_torch_function_stack, _pop_torch_function_stack, _get_function_stack_at, _len_torch_function_stack, + _is_torch_function_mode_enabled) + +__all__ = [ + "get_ignored_functions", + "get_overridable_functions", + "get_testing_overrides", + "handle_torch_function", + "has_torch_function", + "resolve_name", + "is_tensor_like", + "is_tensor_method_or_property", + "wrap_torch_function", + "enable_reentrant_dispatch", +] + + +def _disable_user_warnings( + func: Callable, regex: str = '.*is deprecated, please use.*', module: str = 'torch') -> Callable: + """ + Decorator that temporarily disables ``UserWarning``s for the given ``module`` if the warning message matches the + given ``regex`` pattern. + + Arguments + --------- + func : function + Function to disable the warnings for. + regex : str + A regex pattern compilable by ``re.compile``. This is used to match the ``UserWarning`` message. + module : str + The python module to which the filtering should be restricted. + + Returns + ------- + function + The wrapped function. + """ + + @wraps(func) + def wrapper(*args, **kwargs): + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning, message=regex, module=module) + return func(*args, **kwargs) + return wrapper + + +@functools.lru_cache(None) +@_disable_user_warnings +def get_ignored_functions() -> Set[Callable]: + """ + Return public functions that cannot be overridden by ``__torch_function__``. + + Returns + ------- + Set[Callable] + A tuple of functions that are publicly available in the torch API but cannot + be overridden with ``__torch_function__``. Mostly this is because none of the + arguments of these functions are tensors or tensor-likes. + + Examples + -------- + >>> torch.Tensor.as_subclass in torch.overrides.get_ignored_functions() + True + >>> torch.add in torch.overrides.get_ignored_functions() + False + """ + Tensor = torch.Tensor + return { + torch.typename, + torch.is_tensor, + torch.is_storage, + torch.set_default_tensor_type, + torch.set_default_device, + torch.get_default_device, + torch.set_rng_state, + torch.get_rng_state, + torch.manual_seed, + torch.initial_seed, + torch.seed, + torch.save, + torch.load, + torch.set_printoptions, + torch.fork, + torch.get_default_dtype, + torch.get_num_interop_threads, + torch.get_num_threads, + torch.init_num_threads, + torch.import_ir_module, + torch.import_ir_module_from_buffer, + torch.is_anomaly_enabled, + torch.is_anomaly_check_nan_enabled, + torch.is_grad_enabled, + torch.merge_type_from_type_comment, + torch.parse_ir, + torch.parse_schema, + torch.parse_type_comment, + torch.set_anomaly_enabled, + torch.set_flush_denormal, + torch.set_num_interop_threads, + torch.set_num_threads, + torch.wait, + torch.as_tensor, + torch.from_numpy, + torch.get_device, + torch.tensor, + torch.default_generator, + torch.has_cuda, + torch.has_cudnn, + torch.has_lapack, + torch.device, + torch.dtype, + torch.finfo, + torch.has_mkl, + torch.has_mps, + torch.has_mkldnn, + torch.has_openmp, + torch.iinfo, + torch.memory_format, + torch.qscheme, + torch.set_grad_enabled, + torch.no_grad, + torch.enable_grad, + torch.inference_mode, + torch.is_inference_mode_enabled, + torch.layout, + torch.align_tensors, + torch.arange, + torch.as_strided, + torch.bartlett_window, + torch.blackman_window, + torch.broadcast_shapes, + torch.can_cast, + torch.compile, + torch.cudnn_affine_grid_generator, + torch.cudnn_batch_norm, + torch.cudnn_convolution, + torch.cudnn_convolution_transpose, + torch.cudnn_convolution_relu, + torch.cudnn_convolution_add_relu, + torch.cudnn_grid_sampler, + torch.cudnn_is_acceptable, + torch.empty, + torch.empty_permuted, + torch.empty_strided, + torch.empty_quantized, + torch.export.dynamic_dim, + torch.export.export, + torch.export.load, + torch.export.register_dataclass, + torch.export.save, + torch.eye, + torch.fft.fftfreq, + torch.fft.rfftfreq, + torch.from_file, + torch.full, + torch.fill, + torch.hamming_window, + torch.hann_window, + torch.kaiser_window, + torch.linspace, + torch.logspace, + torch.mkldnn_adaptive_avg_pool2d, + torch.mkldnn_convolution, + torch.mkldnn_max_pool2d, + torch.mkldnn_max_pool3d, + torch.mkldnn_linear_backward_weights, + torch.mkldnn_rnn_layer, + torch.normal, + torch.ones, + torch.promote_types, + torch.rand, + torch.randn, + torch.randint, + torch.randperm, + torch.range, + torch.result_type, + torch.scalar_tensor, + torch.sparse_coo_tensor, + torch.sparse_compressed_tensor, + torch.sparse_csr_tensor, + torch.sparse_csc_tensor, + torch.sparse_bsr_tensor, + torch.sparse_bsc_tensor, + torch.sym_constrain_range, + torch.sym_constrain_range_for_size, + torch.tril_indices, + torch.triu_indices, + torch.vander, + torch.zeros, + torch._jit_internal.boolean_dispatch, + torch.nn.functional.assert_int_or_pair, + torch.nn.functional.upsample, + torch.nn.functional.upsample_bilinear, + torch.nn.functional.upsample_nearest, + torch.nn.functional.has_torch_function, + torch.nn.functional.has_torch_function_unary, + torch.nn.functional.has_torch_function_variadic, + torch.nn.functional.handle_torch_function, + torch.nn.functional.sigmoid, + torch.nn.functional.hardsigmoid, + torch.nn.functional.tanh, + torch.nn.functional._canonical_mask, + torch.nn.functional._none_or_dtype, + # Doesn't actually take or return tensor arguments + torch.nn.init.calculate_gain, + # These are deprecated; don't test them + torch.nn.init.uniform, + torch.nn.init.normal, + torch.nn.init.constant, + torch.nn.init.eye, + torch.nn.init.dirac, + torch.nn.init.xavier_uniform, + torch.nn.init.xavier_normal, + torch.nn.init.kaiming_uniform, + torch.nn.init.kaiming_normal, + torch.nn.init.orthogonal, + torch.nn.init.sparse, + torch.nested.to_padded_tensor, + has_torch_function, + handle_torch_function, + torch.set_autocast_enabled, + torch.is_autocast_enabled, + torch.clear_autocast_cache, + torch.set_autocast_cpu_enabled, + torch.is_autocast_cpu_enabled, + torch.set_autocast_xla_enabled, + torch.is_autocast_xla_enabled, + torch.set_autocast_ipu_enabled, + torch.is_autocast_ipu_enabled, + torch.set_autocast_cpu_dtype, + torch.get_autocast_cpu_dtype, + torch.set_autocast_ipu_dtype, + torch.get_autocast_ipu_dtype, + torch.get_autocast_gpu_dtype, + torch.set_autocast_gpu_dtype, + torch.get_autocast_xla_dtype, + torch.set_autocast_xla_dtype, + torch.autocast_increment_nesting, + torch.autocast_decrement_nesting, + torch.is_autocast_cache_enabled, + torch.set_autocast_cache_enabled, + torch.nn.functional.hardswish, + torch.is_vulkan_available, + torch.are_deterministic_algorithms_enabled, + torch.use_deterministic_algorithms, + torch.is_deterministic_algorithms_warn_only_enabled, + torch.set_deterministic_debug_mode, + torch.get_deterministic_debug_mode, + torch.set_float32_matmul_precision, + torch.get_float32_matmul_precision, + torch.unify_type_list, + torch.is_warn_always_enabled, + torch.set_warn_always, + torch.vitals_enabled, + torch.set_vital, + torch.read_vitals, + torch.vmap, + torch.cond, + torch.frombuffer, + torch.asarray, + torch._functional_sym_constrain_range, + torch._make_dep_token, + Tensor.__delitem__, + Tensor.__dir__, + Tensor.__getattribute__, + Tensor.__init__, + Tensor.__iter__, + Tensor.__init_subclass__, + Tensor.__delattr__, + Tensor.__setattr__, + Tensor.__torch_function__, + Tensor.__torch_dispatch__, + Tensor.__new__, + Tensor.__class__, + Tensor.__subclasshook__, + Tensor.__hash__, + Tensor.as_subclass, + Tensor.eig, + Tensor.lstsq, + Tensor.reinforce, + Tensor.new, + Tensor.new_tensor, + Tensor.new_empty, + Tensor.new_empty_strided, + Tensor.new_zeros, + Tensor.new_ones, + Tensor.new_full, + Tensor._make_subclass, + Tensor.solve, + Tensor.symeig, + Tensor.stride, + Tensor.unflatten, + Tensor.to_sparse_coo, + Tensor.to_sparse_csr, + Tensor.to_sparse_csc, + Tensor.to_sparse_bsr, + Tensor.to_sparse_bsc, + Tensor._to_sparse, + Tensor._to_sparse_csr, + Tensor._to_sparse_csc, + Tensor._to_sparse_bsr, + Tensor._to_sparse_bsc, + Tensor._typed_storage, + Tensor._reduce_ex_internal, + Tensor._fix_weakref, + Tensor._view_func, + Tensor._view_func_unsafe, + Tensor._rev_view_func_unsafe, + Tensor._make_wrapper_subclass, + Tensor._python_dispatch.__get__, + Tensor._has_symbolic_sizes_strides.__get__, + Tensor._conj, + Tensor._conj_physical, + Tensor._lazy_clone, + Tensor._neg_view, + Tensor._is_zerotensor, + Tensor._is_all_true, + Tensor._is_any_true, + Tensor._addmm_activation, + Tensor.to_padded_tensor, + } + + +@functools.lru_cache(None) +def get_default_nowrap_functions() -> Set[Callable]: + """ + Return public functions that do not wrap in a subclass when invoked by + the default ``Tensor.__torch_function__`` that preserves subclasses. Typically, + these functions represent field accesses (i.e., retrieving a Tensor that + is stored somewhere on the Tensor) as opposed to computation. Users of + these functions expect object identity to be preserved over multiple accesses + (e.g., ``a.grad is a.grad``) which cannot be upheld if we're wrapping on + the fly every time (furthermore, the tensor stored here might already be + the subclass, in which case wrapping really ought not to happen). + + Not ALL property accessors have this property; for example ``Tensor.T`` actually + just creates a new transposed tensor on the fly, and so we SHOULD interpose on + these calls (you need to check the implementation of the function to see if + this is the case or not). Additionally, if a property accessor doesn't return a Tensor, + it doesn't have to be on this list (though it is harmless if it is). + """ + Tensor = torch.Tensor + return { + Tensor._base.__get__, + Tensor.grad.__get__, + Tensor._grad.__get__, + } + + +@functools.lru_cache(None) +@_disable_user_warnings +def get_testing_overrides() -> Dict[Callable, Callable]: + """Return a dict containing dummy overrides for all overridable functions + + Returns + ------- + Dict[Callable, Callable] + A dictionary that maps overridable functions in the PyTorch API to + lambda functions that have the same signature as the real function + and unconditionally return -1. These lambda functions are useful + for testing API coverage for a type that defines ``__torch_function__``. + + Examples + -------- + >>> import inspect + >>> my_add = torch.overrides.get_testing_overrides()[torch.add] + >>> inspect.signature(my_add) + + """ + # Every function in the PyTorchAPI that can be overriden needs an entry + # in this dict. + # + # Optimally we would use inspect to get the function signature and define + # the lambda function procedurally but that is blocked by generating + # function signatures for native kernels that can be consumed by inspect. + # See Issue #28233. + Tensor = torch.Tensor + ret: Dict[Callable, Callable] = { + torch.abs: lambda input, out=None: -1, + torch.absolute: lambda input, out=None: -1, + torch.adaptive_avg_pool1d: lambda input, output_size: -1, + torch.adaptive_max_pool1d: lambda inputs, output_size: -1, + torch.acos: lambda input, out=None: -1, + torch.adjoint: lambda input: -1, + torch.arccos: lambda input, out=None: -1, + torch.acosh: lambda input, out=None: -1, + torch.arccosh: lambda input, out=None: -1, + torch.add: lambda input, other, out=None: -1, + torch.addbmm: lambda input, batch1, batch2, alpha=1, beta=1, out=None: -1, + torch.addcdiv: lambda input, tensor1, tensor2, value=1, out=None: -1, + torch.addcmul: lambda input, tensor1, tensor2, value=1, out=None: -1, + torch.addmm: lambda input, mat1, mat2, beta=1, alpha=1, out=None: -1, + torch.addmv: lambda input, mat, vec, beta=1, alpha=1, out=None: -1, + torch.addr: lambda input, vec1, vec2, beta=1, alpha=1, out=None: -1, + torch.affine_grid_generator: lambda theta, size, align_corners: -1, + torch.all: lambda input, dim=None: -1, + torch.allclose: lambda input, other, trol=1e-05, atol=1e-08, equal_nan=False: -1, + torch.alpha_dropout: lambda input, p, train, inplace=False: -1, + torch.amax: lambda input, dim=None: -1, + torch.amin: lambda input, dim=None: -1, + torch.aminmax: lambda input, dim=None, keepdim=False, out=None: -1, + torch.angle: lambda input, out=None: -1, + torch.any: lambda input, dim=None, keepdim=False, out=None: -1, + torch.argmax: lambda input: -1, + torch.argmin: lambda input: -1, + torch.argsort: lambda input, dim=None: -1, + torch.asin: lambda input, out=None: -1, + torch._assert_async: lambda input, msg: -1, + torch.arcsin: lambda input, out=None: -1, + torch.asinh: lambda input, out=None: -1, + torch.arcsinh: lambda input, out=None: -1, + torch.atan: lambda input, out=None: -1, + torch.arctan: lambda input, out=None: -1, + torch.atan2: lambda input, other, out=None: -1, + torch.arctan2: lambda input, other, out=None: -1, + torch.atanh: lambda input, out=None: -1, + torch.arctanh: lambda input, out=None: -1, + torch.atleast_1d: lambda *tensors: -1, + torch.atleast_2d: lambda *tensors: -1, + torch.atleast_3d: lambda *tensors: -1, + torch.avg_pool1d: lambda input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True: -1, + torch.baddbmm: lambda input, batch1, batch2, alpha=1, beta=1, out=None: -1, + torch.batch_norm: lambda input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled: -1, + torch.batch_norm_backward_elemt: lambda grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count_tensor: -1, + torch.batch_norm_backward_reduce: lambda grad_out, input, mean, invstd, weight, input_g, weight_g, bias_g: -1, + torch.batch_norm_elemt: lambda input, weight, bias, mean, invstd, eps: -1, + torch.batch_norm_gather_stats: lambda input, mean, invstd, running_mean, running_var, momentum, eps, count: -1, + torch.batch_norm_gather_stats_with_counts: lambda input, mean, invstd, running_mean, running_var, momentum, eps, count: -1, + torch.batch_norm_stats: lambda input, eps: -1, + torch.batch_norm_update_stats: lambda input, running_mean, running_var, momentum: -1, + torch.bernoulli: lambda input, generator=None, out=None: -1, + torch.bilinear: lambda input1, input2, weight, bias: -1, + torch.binary_cross_entropy_with_logits: (lambda input, target, weight=None, size_average=None, reduce=None, + reduction='mean', pos_weight=None: -1), + torch.bincount: lambda input, weights=None, minlength=0: -1, + torch.binomial: lambda count, prob, generator=None: -1, + torch.bitwise_and: lambda input, other, out=None: -1, + torch.bitwise_not: lambda input, out=None: -1, + torch.bitwise_or: lambda input, other, out=None: -1, + torch.bitwise_xor: lambda input, other, out=None: -1, + torch.bitwise_left_shift: lambda input, other, out=None: -1, + torch.bitwise_right_shift: lambda input, other, out=None: -1, + torch.block_diag: lambda *tensors: -1, + torch.bmm: lambda input, mat2, out=None: -1, + torch.broadcast_tensors: lambda *tensors: -1, + torch.broadcast_to: lambda self, size: -1, + torch.bucketize: lambda input, boundaries, out_int32=False, right=False, out=None: -1, + torch.cartesian_prod: lambda *tensors: -1, + torch.cat: lambda tensors, dim=0, out=None: -1, + torch.concat: lambda tensors, dim=0, out=None: -1, # alias for torch.cat + torch.concatenate: lambda tensors, dim=0, out=None: -1, # alias for torch.concatenate + torch.cdist: lambda x1, x2, p=2.0, compute_mode='use_mm_for_euclid_dist_if_necessary': -1, + torch.ceil: lambda input, out=None: -1, + torch.celu: lambda input, alpha=1., inplace=False: -1, + torch.chain_matmul: lambda *matrices, out=None: -1, + torch.channel_shuffle: lambda input, groups : -1, + torch.cholesky: lambda input, upper=False, out=None: -1, + torch.linalg.cholesky: lambda input, out=None: -1, + torch.linalg.cholesky_ex: lambda input, check_errors=False, out=None: -1, + torch.cholesky_inverse: lambda input, upper=False, out=None: -1, + torch.cholesky_solve: lambda input1, input2, upper=False, out=None: -1, + torch.choose_qparams_optimized: lambda input, numel, n_bins, ratio, bit_width: -1, + torch.chunk: lambda input, chunks, dim=0: -1, + torch.clamp: lambda input, min=None, max=None, out=None: -1, + torch.clip: lambda input, min=None, max=None, out=None: -1, + torch.clamp_min: lambda input, min, out=None: -1, + torch.clamp_max: lambda input, max, out=None: -1, + torch.column_stack: lambda tensors, out=None: -1, + torch.cov: lambda input, correction=1, fweights=None, aweights=None: -1, + torch.clone: lambda input: -1, + torch.combinations: lambda input, r=2, with_replacement=False: -1, + torch.complex: lambda real, imag: -1, + torch.copysign: lambda input, other, out=None: -1, + torch.polar: lambda abs, ang: -1, + torch.linalg.cond: lambda input, ord=None: -1, + torch.conj: lambda input, out=None: -1, + torch.conj_physical: lambda input, out=None: -1, + torch.resolve_conj: lambda input, out=None: -1, + torch.resolve_neg: lambda input, out=None: -1, + torch.constant_pad_nd: lambda input, pad, value=0: -1, + torch.conv1d: lambda input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1: -1, + torch.conv2d: lambda input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1: -1, + torch.conv3d: lambda input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1: -1, + torch.convolution: lambda input, weight, bias, stride, padding, dilation, transposed, output_adding, groups: -1, + torch.conv_tbc: lambda input, weight, bias, pad=0: -1, + torch.conv_transpose1d: lambda input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1: -1, + torch.conv_transpose2d: lambda input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1: -1, + torch.conv_transpose3d: lambda input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1: -1, + torch.corrcoef: lambda input: -1, + torch.cos: lambda input, out=None: -1, + torch.cosine_embedding_loss: lambda input1, input2, target, margin=0, size_average=None, reduce=None, reduction='mean': -1, + torch.cosh: lambda input, out=None: -1, + torch.cosine_similarity: lambda x1, x2, dim=1, eps=1e-8: -1, + torch.count_nonzero: lambda input: -1, + torch.cross: lambda input, other, dim=None, out=None: -1, + torch.linalg.cross: lambda input, other, dim=-1, out=None: -1, + torch.ctc_loss: (lambda log_probs, targets, input_lengths, target_lengths, blank=0, reduction='mean', + zero_infinity=False: -1), + torch.cummax: lambda input, dim, out=None: -1, + torch.cummin: lambda input, dim, out=None: -1, + torch.cumprod: lambda input, dim, out=None, dtype=None: -1, + torch.cumsum: lambda input, dim, out=None, dtype=None: -1, + torch.cumulative_trapezoid: lambda y, x=None, dim=-1: -1, + torch.logcumsumexp: lambda input, dim, out=None: -1, + torch.deg2rad: lambda input, out=None: -1, + torch.dequantize: lambda input: -1, + torch.det: lambda input: -1, + torch.linalg.det: lambda input: -1, # alias for torch.det # type: ignore[attr-defined] + torch.detach: lambda input: -1, + torch.diag: lambda input, diagonal=0, out=None: -1, + torch.diag_embed: lambda input, diagonal=0, out=None: -1, + torch.diagflat: lambda input, offset=0: -1, + torch.diff: lambda input, n=1, dim=-1, prepend=None, append=None, out=None: -1, + torch.diagonal: lambda input, offset=0, dim1=0, dim2=1: -1, + torch.linalg.diagonal: lambda input, offset=0, dim1=-2, dim2=-1: -1, + torch.diagonal_scatter: lambda input, src, offset=0, dim1=0, dim2=1: -1, + torch.as_strided_scatter: lambda self, src, size, stride, storage_offset=None: -1, + torch.digamma: lambda input, out=None: -1, + torch.dist: lambda input, other, p=2: -1, + torch.div: lambda input, other, rounding_mode=None, out=None: -1, + torch.divide: lambda input, other, rounding_mode=None, out=None: -1, + torch.dot: lambda input, other, out=None: -1, + torch.dropout: lambda input, p, train, inplace=False: -1, + torch.dsmm: lambda input, mat2: -1, + torch.hsmm: lambda mat1, mat2: -1, + torch.dsplit: lambda input, indices_or_sections: -1, + torch.dstack: lambda tensors, out=None: -1, + torch.linalg.eig: lambda input, out=None: -1, + torch.linalg.eigvals: lambda input, out=None: -1, + torch.linalg.eigh: lambda input, UPLO="L", out=None: -1, + torch.linalg.eigvalsh: lambda input, UPLO="L", out=None: -1, + torch.einsum: lambda equation, *operands: -1, + torch.embedding: (lambda input, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, + sparse=False: -1), + torch.embedding_bag: (lambda input, weight, offsets, max_norm=None, norm_type=2, scale_grad_by_freq=False, + mode='mean', sparse=False, per_sample_weights=None, padding_idx=None: -1), + torch.empty_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1, + torch.eq: lambda input, other, out=None: -1, + torch.equal: lambda input, other: -1, + torch.erf: lambda input, out=None: -1, + torch.erfc: lambda input, out=None: -1, + torch.erfinv: lambda input, out=None: -1, + torch.exp: lambda input, out=None: -1, + torch.exp2: lambda input, out=None: -1, + torch.expm1: lambda input, out=None: -1, + torch.fake_quantize_per_channel_affine: lambda input, scale, zero_point, axis, quant_min, quant_max: -1, + torch.fake_quantize_per_tensor_affine: lambda input, scale, zero_point, quant_min, quant_max: -1, + torch.fused_moving_avg_obs_fake_quant: (lambda x, observer_on, fake_quant_on, averaging_const, running_min, + running_max, scale, zero_point, quant_min, quant_max, ch_axis, + per_row_fake_quant=False, symmetric_quant=False: -1), + torch.fbgemm_linear_fp16_weight: lambda input, packed_weight, bias: -1, + torch.fbgemm_linear_fp16_weight_fp32_activation: lambda input, packed_weight, bias: -1, + torch.fbgemm_linear_int8_weight: lambda input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias: -1, + torch.fbgemm_linear_int8_weight_fp32_activation: (lambda input, weight, packed, col_offsets, weight_scale, + weight_zero_point, bias: -1), + torch.fbgemm_linear_quantize_weight: lambda input: -1, + torch.fbgemm_pack_gemm_matrix_fp16: lambda input: -1, + torch.fbgemm_pack_quantized_matrix: lambda input, a, b: -1, + torch.feature_alpha_dropout: lambda input, p, train: -1, + torch.feature_dropout: lambda input, p, train: -1, + torch.fft.ifft: lambda input, n=None, dim=-1, norm=None: -1, + torch.fft.rfft: lambda input, n=None, dim=-1, norm=None: -1, + torch.fft.irfft: lambda input, n=None, dim=-1, norm=None: -1, + torch.fft.hfft: lambda input, n=None, dim=-1, norm=None: -1, + torch.fft.ihfft: lambda input, n=None, dim=-1, norm=None: -1, + torch.fft.hfft2: lambda input, s=None, dim=(-2, -1), norm=None: -1, + torch.fft.ihfft2: lambda input, s=None, dim=(-2, -1), norm=None: -1, + torch.fft.hfftn: lambda input, s=None, dim=-1, norm=None: -1, + torch.fft.ihfftn: lambda input, s=None, dim=-1, norm=None: -1, + torch.fft.fftn: lambda input, s=None, dim=None, norm=None: -1, + torch.fft.ifftn: lambda input, s=None, dim=None, norm=None: -1, + torch.fft.rfftn: lambda input, s=None, dim=None, norm=None: -1, + torch.fft.irfftn: lambda input, s=None, dim=None, norm=None: -1, + torch.fft.fft2: lambda input, s=None, dim=(-2, -1), norm=None: -1, + torch.fft.ifft2: lambda input, s=None, dim=(-2, -1), norm=None: -1, + torch.fft.rfft2: lambda input, s=None, dim=(-2, -1), norm=None: -1, + torch.fft.irfft2: lambda input, s=None, dim=(-2, -1), norm=None: -1, + torch.fft.fftshift: lambda input, dim=None: -1, + torch.fft.ifftshift: lambda input, dim=None: -1, + torch.fft.fft: lambda input, n=None, dim=-1, norm=None: -1, + torch.fix: lambda input, out=None: -1, + torch.flatten: lambda input, start_dim=0, end_dim=-1: -1, + torch.flip: lambda input, dims: -1, + torch.fliplr: lambda input: -1, + torch.flipud: lambda input: -1, + torch.frobenius_norm: lambda input, dim=None, keepdim=False, out=None: -1, + torch.floor: lambda input, out=None: -1, + torch.floor_divide: lambda input, other: -1, + torch.float_power: lambda input, exponent, out=None: -1, + torch.fmod: lambda input, other, out=None: -1, + torch.frac: lambda input, out=None: -1, + torch.frexp: lambda input, out=None: -1, + torch.full_like: lambda input, fill_value, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False: -1, + torch._functional_assert_async: lambda input, msg, dep_token: -1, + torch.lu_unpack: lambda LU_data, LU_pivots, unpack_data=True, unpack_pivots=True: -1, + torch.gather: lambda input, dim, index, out=None, sparse_grad=False: -1, + torch.gcd: lambda input, other, out=None: -1, + torch.ge: lambda input, other, out=None: -1, + torch.greater_equal: lambda input, other, out=None: -1, + torch.geqrf: lambda input, out=None: -1, + torch.i0: lambda input, out=None: -1, + torch.inner: lambda input, other, out=None: -1, + torch.outer: lambda input, vec2, out=None: -1, + torch.ger: lambda input, vec2, out=None: -1, # alias for torch.outer + torch.gradient: lambda input, spacing=None, dim=None, edge_order=1: -1, + torch.grid_sampler: lambda input, grid, interpolation_mode, padding_mode, align_corners: -1, + torch.grid_sampler_2d: lambda input, grid, interpolation_mode, padding_mode, align_corners: -1, + torch.grid_sampler_3d: lambda input, grid, interpolation_mode, padding_mode, align_corners: -1, + torch.group_norm: lambda input, num_groups, weight=None, bias=None, eps=1e-05, cudnn_enabled=True: -1, + torch.gru: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1, + torch.gru_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1, + torch.gt: lambda input, other, out=None: -1, + torch.greater: lambda input, other, out=None: -1, + torch.hardshrink: lambda input, lambd=0.5: -1, + torch.heaviside: lambda input, values, out=None: -1, + torch.hinge_embedding_loss: lambda input, target, margin=1.0, size_average=None, reduce=None, reduction='mean': -1, + torch.histc: lambda input, bins=100, min=0, max=0, out=None: -1, + torch.histogram: lambda input, bins=100, min=None, max=None, weight=None, density=False, out=None: -1, + torch.histogramdd: lambda input, bins, range=None, weight=None, density=False: -1, + torch.linalg.householder_product: lambda input, tau: -1, + torch.hspmm: lambda mat1, mat2, out=None: -1, + torch.hsplit: lambda input, indices_or_sections: -1, + torch.hstack: lambda tensors, out=None: -1, + torch.hypot: lambda input, other, out=None: -1, + torch.igamma: lambda input, other, out=None: -1, + torch.igammac: lambda input, other, out=None: -1, + torch.imag: lambda input, out=None: -1, + torch.index_add: lambda input, dim, index, source: -1, + torch.index_copy: lambda input, dim, index, source: -1, + torch.index_put: lambda input, indices, values, accumulate=False: -1, + torch.index_select: lambda input, dim, index, out=None: -1, + torch.index_fill: lambda input, dim, index, value: -1, + torch.index_reduce: lambda input, dim, index, source, reduce, include_input=True: -1, + torch.isfinite: lambda tensor: -1, + torch.isin: lambda e, te, assume_unique=False, invert=False: -1, + torch.isinf: lambda tensor: -1, + torch.isreal: lambda tensor: -1, + torch.isposinf: lambda input, out=None: -1, + torch.isneginf: lambda input, out=None: -1, + torch.instance_norm: (lambda input, running_mean, running_var, weight, bias, use_input_stats, momentum, eps, + cudnn_enabled: -1), + torch.int_repr: lambda input: -1, + torch.inverse: lambda input, out=None: -1, + torch.linalg.inv: lambda input, out=None: -1, + torch.linalg.inv_ex: lambda input, check_errors=False, out=None: -1, + torch.is_complex: lambda input: -1, + torch.is_conj: lambda input: -1, + torch.is_neg: lambda input: -1, + torch.is_distributed: lambda input: -1, + torch.is_inference: lambda input: -1, + torch.is_floating_point: lambda input: -1, + torch.is_nonzero: lambda input: -1, + torch.is_same_size: lambda input, other: -1, + torch.is_signed: lambda input: -1, + torch.isclose: lambda input, other, rtol=1e-05, atol=1e-08, equal_nan=False: -1, + torch.isnan: lambda input: -1, + torch.istft: (lambda input, n_fft, hop_length=None, win_length=None, window=None, center=True, + normalized=False, onesided=None, length=None, return_complex=False: -1), + torch.kl_div: lambda input, target, size_average=None, reduce=None, reduction='mean', log_target=False: -1, + torch.kron: lambda input, other: -1, + torch.kthvalue: lambda input, k, dim=None, keepdim=False, out=None: -1, + torch.linalg.ldl_factor_ex: lambda input, hermitian=False, check_errors=False, out=None: -1, + torch.linalg.ldl_factor: lambda input, hermitian=False, out=None: -1, + torch.linalg.ldl_solve: lambda LD, pivots, B, hermitian=False, out=None: -1, + torch.layer_norm: lambda input, normalized_shape, weight=None, bias=None, esp=1e-05, cudnn_enabled=True: -1, + torch.lcm: lambda input, other, out=None: -1, + torch.ldexp: lambda input, other, out=None: -1, + torch.le: lambda input, other, out=None: -1, + torch.less_equal: lambda input, other, out=None: -1, + torch.lerp: lambda input, end, weight, out=None: -1, + torch.lgamma: lambda input, out=None: -1, + torch.lobpcg: lambda input, k=None, B=None, X=None, n=None, iK=None, niter=None, tol=None, largest=None, method=None, + tracker=None, ortho_iparams=None, ortho_fparams=None, ortho_bparams=None: -1, + torch.log: lambda input, out=None: -1, + torch.log_softmax: lambda input, dim, dtype=None: -1, + torch.log10: lambda input, out=None: -1, + torch.log1p: lambda input, out=None: -1, + torch.log2: lambda input, out=None: -1, + torch.logaddexp: lambda input, other, out=None: -1, + torch.logaddexp2: lambda input, other, out=None: -1, + torch.logdet: lambda input: -1, + torch.xlogy: lambda x, y, out=None: -1, + torch.logical_and: lambda input, other, out=None: -1, + torch.logical_not: lambda input, out=None: -1, + torch.logical_or: lambda input, other, out=None: -1, + torch.logical_xor: lambda input, other, out=None: -1, + torch.logit: lambda input, eps=None: -1, + torch.logsumexp: lambda input, names, keepdim=False, out=None: -1, + torch.lstm: lambda data, batch_sizes, hx, params, has_biases, num_layers, dropout, train, bidirectional: -1, + torch.lstm_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1, + torch.lt: lambda input, other, out=None: -1, + torch.less: lambda input, other, out=None: -1, + torch.lu: lambda A, pivot=True, get_infos=False, out=None: -1, + torch.lu_solve: lambda b, LU_data, LU_pivots, out=None: -1, + torch.margin_ranking_loss: lambda input1, input2, target, margin=0, size_average=None, reduce=None, reduction='mean': -1, # type: ignore[attr-defined] # noqa: B950 + torch.masked_fill: lambda input, mask, value: -1, + torch.masked_scatter: lambda input, mask, source: -1, + torch.masked_select: lambda input, mask, out=None: -1, + torch.matmul: lambda input, other, out=None: -1, + torch.linalg.lu: lambda input, pivot=True, out=None: -1, + torch.linalg.lu_factor: lambda input, pivot=True, out=None: -1, + torch.linalg.lu_factor_ex: lambda input, pivot=True, check_errors=False, out=None: -1, + torch.linalg.lu_solve: lambda LU, pivots, B, left=True, adjoint=False, out=None: -1, + torch.linalg.matmul: lambda input, other, out=None: -1, # alias for torch.matmul + torch.matrix_power: lambda input, n: -1, + torch.linalg.matrix_power: lambda input, n, out=None: -1, + torch.linalg.matrix_rank: lambda input, tol=None, hermitian=False: -1, + torch.linalg.multi_dot: lambda tensors, out=None: -1, + torch.matrix_exp: lambda input: -1, + torch.linalg.matrix_exp: lambda input: -1, + torch.max: lambda input, out=None: -1, + torch.maximum: lambda input, other, out=None: -1, + torch.fmax: lambda input, other, out=None: -1, + torch.max_pool1d: lambda input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False: -1, + torch.max_pool2d: lambda input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False: -1, + torch.max_pool3d: lambda input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False: -1, + torch.max_pool1d_with_indices: (lambda input, kernel_size, stride=None, padding=0, dilation=1, + return_indices=False, ceil_mode=False: -1), + torch.mean: lambda input, dim=None: -1, + torch.nanmean: lambda input, dim=None, keepdim=False, dtype=None, out=None: -1, + torch.median: lambda input, dim=None: -1, + torch.nanmedian: lambda input, dim=None: -1, + torch.meshgrid: lambda *tensors, **kwargs: -1, + torch.min: lambda input, out=None: -1, + torch.minimum: lambda input, other, out=None: -1, + torch.fmin: lambda input, other, out=None: -1, + torch.miopen_batch_norm: (lambda input, weight, bias, running_mean, running_var, training, + exponential_average_factor, epsilon: -1), + torch.miopen_convolution: lambda input, weight, bias, padding, stride, dilation, groups, benchmark, deterministic: -1, + torch.miopen_convolution_add_relu: lambda input, weight, z, alpha, bias, stride, padding, dilation, groups: -1, + torch.miopen_convolution_relu: lambda input, weight, bias, stride, padding, dilation, groups: -1, + torch.miopen_convolution_transpose: (lambda input, weight, bias, padding, output_padding, stride, dilation, + groups, benchmark, deterministic: -1), + torch.miopen_depthwise_convolution: (lambda input, weight, bias, padding, stride, dilation, groups, benchmark, + deterministic: -1), + torch.miopen_rnn: (lambda input, weight, weight_stride0, hx, cx, mode, hidden_size, num_layers, batch_first, + dropout, train, bidirectional, batch_sizes, dropout_state: -1), + torch.mm: lambda input, mat2, out=None: -1, + torch.mode: lambda input, dim=-1, keepdim=False, out=None: -1, + torch.movedim: lambda input, source, destination: -1, + torch.moveaxis: lambda input, source, destination: -1, + torch.msort: lambda input, descending=False, out=None: -1, + torch.mul: lambda input, other, out=None: -1, + torch.multiply: lambda input, other, out=None: -1, + torch.multinomial: lambda input, num_samples, replacement=False, out=None: -1, + torch.mv: lambda input, vec, out=None: -1, + torch.mvlgamma: lambda input, p: -1, + torch.narrow: lambda input, dim, start, length: -1, + torch.nan_to_num: lambda input, nan=0.0, posinf=None, neginf=None, out=None: -1, + torch.native_batch_norm: lambda input, weight, bias, running_mean, running_var, training, momentum, eps: -1, + torch._native_batch_norm_legit: lambda input, weight, bias, training, momentum, eps: -1, + torch.native_dropout: lambda input, p, train: -1, + torch.native_layer_norm: lambda input, normalized_shape, weight=None, bias=None, eps=1e-05: -1, + torch.native_group_norm: lambda input, weight, bias, N, C, HxW, group, eps: -1, + torch.native_norm: lambda input, p=2, dim=None, keepdim=False, dtype=None: -1, + torch.native_channel_shuffle: lambda input, groups : -1, + torch.ne: lambda input, other, out=None: -1, + torch.not_equal: lambda input, other, out=None: -1, + torch.neg: lambda input, out=None: -1, + torch.negative: lambda input, out=None: -1, + torch.nextafter: lambda input, other, out=None: -1, + torch.nn.functional.adaptive_avg_pool2d: lambda input, output_size: -1, + torch.nn.functional.adaptive_avg_pool3d: lambda input, output_size: -1, + torch.nn.functional.adaptive_max_pool1d: lambda input, output_size, return_indices=False: -1, + torch.nn.functional.adaptive_max_pool1d_with_indices: lambda input, output_size, return_indices=False: -1, + torch.nn.functional.adaptive_max_pool2d: lambda input, output_size, return_indices=False: -1, + torch.nn.functional.adaptive_max_pool2d_with_indices: lambda input, output_size, return_indices=False: -1, + torch.nn.functional.adaptive_max_pool3d: lambda input, output_size, return_indices=False: -1, + torch.nn.functional.adaptive_max_pool3d_with_indices: lambda input, output_size, return_indices=False: -1, + torch.nn.functional.affine_grid: lambda theta, size, align_corners=None: -1, + torch.nn.functional.alpha_dropout: lambda input, p=0.5, training=False, inplace=False: -1, + torch.nn.functional.avg_pool2d: (lambda input, kernel_size, stride=None, padding=0, ceil_mode=False, + count_include_pad=True, divisor_override=None: -1), + torch.nn.functional.avg_pool3d: (lambda input, kernel_size, stride=None, padding=0, ceil_mode=False, + count_include_pad=True, divisor_override=None: -1), + torch.nn.functional.batch_norm: (lambda input, running_mean, running_var, weight=None, bias=None, training=False, + momentum=0.1, eps=1e-05: -1), + torch.nn.functional.bilinear: lambda input1, input2, weight, bias=None: -1, + torch.nn.functional.binary_cross_entropy: (lambda input, target, weight=None, size_average=None, reduce=None, + reduction="mean": -1), + torch.nn.functional.binary_cross_entropy_with_logits: (lambda input, target, weight=None, size_average=None, + reduce=None, reduction="mean", pos_weight=None: -1), + torch.nn.functional.celu: lambda input, alpha=1.0, inplace=False: -1, + torch.nn.functional.cosine_embedding_loss: (lambda input1, input2, target, margin=0, size_average=None, + reduce=None, reduction='mean': -1), + torch.nn.functional.cross_entropy: (lambda input, target, weight=None, size_average=None, ignore_index=-100, + reduce=None, reduction="mean", label_smoothing=0.0: -1), + torch.nn.functional.ctc_loss: (lambda log_probs, targets, input_lengths, target_lengths, blank=0, + reduction='mean', zero_infinity=False: -1), + torch.nn.functional.dropout: lambda input, p=0.5, training=True, inplace=False: -1, + torch.nn.functional.dropout1d: lambda input, p=0.5, training=True, inplace=False: -1, + torch.nn.functional.dropout2d: lambda input, p=0.5, training=True, inplace=False: -1, + torch.nn.functional.dropout3d: lambda input, p=0.5, training=True, inplace=False: -1, + torch.nn.functional.elu: lambda input, alpha=1.0, inplace=False: -1, + torch.nn.functional.embedding: (lambda input, weight, padding_idx=None, max_norm=None, norm_type=2.0, + scale_grad_by_freq=False, sparse=False: -1), + torch.nn.functional.embedding_bag: (lambda input, weight, offsets=None, max_norm=None, norm_type=2, + scale_grad_by_freq=False, mode='mean', sparse=False, per_sample_weights=None, + include_last_offset=False, padding_idx=None: -1), + torch.nn.functional.feature_alpha_dropout: lambda input, p=0.5, training=False, inplace=False: -1, + torch.nn.functional.fold: lambda input, output_size, kernel_size, dilation=1, padding=0, stride=1: -1, + torch.nn.functional.fractional_max_pool2d: (lambda input, kernel_size, output_size=None, output_ratio=None, + return_indices=False, _random_samples=None: -1), + torch.nn.functional.fractional_max_pool2d_with_indices: ( + lambda input, kernel_size, output_size=None, output_ratio=None, return_indices=False, + _random_samples=None: -1), + torch.nn.functional.fractional_max_pool3d: (lambda input, kernel_size, output_size=None, output_ratio=None, + return_indices=False, _random_samples=None: -1), + torch.nn.functional.fractional_max_pool3d_with_indices: ( + lambda input, kernel_size, output_size=None, output_ratio=None, return_indices=False, + _random_samples=None: -1), + torch.nn.functional.gaussian_nll_loss: lambda input, target, var, full=False, eps=1e-06, reduction='mean': -1, + torch.nn.functional.gelu: lambda input, approximate='none': -1, + torch.nn.functional.glu: lambda input, dim=-1: -1, + torch.nn.functional.grid_sample: lambda input, grid, mode='bilinear', padding_mode='zeros', align_corners=None: -1, + torch.nn.functional.group_norm: lambda input, num_groups, weight=None, bias=None, eps=1e-05: -1, + torch.nn.functional.gumbel_softmax: lambda logits, tau=1, hard=False, eps=1e-10, dim=-1: -1, + torch.nn.functional.hardshrink: lambda input, lambd=0.5: -1, + torch.nn.functional.hardtanh: lambda input, min_val=-1., max_val=1., inplace=False: -1, + torch.nn.functional.hinge_embedding_loss: (lambda input, target, margin=1.0, size_average=None, reduce=None, + reduction='mean': -1), + torch.nn.functional.instance_norm: (lambda input, running_mean=None, running_var=None, weight=None, bias=None, + use_input_stats=True, momentum=0.1, eps=1e-05: -1), + torch.nn.functional.interpolate: (lambda input, size=None, scale_factor=None, mode='nearest', align_corners=None, + recompute_scale_factor=None, antialias=False: -1), + torch.nn.functional.kl_div: lambda input, target, size_average=None, reduce=None, reduction='mean', log_target=False: -1, + torch.nn.functional.l1_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1, + torch.nn.functional.layer_norm: lambda input, normalized_shape, weight=None, bias=None, eps=1e-05: -1, + torch.nn.functional.leaky_relu: lambda input, negative_slope=0.01, inplace=False: -1, + torch.nn.functional.linear: lambda input, weight, bias=None: -1, + torch.nn.functional.local_response_norm: lambda input, size, alpha=0.0001, beta=0.75, k=1.0: -1, + torch.nn.functional.log_softmax: lambda input, dim=None, _stacklevel=3, dtype=None: -1, + torch.nn.functional.logsigmoid: lambda input: -1, + torch.nn.functional.lp_pool1d: lambda input, norm_type, kernel_size, stride=None, ceil_mode=False: -1, + torch.nn.functional.lp_pool2d: lambda input, norm_type, kernel_size, stride=None, ceil_mode=False: -1, + torch.nn.functional.lp_pool3d: lambda input, norm_type, kernel_size, stride=None, ceil_mode=False: -1, + torch.nn.functional.margin_ranking_loss: (lambda input1, input2, target, margin=0, size_average=None, + reduce=None, reduction='mean': -1), + torch.nn.functional.max_pool1d: (lambda input, kernel_size, stride=None, padding=0, dilation=1, + ceil_mode=False, return_indices=False: -1), + torch.nn.functional.max_pool1d_with_indices: (lambda input, kernel_size, stride=None, padding=0, dilation=1, + return_indices=False, ceil_mode=False: -1), + torch.nn.functional.max_pool2d: (lambda input, kernel_size, stride=None, padding=0, dilation=1, + ceil_mode=False, return_indices=False: -1), + torch.nn.functional.max_pool2d_with_indices: (lambda input, kernel_size, stride=None, padding=0, dilation=1, + return_indices=False, ceil_mode=False: -1), + torch.nn.functional.max_pool3d: (lambda input, kernel_size, stride=None, padding=0, dilation=1, + return_indices=False, ceil_mode=False: -1), + torch.nn.functional.max_pool3d_with_indices: (lambda input, kernel_size, stride=None, padding=0, dilation=1, + return_indices=False, ceil_mode=False: -1), + torch.nn.functional.max_unpool1d: lambda input, indices, kernel_size, stride=None, padding=0, output_size=None: -1, + torch.nn.functional.max_unpool2d: lambda input, indices, kernel_size, stride=None, padding=0, output_size=None: -1, + torch.nn.functional.max_unpool3d: lambda input, indices, kernel_size, stride=None, padding=0, output_size=None: -1, + torch.nn.functional.mse_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1, + torch.nn.functional.multi_head_attention_forward: ( + lambda query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, + add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training=True, key_padding_mask=None, + need_weights=True, attn_mask=None, use_separate_proj_weight=False, q_proj_weight=None, k_proj_weight=None, + v_proj_weight=None, static_k=None, static_v=None, average_attn_weights=None, is_causal=False: -1), + torch.nn.functional.multi_margin_loss: (lambda input, target, p=1, margin=1.0, weight=None, size_average=None, + reduce=None, reduction='mean': -1), + torch.nn.functional.multilabel_margin_loss: (lambda input, target, size_average=None, reduce=None, + reduction='mean': -1), + torch.nn.functional.multilabel_soft_margin_loss: (lambda input, target, weight=None, size_average=None, + reduce=None, reduction='mean': -1), + torch.nn.functional.nll_loss: (lambda input, target, weight=None, size_average=None, ignore_index=-100, + reduce=None, reduction='mean': -1), + torch.nn.functional.normalize: lambda input, p=2, dim=1, eps=1e-12, out=None: -1, + torch.nn.functional.one_hot: lambda tensor, num_classes=-1: -1, + torch.nn.functional.pad: lambda input, pad, mode='constant', value=0: -1, + torch.nn.functional.pairwise_distance: lambda x1, x2, p=2.0, eps=1e-06, keepdim=False: -1, + torch.nn.functional.poisson_nll_loss: (lambda input, target, log_input=True, full=False, size_average=None, + eps=1e-08, reduce=None, reduction='mean': -1), + torch.nn.functional.prelu: lambda input, weight: -1, + torch.nn.functional.relu: lambda input, inplace=False: -1, + torch.nn.functional.relu6: lambda input, inplace=False: -1, + torch.nn.functional.rrelu: lambda input, lower=0.125, upper=0.3333333333333333, training=False, inplace=False: -1, + torch.nn.functional.selu: lambda input, inplace=False: -1, + torch.nn.functional.silu: lambda input, inplace=False: -1, + torch.nn.functional.mish: lambda input, inplace=False: -1, + torch.nn.functional.scaled_dot_product_attention: lambda query, key, value, attn_mask=None, dropout_p=0.0: -1, + torch.nn.functional.smooth_l1_loss: lambda input, target, size_average=None, reduce=None, reduction='mean', beta=1.: -1, + torch.nn.functional.huber_loss: lambda input, target, reduction='mean', delta=1.: -1, + torch.nn.functional.soft_margin_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1, + torch.nn.functional.softmax: lambda input, dim=None, _stacklevel=3, dtype=None: -1, + torch.nn.functional.softmin: lambda input, dim=None, _stacklevel=3, dtype=None: -1, + torch.nn.functional.softplus: lambda input, beta=1, threshold=20: -1, + torch.nn.functional.softshrink: lambda input, lambd=0.5: -1, + torch.nn.functional.softsign: lambda input: -1, + torch.nn.functional.tanhshrink: lambda input: -1, + torch.nn.functional.threshold: lambda input, threshold, value, inplace=False: -1, + torch.nn.functional.triplet_margin_loss: (lambda anchor, positive, negative, margin=1.0, p=2, eps=1e-06, + swap=False, size_average=None, reduce=None, reduction='mean': -1), + torch.nn.functional.triplet_margin_with_distance_loss: (lambda anchor, positive, negative, *, + distance_function=None, margin=1.0, + swap=False, reduction='mean': -1), + torch.nn.functional.unfold: lambda input, kernel_size, dilation=1, padding=0, stride=1: -1, + torch.nn.init.uniform_: lambda tensor, a=0., b=1., generator=None: -1, + torch.nn.init.normal_: lambda tensor, mean=0., std=1., generator=None: -1, + torch.nn.init.constant_: lambda tensor, val: -1, + torch.nn.init.kaiming_uniform_: lambda tensor, a=0, mode='fan_in', nonlinearity='leaky_relu', generator=None: -1, + torch.nonzero: lambda input, as_tuple=False: -1, + torch.nonzero_static: lambda input, *, size, fill_value=-1: -1, + torch.argwhere: lambda input: -1, + torch.norm: lambda input, p='fro', dim=None, keepdim=False, out=None, dtype=None: -1, + torch.linalg.norm: lambda input, ord=None, dim=None, keepdim=False, out=None, dtype=None: -1, + torch.linalg.vector_norm: lambda input, ord=2, dim=None, keepdim=False, out=None, dtype=None: -1, + torch.linalg.matrix_norm: lambda input, ord='fro', dim=(-2, -1), keepdim=False, out=None, dtype=None: -1, + torch.norm_except_dim: lambda v, pow=2, dim=0: -1, + torch.nuclear_norm: lambda input, p='fro', dim=None, keepdim=False, out=None, dtype=None: -1, + torch.numel: lambda input: -1, + torch.orgqr: lambda input, tau: -1, + torch.ormqr: lambda input, input2, input3, left=True, transpose=False: -1, + torch.pairwise_distance: lambda x1, x2, p=2.0, eps=1e-06, keepdim=False: -1, + torch.permute: lambda self, dim: -1, + torch.pca_lowrank: lambda input, q=None, center=True, niter=2: -1, + torch.pdist: lambda input, p=2: -1, + torch.pinverse: lambda input, rcond=1e-15: -1, + torch.linalg.pinv: lambda input, rcond=1e-15, hermitian=False: -1, + torch.pixel_shuffle: lambda input, upscale_factor: -1, + torch.pixel_unshuffle: lambda input, downscale_factor: -1, + torch.poisson: lambda input, generator=None: -1, + torch.poisson_nll_loss: lambda input, target, log_input, full, eps, reduction: -1, + torch.polygamma: lambda input, n, out=None: -1, + torch.positive: lambda input, out=None: -1, + torch.prelu: lambda input, weight: -1, + torch.ones_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1, + torch.pow: lambda input, exponent, out=None: -1, + torch.prod: lambda input, dtype=None: -1, + torch.put: lambda input, index, source, accumulate=False: -1, + torch.q_per_channel_axis: lambda input: -1, + torch.q_per_channel_scales: lambda input: -1, + torch.q_per_channel_zero_points: lambda input: -1, + torch.q_scale: lambda input: -1, + torch.q_zero_point: lambda input: -1, + torch.qr: lambda input, some=True, out=None: -1, + torch.linalg.qr: lambda input, mode='reduced', out=None: -1, + torch.quantile: lambda input, q, dim=None, keepdim=False, interpolation='linear', out=None: -1, + torch.nanquantile: lambda input, q, dim=None, keepdim=False, interpolation='linear', out=None: -1, + torch.quantize_per_channel: lambda input, scales, zero_points, axis, dtype: -1, + torch.quantize_per_tensor: lambda input, scale, zero_point, dtype: -1, + torch.quantize_per_tensor_dynamic: lambda input, dtype, reduce_range: -1, + torch.quantized_batch_norm: lambda input, weight, bias, mean, var, eps, output_scale, output_zero_point: -1, + torch.quantized_gru_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih, + col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh: -1), + + torch.quantized_lstm_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih, + col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh: -1), + torch.quantized_max_pool1d: (lambda input, kernel_size, stride=tuple(), padding=(0,), + dilation=(1,), ceil_mode=False: -1), + torch.quantized_max_pool2d: (lambda input, kernel_size, stride=tuple(), padding=(0, 0), + dilation=(1, 1), ceil_mode=False: -1), + torch.quantized_max_pool3d: (lambda input, kernel_size, stride=tuple(), padding=(0, 0, 0), + dilation=(1, 1, 1), ceil_mode=False: -1), + torch.quantized_rnn_relu_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih, + col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh: -1), + torch.quantized_rnn_tanh_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih, + col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh: -1), + torch.rad2deg: lambda input, out=None: -1, + torch.rand_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1, + torch.randint_like: lambda input, high, dtype=None, layout=torch.strided, device=None, requires_grad=False: -1, + torch.randn_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1, + torch.ravel: lambda input: -1, + torch.real: lambda input, out=None: -1, + torch.vdot: lambda input, other, out=None: -1, + torch.linalg.vecdot: lambda input, other, dim=-1, out=None: -1, + torch.view_as_real: lambda input: -1, + torch.view_as_complex: lambda input: -1, + torch.reciprocal: lambda input, out=None: -1, + torch.relu: lambda input, inplace=False: -1, + torch.remainder: lambda input, other, out=None: -1, + torch.renorm: lambda input, p, dim, maxnorm, out=None: -1, + torch.repeat_interleave: lambda input, dim=None: -1, + torch.reshape: lambda input, shape: -1, + torch.rnn_relu: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1, + torch.rnn_relu_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1, + torch.rnn_tanh: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1, + torch.rnn_tanh_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1, + torch.roll: lambda input, shifts, dims=None: -1, + torch.rot90: lambda input, k=1, dims=(0, 1): -1, + torch.round: lambda input, out=None: -1, + torch.row_stack: lambda tensors, out=None: -1, # alias for torch.vstack + torch._rowwise_prune: (lambda weight, mask, compressed_indices_dtype: -1), + torch.rrelu: lambda input, lower=1. / 8, upper=1. / 3, training=False, inplace=False: -1, + torch.rsqrt: lambda input, out=None: -1, + torch.rsub: lambda input, other, alpha=1: -1, + torch.saddmm: lambda input, mat1, mat2, beta=1, alpha=1, out=None: -1, + torch.scatter: lambda input, dim, index, src: -1, + torch.scatter_add: lambda input, dim, index, src: -1, + torch.scatter_reduce: lambda input, dim, index, src, reduce, include_self=True: -1, + torch.searchsorted: lambda sorted_sequence, input, out_int32=False, right=False, out=None: -1, + torch._segment_reduce: lambda data, reduce="max", lengths=None, indices=None, offsets=None, axis=0, unsafe=False: -1, + torch.select: lambda input, dim, index: -1, + torch.select_scatter: lambda input, src, dim, index: -1, + torch.slice_inverse: lambda input, src, dim=0, start=None, end=None, step=1: -1, + torch.slice_scatter: lambda input, src, dim=0, start=None, end=None, step=1: -1, + torch.selu: lambda input, inplace=False: -1, + torch.sigmoid: lambda input, out=None: -1, + torch.sign: lambda input, out=None: -1, + torch.signbit: lambda input, out=None: -1, + torch.sgn: lambda input, out=None: -1, + torch.sin: lambda input, out=None: -1, + torch.sinc: lambda input, out=None: -1, + torch.sinh: lambda input, out=None: -1, + torch.slogdet: lambda input: -1, + torch.linalg.slogdet: lambda input: -1, + torch.smm: lambda input, mat2: -1, + torch.spmm: lambda input, mat2: -1, + torch.softmax: lambda input, dim, dtype=None: -1, + torch.linalg.solve: lambda A, B, left=True, out=None: -1, + torch.linalg.solve_ex: lambda A, B, left=True, check_errors=False, out=None: -1, + torch.sort: lambda input, dim=-1, descending=False, *, stable=False, out=None: -1, + torch.split: lambda tensor, split_size_or_sections, dim=0: -1, + torch.split_with_sizes: lambda tensor, split_size_or_sections, dim=0: -1, + torch.sqrt: lambda input, out=None: -1, + torch.square: lambda input, out=None: -1, + torch.squeeze: lambda input, dim=None, out=None: -1, + torch.sspaddmm: lambda input, mat1, mat2, beta=1, alpha=1, out=None: -1, + torch.stack: lambda tensors, dim=0, out=None: -1, + torch.std: lambda input, dim=None: -1, + torch.std_mean: lambda input, dim=None: -1, + torch.stft: (lambda input, n_fft, hop_length=None, win_length=None, window=None, center=True, + pad_mode='reflect', normalized=False, onesided=True, return_complex=None: -1), + torch.sub: lambda input, other, out=None: -1, + torch.subtract: lambda input, other, out=None: -1, + torch.sum: lambda input, dim=None: -1, + torch.sym_float: lambda input: -1, + torch.sym_int: lambda input: -1, + torch.sym_max: lambda a, b: -1, + torch.sym_min: lambda a, b: -1, + torch.sym_not: lambda input: -1, + torch.sym_ite: lambda a, b, c: -1, + torch._sym_sqrt: lambda input: -1, + torch._sym_cos: lambda input: -1, + torch._sym_cosh: lambda input: -1, + torch._sym_sin: lambda input: -1, + torch._sym_sinh: lambda input: -1, + torch._sym_tan: lambda input: -1, + torch._sym_tanh: lambda input: -1, + torch._sym_asin: lambda input: -1, + torch._sym_acos: lambda input: -1, + torch._sym_atan: lambda input: -1, + torch.nansum: lambda input, dim=None: -1, + torch.svd: lambda input, some=True, compute_uv=True, out=None: -1, + torch.svd_lowrank: lambda input, q=6, niter=2, M=None: -1, + torch.linalg.svd: lambda input, full_matrices=True, out=None: -1, + torch.linalg.svdvals: lambda input, out=None: -1, + torch.swapaxes: lambda input, dim0, dim1: -1, + torch.swapdims: lambda input, axis0, axis1: -1, + torch.special.airy_ai: lambda input: -1, + torch.special.bessel_j0: lambda input: -1, + torch.special.bessel_j1: lambda input: -1, + torch.special.bessel_y0: lambda input: -1, + torch.special.bessel_y1: lambda input: -1, + torch.special.chebyshev_polynomial_t: lambda input, n, out=None: -1, + torch.special.chebyshev_polynomial_u: lambda input, n, out=None: -1, + torch.special.chebyshev_polynomial_v: lambda input, n, out=None: -1, + torch.special.chebyshev_polynomial_w: lambda input, n, out=None: -1, + torch.special.digamma: lambda input: -1, + torch.special.entr: lambda input: -1, + torch.special.erf: lambda input: -1, + torch.special.erfc: lambda input: -1, + torch.special.erfcx: lambda input: -1, + torch.special.erfinv: lambda input: -1, + torch.special.exp2: lambda input: -1, + torch.special.expit: lambda input: -1, + torch.special.expm1: lambda input: -1, + torch.special.gammainc: lambda input, other, out=None: -1, + torch.special.gammaincc: lambda input, other, out=None: -1, + torch.special.gammaln: lambda input: -1, + torch.special.hermite_polynomial_h: lambda input, n, out=None: -1, + torch.special.hermite_polynomial_he: lambda input, n, out=None: -1, + torch.special.i0: lambda input: -1, + torch.special.i0e: lambda input: -1, + torch.special.i1: lambda input: -1, + torch.special.i1e: lambda input: -1, + torch.special.laguerre_polynomial_l: lambda input, n, out=None: -1, + torch.special.legendre_polynomial_p: lambda input, n, out=None: -1, + torch.special.log1p: lambda input: -1, + torch.special.log_ndtr: lambda input: -1, + torch.special.log_softmax: lambda input, dim, dtype=None: -1, + torch.special.logit: lambda input: -1, + torch.special.logsumexp: lambda input, dim, keepdim=False, out=None: -1, + torch.special.modified_bessel_i0: lambda input: -1, + torch.special.modified_bessel_i1: lambda input: -1, + torch.special.modified_bessel_k0: lambda input: -1, + torch.special.modified_bessel_k1: lambda input: -1, + torch.special.multigammaln: lambda input, p: -1, + torch.special.ndtr: lambda input: -1, + torch.special.ndtri: lambda input: -1, + torch.special.polygamma: lambda input, n, out=None: -1, + torch.special.psi: lambda input: -1, + torch.special.round: lambda input: -1, + torch.special.scaled_modified_bessel_k0: lambda input: -1, + torch.special.scaled_modified_bessel_k1: lambda input: -1, + torch.special.shifted_chebyshev_polynomial_t: lambda input, n, out=None: -1, + torch.special.shifted_chebyshev_polynomial_u: lambda input, n, out=None: -1, + torch.special.shifted_chebyshev_polynomial_v: lambda input, n, out=None: -1, + torch.special.shifted_chebyshev_polynomial_w: lambda input, n, out=None: -1, + torch.special.sinc: lambda input: -1, + torch.special.softmax: lambda input, dim, dtype=None: -1, + torch.special.spherical_bessel_j0: lambda input: -1, + torch.special.xlog1py: lambda input, other, out=None: -1, + torch.special.xlogy: lambda input, other, out=None: -1, + torch.special.zeta: lambda self, other, out=None: -1, + torch.t: lambda input: -1, + torch.take: lambda input, index: -1, + torch.take_along_dim: lambda input, indices, dim=None, out=None: -1, + torch.tan: lambda input, out=None: -1, + torch.tanh: lambda input, out=None: -1, + torch.linalg.tensorinv: lambda a, ind=2: -1, + torch.linalg.tensorsolve: lambda a, b, dims=None: -1, + torch.tensordot: lambda a, b, dims=2, out=None: -1, + torch.tensor_split: lambda input, indices_or_sections, dim=0: -1, + torch.threshold: lambda input, threshold, value, inplace=False: -1, + torch.tile: lambda input, dims: -1, + torch.topk: lambda input, k, dim=-1, descending=False, out=None: -1, + torch.trace: lambda input: -1, + torch.transpose: lambda input, dim0, dim1: -1, + torch.trapz: lambda y, x=None, dim=-1: -1, + torch.trapezoid: lambda y, x=None, dim=-1: -1, + torch.triangular_solve: lambda input, A, upper=True, transpose=False, unitriangular=False: -1, + torch.linalg.solve_triangular: lambda input, B, upper, left=True, unitriangular=False: -1, + torch.tril: lambda input, diagonal=0, out=None: -1, + torch.triplet_margin_loss: (lambda anchor, positive, negative, margin=1.0, p=2, eps=1e-06, swap=False, + + size_average=None, reduce=None, reduction='mean': -1), + torch.triu: lambda input, diagonal=0, out=None: -1, + torch.true_divide: lambda input, other: -1, + torch.trunc: lambda input, out=None: -1, + torch.unbind: lambda input, dim=0: -1, + torch.unflatten: lambda input, dim, sizes, names: -1, + torch.unique: lambda input, sorted=True, return_inverse=False, return_counts=False, dim=None: -1, + torch.unique_consecutive: lambda input, return_inverse=False, return_counts=False, dim=None: -1, + torch.unravel_index: lambda indices, shape: -1, + torch.unsafe_chunk: lambda input, chunks, dim=0: -1, + torch.unsafe_split: lambda tensor, split_size_or_sections, dim=0: -1, + torch.unsafe_split_with_sizes: lambda tensor, split_size_or_sections, dim=0: -1, + torch.unsqueeze: lambda input, dim, out=None: -1, + torch.linalg.vander: lambda x, N=None: -1, + torch.var: lambda input, dim=None: -1, + torch.var_mean: lambda input, dim=None: -1, + torch.vsplit: lambda input, indices_or_sections: -1, + torch.vstack: lambda tensors, out=None: -1, + torch.where: lambda condition, x=None, y=None: -1, + torch.zeros_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1, + torch._fw_primal_copy: lambda self, level: -1, + torch._make_dual_copy: lambda primal, tangent, level: -1, + torch.view_as_real_copy: lambda self: -1, + torch.view_as_complex_copy: lambda self: -1, + torch._conj_copy: lambda self: -1, + torch._neg_view_copy: lambda self: -1, + torch.as_strided_copy: lambda self, size, stride, storage_offset=None: -1, + torch._sparse_broadcast_to_copy: lambda self, size: -1, + torch.diagonal_copy: lambda self, offset=0, dim1=0, dim2=1: -1, + torch.expand_copy: lambda self, size, *, implicit=False: -1, + torch.narrow_copy: lambda self, dim, start, length: -1, + torch.permute_copy: lambda self, dims: -1, + torch._reshape_alias_copy: lambda self, size, stride: -1, + torch.select_copy: lambda self, dim, index: -1, + torch.detach_copy: lambda self: -1, + torch.slice_copy: lambda self, dim=0, start=None, end=None, step=1: -1, + torch.split_copy: lambda self, split_size, dim=0: -1, + torch.split_with_sizes_copy: lambda self, split_sizes, dim=0: -1, + torch.squeeze_copy: lambda self, dim: -1, + torch.t_copy: lambda self: -1, + torch.transpose_copy: lambda self, dim0, dim1: -1, + torch.unsqueeze_copy: lambda self, dim: -1, + torch._indices_copy: lambda self: -1, + torch._values_copy: lambda self: -1, + torch.indices_copy: lambda self: -1, + torch.values_copy: lambda self: -1, + torch.crow_indices_copy: lambda self: -1, + torch.col_indices_copy: lambda self: -1, + torch.ccol_indices_copy: lambda self: -1, + torch.row_indices_copy: lambda self: -1, + torch.unbind_copy: lambda self, dim=0: -1, + torch.view_copy: lambda self, dtype: -1, + torch.unfold_copy: lambda self, dimension, size, step: -1, + torch.alias_copy: lambda self: -1, + Tensor.__floordiv__: lambda self, other: -1, + Tensor.__rfloordiv__: lambda self, other: -1, + Tensor.__ifloordiv__: lambda self, other: -1, + Tensor.__truediv__: lambda self, other: -1, + Tensor.__rtruediv__: lambda self, other: -1, + Tensor.__itruediv__: lambda self, other: -1, + Tensor.__lshift__: lambda self, other: -1, + Tensor.__rlshift__: lambda self, other: -1, + Tensor.__ilshift__: lambda self, other: -1, + Tensor.__rshift__: lambda self, other: -1, + Tensor.__rrshift__: lambda self, other: -1, + Tensor.__irshift__: lambda self, other: -1, + Tensor.__and__: lambda self, other: -1, + Tensor.__or__: lambda self, other: -1, + Tensor.__xor__: lambda self, other: -1, + Tensor.__float__: lambda self: -1, + Tensor.__complex__: lambda self: -1, + Tensor.__array__: lambda self, dtype: -1, + Tensor.__bool__: lambda self: -1, + Tensor.__contains__: lambda self, other: -1, + Tensor.__neg__: lambda self: -1, + Tensor.__invert__: lambda self: -1, + Tensor.__mod__: lambda self, other: -1, + Tensor.__rmod__: lambda self, other: -1, + Tensor.__imod__: lambda self, other: -1, + Tensor.__array_wrap__: lambda self, array: -1, + Tensor.__getitem__: lambda self, idx: -1, + Tensor.__deepcopy__: lambda self, memo: -1, + Tensor.__int__: lambda self: -1, + Tensor.__long__: lambda self: -1, + Tensor.__index__: lambda self: -1, + Tensor.__len__: lambda self: -1, + Tensor.__format__: lambda self, format_spec: -1, + Tensor.__reduce_ex__: lambda self, proto: -1, + Tensor.__reversed__: lambda self: -1, + Tensor.__repr__: lambda self, *, tensor_contents=None: -1, + Tensor.__setitem__: lambda self, k, v: -1, + Tensor.__setstate__: lambda self, d: -1, + Tensor.T.__get__: lambda self: -1, + Tensor.H.__get__: lambda self: -1, + Tensor.mT.__get__: lambda self: -1, + Tensor.mH.__get__: lambda self: -1, + Tensor._backward_hooks.__get__: lambda self: -1, + Tensor._post_accumulate_grad_hooks.__get__: lambda self: -1, + Tensor._base.__get__: lambda self: -1, + Tensor._cdata.__get__: lambda self: -1, + Tensor.grad.__get__: lambda self: -1, + Tensor._grad.__get__: lambda self: -1, + Tensor._grad_fn.__get__: lambda self: -1, + Tensor.grad_fn.__get__: lambda self: -1, + Tensor._version.__get__: lambda self: -1, + Tensor._autocast_to_reduced_precision: lambda self, cuda_enabled, cpu_enabled, cuda_dtype, cpu_dtype: -1, + Tensor._autocast_to_full_precision: lambda self, cuda_enabled, cpu_enabled: -1, + Tensor.data.__get__: lambda self: -1, + Tensor.device.__get__: lambda self: -1, + Tensor.dtype.__get__: lambda self: -1, + Tensor.is_cuda.__get__: lambda self: -1, + Tensor.is_cpu.__get__: lambda self: -1, + Tensor.is_xla.__get__: lambda self: -1, + Tensor.is_xpu.__get__: lambda self: -1, + Tensor.is_ipu.__get__: lambda self: -1, + Tensor.is_leaf.__get__: lambda self: -1, + Tensor.retains_grad.__get__: lambda self: -1, + Tensor.is_meta.__get__: lambda self: -1, + Tensor.is_mps.__get__: lambda self: -1, + Tensor.is_mtia.__get__: lambda self: -1, + Tensor.is_nested.__get__: lambda self: -1, + Tensor.is_ort.__get__: lambda self: -1, + Tensor.is_mkldnn.__get__: lambda self: -1, + Tensor.is_quantized.__get__: lambda self: -1, + Tensor.is_sparse.__get__: lambda self: -1, + Tensor.is_sparse_csr.__get__: lambda self: -1, + Tensor.is_vulkan.__get__: lambda self: -1, + Tensor.itemsize.__get__: lambda self: -1, + Tensor.layout.__get__: lambda self: -1, + Tensor.name.__get__: lambda self: -1, + Tensor.names.__get__: lambda self: -1, + Tensor.nbytes.__get__: lambda self: -1, + Tensor.ndim.__get__: lambda self: -1, + Tensor.output_nr.__get__: lambda self: -1, + Tensor.requires_grad.__get__: lambda self: -1, + Tensor.shape.__get__: lambda self: -1, + Tensor.volatile.__get__: lambda self: -1, + Tensor.real.__get__: lambda self: -1, + Tensor.imag.__get__: lambda self: -1, + Tensor.__cuda_array_interface__.__get__: lambda self: -1, + Tensor.type: lambda self, dtype=None, non_blocking=False, **kwargs: -1, + Tensor._dimI: lambda self: -1, + Tensor._dimV: lambda self: -1, + Tensor._indices: lambda self: -1, + Tensor._is_view: lambda self: -1, + Tensor._nnz: lambda self: -1, + Tensor.crow_indices: lambda self: -1, + Tensor.col_indices: lambda self: -1, + Tensor.ccol_indices: lambda self: -1, + Tensor.row_indices: lambda self: -1, + Tensor._update_names: lambda self, names, inplace: -1, + Tensor._values: lambda self: -1, + Tensor.adjoint: lambda self: -1, + Tensor.align_as: lambda self, other: -1, + Tensor.align_to: lambda self, order, ellipsis_idx: -1, + Tensor.apply_: lambda self, callable: -1, + Tensor.as_strided: lambda self, size, stride: -1, + Tensor.as_strided_: lambda self, size, stride: -1, + Tensor.backward: lambda self, gradient=None, retain_graph=None, create_graph=False, inputs=None: -1, + Tensor.bfloat16: lambda self, memory_format=torch.preserve_format: -1, + Tensor.bool: lambda self, memory_format=torch.preserve_format: -1, + Tensor.byte: lambda self, memory_format=torch.preserve_format: -1, + Tensor.char: lambda self, memory_format=torch.preserve_format: -1, + Tensor.cauchy_: lambda self, median=0, sigma=1, *, generator=None: -1, + Tensor.coalesce: lambda self: -1, + Tensor._coalesced_: lambda self, coalesced: -1, + Tensor.contiguous: lambda self, memory_format=torch.contiguous_format: -1, + Tensor.copy_: lambda self, src, non_blocking=False: -1, + Tensor.cpu: lambda self, memory_format=torch.preserve_format: -1, + Tensor.cuda: lambda self, memory_format=torch.preserve_format: -1, + Tensor.xpu: lambda self, memory_format=torch.preserve_format: -1, + Tensor.ipu: lambda self, memory_format=torch.preserve_format: -1, + Tensor.data_ptr: lambda self: -1, + Tensor.dense_dim: lambda self: -1, + Tensor.diagonal_scatter: lambda self, src, offset=0, dim1=0, dim2=1: -1, + Tensor.dim: lambda self: -1, + Tensor.dim_order: lambda self: -1, + Tensor.double: lambda self, memory_format=torch.preserve_format: -1, + Tensor.cdouble: lambda self, memory_format=torch.preserve_format: -1, + Tensor.element_size: lambda self: -1, + Tensor.expand: lambda self, size: -1, + Tensor.expand_as: lambda self, other: -1, + Tensor.exponential_: lambda self, lambd=1, *, generator=None: -1, + Tensor.fill_: lambda self, value: -1, + Tensor.fill_diagonal_: lambda self, value: -1, + Tensor.float: lambda self, memory_format=torch.preserve_format: -1, + Tensor.cfloat: lambda self, memory_format=torch.preserve_format: -1, + Tensor.geometric_: lambda self, p, *, generator=None: -1, + Tensor.get_device: lambda self: -1, + Tensor.half: lambda self, memory_format=torch.preserve_format: -1, + Tensor.chalf: lambda self, memory_format=torch.preserve_format: -1, + Tensor.has_names: lambda self: -1, + Tensor.indices: lambda self: -1, + Tensor.int: lambda self, memory_format=torch.preserve_format: -1, + Tensor.is_coalesced: lambda self: -1, + Tensor.is_contiguous: lambda self: -1, + Tensor.is_inference: lambda self: -1, + Tensor.is_pinned: lambda self: -1, + Tensor.is_set_to: lambda self, tensor: -1, + Tensor.is_shared: lambda self: -1, + Tensor.item: lambda self: -1, + Tensor.log_normal_: lambda self, mean=1, std=2, *, generator=None: -1, + Tensor.log_softmax: lambda self, dim: -1, + Tensor.long: lambda self, memory_format=torch.preserve_format: -1, + Tensor.map_: lambda self, tensor, callable: -1, + Tensor.map2_: lambda self, x, y, callable: -1, + Tensor.mm: lambda self, mat2: -1, + Tensor.module_load: lambda self, other, assign=False: -1, + Tensor.narrow_copy: lambda self, dimension, start, length: -1, + Tensor.ndimension: lambda self: -1, + Tensor.nelement: lambda self: -1, + Tensor._nested_tensor_size: lambda self: -1, + Tensor._nested_tensor_storage_offsets: lambda self: -1, + Tensor._nested_tensor_strides: lambda self: -1, + Tensor.normal_: lambda self: -1, + Tensor.numpy: lambda self: -1, + Tensor.permute: lambda self, dim: -1, + Tensor.pin_memory: lambda self: -1, + Tensor.put_: lambda self, indices, tensor, accumulate=False: -1, + Tensor.qscheme: lambda self: -1, + Tensor.random_: lambda self, from_=0, to=None, *, generator=None: -1, + Tensor.record_stream: lambda self, stream: -1, + Tensor.refine_names: lambda self, names: -1, + Tensor.register_hook: lambda self, hook: -1, + Tensor.register_post_accumulate_grad_hook: lambda self, hook: -1, + Tensor.rename: lambda self, name: -1, + Tensor.repeat: lambda self, *size: -1, + Tensor.requires_grad_: lambda self, requires_grad=True: -1, + Tensor.reshape_as: lambda self, other: -1, + Tensor.resize: lambda self, *size: -1, + Tensor.resize_: lambda self, size: -1, + Tensor.resize_as: lambda self, other: -1, + Tensor.resize_as_sparse_: lambda self, other: -1, + Tensor.retain_grad: lambda self: -1, + Tensor.set_: lambda self, source=None, storage_offset=0, size=None, stride=None: -1, + Tensor.select_scatter: lambda self, src, dim, index: -1, + Tensor.share_memory_: lambda self: -1, + Tensor.short: lambda self, memory_format=torch.preserve_format: -1, + Tensor.size: lambda self: -1, + Tensor.slice_scatter: lambda self, src, dim=0, start=None, end=None, step=1: -1, + Tensor.sparse_dim: lambda self: -1, + Tensor.sparse_mask: lambda self, mask: -1, + Tensor._sparse_mask_projection: lambda self, mask, accumulate_matches=False: -1, + Tensor.sparse_resize_: lambda self, size1, size2, dense_dim: -1, + Tensor.sparse_resize_and_clear_: lambda self, size1, size2, dense_dim: -1, + Tensor.sspaddmm: lambda self, mat1, mat2, beta=1, alpha=1, out=None: -1, + Tensor.storage: lambda self: -1, + Tensor.untyped_storage: lambda self: -1, + Tensor.storage_offset: lambda self: -1, + Tensor.storage_type: lambda self: -1, + Tensor.sum_to_size: lambda self, size: -1, + Tensor.tile: lambda self, *reps: -1, + Tensor.to: lambda self, dtype, non_blocking=False, copy=False, memory_format=torch.preserve_format: -1, + Tensor.to_dense: lambda self, dtype=None, *, masked_grad=None: -1, + Tensor._to_dense: lambda self, dtype=None, masked_grad=None: -1, + Tensor.to_sparse: lambda self: -1, + Tensor.tolist: lambda self: -1, + Tensor.to_mkldnn: lambda self: -1, + Tensor.type_as: lambda self, other: -1, + Tensor.unfold: lambda self, dimension, size, step: -1, + Tensor.uniform_: lambda self, from_=0, to=1: -1, + Tensor.values: lambda self: -1, + Tensor.view: lambda self, shape: -1, + Tensor.view_as: lambda self, other: -1, + Tensor.zero_: lambda self: -1, + Tensor.__dlpack__: lambda self, stream=None: -1, + Tensor.__dlpack_device__: lambda self: -1, + torch.linalg.lstsq: lambda self, b, cond=None, driver=None: -1, + } + + ret2 = {} + ignored = get_ignored_functions() + + for k, v in ret.items(): + # Generate methods like __add__ and add_ by default from add + names = [ + k.__name__, # Default method + k.__name__ + "_", # Inplace variant + "__" + k.__name__ + "__", # Dunder method + "__i" + k.__name__ + "__", # Inplace dunder method + "__r" + k.__name__ + "__", # Reverse dunder method + ] + + if k.__name__.startswith("bitwise_"): + # bitwise_ have dunder methods of the form ____ + # And so on. + subname = k.__name__[len("bitwise_"):] + names.extend([ + "__" + subname + "__", + "__i" + subname + "__", + "__r" + subname + "__" + ]) + + for name in names: + func = getattr(Tensor, name, None) + if callable(func) and func not in ret and func not in ignored: + ret2[func] = v + + ret.update(ret2) + return ret + +def wrap_torch_function(dispatcher: Callable): + """Wraps a given function with ``__torch_function__`` -related functionality. + + Parameters + ---------- + dispatcher: Callable + A callable that returns an iterable of Tensor-likes passed into the function. + + Note + ---- + This decorator may reduce the performance of your code. Generally, it's enough to express + your code as a series of functions that, themselves, support __torch_function__. If you + find yourself in the rare situation where this is not the case, e.g. if you're wrapping a + low-level library and you also need it to work for Tensor-likes, then this function is available. + + Examples + -------- + >>> def dispatcher(a): # Must have the same signature as func + ... return (a,) + >>> @torch.overrides.wrap_torch_function(dispatcher) + >>> def func(a): # This will make func dispatchable by __torch_function__ + ... return a + 0 + """ + def inner(func): + @functools.wraps(func) + def wrapped(*args, **kwargs): + relevant_args = dispatcher(*args, **kwargs) + if has_torch_function(relevant_args): + return handle_torch_function(wrapped, relevant_args, *args, **kwargs) + + return func(*args, **kwargs) + + return wrapped + + return inner + +def _get_overloaded_args(relevant_args: Iterable[Any], get_type_fn: Callable[[Any], Type] = None) -> List[Any]: + """Returns a list of arguments on which to call __torch_function__. + + Checks arguments in relevant_args for __torch_function__ implementations, + storing references to the arguments and their types in overloaded_args and + overloaded_types in order of calling precedence. Only distinct types are + considered. If a type is a subclass of another type it will have higher + precedence, otherwise the precedence order is the same as the order of + arguments in relevant_args, that is, from left-to-right in the argument list. + + The precedence-determining algorithm implemented in this function is + described in `NEP-0018`_. + + See torch::append_overloaded_arg for the equivalent function in the C++ + implementation. + + Parameters + ---------- + relevant_args : iterable of array-like + Iterable of array-like arguments to check for __torch_function__ + methods. + + get_type_fn : callable, optional + Function to call on each argument in relevant_args to get its type. + + Returns + ------- + overloaded_args : list + Arguments from relevant_args on which to call __torch_function__ + methods, in the order in which they should be called. + + .. _NEP-0018: + https://numpy.org/neps/nep-0018-array-function-protocol.html + """ + if get_type_fn is None: + get_type_fn = type + + # If torch function is not enabled, there are no overloaded types + if not torch._C._is_torch_function_enabled(): + return [] + # Runtime is O(num_arguments * num_unique_types) + overloaded_types: Set[Type] = set() + overloaded_args: List[Any] = [] + for arg in relevant_args: + arg_type = get_type_fn(arg) + # We only collect arguments if they have a unique type, which ensures + # reasonable performance even with a long list of possibly overloaded + # arguments. + # + # NB: Important to exclude _disabled_torch_function_impl, otherwise + # https://github.com/pytorch/pytorch/issues/64687 + if (arg_type not in overloaded_types and hasattr(arg_type, '__torch_function__') and + arg_type.__torch_function__ != torch._C._disabled_torch_function_impl): + # Create lists explicitly for the first type (usually the only one + # done) to avoid setting up the iterator for overloaded_args. + if overloaded_types: + overloaded_types.add(arg_type) + # By default, insert argument at the end, but if it is + # subclass of another argument, insert it before that argument. + # This ensures "subclasses before superclasses". + index = len(overloaded_args) + for i, old_arg in enumerate(overloaded_args): + if issubclass(arg_type, get_type_fn(old_arg)): + index = i + break + overloaded_args.insert(index, arg) + else: + overloaded_types = {arg_type} + overloaded_args = [arg] + return overloaded_args + + +def handle_torch_function( + public_api: Callable, relevant_args: Iterable[Any], *args, **kwargs) -> Any: + """Implement a function with checks for ``__torch_function__`` overrides. + + See torch::autograd::handle_torch_function for the equivalent of this + function in the C++ implementation. + + Arguments + --------- + public_api : function + Function exposed by the public torch API originally called like + ``public_api(*args, **kwargs)`` on which arguments are now being + checked. + relevant_args : iterable + Iterable of arguments to check for __torch_function__ methods. + args : tuple + Arbitrary positional arguments originally passed into ``public_api``. + kwargs : tuple + Arbitrary keyword arguments originally passed into ``public_api``. + + Returns + ------- + object + Result from calling ``implementation`` or an ``__torch_function__`` + method, as appropriate. + + Raises + ------ + TypeError : if no implementation is found. + + Example + ------- + >>> def func(a): + ... if has_torch_function_unary(a): + ... return handle_torch_function(func, (a,), a) + ... return a + 0 + """ + # Check for __torch_function__ methods. + overloaded_args = _get_overloaded_args(relevant_args) + # overloaded_args already have unique types. + types = tuple(map(type, overloaded_args)) + + # Check for __torch_function__ mode. + if _is_torch_function_mode_enabled(): + # if we're here, the mode must be set to a TorchFunctionStackMode + # this unsets it and calls directly into TorchFunctionStackMode's torch function + with _pop_mode_temporarily() as mode: + result = mode.__torch_function__(public_api, types, args, kwargs) + if result is not NotImplemented: + return result + + # Call overrides + for overloaded_arg in overloaded_args: + # This call needs to become a classmethod call in the future. + # See https://github.com/pytorch/pytorch/issues/63767 + torch_func_method = overloaded_arg.__torch_function__ + if hasattr(torch_func_method, "__self__") and torch_func_method.__self__ is overloaded_arg and \ + torch_func_method is not torch._C._disabled_torch_function_impl: + warnings.warn("Defining your `__torch_function__ as a plain method is deprecated and " + "will be an error in future, please define it as a classmethod.", + DeprecationWarning) + + # Use `public_api` instead of `implementation` so __torch_function__ + # implementations can do equality/identity comparisons. + result = torch_func_method(public_api, types, args, kwargs) + + if result is not NotImplemented: + return result + + func_name = f'{public_api.__module__}.{public_api.__name__}' + msg = ( + f"no implementation found for '{func_name}' on types that implement " + f'__torch_function__: {[type(arg) for arg in overloaded_args]}' + ) + if _is_torch_function_mode_enabled(): + msg += f" nor in mode {_get_current_function_mode()}" + raise TypeError(msg) + +has_torch_function = _add_docstr( + _has_torch_function, + r"""Check for __torch_function__ implementations in the elements of an iterable + or if a __torch_function__ mode is enabled. Considers exact ``Tensor`` s + and ``Parameter`` s non-dispatchable. Use this to guard a call to + :func:`handle_torch_function`; don't use it to test if something + is Tensor-like, use :func:`is_tensor_like` instead. + Arguments + --------- + relevant_args : iterable + Iterable or arguments to check for __torch_function__ methods. + Returns + ------- + bool + True if any of the elements of relevant_args have __torch_function__ + implementations, False otherwise. + See Also + ________ + torch.is_tensor_like + Checks if something is a Tensor-like, including an exact ``Tensor``. + """ +) + +has_torch_function_unary = _add_docstr( + _has_torch_function_unary, + r"""Special case of `has_torch_function` for single inputs. + Instead of: + `has_torch_function((t,))` + call: + `has_torch_function_unary(t)` + which skips unnecessary packing and unpacking work. + """ +) + +has_torch_function_variadic = _add_docstr( + _has_torch_function_variadic, + r"""Special case of `has_torch_function` that skips tuple creation. + + This uses the METH_FASTCALL protocol introduced in Python 3.7 + + Instead of: + `has_torch_function((a, b))` + call: + `has_torch_function_variadic(a, b)` + which skips unnecessary packing and unpacking work. + """ +) + +@functools.lru_cache(None) +def _get_overridable_functions() -> Tuple[Dict[Any, List[Callable]], Dict[Callable, str]]: + overridable_funcs = collections.defaultdict(list) + index = {} + tested_namespaces = [ + ("torch", torch, torch.__all__), + ("torch.functional", torch.functional, torch.functional.__all__), + ("torch.nn.functional", torch.nn.functional, dir(torch.nn.functional)), + ("torch.nn.init", torch.nn.init, dir(torch.nn.init)), + ("torch.Tensor", torch.Tensor, dir(torch.Tensor)), + ("torch.linalg", torch.linalg, dir(torch.linalg)), + ("torch.fft", torch.fft, dir(torch.fft)), + ("torch.special", torch.special, dir(torch.special)), + ] + for namespace_str, namespace, ns_funcs in tested_namespaces: + for func_name in ns_funcs: + ignore = False + # ignore private functions or functions that are deleted in torch.__init__ + if namespace is not torch.Tensor: + if func_name.startswith('__'): + continue + elif func_name.startswith('_'): + ignore = True + elif func_name.endswith('_'): + ignore = True + elif not func_name[0].islower(): + ignore = True + elif func_name == 'unique_dim': + continue + else: + func = getattr(namespace, func_name) + if getattr(object, func_name, None) == func: + continue + if func_name == '__weakref__': + continue + func = getattr(namespace, func_name) + if namespace is torch.Tensor and getattr(object, func_name, None) == func: + continue + # ignore re-exported modules + if isinstance(func, types.ModuleType): + continue + # ignore __future__ imports + if isinstance(func, __future__._Feature): + continue + + if not callable(func) and hasattr(func, "__get__"): + index[func.__get__] = f"{namespace_str}.{func_name}.__get__" + index[func.__set__] = f"{namespace_str}.{func_name}.__set__" + if ignore: + continue + if func.__get__ in get_ignored_functions(): + msg = ("{}.{} is in the tuple returned by torch._overrides.get_ignored_functions " + "but still has an explicit override") + assert func.__get__ not in get_testing_overrides(), msg.format(namespace, func.__name__) + continue + else: + overridable_funcs[func].append(func.__get__) + continue + + if not callable(func): + continue + + index[func] = f"{namespace_str}.{func_name}" + + if ignore: + continue + + # cannot be overriden by __torch_function__ + if func in get_ignored_functions(): + msg = ("{}.{} is in the tuple returned by torch._overrides.get_ignored_functions " + "but still has an explicit override") + assert func not in get_testing_overrides(), msg.format(namespace, func.__name__) + continue + overridable_funcs[namespace].append(func) + return overridable_funcs, index + +@_disable_user_warnings +def get_overridable_functions() -> Dict[Any, List[Callable]]: + """List functions that are overridable via __torch_function__ + + Returns + ------- + Dict[Any, List[Callable]] + A dictionary that maps namespaces that contain overridable functions + to functions in that namespace that can be overridden. + """ + return _get_overridable_functions()[0] + +@_disable_user_warnings +def resolve_name(f): + """Get a human readable string name for a function passed to + __torch_function__ + + Arguments + --------- + f : Callable + Function to resolve the name of. + + Returns + ------- + str + Name of the function; if eval'ed it should give back the input + function. + """ + if isinstance(f, (torch._ops.OpOverload, torch._ops.OpOverloadPacket)): + return str(f) + return _get_overridable_functions()[1].get(f) + +@functools.lru_cache(None) +def _get_tensor_methods() -> Set[Callable]: + """ Returns a set of the overridable methods on ``torch.Tensor`` """ + overridable_funcs = get_overridable_functions() + methods = set(overridable_funcs[torch.Tensor]) + return methods + +@_disable_user_warnings +def is_tensor_method_or_property(func: Callable) -> bool: + """ + Returns True if the function passed in is a handler for a + method or property belonging to ``torch.Tensor``, as passed + into ``__torch_function__``. + + .. note:: + For properties, their ``__get__`` method must be passed in. + + This may be needed, in particular, for the following reasons: + + 1. Methods/properties sometimes don't contain a `__module__` slot. + 2. They require that the first passed-in argument is an instance + of ``torch.Tensor``. + + Examples + -------- + >>> is_tensor_method_or_property(torch.Tensor.add) + True + >>> is_tensor_method_or_property(torch.add) + False + """ + return func in _get_tensor_methods() or func.__name__ == "__get__" + +def is_tensor_like(inp): + """ + Returns ``True`` if the passed-in input is a Tensor-like. + + Currently, this occurs whenever there's a ``__torch_function__`` + attribute on the type of the input. + + Examples + -------- + A subclass of tensor is generally a Tensor-like. + + >>> class SubTensor(torch.Tensor): ... + >>> is_tensor_like(SubTensor([0])) + True + + Built-in or user types aren't usually Tensor-like. + + >>> is_tensor_like(6) + False + >>> is_tensor_like(None) + False + >>> class NotATensor: ... + >>> is_tensor_like(NotATensor()) + False + + But, they can be made Tensor-like by implementing __torch_function__. + + >>> class TensorLike: + ... @classmethod + ... def __torch_function__(cls, func, types, args, kwargs): + ... return -1 + >>> is_tensor_like(TensorLike()) + True + """ + return type(inp) is torch.Tensor or hasattr(inp, "__torch_function__") + +class TorchFunctionMode: + """ + A ``TorchFunctionMode`` allows you to override the meaning of all + ``__torch_function__`` overrideable functions within a dynamic scope, + without having to actually create a tensor subclass or manually + monkey-patch functions in the PyTorch API. Some common situations + where you should use a mode: + + * You want to override the meaning of factory functions, or other + functions that do not otherwise take a tensor as an argument + (these cannot be overridden with tensor subclasses). + + * You want to override the behavior of all functions without needing + to wrap your inputs in tensor subclasses; e.g., if you are just + interested in logging intermediate computations. + + * You want to control the order of execution of various tensor + subclasses explicitly, rather than implicitly via the return of + ``NotImplemented``. + + Independent subclasses of :class:`TorchFunctionMode` are compositional: + modes can be pushed onto a stack using ``with MyMode():``. + When you call functions in the PyTorch API inside your + ``__torch_function__`` implementation, by default, they will forward on to + the next mode on the mode stack. If you want recursively call back into + your current ``__torch_function__`` implementation, either explicitly + invoke ``self.__torch_function__(...)``, or use the context manager + ``enable_torch_function_mode(self, replace=self.inner)`` to make PyTorch + API self-referential (beware of infinite loops, in this case!) + """ + inner: "TorchFunctionMode" + + # Force metaclass to generate constructor at the base of the hierarchy + def __init__(self): + pass + + def __torch_function__(self, func, types, args=(), kwargs=None): + raise NotImplementedError() + + def __enter__(self): + _push_mode(self) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + _pop_mode() + + @classmethod + def push(cls, *args, **kwargs): + warnings.warn("`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`") + instance = cls(*args, **kwargs) + return instance + + +def _get_current_function_mode(): + stack_len = _len_torch_function_stack() + return _get_function_stack_at(stack_len - 1) if stack_len > 0 else None + + +def _get_current_function_mode_stack(): + stack_len = _len_torch_function_stack() + return [_get_function_stack_at(i) for i in range(stack_len)] + +def _push_mode(mode): + _push_on_torch_function_stack(mode) + + +def _pop_mode(): + old = _pop_torch_function_stack() + return old + + +@contextlib.contextmanager +def _pop_mode_temporarily(): + old = _pop_mode() + try: + yield old + finally: + _push_mode(old) + +class BaseTorchFunctionMode(TorchFunctionMode): + def __torch_function__(self, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + return func(*args, **kwargs) + + +@contextlib.contextmanager +def enable_reentrant_dispatch(): + # NB: this can't simply be + # `enable_reentrant_dispatch = torch._C._RestorePythonTLSSnapshot` + # because: + # 1. torch._C._RestorePythonTLSSnapshot is unavailable when this file + # initially gets imported. Probably an import order thing. + # 2. enable_reentrant_dispatch is technically public API; assigning + # it the object would change the __module__ to look private. + with torch._C._RestorePythonTLSSnapshot(): + try: + yield + finally: + pass diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quasirandom.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quasirandom.py new file mode 100644 index 0000000000000000000000000000000000000000..1c9b949c55651c42895c1a1afb6d9050d41aca2f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quasirandom.py @@ -0,0 +1,180 @@ +import torch +from typing import Optional + + +class SobolEngine: + r""" + The :class:`torch.quasirandom.SobolEngine` is an engine for generating + (scrambled) Sobol sequences. Sobol sequences are an example of low + discrepancy quasi-random sequences. + + This implementation of an engine for Sobol sequences is capable of + sampling sequences up to a maximum dimension of 21201. It uses direction + numbers from https://web.maths.unsw.edu.au/~fkuo/sobol/ obtained using the + search criterion D(6) up to the dimension 21201. This is the recommended + choice by the authors. + + References: + - Art B. Owen. Scrambling Sobol and Niederreiter-Xing points. + Journal of Complexity, 14(4):466-489, December 1998. + + - I. M. Sobol. The distribution of points in a cube and the accurate + evaluation of integrals. + Zh. Vychisl. Mat. i Mat. Phys., 7:784-802, 1967. + + Args: + dimension (Int): The dimensionality of the sequence to be drawn + scramble (bool, optional): Setting this to ``True`` will produce + scrambled Sobol sequences. Scrambling is + capable of producing better Sobol + sequences. Default: ``False``. + seed (Int, optional): This is the seed for the scrambling. The seed + of the random number generator is set to this, + if specified. Otherwise, it uses a random seed. + Default: ``None`` + + Examples:: + + >>> # xdoctest: +SKIP("unseeded random state") + >>> soboleng = torch.quasirandom.SobolEngine(dimension=5) + >>> soboleng.draw(3) + tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [0.5000, 0.5000, 0.5000, 0.5000, 0.5000], + [0.7500, 0.2500, 0.2500, 0.2500, 0.7500]]) + """ + MAXBIT = 30 + MAXDIM = 21201 + + def __init__(self, dimension, scramble=False, seed=None): + if dimension > self.MAXDIM or dimension < 1: + raise ValueError("Supported range of dimensionality " + f"for SobolEngine is [1, {self.MAXDIM}]") + + self.seed = seed + self.scramble = scramble + self.dimension = dimension + + cpu = torch.device("cpu") + + self.sobolstate = torch.zeros(dimension, self.MAXBIT, device=cpu, dtype=torch.long) + torch._sobol_engine_initialize_state_(self.sobolstate, self.dimension) + + if not self.scramble: + self.shift = torch.zeros(self.dimension, device=cpu, dtype=torch.long) + else: + self._scramble() + + self.quasi = self.shift.clone(memory_format=torch.contiguous_format) + self._first_point = (self.quasi / 2 ** self.MAXBIT).reshape(1, -1) + self.num_generated = 0 + + def draw(self, n: int = 1, out: Optional[torch.Tensor] = None, + dtype: torch.dtype = torch.float32) -> torch.Tensor: + r""" + Function to draw a sequence of :attr:`n` points from a Sobol sequence. + Note that the samples are dependent on the previous samples. The size + of the result is :math:`(n, dimension)`. + + Args: + n (Int, optional): The length of sequence of points to draw. + Default: 1 + out (Tensor, optional): The output tensor + dtype (:class:`torch.dtype`, optional): the desired data type of the + returned tensor. + Default: ``torch.float32`` + """ + if self.num_generated == 0: + if n == 1: + result = self._first_point.to(dtype) + else: + result, self.quasi = torch._sobol_engine_draw( + self.quasi, n - 1, self.sobolstate, self.dimension, self.num_generated, dtype=dtype, + ) + result = torch.cat((self._first_point, result), dim=-2) + else: + result, self.quasi = torch._sobol_engine_draw( + self.quasi, n, self.sobolstate, self.dimension, self.num_generated - 1, dtype=dtype, + ) + + self.num_generated += n + + if out is not None: + out.resize_as_(result).copy_(result) + return out + + return result + + def draw_base2(self, m: int, out: Optional[torch.Tensor] = None, + dtype: torch.dtype = torch.float32) -> torch.Tensor: + r""" + Function to draw a sequence of :attr:`2**m` points from a Sobol sequence. + Note that the samples are dependent on the previous samples. The size + of the result is :math:`(2**m, dimension)`. + + Args: + m (Int): The (base2) exponent of the number of points to draw. + out (Tensor, optional): The output tensor + dtype (:class:`torch.dtype`, optional): the desired data type of the + returned tensor. + Default: ``torch.float32`` + """ + n = 2 ** m + total_n = self.num_generated + n + if not (total_n & (total_n - 1) == 0): + raise ValueError("The balance properties of Sobol' points require " + f"n to be a power of 2. {self.num_generated} points have been " + f"previously generated, then: n={self.num_generated}+2**{m}={total_n}. " + "If you still want to do this, please use " + "'SobolEngine.draw()' instead." + ) + return self.draw(n=n, out=out, dtype=dtype) + + def reset(self): + r""" + Function to reset the ``SobolEngine`` to base state. + """ + self.quasi.copy_(self.shift) + self.num_generated = 0 + return self + + def fast_forward(self, n): + r""" + Function to fast-forward the state of the ``SobolEngine`` by + :attr:`n` steps. This is equivalent to drawing :attr:`n` samples + without using the samples. + + Args: + n (Int): The number of steps to fast-forward by. + """ + if self.num_generated == 0: + torch._sobol_engine_ff_(self.quasi, n - 1, self.sobolstate, self.dimension, self.num_generated) + else: + torch._sobol_engine_ff_(self.quasi, n, self.sobolstate, self.dimension, self.num_generated - 1) + self.num_generated += n + return self + + def _scramble(self): + g: Optional[torch.Generator] = None + if self.seed is not None: + g = torch.Generator() + g.manual_seed(self.seed) + + cpu = torch.device("cpu") + + # Generate shift vector + shift_ints = torch.randint(2, (self.dimension, self.MAXBIT), device=cpu, generator=g) + self.shift = torch.mv(shift_ints, torch.pow(2, torch.arange(0, self.MAXBIT, device=cpu))) + + # Generate lower triangular matrices (stacked across dimensions) + ltm_dims = (self.dimension, self.MAXBIT, self.MAXBIT) + ltm = torch.randint(2, ltm_dims, device=cpu, generator=g).tril() + + torch._sobol_engine_scramble_(self.sobolstate, ltm, self.dimension) + + def __repr__(self): + fmt_string = [f'dimension={self.dimension}'] + if self.scramble: + fmt_string += ['scramble=True'] + if self.seed is not None: + fmt_string += [f'seed={self.seed}'] + return self.__class__.__name__ + '(' + ', '.join(fmt_string) + ')' diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/random.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/random.py new file mode 100644 index 0000000000000000000000000000000000000000..75c48dab7213bf86ff7d9443d7af369e4932de68 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/random.py @@ -0,0 +1,175 @@ +import contextlib +from typing import Generator +import warnings + +from torch._C import default_generator +import torch + + +def set_rng_state(new_state: torch.Tensor) -> None: + r"""Sets the random number generator state. + + .. note: This function only works for CPU. For CUDA, please use + torch.manual_seed(seed), which works for both CPU and CUDA. + + Args: + new_state (torch.ByteTensor): The desired state + """ + default_generator.set_state(new_state) + + +def get_rng_state() -> torch.Tensor: + r"""Returns the random number generator state as a `torch.ByteTensor`.""" + return default_generator.get_state() + + +def manual_seed(seed) -> torch._C.Generator: + r"""Sets the seed for generating random numbers. Returns a + `torch.Generator` object. + + Args: + seed (int): The desired seed. Value must be within the inclusive range + `[-0x8000_0000_0000_0000, 0xffff_ffff_ffff_ffff]`. Otherwise, a RuntimeError + is raised. Negative inputs are remapped to positive values with the formula + `0xffff_ffff_ffff_ffff + seed`. + """ + seed = int(seed) + import torch.cuda + + if not torch.cuda._is_in_bad_fork(): + torch.cuda.manual_seed_all(seed) + + import torch.mps + if not torch.mps._is_in_bad_fork(): + torch.mps.manual_seed(seed) + + import torch.xpu + if not torch.xpu._is_in_bad_fork(): + torch.xpu.manual_seed_all(seed) + + _seed_custom_device(seed) + + return default_generator.manual_seed(seed) + + +def seed() -> int: + r"""Sets the seed for generating random numbers to a non-deterministic + random number. Returns a 64 bit number used to seed the RNG. + """ + seed = default_generator.seed() + import torch.cuda + + if not torch.cuda._is_in_bad_fork(): + torch.cuda.manual_seed_all(seed) + + import torch.mps + if not torch.mps._is_in_bad_fork(): + torch.mps.manual_seed(seed) + + import torch.xpu + if not torch.xpu._is_in_bad_fork(): + torch.xpu.manual_seed_all(seed) + + _seed_custom_device(seed) + + return seed + + +def _seed_custom_device(seed) -> None: + r"""Sets the seed to generate random numbers for custom device. + + Args: + seed (int): The desired seed. + + See [Note: support the custom device with privateuse1] + """ + seed = int(seed) + custom_backend_name = torch._C._get_privateuse1_backend_name() + if hasattr(torch, custom_backend_name): + custom_device_mod = getattr(torch, custom_backend_name) + _bad_fork_name = "_is_in_bad_fork" + _seed_all_name = "manual_seed_all" + if hasattr(custom_device_mod, _bad_fork_name) and hasattr(custom_device_mod, _seed_all_name): + if not getattr(custom_device_mod, _bad_fork_name)(): + getattr(custom_device_mod, _seed_all_name)(seed) + else: + message = f"Set seed for `{custom_backend_name}` device does not take effect, please add API's " + message += f"`{_bad_fork_name}` and `{_seed_all_name}` to `{custom_backend_name}` device module." + warnings.warn(message, UserWarning, stacklevel=3) + + +def initial_seed() -> int: + r"""Returns the initial seed for generating random numbers as a + Python `long`. + """ + return default_generator.initial_seed() + + +_fork_rng_warned_already = False + + +@contextlib.contextmanager +def fork_rng(devices=None, enabled=True, _caller="fork_rng", _devices_kw="devices", device_type="cuda") -> Generator: + """ + Forks the RNG, so that when you return, the RNG is reset + to the state that it was previously in. + + Args: + devices (iterable of Device IDs): devices for which to fork + the RNG. CPU RNG state is always forked. By default, :meth:`fork_rng` operates + on all devices, but will emit a warning if your machine has a lot + of devices, since this function will run very slowly in that case. + If you explicitly specify devices, this warning will be suppressed + enabled (bool): if ``False``, the RNG is not forked. This is a convenience + argument for easily disabling the context manager without having + to delete it and unindent your Python code under it. + deivce_type (str): device type str, default is `cuda`. As for custom device, + see details in [Note: support the custom device with privateuse1] + """ + + device_type = torch.device(device_type).type + device_mod = getattr(torch, device_type, None) + if device_mod is None: + raise RuntimeError(f"torch has no module of `{device_type}`, you should register " + + "a module by `torch._register_device_module`.") + global _fork_rng_warned_already + + # Internal arguments: + # _caller: the function which called fork_rng, which the user used + # _devices_kw: the devices keyword of _caller + + if not enabled: + yield + return + + if devices is None: + num_devices = device_mod.device_count() + if num_devices > 1 and not _fork_rng_warned_already: + message = (f"{device_type.upper()} reports that you have {num_devices} available devices, and " + f"you have used {_caller} without explicitly specifying which devices are being used. " + f"For safety, we initialize *every* {device_type.upper()} device by default, which can " + f"be quite slow if you have a lot of {device_type.upper()}s. If you know that you are only" + f" making use of a few {device_type.upper()} devices, set the environment variable " + f"{device_type.upper()}_VISIBLE_DEVICES or the '{_devices_kw}' keyword argument of {_caller} " + "with the set of devices you are actually using. For example, if you are using CPU only, " + "set device.upper()_VISIBLE_DEVICES= or devices=[]; if you are using device 0 only, " + f"set {device_type.upper()}_VISIBLE_DEVICES=0 or devices=[0]. To initialize all devices " + f"and suppress this warning, set the '{_devices_kw}' keyword argument to " + f"`range(torch.{device_type}.device_count())`.") + warnings.warn(message) + _fork_rng_warned_already = True + devices = list(range(num_devices)) + else: + # Protect against user passing us a generator; we need to traverse this + # multiple times but a generator will be exhausted upon first traversal + devices = list(devices) + + cpu_rng_state = torch.get_rng_state() + device_rng_states = [device_mod.get_rng_state(device) for device in devices] + + try: + yield + finally: + torch.set_rng_state(cpu_rng_state) + for device, device_rng_state in zip(devices, device_rng_states): + device_mod.set_rng_state(device_rng_state, device) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aed7d735af2cc66c96224a149bdfbcc3cb86ca52 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b53b6b9d49d55864964b3395b16dde18bce16e76fdd3f1fb4a0b26048e29fd0 +size 123690 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton-2.3.0.dist-info/METADATA b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton-2.3.0.dist-info/METADATA new file mode 100644 index 0000000000000000000000000000000000000000..3e4e2809775aad6f54e7c9ea2a4cb9b51fd78b27 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton-2.3.0.dist-info/METADATA @@ -0,0 +1,35 @@ +Metadata-Version: 2.1 +Name: triton +Version: 2.3.0 +Summary: A language and compiler for custom Deep Learning operations +Home-page: https://github.com/openai/triton/ +Author: Philippe Tillet +Author-email: phil@openai.com +Keywords: Compiler,Deep Learning +Classifier: Development Status :: 4 - Beta +Classifier: Intended Audience :: Developers +Classifier: Topic :: Software Development :: Build Tools +Classifier: License :: OSI Approved :: MIT License +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Requires-Dist: filelock +Provides-Extra: build +Requires-Dist: cmake >=3.20 ; extra == 'build' +Requires-Dist: lit ; extra == 'build' +Provides-Extra: tests +Requires-Dist: autopep8 ; extra == 'tests' +Requires-Dist: flake8 ; extra == 'tests' +Requires-Dist: isort ; extra == 'tests' +Requires-Dist: numpy ; extra == 'tests' +Requires-Dist: pytest ; extra == 'tests' +Requires-Dist: scipy >=1.7.1 ; extra == 'tests' +Requires-Dist: torch ; extra == 'tests' +Provides-Extra: tutorials +Requires-Dist: matplotlib ; extra == 'tutorials' +Requires-Dist: pandas ; extra == 'tutorials' +Requires-Dist: tabulate ; extra == 'tutorials' +Requires-Dist: torch ; extra == 'tutorials' +