diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/MarkupSafe-2.1.5.dist-info/RECORD b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/MarkupSafe-2.1.5.dist-info/RECORD
new file mode 100644
index 0000000000000000000000000000000000000000..0be94001ee3939b5aa5a446da5a2d85aa39f1b66
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/MarkupSafe-2.1.5.dist-info/RECORD
@@ -0,0 +1,14 @@
+MarkupSafe-2.1.5.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+MarkupSafe-2.1.5.dist-info/LICENSE.rst,sha256=SJqOEQhQntmKN7uYPhHg9-HTHwvY-Zp5yESOf_N9B-o,1475
+MarkupSafe-2.1.5.dist-info/METADATA,sha256=2dRDPam6OZLfpX0wg1JN5P3u9arqACxVSfdGmsJU7o8,3003
+MarkupSafe-2.1.5.dist-info/RECORD,,
+MarkupSafe-2.1.5.dist-info/WHEEL,sha256=AI1yqBLEPcVKWn5Ls2uPawjbqPXPFTYdQLSdN8WFCJw,152
+MarkupSafe-2.1.5.dist-info/top_level.txt,sha256=qy0Plje5IJuvsCBjejJyhDCjEAdcDLK_2agVcex8Z6U,11
+markupsafe/__init__.py,sha256=r7VOTjUq7EMQ4v3p4R1LoVOGJg6ysfYRncLr34laRBs,10958
+markupsafe/__pycache__/__init__.cpython-311.pyc,,
+markupsafe/__pycache__/_native.cpython-311.pyc,,
+markupsafe/_native.py,sha256=GR86Qvo_GcgKmKreA1WmYN9ud17OFwkww8E-fiW-57s,1713
+markupsafe/_speedups.c,sha256=X2XvQVtIdcK4Usz70BvkzoOfjTCmQlDkkjYSn-swE0g,7083
+markupsafe/_speedups.cpython-311-x86_64-linux-gnu.so,sha256=9PMBIm-zJzHm91NC-mblTC119_dIAldSQ4xFsE1_NPc,53656
+markupsafe/_speedups.pyi,sha256=vfMCsOgbAXRNLUXkyuyonG8uEWKYU4PDqNuMaDELAYw,229
+markupsafe/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/constants.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/constants.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b00abd70fea07909561d71bdd2b6bb009e8d5c9
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/constants.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/environment.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/environment.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e55be52b23296571b030af156bd339f89491f3e
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/environment.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/meta.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/meta.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab1cabcfc14c57e9cb6db9243be3be5ee0b78f04
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/meta.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/networkx-3.2.1.dist-info/LICENSE.txt b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/networkx-3.2.1.dist-info/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..42b6f17a65ef6636d00c93aefb514441145ecc94
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/networkx-3.2.1.dist-info/LICENSE.txt
@@ -0,0 +1,37 @@
+NetworkX is distributed with the 3-clause BSD license.
+
+::
+
+   Copyright (C) 2004-2023, NetworkX Developers
+   Aric Hagberg <hagberg@lanl.gov>
+   Dan Schult <dschult@colgate.edu>
+   Pieter Swart <swart@lanl.gov>
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+     * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+     * Neither the name of the NetworkX Developers nor the names of its
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/networkx-3.2.1.dist-info/METADATA b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/networkx-3.2.1.dist-info/METADATA
new file mode 100644
index 0000000000000000000000000000000000000000..ac51fe019585ed14b015f89a8d8908ccc7c356c2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/networkx-3.2.1.dist-info/METADATA
@@ -0,0 +1,135 @@
+Metadata-Version: 2.1
+Name: networkx
+Version: 3.2.1
+Summary: Python package for creating and manipulating graphs and networks
+Author-email: Aric Hagberg <hagberg@lanl.gov>
+Maintainer-email: NetworkX Developers <networkx-discuss@googlegroups.com>
+Project-URL: Homepage, https://networkx.org/
+Project-URL: Bug Tracker, https://github.com/networkx/networkx/issues
+Project-URL: Documentation, https://networkx.org/documentation/stable/
+Project-URL: Source Code, https://github.com/networkx/networkx
+Keywords: Networks,Graph Theory,Mathematics,network,graph,discrete mathematics,math
+Platform: Linux
+Platform: Mac OSX
+Platform: Windows
+Platform: Unix
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Classifier: Topic :: Scientific/Engineering :: Information Analysis
+Classifier: Topic :: Scientific/Engineering :: Mathematics
+Classifier: Topic :: Scientific/Engineering :: Physics
+Requires-Python: >=3.9
+Description-Content-Type: text/x-rst
+License-File: LICENSE.txt
+Provides-Extra: default
+Requires-Dist: numpy >=1.22 ; extra == 'default'
+Requires-Dist: scipy !=1.11.0,!=1.11.1,>=1.9 ; extra == 'default'
+Requires-Dist: matplotlib >=3.5 ; extra == 'default'
+Requires-Dist: pandas >=1.4 ; extra == 'default'
+Provides-Extra: developer
+Requires-Dist: changelist ==0.4 ; extra == 'developer'
+Requires-Dist: pre-commit >=3.2 ; extra == 'developer'
+Requires-Dist: mypy >=1.1 ; extra == 'developer'
+Requires-Dist: rtoml ; extra == 'developer'
+Provides-Extra: doc
+Requires-Dist: sphinx >=7 ; extra == 'doc'
+Requires-Dist: pydata-sphinx-theme >=0.14 ; extra == 'doc'
+Requires-Dist: sphinx-gallery >=0.14 ; extra == 'doc'
+Requires-Dist: numpydoc >=1.6 ; extra == 'doc'
+Requires-Dist: pillow >=9.4 ; extra == 'doc'
+Requires-Dist: nb2plots >=0.7 ; extra == 'doc'
+Requires-Dist: texext >=0.6.7 ; extra == 'doc'
+Requires-Dist: nbconvert <7.9 ; extra == 'doc'
+Provides-Extra: extra
+Requires-Dist: lxml >=4.6 ; extra == 'extra'
+Requires-Dist: pygraphviz >=1.11 ; extra == 'extra'
+Requires-Dist: pydot >=1.4.2 ; extra == 'extra'
+Requires-Dist: sympy >=1.10 ; extra == 'extra'
+Provides-Extra: test
+Requires-Dist: pytest >=7.2 ; extra == 'test'
+Requires-Dist: pytest-cov >=4.0 ; extra == 'test'
+
+NetworkX
+========
+
+
+.. image:: https://github.com/networkx/networkx/workflows/test/badge.svg?branch=main
+  :target: https://github.com/networkx/networkx/actions?query=workflow%3A%22test%22
+
+.. image:: https://codecov.io/gh/networkx/networkx/branch/main/graph/badge.svg
+   :target: https://app.codecov.io/gh/networkx/networkx/branch/main
+   
+.. image:: https://img.shields.io/github/labels/networkx/networkx/Good%20First%20Issue?color=green&label=Contribute%20&style=flat-square
+   :target: https://github.com/networkx/networkx/issues?q=is%3Aopen+is%3Aissue+label%3A%22Good+First+Issue%22
+   
+
+NetworkX is a Python package for the creation, manipulation,
+and study of the structure, dynamics, and functions
+of complex networks.
+
+- **Website (including documentation):** https://networkx.org
+- **Mailing list:** https://groups.google.com/forum/#!forum/networkx-discuss
+- **Source:** https://github.com/networkx/networkx
+- **Bug reports:** https://github.com/networkx/networkx/issues
+- **Report a security vulnerability:** https://tidelift.com/security
+- **Tutorial:** https://networkx.org/documentation/latest/tutorial.html
+- **GitHub Discussions:** https://github.com/networkx/networkx/discussions
+
+Simple example
+--------------
+
+Find the shortest path between two nodes in an undirected graph:
+
+.. code:: pycon
+
+    >>> import networkx as nx
+    >>> G = nx.Graph()
+    >>> G.add_edge("A", "B", weight=4)
+    >>> G.add_edge("B", "D", weight=2)
+    >>> G.add_edge("A", "C", weight=3)
+    >>> G.add_edge("C", "D", weight=4)
+    >>> nx.shortest_path(G, "A", "D", weight="weight")
+    ['A', 'B', 'D']
+
+Install
+-------
+
+Install the latest version of NetworkX::
+
+    $ pip install networkx
+
+Install with all optional dependencies::
+
+    $ pip install networkx[all]
+
+For additional details, please see `INSTALL.rst`.
+
+Bugs
+----
+
+Please report any bugs that you find `here <https://github.com/networkx/networkx/issues>`_.
+Or, even better, fork the repository on `GitHub <https://github.com/networkx/networkx>`_
+and create a pull request (PR). We welcome all changes, big or small, and we
+will help you make the PR if you are new to `git` (just ask on the issue and/or
+see `CONTRIBUTING.rst`).
+
+License
+-------
+
+Released under the 3-Clause BSD license (see `LICENSE.txt`)::
+
+   Copyright (C) 2004-2023 NetworkX Developers
+   Aric Hagberg <hagberg@lanl.gov>
+   Dan Schult <dschult@colgate.edu>
+   Pieter Swart <swart@lanl.gov>
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/networkx-3.2.1.dist-info/WHEEL b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/networkx-3.2.1.dist-info/WHEEL
new file mode 100644
index 0000000000000000000000000000000000000000..7e688737d490be3643d705bc16b5a77f7bd567b7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/networkx-3.2.1.dist-info/WHEEL
@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: bdist_wheel (0.41.2)
+Root-Is-Purelib: true
+Tag: py3-none-any
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cuda_nvrtc_cu11-11.8.89.dist-info/INSTALLER b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cuda_nvrtc_cu11-11.8.89.dist-info/INSTALLER
new file mode 100644
index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cuda_nvrtc_cu11-11.8.89.dist-info/INSTALLER
@@ -0,0 +1 @@
+pip
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cuda_nvrtc_cu11-11.8.89.dist-info/top_level.txt b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cuda_nvrtc_cu11-11.8.89.dist-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..862f7abf232cdfbb928609856247292e81c9decb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cuda_nvrtc_cu11-11.8.89.dist-info/top_level.txt
@@ -0,0 +1 @@
+nvidia
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cuda_runtime_cu11-11.8.89.dist-info/INSTALLER b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cuda_runtime_cu11-11.8.89.dist-info/INSTALLER
new file mode 100644
index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cuda_runtime_cu11-11.8.89.dist-info/INSTALLER
@@ -0,0 +1 @@
+pip
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cudnn_cu11-8.7.0.84.dist-info/INSTALLER b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cudnn_cu11-8.7.0.84.dist-info/INSTALLER
new file mode 100644
index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cudnn_cu11-8.7.0.84.dist-info/INSTALLER
@@ -0,0 +1 @@
+pip
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cudnn_cu11-8.7.0.84.dist-info/RECORD b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cudnn_cu11-8.7.0.84.dist-info/RECORD
new file mode 100644
index 0000000000000000000000000000000000000000..19624b366963bf19a272095385f1430d237e383b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cudnn_cu11-8.7.0.84.dist-info/RECORD
@@ -0,0 +1,39 @@
+nvidia/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+nvidia/__pycache__/__init__.cpython-311.pyc,,
+nvidia/cudnn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+nvidia/cudnn/__pycache__/__init__.cpython-311.pyc,,
+nvidia/cudnn/include/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+nvidia/cudnn/include/__pycache__/__init__.cpython-311.pyc,,
+nvidia/cudnn/include/cudnn.h,sha256=nDGd5AONjdibAgMINJPMvCS1Tzvb3mki4IGWsrtR8p8,2968
+nvidia/cudnn/include/cudnn_adv_infer.h,sha256=CSGiXy0NKDNSl-nkG2h6B8ssepnljQ3PiXJaiwcZ4rU,29025
+nvidia/cudnn/include/cudnn_adv_infer_v8.h,sha256=CSGiXy0NKDNSl-nkG2h6B8ssepnljQ3PiXJaiwcZ4rU,29025
+nvidia/cudnn/include/cudnn_adv_train.h,sha256=I741xrSdMEyJET6CRergT_rtVD-CXCAqAqVnI3wn2DI,27700
+nvidia/cudnn/include/cudnn_adv_train_v8.h,sha256=I741xrSdMEyJET6CRergT_rtVD-CXCAqAqVnI3wn2DI,27700
+nvidia/cudnn/include/cudnn_backend.h,sha256=qNFagb4NdM8S2d1JZFITBVLL59qxiU0_GrnN7eAl5co,24727
+nvidia/cudnn/include/cudnn_backend_v8.h,sha256=qNFagb4NdM8S2d1JZFITBVLL59qxiU0_GrnN7eAl5co,24727
+nvidia/cudnn/include/cudnn_cnn_infer.h,sha256=44Le8oYL6MlNqYPHt-gG3OJtX0pQ39zxVmwQR-mMs_I,29083
+nvidia/cudnn/include/cudnn_cnn_infer_v8.h,sha256=44Le8oYL6MlNqYPHt-gG3OJtX0pQ39zxVmwQR-mMs_I,29083
+nvidia/cudnn/include/cudnn_cnn_train.h,sha256=fMg2JXxX5cC_AuOLwmJvj3dRfI5Gxb-3_G67f-IdepQ,10217
+nvidia/cudnn/include/cudnn_cnn_train_v8.h,sha256=fMg2JXxX5cC_AuOLwmJvj3dRfI5Gxb-3_G67f-IdepQ,10217
+nvidia/cudnn/include/cudnn_ops_infer.h,sha256=_eoAj4VayT1X2xjWRIKDfyfu2X9SYKo206W2KF-5-60,49631
+nvidia/cudnn/include/cudnn_ops_infer_v8.h,sha256=_eoAj4VayT1X2xjWRIKDfyfu2X9SYKo206W2KF-5-60,49631
+nvidia/cudnn/include/cudnn_ops_train.h,sha256=Gxdqcy5CRBIjD0MRaBlNv8LGYb3lbi06ECLzPVTGwmE,25733
+nvidia/cudnn/include/cudnn_ops_train_v8.h,sha256=Gxdqcy5CRBIjD0MRaBlNv8LGYb3lbi06ECLzPVTGwmE,25733
+nvidia/cudnn/include/cudnn_v8.h,sha256=nDGd5AONjdibAgMINJPMvCS1Tzvb3mki4IGWsrtR8p8,2968
+nvidia/cudnn/include/cudnn_version.h,sha256=GDrhJZBHJ9V9raYCLgK20IwPsRbFfWar-e_aUGDz4cQ,3113
+nvidia/cudnn/include/cudnn_version_v8.h,sha256=GDrhJZBHJ9V9raYCLgK20IwPsRbFfWar-e_aUGDz4cQ,3113
+nvidia/cudnn/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+nvidia/cudnn/lib/__pycache__/__init__.cpython-311.pyc,,
+nvidia/cudnn/lib/libcudnn.so.8,sha256=Jqcoi3MV1lisqxBz8CxPGM0dJ-6t3hApWPAxfa1mVuA,150200
+nvidia/cudnn/lib/libcudnn_adv_infer.so.8,sha256=m4YscW0p6QtRn9nmz1BmLS5DxujMT-051PHQ9g1ZzGI,130420936
+nvidia/cudnn/lib/libcudnn_adv_train.so.8,sha256=LouBB7ZNSranPOGE0wG3HhTCsbtGnvVzShuqOhAIR9Q,121126456
+nvidia/cudnn/lib/libcudnn_cnn_infer.so.8,sha256=q_6u8ekYhJBJCD9iV7zp6rCiqJ9ty7IvRPwVBcBAmnw,662671696
+nvidia/cudnn/lib/libcudnn_cnn_train.so.8,sha256=gBiVQZ6d4k3klgmf__Pu8UPoLEh83mjE6Rl-lvvqEds,102270568
+nvidia/cudnn/lib/libcudnn_ops_infer.so.8,sha256=bmmzZRK2mZAF3kV3YzadRHQ6gPnrFBbgnE5gKeSrOA0,97560024
+nvidia/cudnn/lib/libcudnn_ops_train.so.8,sha256=VfW-n_Ut7y_5uhQ4uF-37Pv6N9CgmJt0Z4izmu-Bv-c,74719784
+nvidia_cudnn_cu11-8.7.0.84.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+nvidia_cudnn_cu11-8.7.0.84.dist-info/License.txt,sha256=Sc95vbNXNLUv5iAwE7O9dZ-B6ZjNMqosZcUduaiMYdI,18174
+nvidia_cudnn_cu11-8.7.0.84.dist-info/METADATA,sha256=O2zEaG1qbOXQi2Bl-n2ZQJxpqcBjMBGxlDqIKisoC6Y,1570
+nvidia_cudnn_cu11-8.7.0.84.dist-info/RECORD,,
+nvidia_cudnn_cu11-8.7.0.84.dist-info/WHEEL,sha256=-kQi_VMfvRQozZJT7HUPMfY-5vLo0LVTmAylNJ3Ft98,106
+nvidia_cudnn_cu11-8.7.0.84.dist-info/top_level.txt,sha256=fTkAtiFuL16nUrB9ytDDtpytz2t0B4NvYTnRzwAhO14,7
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cudnn_cu11-8.7.0.84.dist-info/WHEEL b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cudnn_cu11-8.7.0.84.dist-info/WHEEL
new file mode 100644
index 0000000000000000000000000000000000000000..06e355fe0e3ed7077903f119ae6928a17da8eb6f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cudnn_cu11-8.7.0.84.dist-info/WHEEL
@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: bdist_wheel (0.37.1)
+Root-Is-Purelib: true
+Tag: py3-none-manylinux1_x86_64
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip-24.3.1.dist-info/AUTHORS.txt b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip-24.3.1.dist-info/AUTHORS.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8ccefbc6e59f0bc761589c7994c15c7922ce5127
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip-24.3.1.dist-info/AUTHORS.txt
@@ -0,0 +1,799 @@
+@Switch01
+A_Rog
+Aakanksha Agrawal
+Abhinav Sagar
+ABHYUDAY PRATAP SINGH
+abs51295
+AceGentile
+Adam Chainz
+Adam Tse
+Adam Wentz
+admin
+Adolfo Ochagavía
+Adrien Morison
+Agus
+ahayrapetyan
+Ahilya
+AinsworthK
+Akash Srivastava
+Alan Yee
+Albert Tugushev
+Albert-Guan
+albertg
+Alberto Sottile
+Aleks Bunin
+Ales Erjavec
+Alethea Flowers
+Alex Gaynor
+Alex Grönholm
+Alex Hedges
+Alex Loosley
+Alex Morega
+Alex Stachowiak
+Alexander Shtyrov
+Alexandre Conrad
+Alexey Popravka
+Aleš Erjavec
+Alli
+Ami Fischman
+Ananya Maiti
+Anatoly Techtonik
+Anders Kaseorg
+Andre Aguiar
+Andreas Lutro
+Andrei Geacar
+Andrew Gaul
+Andrew Shymanel
+Andrey Bienkowski
+Andrey Bulgakov
+Andrés Delfino
+Andy Freeland
+Andy Kluger
+Ani Hayrapetyan
+Aniruddha Basak
+Anish Tambe
+Anrs Hu
+Anthony Sottile
+Antoine Musso
+Anton Ovchinnikov
+Anton Patrushev
+Anton Zelenov
+Antonio Alvarado Hernandez
+Antony Lee
+Antti Kaihola
+Anubhav Patel
+Anudit Nagar
+Anuj Godase
+AQNOUCH Mohammed
+AraHaan
+arena
+arenasys
+Arindam Choudhury
+Armin Ronacher
+Arnon Yaari
+Artem
+Arun Babu Neelicattu
+Ashley Manton
+Ashwin Ramaswami
+atse
+Atsushi Odagiri
+Avinash Karhana
+Avner Cohen
+Awit (Ah-Wit) Ghirmai
+Baptiste Mispelon
+Barney Gale
+barneygale
+Bartek Ogryczak
+Bastian Venthur
+Ben Bodenmiller
+Ben Darnell
+Ben Hoyt
+Ben Mares
+Ben Rosser
+Bence Nagy
+Benjamin Peterson
+Benjamin VanEvery
+Benoit Pierre
+Berker Peksag
+Bernard
+Bernard Tyers
+Bernardo B. Marques
+Bernhard M. Wiedemann
+Bertil Hatt
+Bhavam Vidyarthi
+Blazej Michalik
+Bogdan Opanchuk
+BorisZZZ
+Brad Erickson
+Bradley Ayers
+Branch Vincent
+Brandon L. Reiss
+Brandt Bucher
+Brannon Dorsey
+Brett Randall
+Brett Rosen
+Brian Cristante
+Brian Rosner
+briantracy
+BrownTruck
+Bruno Oliveira
+Bruno Renié
+Bruno S
+Bstrdsmkr
+Buck Golemon
+burrows
+Bussonnier Matthias
+bwoodsend
+c22
+Caleb Martinez
+Calvin Smith
+Carl Meyer
+Carlos Liam
+Carol Willing
+Carter Thayer
+Cass
+Chandrasekhar Atina
+Charlie Marsh
+Chih-Hsuan Yen
+Chris Brinker
+Chris Hunt
+Chris Jerdonek
+Chris Kuehl
+Chris Markiewicz
+Chris McDonough
+Chris Pawley
+Chris Pryer
+Chris Wolfe
+Christian Clauss
+Christian Heimes
+Christian Oudard
+Christoph Reiter
+Christopher Hunt
+Christopher Snyder
+chrysle
+cjc7373
+Clark Boylan
+Claudio Jolowicz
+Clay McClure
+Cody
+Cody Soyland
+Colin Watson
+Collin Anderson
+Connor Osborn
+Cooper Lees
+Cooper Ry Lees
+Cory Benfield
+Cory Wright
+Craig Kerstiens
+Cristian Sorinel
+Cristina
+Cristina Muñoz
+ctg123
+Curtis Doty
+cytolentino
+Daan De Meyer
+Dale
+Damian
+Damian Quiroga
+Damian Shaw
+Dan Black
+Dan Savilonis
+Dan Sully
+Dane Hillard
+daniel
+Daniel Collins
+Daniel Hahler
+Daniel Holth
+Daniel Jost
+Daniel Katz
+Daniel Shaulov
+Daniele Esposti
+Daniele Nicolodi
+Daniele Procida
+Daniil Konovalenko
+Danny Hermes
+Danny McClanahan
+Darren Kavanagh
+Dav Clark
+Dave Abrahams
+Dave Jones
+David Aguilar
+David Black
+David Bordeynik
+David Caro
+David D Lowe
+David Evans
+David Hewitt
+David Linke
+David Poggi
+David Poznik
+David Pursehouse
+David Runge
+David Tucker
+David Wales
+Davidovich
+ddelange
+Deepak Sharma
+Deepyaman Datta
+Denise Yu
+dependabot[bot]
+derwolfe
+Desetude
+Devesh Kumar Singh
+devsagul
+Diego Caraballo
+Diego Ramirez
+DiegoCaraballo
+Dimitri Merejkowsky
+Dimitri Papadopoulos
+Dimitri Papadopoulos Orfanos
+Dirk Stolle
+Dmitry Gladkov
+Dmitry Volodin
+Domen Kožar
+Dominic Davis-Foster
+Donald Stufft
+Dongweiming
+doron zarhi
+Dos Moonen
+Douglas Thor
+DrFeathers
+Dustin Ingram
+Dustin Rodrigues
+Dwayne Bailey
+Ed Morley
+Edgar Ramírez
+Edgar Ramírez Mondragón
+Ee Durbin
+Efflam Lemaillet
+efflamlemaillet
+Eitan Adler
+ekristina
+elainechan
+Eli Schwartz
+Elisha Hollander
+Ellen Marie Dash
+Emil Burzo
+Emil Styrke
+Emmanuel Arias
+Endoh Takanao
+enoch
+Erdinc Mutlu
+Eric Cousineau
+Eric Gillingham
+Eric Hanchrow
+Eric Hopper
+Erik M. Bray
+Erik Rose
+Erwin Janssen
+Eugene Vereshchagin
+everdimension
+Federico
+Felipe Peter
+Felix Yan
+fiber-space
+Filip Kokosiński
+Filipe Laíns
+Finn Womack
+finnagin
+Flavio Amurrio
+Florian Briand
+Florian Rathgeber
+Francesco
+Francesco Montesano
+Fredrik Orderud
+Frost Ming
+Gabriel Curio
+Gabriel de Perthuis
+Garry Polley
+gavin
+gdanielson
+Geoffrey Sneddon
+George Song
+Georgi Valkov
+Georgy Pchelkin
+ghost
+Giftlin Rajaiah
+gizmoguy1
+gkdoc
+Godefroid Chapelle
+Gopinath M
+GOTO Hayato
+gousaiyang
+gpiks
+Greg Roodt
+Greg Ward
+Guilherme Espada
+Guillaume Seguin
+gutsytechster
+Guy Rozendorn
+Guy Tuval
+gzpan123
+Hanjun Kim
+Hari Charan
+Harsh Vardhan
+harupy
+Harutaka Kawamura
+hauntsaninja
+Henrich Hartzer
+Henry Schreiner
+Herbert Pfennig
+Holly Stotelmyer
+Honnix
+Hsiaoming Yang
+Hugo Lopes Tavares
+Hugo van Kemenade
+Hugues Bruant
+Hynek Schlawack
+Ian Bicking
+Ian Cordasco
+Ian Lee
+Ian Stapleton Cordasco
+Ian Wienand
+Igor Kuzmitshov
+Igor Sobreira
+Ikko Ashimine
+Ilan Schnell
+Illia Volochii
+Ilya Baryshev
+Inada Naoki
+Ionel Cristian Mărieș
+Ionel Maries Cristian
+Itamar Turner-Trauring
+Ivan Pozdeev
+J. Nick Koston
+Jacob Kim
+Jacob Walls
+Jaime Sanz
+jakirkham
+Jakub Kuczys
+Jakub Stasiak
+Jakub Vysoky
+Jakub Wilk
+James Cleveland
+James Curtin
+James Firth
+James Gerity
+James Polley
+Jan Pokorný
+Jannis Leidel
+Jarek Potiuk
+jarondl
+Jason Curtis
+Jason R. Coombs
+JasonMo
+JasonMo1
+Jay Graves
+Jean Abou Samra
+Jean-Christophe Fillion-Robin
+Jeff Barber
+Jeff Dairiki
+Jeff Widman
+Jelmer Vernooĳ
+jenix21
+Jeremy Fleischman
+Jeremy Stanley
+Jeremy Zafran
+Jesse Rittner
+Jiashuo Li
+Jim Fisher
+Jim Garrison
+Jinzhe Zeng
+Jiun Bae
+Jivan Amara
+Joe Bylund
+Joe Michelini
+John Paton
+John Sirois
+John T. Wodder II
+John-Scott Atlakson
+johnthagen
+Jon Banafato
+Jon Dufresne
+Jon Parise
+Jonas Nockert
+Jonathan Herbert
+Joonatan Partanen
+Joost Molenaar
+Jorge Niedbalski
+Joseph Bylund
+Joseph Long
+Josh Bronson
+Josh Cannon
+Josh Hansen
+Josh Schneier
+Joshua
+Juan Luis Cano Rodríguez
+Juanjo Bazán
+Judah Rand
+Julian Berman
+Julian Gethmann
+Julien Demoor
+Jussi Kukkonen
+jwg4
+Jyrki Pulliainen
+Kai Chen
+Kai Mueller
+Kamal Bin Mustafa
+kasium
+kaustav haldar
+keanemind
+Keith Maxwell
+Kelsey Hightower
+Kenneth Belitzky
+Kenneth Reitz
+Kevin Burke
+Kevin Carter
+Kevin Frommelt
+Kevin R Patterson
+Kexuan Sun
+Kit Randel
+Klaas van Schelven
+KOLANICH
+konstin
+kpinc
+Krishna Oza
+Kumar McMillan
+Kuntal Majumder
+Kurt McKee
+Kyle Persohn
+lakshmanaram
+Laszlo Kiss-Kollar
+Laurent Bristiel
+Laurent LAPORTE
+Laurie O
+Laurie Opperman
+layday
+Leon Sasson
+Lev Givon
+Lincoln de Sousa
+Lipis
+lorddavidiii
+Loren Carvalho
+Lucas Cimon
+Ludovic Gasc
+Luis Medel
+Lukas Geiger
+Lukas Juhrich
+Luke Macken
+Luo Jiebin
+luojiebin
+luz.paz
+László Kiss Kollár
+M00nL1ght
+Marc Abramowitz
+Marc Tamlyn
+Marcus Smith
+Mariatta
+Mark Kohler
+Mark McLoughlin
+Mark Williams
+Markus Hametner
+Martey Dodoo
+Martin Fischer
+Martin Häcker
+Martin Pavlasek
+Masaki
+Masklinn
+Matej Stuchlik
+Mathew Jennings
+Mathieu Bridon
+Mathieu Kniewallner
+Matt Bacchi
+Matt Good
+Matt Maker
+Matt Robenolt
+Matt Wozniski
+matthew
+Matthew Einhorn
+Matthew Feickert
+Matthew Gilliard
+Matthew Hughes
+Matthew Iversen
+Matthew Treinish
+Matthew Trumbell
+Matthew Willson
+Matthias Bussonnier
+mattip
+Maurits van Rees
+Max W Chase
+Maxim Kurnikov
+Maxime Rouyrre
+mayeut
+mbaluna
+mdebi
+memoselyk
+meowmeowcat
+Michael
+Michael Aquilina
+Michael E. Karpeles
+Michael Klich
+Michael Mintz
+Michael Williamson
+michaelpacer
+Michał Górny
+Mickaël Schoentgen
+Miguel Araujo Perez
+Mihir Singh
+Mike
+Mike Hendricks
+Min RK
+MinRK
+Miro Hrončok
+Monica Baluna
+montefra
+Monty Taylor
+morotti
+mrKazzila
+Muha Ajjan
+Nadav Wexler
+Nahuel Ambrosini
+Nate Coraor
+Nate Prewitt
+Nathan Houghton
+Nathaniel J. Smith
+Nehal J Wani
+Neil Botelho
+Nguyễn Gia Phong
+Nicholas Serra
+Nick Coghlan
+Nick Stenning
+Nick Timkovich
+Nicolas Bock
+Nicole Harris
+Nikhil Benesch
+Nikhil Ladha
+Nikita Chepanov
+Nikolay Korolev
+Nipunn Koorapati
+Nitesh Sharma
+Niyas Sait
+Noah
+Noah Gorny
+Nowell Strite
+NtaleGrey
+nvdv
+OBITORASU
+Ofek Lev
+ofrinevo
+Oliver Freund
+Oliver Jeeves
+Oliver Mannion
+Oliver Tonnhofer
+Olivier Girardot
+Olivier Grisel
+Ollie Rutherfurd
+OMOTO Kenji
+Omry Yadan
+onlinejudge95
+Oren Held
+Oscar Benjamin
+Oz N Tiram
+Pachwenko
+Patrick Dubroy
+Patrick Jenkins
+Patrick Lawson
+patricktokeeffe
+Patrik Kopkan
+Paul Ganssle
+Paul Kehrer
+Paul Moore
+Paul Nasrat
+Paul Oswald
+Paul van der Linden
+Paulus Schoutsen
+Pavel Safronov
+Pavithra Eswaramoorthy
+Pawel Jasinski
+Paweł Szramowski
+Pekka Klärck
+Peter Gessler
+Peter Lisák
+Peter Shen
+Peter Waller
+Petr Viktorin
+petr-tik
+Phaneendra Chiruvella
+Phil Elson
+Phil Freo
+Phil Pennock
+Phil Whelan
+Philip Jägenstedt
+Philip Molloy
+Philippe Ombredanne
+Pi Delport
+Pierre-Yves Rofes
+Pieter Degroote
+pip
+Prabakaran Kumaresshan
+Prabhjyotsing Surjit Singh Sodhi
+Prabhu Marappan
+Pradyun Gedam
+Prashant Sharma
+Pratik Mallya
+pre-commit-ci[bot]
+Preet Thakkar
+Preston Holmes
+Przemek Wrzos
+Pulkit Goyal
+q0w
+Qiangning Hong
+Qiming Xu
+Quentin Lee
+Quentin Pradet
+R. David Murray
+Rafael Caricio
+Ralf Schmitt
+Ran Benita
+Razzi Abuissa
+rdb
+Reece Dunham
+Remi Rampin
+Rene Dudfield
+Riccardo Magliocchetti
+Riccardo Schirone
+Richard Jones
+Richard Si
+Ricky Ng-Adam
+Rishi
+rmorotti
+RobberPhex
+Robert Collins
+Robert McGibbon
+Robert Pollak
+Robert T. McGibbon
+robin elisha robinson
+Roey Berman
+Rohan Jain
+Roman Bogorodskiy
+Roman Donchenko
+Romuald Brunet
+ronaudinho
+Ronny Pfannschmidt
+Rory McCann
+Ross Brattain
+Roy Wellington Ⅳ
+Ruairidh MacLeod
+Russell Keith-Magee
+Ryan Shepherd
+Ryan Wooden
+ryneeverett
+S. Guliaev
+Sachi King
+Salvatore Rinchiera
+sandeepkiran-js
+Sander Van Balen
+Savio Jomton
+schlamar
+Scott Kitterman
+Sean
+seanj
+Sebastian Jordan
+Sebastian Schaetz
+Segev Finer
+SeongSoo Cho
+Sergey Vasilyev
+Seth Michael Larson
+Seth Woodworth
+Shahar Epstein
+Shantanu
+shenxianpeng
+shireenrao
+Shivansh-007
+Shixian Sheng
+Shlomi Fish
+Shovan Maity
+Simeon Visser
+Simon Cross
+Simon Pichugin
+sinoroc
+sinscary
+snook92
+socketubs
+Sorin Sbarnea
+Srinivas Nyayapati
+Srishti Hegde
+Stavros Korokithakis
+Stefan Scherfke
+Stefano Rivera
+Stephan Erb
+Stephen Rosen
+stepshal
+Steve (Gadget) Barnes
+Steve Barnes
+Steve Dower
+Steve Kowalik
+Steven Myint
+Steven Silvester
+stonebig
+studioj
+Stéphane Bidoul
+Stéphane Bidoul (ACSONE)
+Stéphane Klein
+Sumana Harihareswara
+Surbhi Sharma
+Sviatoslav Sydorenko
+Sviatoslav Sydorenko (Святослав Сидоренко)
+Swat009
+Sylvain
+Takayuki SHIMIZUKAWA
+Taneli Hukkinen
+tbeswick
+Thiago
+Thijs Triemstra
+Thomas Fenzl
+Thomas Grainger
+Thomas Guettler
+Thomas Johansson
+Thomas Kluyver
+Thomas Smith
+Thomas VINCENT
+Tim D. Smith
+Tim Gates
+Tim Harder
+Tim Heap
+tim smith
+tinruufu
+Tobias Hermann
+Tom Forbes
+Tom Freudenheim
+Tom V
+Tomas Hrnciar
+Tomas Orsava
+Tomer Chachamu
+Tommi Enenkel | AnB
+Tomáš Hrnčiar
+Tony Beswick
+Tony Narlock
+Tony Zhaocheng Tan
+TonyBeswick
+toonarmycaptain
+Toshio Kuratomi
+toxinu
+Travis Swicegood
+Tushar Sadhwani
+Tzu-ping Chung
+Valentin Haenel
+Victor Stinner
+victorvpaulo
+Vikram - Google
+Viktor Szépe
+Ville Skyttä
+Vinay Sajip
+Vincent Philippon
+Vinicyus Macedo
+Vipul Kumar
+Vitaly Babiy
+Vladimir Fokow
+Vladimir Rutsky
+W. Trevor King
+Wil Tan
+Wilfred Hughes
+William Edwards
+William ML Leslie
+William T Olson
+William Woodruff
+Wilson Mo
+wim glenn
+Winson Luk
+Wolfgang Maier
+Wu Zhenyu
+XAMES3
+Xavier Fernandez
+Xianpeng Shen
+xoviat
+xtreak
+YAMAMOTO Takashi
+Yen Chi Hsuan
+Yeray Diaz Diaz
+Yoval P
+Yu Jian
+Yuan Jing Vincent Yan
+Yusuke Hayashi
+Zearin
+Zhiping Deng
+ziebam
+Zvezdan Petkovic
+Łukasz Langa
+Роман Донченко
+Семён Марьясин
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip-24.3.1.dist-info/INSTALLER b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip-24.3.1.dist-info/INSTALLER
new file mode 100644
index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip-24.3.1.dist-info/INSTALLER
@@ -0,0 +1 @@
+pip
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip-24.3.1.dist-info/LICENSE.txt b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip-24.3.1.dist-info/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8e7b65eaf628360e6f32f4140fcdd7ec7c2b7077
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip-24.3.1.dist-info/LICENSE.txt
@@ -0,0 +1,20 @@
+Copyright (c) 2008-present The pip developers (see AUTHORS.txt file)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip-24.3.1.dist-info/METADATA b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip-24.3.1.dist-info/METADATA
new file mode 100644
index 0000000000000000000000000000000000000000..9e5aa3a486f7e6a60761c3b0760a62730478d741
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip-24.3.1.dist-info/METADATA
@@ -0,0 +1,90 @@
+Metadata-Version: 2.1
+Name: pip
+Version: 24.3.1
+Summary: The PyPA recommended tool for installing Python packages.
+Author-email: The pip developers <distutils-sig@python.org>
+License: MIT
+Project-URL: Homepage, https://pip.pypa.io/
+Project-URL: Documentation, https://pip.pypa.io
+Project-URL: Source, https://github.com/pypa/pip
+Project-URL: Changelog, https://pip.pypa.io/en/stable/news/
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Topic :: Software Development :: Build Tools
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Requires-Python: >=3.8
+Description-Content-Type: text/x-rst
+License-File: LICENSE.txt
+License-File: AUTHORS.txt
+
+pip - The Python Package Installer
+==================================
+
+.. |pypi-version| image:: https://img.shields.io/pypi/v/pip.svg
+   :target: https://pypi.org/project/pip/
+   :alt: PyPI
+
+.. |python-versions| image:: https://img.shields.io/pypi/pyversions/pip
+   :target: https://pypi.org/project/pip
+   :alt: PyPI - Python Version
+
+.. |docs-badge| image:: https://readthedocs.org/projects/pip/badge/?version=latest
+   :target: https://pip.pypa.io/en/latest
+   :alt: Documentation
+
+|pypi-version| |python-versions| |docs-badge|
+
+pip is the `package installer`_ for Python. You can use pip to install packages from the `Python Package Index`_ and other indexes.
+
+Please take a look at our documentation for how to install and use pip:
+
+* `Installation`_
+* `Usage`_
+
+We release updates regularly, with a new version every 3 months. Find more details in our documentation:
+
+* `Release notes`_
+* `Release process`_
+
+If you find bugs, need help, or want to talk to the developers, please use our mailing lists or chat rooms:
+
+* `Issue tracking`_
+* `Discourse channel`_
+* `User IRC`_
+
+If you want to get involved head over to GitHub to get the source code, look at our development documentation and feel free to jump on the developer mailing lists and chat rooms:
+
+* `GitHub page`_
+* `Development documentation`_
+* `Development IRC`_
+
+Code of Conduct
+---------------
+
+Everyone interacting in the pip project's codebases, issue trackers, chat
+rooms, and mailing lists is expected to follow the `PSF Code of Conduct`_.
+
+.. _package installer: https://packaging.python.org/guides/tool-recommendations/
+.. _Python Package Index: https://pypi.org
+.. _Installation: https://pip.pypa.io/en/stable/installation/
+.. _Usage: https://pip.pypa.io/en/stable/
+.. _Release notes: https://pip.pypa.io/en/stable/news.html
+.. _Release process: https://pip.pypa.io/en/latest/development/release-process/
+.. _GitHub page: https://github.com/pypa/pip
+.. _Development documentation: https://pip.pypa.io/en/latest/development
+.. _Issue tracking: https://github.com/pypa/pip/issues
+.. _Discourse channel: https://discuss.python.org/c/packaging
+.. _User IRC: https://kiwiirc.com/nextclient/#ircs://irc.libera.chat:+6697/pypa
+.. _Development IRC: https://kiwiirc.com/nextclient/#ircs://irc.libera.chat:+6697/pypa-dev
+.. _PSF Code of Conduct: https://github.com/pypa/.github/blob/main/CODE_OF_CONDUCT.md
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip-24.3.1.dist-info/REQUESTED b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip-24.3.1.dist-info/REQUESTED
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..efefccffc7a1d61d478ead8475f52dffb4b832c5
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/__init__.py
@@ -0,0 +1,13 @@
+from typing import List, Optional
+
+__version__ = "24.3.1"
+
+
+def main(args: Optional[List[str]] = None) -> int:
+    """This is an internal API only meant for use by pip's own console scripts.
+
+    For additional details, see https://github.com/pypa/pip/issues/7498.
+    """
+    from pip._internal.utils.entrypoints import _wrapper
+
+    return _wrapper(args)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/__pycache__/__pip-runner__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/__pycache__/__pip-runner__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a09344fa60b917e562bfe8492417ec39371bb8d4
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/__pycache__/__pip-runner__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/main.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..33c6d24cd85b55a9fb1b1e6ab784f471e2b135f0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/main.py
@@ -0,0 +1,12 @@
+from typing import List, Optional
+
+
+def main(args: Optional[List[str]] = None) -> int:
+    """This is preserved for old console scripts that may still be referencing
+    it.
+
+    For additional details, see https://github.com/pypa/pip/issues/7498.
+    """
+    from pip._internal.utils.entrypoints import _wrapper
+
+    return _wrapper(args)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/__pycache__/check.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/__pycache__/check.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5bd9d8ae421ee93ddc4658b5b4afed649339e2d4
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/__pycache__/check.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/__pycache__/build_tracker.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/__pycache__/build_tracker.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d5e190316648dd90dc4c87af3d1c20a23a12a38
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/__pycache__/build_tracker.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/__pycache__/metadata_legacy.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/__pycache__/metadata_legacy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7dd7498aaefd2729f878dc91e8cdc774c7d5e88d
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/__pycache__/metadata_legacy.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/__pycache__/wheel_editable.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/__pycache__/wheel_editable.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bd135d64159724139c69da99d6f998e581f2c02
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/__pycache__/wheel_editable.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/__pycache__/wheel_legacy.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/__pycache__/wheel_legacy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..232db33959e1c0b2214dd899e42f0eed33330ef5
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/__pycache__/wheel_legacy.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/metadata.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..c66ac354deb035405fe0e4040dac539d28570257
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/metadata.py
@@ -0,0 +1,39 @@
+"""Metadata generation logic for source distributions.
+"""
+
+import os
+
+from pip._vendor.pyproject_hooks import BuildBackendHookCaller
+
+from pip._internal.build_env import BuildEnvironment
+from pip._internal.exceptions import (
+    InstallationSubprocessError,
+    MetadataGenerationFailed,
+)
+from pip._internal.utils.subprocess import runner_with_spinner_message
+from pip._internal.utils.temp_dir import TempDirectory
+
+
+def generate_metadata(
+    build_env: BuildEnvironment, backend: BuildBackendHookCaller, details: str
+) -> str:
+    """Generate metadata using mechanisms described in PEP 517.
+
+    Returns the generated metadata directory.
+    """
+    metadata_tmpdir = TempDirectory(kind="modern-metadata", globally_managed=True)
+
+    metadata_dir = metadata_tmpdir.path
+
+    with build_env:
+        # Note that BuildBackendHookCaller implements a fallback for
+        # prepare_metadata_for_build_wheel, so we don't have to
+        # consider the possibility that this hook doesn't exist.
+        runner = runner_with_spinner_message("Preparing metadata (pyproject.toml)")
+        with backend.subprocess_runner(runner):
+            try:
+                distinfo_dir = backend.prepare_metadata_for_build_wheel(metadata_dir)
+            except InstallationSubprocessError as error:
+                raise MetadataGenerationFailed(package_details=details) from error
+
+    return os.path.join(metadata_dir, distinfo_dir)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/metadata_editable.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/metadata_editable.py
new file mode 100644
index 0000000000000000000000000000000000000000..27c69f0d1eaf3e223d599e91f969d52a821426fe
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/metadata_editable.py
@@ -0,0 +1,41 @@
+"""Metadata generation logic for source distributions.
+"""
+
+import os
+
+from pip._vendor.pyproject_hooks import BuildBackendHookCaller
+
+from pip._internal.build_env import BuildEnvironment
+from pip._internal.exceptions import (
+    InstallationSubprocessError,
+    MetadataGenerationFailed,
+)
+from pip._internal.utils.subprocess import runner_with_spinner_message
+from pip._internal.utils.temp_dir import TempDirectory
+
+
+def generate_editable_metadata(
+    build_env: BuildEnvironment, backend: BuildBackendHookCaller, details: str
+) -> str:
+    """Generate metadata using mechanisms described in PEP 660.
+
+    Returns the generated metadata directory.
+    """
+    metadata_tmpdir = TempDirectory(kind="modern-metadata", globally_managed=True)
+
+    metadata_dir = metadata_tmpdir.path
+
+    with build_env:
+        # Note that BuildBackendHookCaller implements a fallback for
+        # prepare_metadata_for_build_wheel/editable, so we don't have to
+        # consider the possibility that this hook doesn't exist.
+        runner = runner_with_spinner_message(
+            "Preparing editable metadata (pyproject.toml)"
+        )
+        with backend.subprocess_runner(runner):
+            try:
+                distinfo_dir = backend.prepare_metadata_for_build_editable(metadata_dir)
+            except InstallationSubprocessError as error:
+                raise MetadataGenerationFailed(package_details=details) from error
+
+    return os.path.join(metadata_dir, distinfo_dir)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/wheel_editable.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/wheel_editable.py
new file mode 100644
index 0000000000000000000000000000000000000000..719d69dd801b78b360c6c2234080eee638b8de82
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/wheel_editable.py
@@ -0,0 +1,46 @@
+import logging
+import os
+from typing import Optional
+
+from pip._vendor.pyproject_hooks import BuildBackendHookCaller, HookMissing
+
+from pip._internal.utils.subprocess import runner_with_spinner_message
+
+logger = logging.getLogger(__name__)
+
+
+def build_wheel_editable(
+    name: str,
+    backend: BuildBackendHookCaller,
+    metadata_directory: str,
+    tempd: str,
+) -> Optional[str]:
+    """Build one InstallRequirement using the PEP 660 build process.
+
+    Returns path to wheel if successfully built. Otherwise, returns None.
+    """
+    assert metadata_directory is not None
+    try:
+        logger.debug("Destination directory: %s", tempd)
+
+        runner = runner_with_spinner_message(
+            f"Building editable for {name} (pyproject.toml)"
+        )
+        with backend.subprocess_runner(runner):
+            try:
+                wheel_name = backend.build_editable(
+                    tempd,
+                    metadata_directory=metadata_directory,
+                )
+            except HookMissing as e:
+                logger.error(
+                    "Cannot build editable %s because the build "
+                    "backend does not have the %s hook",
+                    name,
+                    e,
+                )
+                return None
+    except Exception:
+        logger.error("Failed building editable for %s", name)
+        return None
+    return os.path.join(tempd, wheel_name)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/wheel_legacy.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/wheel_legacy.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ee2a7058d323e41f3c930f14685f68e6a599fa5
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/build/wheel_legacy.py
@@ -0,0 +1,102 @@
+import logging
+import os.path
+from typing import List, Optional
+
+from pip._internal.cli.spinners import open_spinner
+from pip._internal.utils.setuptools_build import make_setuptools_bdist_wheel_args
+from pip._internal.utils.subprocess import call_subprocess, format_command_args
+
+logger = logging.getLogger(__name__)
+
+
+def format_command_result(
+    command_args: List[str],
+    command_output: str,
+) -> str:
+    """Format command information for logging."""
+    command_desc = format_command_args(command_args)
+    text = f"Command arguments: {command_desc}\n"
+
+    if not command_output:
+        text += "Command output: None"
+    elif logger.getEffectiveLevel() > logging.DEBUG:
+        text += "Command output: [use --verbose to show]"
+    else:
+        if not command_output.endswith("\n"):
+            command_output += "\n"
+        text += f"Command output:\n{command_output}"
+
+    return text
+
+
+def get_legacy_build_wheel_path(
+    names: List[str],
+    temp_dir: str,
+    name: str,
+    command_args: List[str],
+    command_output: str,
+) -> Optional[str]:
+    """Return the path to the wheel in the temporary build directory."""
+    # Sort for determinism.
+    names = sorted(names)
+    if not names:
+        msg = f"Legacy build of wheel for {name!r} created no files.\n"
+        msg += format_command_result(command_args, command_output)
+        logger.warning(msg)
+        return None
+
+    if len(names) > 1:
+        msg = (
+            f"Legacy build of wheel for {name!r} created more than one file.\n"
+            f"Filenames (choosing first): {names}\n"
+        )
+        msg += format_command_result(command_args, command_output)
+        logger.warning(msg)
+
+    return os.path.join(temp_dir, names[0])
+
+
+def build_wheel_legacy(
+    name: str,
+    setup_py_path: str,
+    source_dir: str,
+    global_options: List[str],
+    build_options: List[str],
+    tempd: str,
+) -> Optional[str]:
+    """Build one unpacked package using the "legacy" build process.
+
+    Returns path to wheel if successfully built. Otherwise, returns None.
+    """
+    wheel_args = make_setuptools_bdist_wheel_args(
+        setup_py_path,
+        global_options=global_options,
+        build_options=build_options,
+        destination_dir=tempd,
+    )
+
+    spin_message = f"Building wheel for {name} (setup.py)"
+    with open_spinner(spin_message) as spinner:
+        logger.debug("Destination directory: %s", tempd)
+
+        try:
+            output = call_subprocess(
+                wheel_args,
+                command_desc="python setup.py bdist_wheel",
+                cwd=source_dir,
+                spinner=spinner,
+            )
+        except Exception:
+            spinner.finish("error")
+            logger.error("Failed building wheel for %s", name)
+            return None
+
+        names = os.listdir(tempd)
+        wheel_path = get_legacy_build_wheel_path(
+            names=names,
+            temp_dir=tempd,
+            name=name,
+            command_args=wheel_args,
+            command_output=output,
+        )
+        return wheel_path
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/install/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/install/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..042a41abf7c6c283875638404555c2ecdd3acd33
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/install/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/install/__pycache__/wheel.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/install/__pycache__/wheel.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be5a59c902d49b8b68e501bdf0414a521f74a1cc
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/operations/install/__pycache__/wheel.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/self_outdated_check.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/self_outdated_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9a91af9d8435917377de0d8db045fadeba34a87
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_internal/self_outdated_check.py
@@ -0,0 +1,244 @@
+import datetime
+import functools
+import hashlib
+import json
+import logging
+import optparse
+import os.path
+import sys
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Optional
+
+from pip._vendor.packaging.version import Version
+from pip._vendor.packaging.version import parse as parse_version
+from pip._vendor.rich.console import Group
+from pip._vendor.rich.markup import escape
+from pip._vendor.rich.text import Text
+
+from pip._internal.index.collector import LinkCollector
+from pip._internal.index.package_finder import PackageFinder
+from pip._internal.metadata import get_default_environment
+from pip._internal.models.selection_prefs import SelectionPreferences
+from pip._internal.network.session import PipSession
+from pip._internal.utils.compat import WINDOWS
+from pip._internal.utils.entrypoints import (
+    get_best_invocation_for_this_pip,
+    get_best_invocation_for_this_python,
+)
+from pip._internal.utils.filesystem import adjacent_tmp_file, check_path_owner, replace
+from pip._internal.utils.misc import ensure_dir
+
+_WEEK = datetime.timedelta(days=7)
+
+logger = logging.getLogger(__name__)
+
+
+def _get_statefile_name(key: str) -> str:
+    key_bytes = key.encode()
+    name = hashlib.sha224(key_bytes).hexdigest()
+    return name
+
+
+def _convert_date(isodate: str) -> datetime.datetime:
+    """Convert an ISO format string to a date.
+
+    Handles the format 2020-01-22T14:24:01Z (trailing Z)
+    which is not supported by older versions of fromisoformat.
+    """
+    return datetime.datetime.fromisoformat(isodate.replace("Z", "+00:00"))
+
+
+class SelfCheckState:
+    def __init__(self, cache_dir: str) -> None:
+        self._state: Dict[str, Any] = {}
+        self._statefile_path = None
+
+        # Try to load the existing state
+        if cache_dir:
+            self._statefile_path = os.path.join(
+                cache_dir, "selfcheck", _get_statefile_name(self.key)
+            )
+            try:
+                with open(self._statefile_path, encoding="utf-8") as statefile:
+                    self._state = json.load(statefile)
+            except (OSError, ValueError, KeyError):
+                # Explicitly suppressing exceptions, since we don't want to
+                # error out if the cache file is invalid.
+                pass
+
+    @property
+    def key(self) -> str:
+        return sys.prefix
+
+    def get(self, current_time: datetime.datetime) -> Optional[str]:
+        """Check if we have a not-outdated version loaded already."""
+        if not self._state:
+            return None
+
+        if "last_check" not in self._state:
+            return None
+
+        if "pypi_version" not in self._state:
+            return None
+
+        # Determine if we need to refresh the state
+        last_check = _convert_date(self._state["last_check"])
+        time_since_last_check = current_time - last_check
+        if time_since_last_check > _WEEK:
+            return None
+
+        return self._state["pypi_version"]
+
+    def set(self, pypi_version: str, current_time: datetime.datetime) -> None:
+        # If we do not have a path to cache in, don't bother saving.
+        if not self._statefile_path:
+            return
+
+        # Check to make sure that we own the directory
+        if not check_path_owner(os.path.dirname(self._statefile_path)):
+            return
+
+        # Now that we've ensured the directory is owned by this user, we'll go
+        # ahead and make sure that all our directories are created.
+        ensure_dir(os.path.dirname(self._statefile_path))
+
+        state = {
+            # Include the key so it's easy to tell which pip wrote the
+            # file.
+            "key": self.key,
+            "last_check": current_time.isoformat(),
+            "pypi_version": pypi_version,
+        }
+
+        text = json.dumps(state, sort_keys=True, separators=(",", ":"))
+
+        with adjacent_tmp_file(self._statefile_path) as f:
+            f.write(text.encode())
+
+        try:
+            # Since we have a prefix-specific state file, we can just
+            # overwrite whatever is there, no need to check.
+            replace(f.name, self._statefile_path)
+        except OSError:
+            # Best effort.
+            pass
+
+
+@dataclass
+class UpgradePrompt:
+    old: str
+    new: str
+
+    def __rich__(self) -> Group:
+        if WINDOWS:
+            pip_cmd = f"{get_best_invocation_for_this_python()} -m pip"
+        else:
+            pip_cmd = get_best_invocation_for_this_pip()
+
+        notice = "[bold][[reset][blue]notice[reset][bold]][reset]"
+        return Group(
+            Text(),
+            Text.from_markup(
+                f"{notice} A new release of pip is available: "
+                f"[red]{self.old}[reset] -> [green]{self.new}[reset]"
+            ),
+            Text.from_markup(
+                f"{notice} To update, run: "
+                f"[green]{escape(pip_cmd)} install --upgrade pip"
+            ),
+        )
+
+
+def was_installed_by_pip(pkg: str) -> bool:
+    """Checks whether pkg was installed by pip
+
+    This is used not to display the upgrade message when pip is in fact
+    installed by system package manager, such as dnf on Fedora.
+    """
+    dist = get_default_environment().get_distribution(pkg)
+    return dist is not None and "pip" == dist.installer
+
+
+def _get_current_remote_pip_version(
+    session: PipSession, options: optparse.Values
+) -> Optional[str]:
+    # Lets use PackageFinder to see what the latest pip version is
+    link_collector = LinkCollector.create(
+        session,
+        options=options,
+        suppress_no_index=True,
+    )
+
+    # Pass allow_yanked=False so we don't suggest upgrading to a
+    # yanked version.
+    selection_prefs = SelectionPreferences(
+        allow_yanked=False,
+        allow_all_prereleases=False,  # Explicitly set to False
+    )
+
+    finder = PackageFinder.create(
+        link_collector=link_collector,
+        selection_prefs=selection_prefs,
+    )
+    best_candidate = finder.find_best_candidate("pip").best_candidate
+    if best_candidate is None:
+        return None
+
+    return str(best_candidate.version)
+
+
+def _self_version_check_logic(
+    *,
+    state: SelfCheckState,
+    current_time: datetime.datetime,
+    local_version: Version,
+    get_remote_version: Callable[[], Optional[str]],
+) -> Optional[UpgradePrompt]:
+    remote_version_str = state.get(current_time)
+    if remote_version_str is None:
+        remote_version_str = get_remote_version()
+        if remote_version_str is None:
+            logger.debug("No remote pip version found")
+            return None
+        state.set(remote_version_str, current_time)
+
+    remote_version = parse_version(remote_version_str)
+    logger.debug("Remote version of pip: %s", remote_version)
+    logger.debug("Local version of pip:  %s", local_version)
+
+    pip_installed_by_pip = was_installed_by_pip("pip")
+    logger.debug("Was pip installed by pip? %s", pip_installed_by_pip)
+    if not pip_installed_by_pip:
+        return None  # Only suggest upgrade if pip is installed by pip.
+
+    local_version_is_older = (
+        local_version < remote_version
+        and local_version.base_version != remote_version.base_version
+    )
+    if local_version_is_older:
+        return UpgradePrompt(old=str(local_version), new=remote_version_str)
+
+    return None
+
+
+def pip_self_version_check(session: PipSession, options: optparse.Values) -> None:
+    """Check for an update for pip.
+
+    Limit the frequency of checks to once per week. State is stored either in
+    the active virtualenv or in the user's USER_CACHE_DIR keyed off the prefix
+    of the pip script path.
+    """
+    installed_dist = get_default_environment().get_distribution("pip")
+    if not installed_dist:
+        return
+
+    upgrade_prompt = _self_version_check_logic(
+        state=SelfCheckState(cache_dir=options.cache_dir),
+        current_time=datetime.datetime.now(datetime.timezone.utc),
+        local_version=installed_dist.version,
+        get_remote_version=functools.partial(
+            _get_current_remote_pip_version, session, options
+        ),
+    )
+    if upgrade_prompt is not None:
+        logger.warning("%s", upgrade_prompt, extra={"rich": True})
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/py.typed b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..493b53e4e7a3984ddd49780313bf3bd9901dc1e0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/py.typed
@@ -0,0 +1,4 @@
+pip is a command line program. While it is implemented in Python, and so is
+available for import, you must not use pip's internal APIs in this way. Typing
+information is provided as a convenience only and is not a guarantee. Expect
+unannounced changes to the API and types in releases.
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/dispatcher.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/dispatcher.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb374dd39f93c994ad6ba02d3fcd5aea1df0c5b1
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/dispatcher.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/functionalization.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/functionalization.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f30b4c15e5f3e8e7f0f30873fce7a956987cbce1
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/functionalization.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/native.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/native.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84e824e3bfa682301c0b158a1b9889bf48abda3b
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/native.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/python.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/python.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b50198c0b74ea987b457f311d0107ebc59354fe
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/python.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/ufunc.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/ufunc.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e3edf1d80297b2f27ececfd59f799c1bd799566
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/ufunc.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/unboxing.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/unboxing.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bbe83a992e4cd15c3ca2d3a50a7a153875690c9f
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/__pycache__/unboxing.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/autograd.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/autograd.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a55211b9990251a9cbd82050e79e2f7c2c10a1e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/autograd.py
@@ -0,0 +1,853 @@
+import re
+from dataclasses import dataclass
+from typing import cast, Dict, List, Match, Optional, Sequence, Set, Tuple
+
+from torchgen import local
+
+from torchgen.api import cpp
+from torchgen.api.types import BaseCType, Binding, NamedCType, tensorListT
+from torchgen.model import (
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    ListType,
+    NativeFunction,
+    NativeFunctionsViewGroup,
+    SchemaKind,
+    Type,
+)
+from torchgen.utils import IDENT_REGEX
+
+
+# Represents a saved attribute involved in backward calculation.
+# Note that it can be a derived property of an input argument, e.g.:
+# we could save `other.scalar_type()` instead of the entire `other` tensor.
+@dataclass(frozen=True)
+class SavedAttribute:
+    # The NamedCType holds the updated name and cpp type of the attribute
+    # for the name, Suffix is appended if it's derived property, e.g.: `other_scalar_type`
+    nctype: NamedCType
+
+    # The expression to read the derived property at save time, e.g.:
+    # `other.scalar_type()`.
+    expr: str
+
+
+# Represents a backward formula that calculates derivatives for one
+# or more tensors.
+@dataclass(frozen=True)
+class Derivative:
+    # The formula string (legit C++ expression).
+    # Note that expressions against input arguments have been replaced with the
+    # corresponding saved attributes.
+    # E.g.:
+    #  raw formula: `mul_tensor_backward(grad, self, other.scalar_type())`
+    #         here: `mul_tensor_backward(grad, self, other_scalar_type)`
+    formula: str
+
+    # The formula string before input argument replacement
+    original_formula: str
+
+    # Names of the arguments for which this formula calculates derivatives.
+    var_names: Tuple[str, ...]
+
+    # Saved inputs that are referenced by the formula.
+    saved_inputs: Tuple[SavedAttribute, ...]
+
+    # Saved outputs that are referenced by the formula.
+    saved_outputs: Tuple[SavedAttribute, ...]
+
+    # Gradients that are referenced by name in the formula.
+    named_gradients: Set[str]
+
+
+# Represents a forward formula that calculates forward derivatives
+# for one tensor.
+@dataclass(frozen=True)
+class ForwardDerivative:
+    # The formula string (legit C++ expression).
+    # Note that special keywords such as "linear" or "element_wise" have been
+    # replaced by the automatically generated formula.
+    formula: str
+
+    # Name of the output arguments for which this formula calculates forward
+    # derivatives
+    var_names: Tuple[str, ...]
+
+    # Type of the output arguments for which this formula calculates forward
+    # derivatives
+    var_types: Tuple[Type, ...]
+
+    # Inputs for which the forward derivatives are required for this formula
+    required_inputs_fw_grad: Optional[Tuple[str, ...]]
+
+    # Inputs for which the primal is required for this formula
+    required_inputs_primal: Optional[Tuple[str, ...]]
+
+    # Flag to specify if this formula requires the original value of self
+    # This is only used by inplace operations
+    required_original_self_value: bool
+
+    # If this formula is specified in derivatives.yaml or if we are re-using the
+    # out of place formula for inplace
+    is_reusing_outplace_formula: bool
+
+
+# Represents differentiability info for a NativeFunction.
+@dataclass(frozen=True)
+class DifferentiabilityInfo:
+    # The base name read from derivatives.yaml.
+    name: str
+
+    # The matching native function.
+    #
+    # There can be multiple NativeFunction having the same base name:
+    #  - different overloads with different types of input arguments;
+    #  - in-place/out/functional variants of the same function;
+    #
+    # We first use the schema string (under the 'name' key) in derivatives.yaml
+    # to find the NativeFunction having the same schema string.
+    # Then we find the in-place/out/functional variants of the matching function.
+    # Among these variants, we choose the one having the same name as the
+    # derivatives.yaml entry. If there is no exact match, then we choose the
+    # in-place variant.
+    # TODO: maybe the logic to search for all variants is no longer necessary?
+    func: NativeFunction
+
+    # The name of the generated autograd function.
+    # It's set only if we will calculate a derivative, i.e.
+    # 'args_with_derivatives' is not empty.
+    op: Optional[str]
+
+    # The derivatives formulae for this function.
+    # Note that the length of this sequence is the number of differentiable inputs
+    derivatives: Sequence[Derivative]
+
+    # The forward derivatives formulae for this function.
+    # Note that the length of this sequence is the number of differentiable outputs
+    forward_derivatives: Sequence[ForwardDerivative]
+
+    # The union of 'saved_inputs' of all 'derivatives'.
+    all_saved_inputs: Sequence[SavedAttribute]
+
+    # The union of 'saved_outputs' of all 'derivatives'.
+    all_saved_outputs: Sequence[SavedAttribute]
+
+    # All named gradients that are available for use, in the same
+    # order as in the grads vector.
+    available_named_gradients: Sequence[str]
+
+    # The named gradients that are used in any of the derivatives.
+    # Invariant: all(name in available_named_gradients for name in used_named_gradients)
+    used_named_gradients: Set[str]
+
+    # The function's input arguments for which it calculates derivatives.
+    # It's the union of 'var_names' of all 'derivatives', sorted by the
+    # argument order in the function schema.
+    args_with_derivatives: Sequence[Binding]
+
+    # Names of arguments whose derivative formula is 'non_differentiable'.
+    non_differentiable_arg_names: Sequence[str]
+
+    # Raw data read from derivatives.yaml.
+    output_differentiability: Optional[List[bool]]
+
+    # output_differentiability in derivatives.yaml can be a list of
+    # conditions that express if the output is differentiable. In this case,
+    # the number of conditions must match the number of outputs
+    # (NB: we only support one condition right now).
+    # output_differentiability gets populated with True for each condition,
+    # while output_differentiability_conditions gets populated with the conditions
+    output_differentiability_conditions: Optional[List[str]]
+
+    @property
+    def has_derivatives(self) -> bool:
+        return len(self.args_with_derivatives) > 0
+
+    # Generates a new DifferentiabilityInfo using the exact same set of derivative information,
+    # but with a new operator name.
+    # This is used when generating "copy" variants of view ops,
+    # which are able to use the exact same derivative formula as the original view op
+    # See Note [Codegen'd {view}_copy Operators]
+    def create_view_copy_from_view_derivative(
+        self, g: NativeFunctionsViewGroup
+    ) -> Optional["DifferentiabilityInfo"]:
+        if g.view_copy is None:
+            return None
+        f = g.view_copy
+
+        name_split_by_period = self.name.split(".", maxsplit=2)
+        # Append a "_copy" to the base name of the operator (but keep the overload name the same)
+        view_copy_name = f"{name_split_by_period[0]}_copy." + ".".join(
+            name_split_by_period[1:]
+        )
+        view_copy_op_name = None if self.op is None else f"{self.op}_copy"
+
+        return DifferentiabilityInfo(
+            # Use the "_copy" version of name/func/op
+            name=view_copy_name,
+            func=f,
+            op=view_copy_op_name,
+            # But keep all derivative info the same
+            derivatives=self.derivatives,
+            forward_derivatives=self.forward_derivatives,
+            all_saved_inputs=self.all_saved_inputs,
+            all_saved_outputs=self.all_saved_outputs,
+            available_named_gradients=self.available_named_gradients,
+            used_named_gradients=self.used_named_gradients,
+            args_with_derivatives=self.args_with_derivatives,
+            non_differentiable_arg_names=self.non_differentiable_arg_names,
+            output_differentiability=self.output_differentiability,
+            output_differentiability_conditions=self.output_differentiability_conditions,
+        )
+
+
+def uses_ident(info: Optional[DifferentiabilityInfo], ident: str) -> bool:
+    if info is None:
+        return False
+    for derivative in info.derivatives:
+        formula = derivative.formula
+        if re.search(IDENT_REGEX.format(ident), formula):
+            return True
+    return False
+
+
+def uses_retain_variables(info: Optional[DifferentiabilityInfo]) -> bool:
+    return uses_ident(info, "retain_variables")
+
+
+def uses_single_grad(info: Optional[DifferentiabilityInfo]) -> bool:
+    return uses_ident(info, "grad")
+
+
+# Represents a differentiable `Argument`.
+# How is it different from the `Argument` type?
+# - It's processed Arguments which are differentiable and only used in the
+#   context of the autograd codegen;
+# - It can represent SelfArgument or regular Argument but not TensorOptionsArgument;
+@dataclass(frozen=True)
+class DifferentiableInput:
+    name: str
+    type: Type
+
+    # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove.
+    cpp_type: str
+
+
+# Represents a differentiable `Return`.
+# How it it different from the `Return` type?
+# - The name in `Return` is optional. Here it is always populated using the same
+#   `cpp.return_names()` method.
+#   TODO: some cpp naming logic (e.g. resolving name conflict) might be irrelevant?
+# - It's processed Returns which are differentiable, in compliance with the
+#   `output_differentiability` field defined in derivatives.yaml (if specified),
+#   and are only used in the context of the autograd codegen;
+@dataclass(frozen=True)
+class DifferentiableOutput:
+    name: str
+    type: Type
+
+    # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove.
+    cpp_type: str
+
+
+@dataclass(frozen=True)
+class NativeFunctionWithDifferentiabilityInfo:
+    func: NativeFunction
+    info: Optional[Dict[str, DifferentiabilityInfo]]
+    fw_derivatives: Optional[Dict[str, Sequence[ForwardDerivative]]]
+
+
+# TODO: Update comment below since it is out of date.
+def dispatch_strategy(fn: NativeFunctionWithDifferentiabilityInfo) -> str:
+    """How are we going to call the underlying implementation of a
+    declaration?  There are two strategies:
+        - use_derived: we want to call the implementation on CPUDoubleType
+          (or a similar, derived Type instance).  Because these derived
+          instances deal in Tensors, not Variables (it's a completely different
+          object, so it doesn't dispatch back to VariableType), code on
+          this dispatch path needs to wrap/unwrap tensors.  If the
+          derived implementation takes and returns tensors, the
+          implementation is usually differentiable (although we also use
+          the derived dispatch path for non-differentiable functions
+          that we still want to dispatch on the derived Type instance;
+          e.g., size())
+        - use_type: we want to call the implementation on Type, because
+          it is implemented concretely, and the functions it invokes will
+          get dispatched back to VariableType (which will ensure that they
+          are differentiable.)
+    """
+    # fn is derived as long as any of its per-key differentiability infos
+    # has_derivatives. dispatch_strategy() is used to guard generation of fns in VariableType
+    # and ADInplaceOrViewType. We want to generate these functions as long as a
+    # derivative is defined for ANY dispatch key.
+    if fn.func.is_abstract or (
+        fn.info is not None and any(info.has_derivatives for info in fn.info.values())
+    ):
+        # If the function is abstract (not implemented on at::Type), we must
+        # call the implementation on the derived type with unpacked tensors.
+
+        # If the function has a derivative specified and is concrete, we could
+        # call either implementation. We prefer the calling the derived
+        # type's implementation with unpacked tensors because it is more
+        # performant in some cases: any internal calls to other ATen functions
+        # won't have the history tracked.
+
+        # If the function has a type dispatched argument (i.e. is a factory),
+        # we prefer calling the derived type's implementation both because it is
+        # more performant and to ensure factory functions return tensors with _version
+        # of 0 (probably not strictly necessary, but nice to have to keeps versions simple
+        # to understand.
+
+        return "use_derived"
+    else:
+        # If the function is concrete (we don't have to override it) and we
+        # didn't declare it in derivatives.yaml, we'll assume that it is
+        # actually implemented out of differentiable functions. (This
+        # assumption might not hold, but then you'll see gradcheck fail.)
+        return "use_type"
+
+
+def is_foreach_func(f: NativeFunction) -> bool:
+    return f.func.name.name.base.startswith("_foreach_")
+
+
+# note(crcrpar): Most foreach functions can reference an out-place `torch` function whose schema kind
+# is functional for their backward derivatives (and forward derivatives in the future), i.e.,
+# they would find such one in `functional_info_by_signature`. There however are some exceptions:
+_foreach_with_inplace_ref = {"_foreach_zero_"}
+_foreach_with_tensor_overload = {
+    "_foreach_add.Tensor",
+    "_foreach_mul.Tensor",
+    "_foreach_div.Tensor",
+}
+
+
+# Checks if `function_schema` is a native, non-foreach function which `f`, a foreach function
+# reference to generate derivatives.
+def is_reference_for_foreach(
+    f: NativeFunction,
+    function_schema: FunctionSchema,
+) -> bool:
+    return (
+        f.func.name.name.base.split("_foreach_")[-1] == function_schema.name.name.base
+        and (
+            not function_schema.name.name.inplace
+            or str(f.func.name) in _foreach_with_inplace_ref
+        )
+        and all(
+            ref_arg.type in (arg.type, getattr(arg.type, "elem", None))
+            for arg, ref_arg in zip(
+                f.func.arguments.flat_non_out,
+                function_schema.arguments.flat_non_out,
+            )
+        )
+    )
+
+
+# TODO(crcrpar): Avoid hard coding "Default" ideally.
+def gen_foreach_derivativeinfo(
+    foreach_function: NativeFunction,
+    functional_info_by_signature: Dict[
+        FunctionSchema, Dict[str, DifferentiabilityInfo]
+    ],
+    non_functional_info_by_signature: Dict[
+        FunctionSchema, Dict[str, DifferentiabilityInfo]
+    ],
+    dispatch_key: str = "Default",
+) -> Tuple[Optional[DifferentiabilityInfo], bool]:
+    """Generate DifferentiabilityInfo for out-place foreach function, return the existing one for in-place.
+
+    The second return value indicates whether the info is generated in this function.
+    """
+    ref_diff_info: Optional[DifferentiabilityInfo] = None
+
+    for function_schema, diff_info in functional_info_by_signature.items():
+        if not is_reference_for_foreach(foreach_function, function_schema):
+            continue
+        ref_diff_info = diff_info[dispatch_key]
+        if ref_diff_info is not None:
+            break
+    # note(crcrpar): It seems like `zero`'s info isn't available in functional_info_by_signature
+    # while the info of `zero_` is in non_functional_info_by_signature
+    if (
+        ref_diff_info is None
+        and foreach_function.func.kind() == SchemaKind.inplace
+        and str(foreach_function.func.name) in _foreach_with_inplace_ref
+    ):
+        for function_schema, diff_info in non_functional_info_by_signature.items():
+            if not is_reference_for_foreach(foreach_function, function_schema):
+                continue
+            ref_diff_info = diff_info[dispatch_key]
+            if ref_diff_info is not None:
+                break
+    if ref_diff_info is None:
+        return None, False
+
+    # non out-place uses the existing Derivative.
+    if foreach_function.func.kind() == SchemaKind.inplace:
+        return ref_diff_info, False
+
+    map_refarg2foreacharg, map_name2arg = {}, {}
+    for i, (arg, ref_arg) in enumerate(
+        zip(
+            foreach_function.func.arguments.flat_non_out,
+            function_schema.arguments.flat_non_out,
+        )
+    ):
+        map_refarg2foreacharg[ref_arg.name] = arg.name
+        map_name2arg[arg.name] = arg
+
+    all_saved_inputs, all_saved_outputs, all_var_names = [], [], []
+    modified_derivative_formulas = []
+    for i, derivative in enumerate(ref_diff_info.derivatives):
+        modified_formula = derivative.formula.replace("grad", "grads[i]").replace(
+            "result", "result[i]"
+        )
+        saved_inputs, saved_outputs = [], []
+        # note(crcrpar): This context seems necessary to call `cpp.argument_type`
+        with local.parametrize(
+            use_const_ref_for_mutable_tensors=foreach_function.use_const_ref_for_mutable_tensors,
+            use_ilistref_for_tensor_lists=foreach_function.part_of_structured_group,
+        ):
+            for ref_input in derivative.saved_inputs:
+                ref_input_jit_name = ref_input.expr.split(".")[0]
+                mapped_name = map_refarg2foreacharg[ref_input_jit_name]
+                if isinstance(map_name2arg[mapped_name].type, ListType):
+                    mapped_expr = mapped_name + "[i]"
+                else:
+                    mapped_expr = mapped_name
+                new_expr = ref_input.expr.replace(ref_input_jit_name, mapped_expr)
+                modified_formula = modified_formula.replace(
+                    cast(str, ref_input.nctype.name), new_expr
+                )
+
+                nctype = cpp.argument_type(map_name2arg[mapped_name], binds=mapped_name)
+                canonical_nctype = NamedCType(
+                    nctype.name, nctype.type.remove_const_ref()
+                )
+                saved_inputs.append(
+                    SavedAttribute(nctype=canonical_nctype, expr=mapped_name)
+                )
+            for ref_output in derivative.saved_outputs:
+                if ref_output.nctype.name == "result":
+                    saved_outputs.append(
+                        SavedAttribute(
+                            nctype=NamedCType(
+                                name="result", type=BaseCType(tensorListT)
+                            ),
+                            expr="result",
+                        )
+                    )
+                else:
+                    raise RuntimeError("")
+        var_names = [map_refarg2foreacharg[var] for var in derivative.var_names]
+        all_var_names.extend(var_names)
+        all_saved_inputs.extend(saved_inputs)
+        all_saved_outputs.extend(saved_outputs)
+        modified_derivative = Derivative(
+            formula=modified_formula,
+            original_formula=derivative.formula,
+            var_names=tuple(var_names),
+            saved_inputs=tuple(saved_inputs),
+            saved_outputs=tuple(saved_outputs),
+            named_gradients=set(),
+        )
+        modified_derivative_formulas.append(modified_derivative)
+
+    with local.parametrize(
+        use_const_ref_for_mutable_tensors=foreach_function.use_const_ref_for_mutable_tensors,
+        use_ilistref_for_tensor_lists=foreach_function.part_of_structured_group,
+    ):
+        args_with_derivatives = [
+            Binding(
+                name=arg.name,
+                nctype=cpp.argument_type(arg, binds=arg.name),
+                argument=arg,
+                default=None,
+            )
+            for arg in foreach_function.func.arguments.flat_non_out
+            if arg.name in all_var_names
+        ]
+
+    forward_derivatives: List[ForwardDerivative] = []
+    fw_derivative: ForwardDerivative
+    for fw_derivative in ref_diff_info.forward_derivatives:
+        var_names: List[str] = list(fw_derivative.var_names)  # type: ignore[no-redef]
+        var_types: List[Type] = list(fw_derivative.var_types)
+        required_inputs_fw_grad: List[str] = []
+        required_inputs_primal: List[str] = []
+        if fw_derivative.required_inputs_fw_grad is not None:
+            required_inputs_fw_grad = list(fw_derivative.required_inputs_fw_grad)
+        if fw_derivative.required_inputs_primal:
+            required_inputs_primal = list(fw_derivative.required_inputs_primal)
+        modified_formula = fw_derivative.formula
+
+        # Foreach's result is TensorList
+        if "result" in modified_formula:
+            modified_formula = fw_derivative.formula.replace("result", "result[i]")
+
+        for foreach_arg, ref_arg in zip(
+            foreach_function.func.arguments.flat_non_out,
+            ref_diff_info.func.func.arguments.flat_non_out,
+        ):
+            # Modify reference forward formula
+            if (
+                isinstance(foreach_arg.type, ListType)
+                and not foreach_arg.type.is_tensor_like()
+            ):
+                # Assuming ScalarList
+                modified_formula = modified_formula.replace(
+                    ref_arg.name, foreach_arg.name + "[i]"
+                )
+            elif foreach_arg.type.is_tensor_like():
+                # Assuming TensorList / Tensor
+                # assert isinstance(foreach_arg.type, ListType), f"{foreach_function.func.name}, {foreach_arg.type}"
+                assert isinstance(foreach_arg.type, ListType) or (
+                    foreach_arg.type == BaseType(BaseTy.Tensor)
+                    and str(foreach_function.func.name) in _foreach_with_tensor_overload
+                ), f"{foreach_function.func.name}, {foreach_arg.type}"
+                for suffix in ("_p", "_t"):
+                    curr_expr = ref_arg.name + suffix
+                    if curr_expr in modified_formula:
+                        new_expr = foreach_arg.name + suffix
+                        modified_formula = modified_formula.replace(curr_expr, new_expr)
+            else:
+                # Assuming Scalar
+                if foreach_arg.name != ref_arg.name:
+                    modified_formula = modified_formula.replace(
+                        ref_arg.name, foreach_arg.name
+                    )
+
+            # note(crcrpar): there should exist a cooler way...
+            for i, name in enumerate(var_names):
+                if name == ref_arg.name:
+                    var_names[i] = foreach_arg.name
+                    var_types[i] = foreach_arg.type
+            for i, name in enumerate(required_inputs_fw_grad):
+                if name == ref_arg.name:
+                    required_inputs_fw_grad[i] = foreach_arg.name
+            for i, name in enumerate(required_inputs_primal):
+                if name == ref_arg.name:
+                    required_inputs_primal[i] = foreach_arg.name
+        forward_derivatives.append(
+            ForwardDerivative(
+                formula=modified_formula,
+                var_names=tuple(var_names),
+                var_types=tuple(var_types),
+                required_inputs_fw_grad=tuple(required_inputs_fw_grad),
+                required_inputs_primal=tuple(required_inputs_primal),
+                required_original_self_value=fw_derivative.required_original_self_value,
+                is_reusing_outplace_formula=fw_derivative.is_reusing_outplace_formula,
+            )
+        )
+
+    return (
+        DifferentiabilityInfo(
+            name=foreach_function.func.name.name.base,
+            func=foreach_function,
+            op=f"Foreach{ref_diff_info.op}{foreach_function.func.name.overload_name}",
+            derivatives=modified_derivative_formulas,
+            forward_derivatives=forward_derivatives,
+            all_saved_inputs=tuple(set(all_saved_inputs)),
+            all_saved_outputs=tuple(set(all_saved_outputs)),
+            available_named_gradients=(),
+            used_named_gradients=set(),
+            args_with_derivatives=args_with_derivatives,
+            non_differentiable_arg_names=[],
+            output_differentiability=None,
+            output_differentiability_conditions=None,
+        ),
+        True,
+    )
+
+
+def match_differentiability_info(
+    native_functions: List[NativeFunction],
+    differentiability_infos: Dict[FunctionSchema, Dict[str, DifferentiabilityInfo]],
+) -> List[NativeFunctionWithDifferentiabilityInfo]:
+    """Sets the "derivative" key on declarations to matching autograd function
+    In-place functions will use the out-of-place derivative definition if there
+    is no in-place specific derivative.
+    """
+
+    functional_info_by_signature = {
+        schema.signature(strip_default=True): info_dict
+        for schema, info_dict in differentiability_infos.items()
+        if schema.kind() == SchemaKind.functional
+    }
+    non_functional_info_by_signature = {
+        schema.signature(strip_default=True): info_dict
+        for schema, info_dict in differentiability_infos.items()
+        if schema.kind() != SchemaKind.functional
+    }
+
+    def find_info(
+        f: NativeFunction,
+    ) -> Tuple[Optional[Dict[str, DifferentiabilityInfo]], bool]:
+        # Don't bother matching info to generated out= variants
+        if "generated" in f.tags and f.func.kind() == SchemaKind.out:
+            return None, False
+
+        # (1) Check for an exact match
+        if f.func in differentiability_infos:
+            return differentiability_infos[f.func], True
+
+        # (2) If no exact match, check if the out-of-place variant
+        # of this operator has a match.
+        # i.e mul() for mul_() or mul_out()
+        # note(crcrpar): Check foreach or not because in-place foreach functions use backward defined for the existing
+        # native functions instead of the out-place counterparts.
+        f_sig = f.func.signature(strip_default=True)
+        if f_sig in functional_info_by_signature and not is_foreach_func(f):
+            return functional_info_by_signature[f_sig], False
+
+        # (3) Some operators have a derivative explicitly defined for the mutable
+        # variant, but get a code-generated out-of-place variant which does *not*
+        # come with a derivative formula.
+        # For the generated out-of-place variant, use the mutable variant's formula
+        # if it exists.
+        if "generated" in f.tags and f_sig in non_functional_info_by_signature:
+            info_dict = non_functional_info_by_signature[f_sig]
+            # See https://github.com/pytorch/pytorch/pull/76320/files#r874816389
+            assert not any(
+                any("self" in str(inpt.nctype.name) for inpt in info.all_saved_inputs)
+                for info in info_dict.values()
+            ), f"""\
+Attempted to convert a derivative formula for a mutable operator
+ to be used by automatically by its functional variant ("{str(f.func)}").
+ this is not currently supported (we'd need to fix up the formula in the codegen)."""
+            return info_dict, False
+
+        # (4) Generate derivative information of foreach functions if none is defined in `derivatives.yaml`
+        if is_foreach_func(f):
+            assert f.func not in differentiability_infos
+            diff_info, is_generated = gen_foreach_derivativeinfo(
+                f,
+                functional_info_by_signature,
+                non_functional_info_by_signature,
+            )
+            if diff_info is None:
+                return None, False
+            # TODO(crcrpar): Avoid hard coding "Default" ideally.
+            diff_info_dict = {"Default": diff_info}
+            if is_generated:
+                differentiability_infos[f.func] = diff_info_dict
+                functional_info_by_signature[f.func] = diff_info_dict
+            return diff_info_dict, is_generated
+
+        return None, False
+
+    result: List[NativeFunctionWithDifferentiabilityInfo] = []
+    for f in native_functions:
+        info_dict, is_exact_match = find_info(f)
+
+        # Currently, the '.strides()' to 'strides_or_error' replacement does not support
+        # 'self' derivatives of an inplace function, so we must check for this case.
+        if f.func.kind() == SchemaKind.inplace and (info_dict is not None):
+            for info in info_dict.values():
+                for derivative in info.derivatives:
+                    if "self" in derivative.var_names:
+                        for saved_input in derivative.saved_inputs:
+                            assert "strides_or_error" not in saved_input.expr, (
+                                "Calling '.strides()' in the 'self' derivative formula of an "
+                                f"in-place function is not supported: {f.func}"
+                            )
+
+        if not info_dict:
+            result.append(
+                NativeFunctionWithDifferentiabilityInfo(
+                    func=f, info=None, fw_derivatives=None
+                )
+            )
+            continue
+
+        fw_derivative_dict: Dict[str, Sequence[ForwardDerivative]] = {}
+        for key, info in info_dict.items():
+            if not info.forward_derivatives:
+                fw_derivative_dict[key] = []
+                continue
+
+            forward_derivatives = info.forward_derivatives
+
+            # For functions that have a single def for out-of-place and inplace (like abs())
+            if f.func.kind() == SchemaKind.inplace:
+                # For inplace functions there is a little bit of work to do:
+                #  1) Validate the formula and make sure the input that is modified in not used:
+                #    - If there is a formula for the inplace variant of the function (is_exact_match == True) then
+                #      we make sure that the original value of the input that is being modified inplace (self_p) is
+                #      not used in the formula. Note that the formula can use "original_self_p" here and that would
+                #      trigger a clone of the original input.
+                #    - If we are re-using the out of place formula (is_exact_match == False) then we replace every
+                #      occurrence of self_p and self_t by original_self_p and original_self_t. These will be
+                #      populated by cloned version of the original input (either the clone done by the backward AD
+                #      logic if self is also used in a backward formula or a special clone that we add).
+                #  2) At this point, there cannot be a self_p in the formula.
+                #  3) Change "result" into "self_p" as by design, in the inplace function codegen, the result is
+                #     simply called self (as it is modified inplace).
+                #  4) Update the required primals data in case it used to contain "result" but should now contain
+                #     "self"
+                #  5) If it is not an exact match, the user formula is not modifying the existing forward grad
+                #     inplace as it should. So add some code that makes sure that we do so if the forward grad
+                #     already exists.
+
+                assert (
+                    len(info.forward_derivatives) == 1
+                )  # Only single output inplace should exist
+                fw_info = info.forward_derivatives[0]
+                formula = fw_info.formula
+
+                def replace_self_with_original_self(formula: str, postfix: str) -> str:
+                    def repl(m: Match[str]) -> str:
+                        return f"{m.group(1)}original_self{postfix}{m.group(2)}"
+
+                    return re.sub(IDENT_REGEX.format(f"self{postfix}"), repl, formula)
+
+                if re.search(IDENT_REGEX.format("self_p"), formula):
+                    if is_exact_match:
+                        # For manually defined formulas, don't allow the original value to be used
+                        raise RuntimeError(
+                            f'The formula for "{f.func.name}" is using the original value of self '
+                            "that is being modified inplace. This would lead to wrong forward gradients. "
+                            'Please use "result" in the formula only.'
+                        )
+                    else:
+                        # When the original formula is out of place, we save a clone of the primal
+                        # value to be able to access this value if needed
+                        # replace "self_p"/"self_t" from the formula by "original_self_p"/"original_self_t"
+                        formula = replace_self_with_original_self(formula, "_p")
+                        formula = replace_self_with_original_self(formula, "_t")
+
+                # replace "result" from the formula by "self_p"
+                def repl(m: Match[str]) -> str:
+                    return f"{m.group(1)}self_p{m.group(2)}"
+
+                formula = re.sub(IDENT_REGEX.format("result"), repl, formula)
+
+                required_primals = fw_info.required_inputs_primal
+                if re.search(IDENT_REGEX.format("self_p"), formula):
+                    required_primals = (
+                        required_primals + ("self",) if required_primals else ("self",)
+                    )
+
+                if not is_exact_match:
+                    # NOTE [In-place forward AD formula Optimization]
+                    #
+                    # This optimization transforms the formula to directly do inplace, i.e.
+                    # instead of self_t.copy_(self_t.op()) we do self_t.op_() when the following are met:
+                    #
+                    # 1) the formula satisfies the pattern: "self_t.op(*args)"
+                    # 2) "op" in (1) needs to be the same as the op the derivative is for
+                    #
+                    # (2) may seem too strict, but currently the only ops that satisfy (1) also satisfy (2)
+                    # If there is a need, we can relax (2) to allow any op that has an in-place variant
+                    is_single_method_on_self_t = False
+                    directly_do_inplace = False
+                    op_name: Optional[str] = None
+                    between_parens: Optional[str] = None
+                    match = re.fullmatch(r"self_t.([\w]*)\((.*)\)", formula)
+                    if match:
+                        op_name, between_parens = match.group(1), match.group(2)
+
+                        # We want to...
+                        #   Match: self_t.op1(other_p.op2(arg))
+                        #   Avoid: self_t.op1(args) + self_t.op2(args)
+                        #   Avoid: self_t.op1(other_p.op2(arg)) + self_t.op2(args)
+                        def check_parens_nest_level_gt_zero(s: str) -> bool:
+                            level = 1
+                            for ch in s:
+                                if ch == ")":
+                                    level -= 1
+                                    if level == 0:
+                                        return False
+                                if ch == "(":
+                                    level += 1
+                            return True
+
+                        is_single_method_on_self_t = check_parens_nest_level_gt_zero(
+                            between_parens
+                        )
+                        directly_do_inplace = (
+                            is_single_method_on_self_t and op_name == info.name
+                        )
+
+                    if directly_do_inplace:
+                        assert op_name is not None
+                        assert between_parens is not None
+                        formula = f"self_t_raw.defined() ? self_t_raw.{op_name}_({between_parens}) : {formula}"
+                    else:
+                        # Make sure that the forward grad is modified inplace when the original formula
+                        # is out of place
+                        formula = f"self_t_raw.defined() ? self_t_raw.copy_({formula}) : {formula}"
+
+                required_original_self_value = bool(
+                    re.search(IDENT_REGEX.format("original_self_p"), formula)
+                ) or bool(re.search(IDENT_REGEX.format("original_self_t"), formula))
+
+                forward_derivatives = [
+                    ForwardDerivative(
+                        formula=formula,
+                        var_names=("self",),
+                        var_types=fw_info.var_types,
+                        required_inputs_fw_grad=fw_info.required_inputs_fw_grad,
+                        required_inputs_primal=required_primals,
+                        required_original_self_value=required_original_self_value,
+                        is_reusing_outplace_formula=not is_exact_match,
+                    ),
+                ]
+
+            fw_derivative_dict[key] = forward_derivatives
+
+        result.append(
+            NativeFunctionWithDifferentiabilityInfo(
+                func=f, info=info_dict, fw_derivatives=fw_derivative_dict
+            )
+        )
+
+    return result
+
+
+def is_differentiable(
+    name: str, type: Type, info: Optional[DifferentiabilityInfo]
+) -> bool:
+    return type.is_tensor_like() and (
+        info is None or name not in info.non_differentiable_arg_names
+    )
+
+
+def gen_differentiable_outputs(
+    fn: NativeFunctionWithDifferentiabilityInfo, key: str = "Default"
+) -> List[DifferentiableOutput]:
+    f = fn.func
+    info = fn.info[key] if fn.info else None
+    outputs: List[DifferentiableOutput] = [
+        DifferentiableOutput(
+            name=name,
+            type=ret.type,
+            cpp_type=cpp.return_type(ret, symint=True).cpp_type(),
+        )
+        for name, ret in zip(cpp.return_names(f), f.func.returns)
+    ]
+    output_differentiability = info.output_differentiability if info else None
+    if output_differentiability is not None:
+        if len(output_differentiability) != len(outputs):
+            raise RuntimeError(
+                f"The length of output_differentiability ({len(output_differentiability)}), "
+                f"does not match the number of outputs ({len(outputs)})."
+            )
+        differentiable_outputs: List[DifferentiableOutput] = []
+        if False in output_differentiability and f.func.kind() == SchemaKind.inplace:
+            raise RuntimeError(
+                "output_differentiability=False for inplace operation (version_counter won't get updated)"
+            )
+        for differentiable, output in zip(output_differentiability, outputs):
+            if differentiable:
+                differentiable_outputs.append(output)
+        return differentiable_outputs
+    candidate_differentiable_outputs = list(
+        filter(lambda r: is_differentiable(r.name, r.type, info), outputs)
+    )
+    if uses_single_grad(info):
+        return candidate_differentiable_outputs[:1]
+    else:
+        return candidate_differentiable_outputs
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/cpp.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5466030daa6baba3899373d7af220bdfa102b72
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/cpp.py
@@ -0,0 +1,467 @@
+from typing import List, Optional, Sequence, Set, Union
+
+from torchgen import local
+from torchgen.api.types import (
+    ArgName,
+    ArrayCType,
+    ArrayRefCType,
+    BaseCType,
+    BaseTypeToCppMapping,
+    Binding,
+    boolT,
+    ConstRefCType,
+    CType,
+    dimnameListT,
+    intArrayRefT,
+    iTensorListRefT,
+    ListCType,
+    longT,
+    MutRefCType,
+    NamedCType,
+    OptionalCType,
+    optionalIntArrayRefT,
+    optionalSymIntArrayRefT,
+    scalarT,
+    SpecialArgName,
+    symIntArrayRefT,
+    SymIntT,
+    tensorListT,
+    tensorOptionsT,
+    tensorT,
+    TupleCType,
+    VectorCType,
+    voidT,
+)
+from torchgen.model import (
+    Argument,
+    Arguments,
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    ListType,
+    NativeFunction,
+    OptionalType,
+    Return,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.utils import assert_never
+
+# This file describes the translation of JIT schema to the public C++
+# API, which is what people use when they call functions like at::add.
+#
+# Prominent characteristics of the C++ API:
+#
+#   - dtype, layout, device and pin_memory are collected into
+#     a single C++ type TensorOptions  (the native functions API
+#     also has this, but tensor options is really most relevant
+#     for the C++ API; it makes calling kwarg factory functions
+#     pleasant)
+#
+#   - defaulting lives here (in fact, the dispatcher is completely
+#     oblivious of defaults!)
+#
+# BTW: policy on name collisions: we try not to have types with
+# collisions, but functions are fair game to collide
+
+
+def name(
+    func: FunctionSchema,
+    *,
+    faithful_name_for_out_overloads: bool = False,
+    symint_overload: bool = False,
+) -> str:
+    name = str(func.name.name)
+    if symint_overload:
+        name += "_symint"
+    if func.is_out_fn():
+        if faithful_name_for_out_overloads:
+            name += "_outf"
+        else:
+            name += "_out"
+
+    return name
+
+
+# Translation of "value types" in JIT schema to C++ API type.  Value
+# types look the same no matter if they are argument types or return
+# types.  Returns None if the type in question is not a value type.
+def valuetype_type(
+    t: Type,
+    *,
+    binds: ArgName,
+    remove_non_owning_ref_types: bool = False,
+    symint: bool = False,
+) -> Optional[NamedCType]:
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor or t.name == BaseTy.Scalar:
+            return None
+        elif str(t) == "SymInt":
+            if symint:
+                return NamedCType(binds, BaseCType(SymIntT))
+            else:
+                return NamedCType(binds, BaseCType(longT))
+        if remove_non_owning_ref_types:
+            if t.name == BaseTy.str:
+                raise AssertionError(
+                    "string ref->value conversion: not implemented yet"
+                )
+        # All other BaseType currently map directly to BaseCppTypes.
+        return NamedCType(binds, BaseCType(BaseTypeToCppMapping[t.name]))
+    elif isinstance(t, OptionalType):
+        elem = valuetype_type(t.elem, binds=binds, symint=symint)
+        if elem is None:
+            return None
+        return NamedCType(binds, OptionalCType(elem.type))
+    elif isinstance(t, ListType):
+        if str(t.elem) == "bool":
+            assert t.size is not None
+            return NamedCType(binds, ArrayCType(BaseCType(boolT), t.size))
+        else:
+            return None
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# Translation of types occurring in JIT arguments to a C++ argument type.
+# If remove_non_owning_ref_types is set, we'll guarantee that the outputed CType is not a non-owning reference type.
+# For example, we'll return std::vector<int> instead of IntArrayRef.
+# See Note [translation from C++ reference to value types]
+def argumenttype_type(
+    t: Type,
+    *,
+    mutable: bool,
+    binds: ArgName,
+    remove_non_owning_ref_types: bool = False,
+    symint: bool = False,
+) -> NamedCType:
+    # If it's a value type, do the value type translation
+    r = valuetype_type(
+        t,
+        binds=binds,
+        symint=symint,
+        remove_non_owning_ref_types=remove_non_owning_ref_types,
+    )
+    if r is not None:
+        return r
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            if mutable and not local.use_const_ref_for_mutable_tensors():
+                return NamedCType(binds, MutRefCType(BaseCType(tensorT)))
+            else:
+                return NamedCType(binds, ConstRefCType(BaseCType(tensorT)))
+        elif t.name == BaseTy.Scalar:
+            return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
+        else:
+            raise AssertionError(f"base type should have been value type {t}")
+    elif isinstance(t, OptionalType):
+        if str(t.elem) == "Tensor":
+            if mutable and not local.use_const_ref_for_mutable_tensors():
+                return NamedCType(
+                    binds, MutRefCType(BaseCType(tensorT))
+                )  # TODO: fix this discrepancy
+            else:
+                return NamedCType(
+                    binds, ConstRefCType(OptionalCType(BaseCType(tensorT)))
+                )
+        elif str(t.elem) == "Scalar":
+            return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT))))
+        elif isinstance(t.elem, ListType) and str(t.elem.elem) == "int":
+            return NamedCType(binds, BaseCType(optionalIntArrayRefT))
+        elif isinstance(t.elem, ListType) and str(t.elem.elem) == "SymInt":
+            if symint:
+                return NamedCType(binds, BaseCType(optionalSymIntArrayRefT))
+            else:
+                return NamedCType(binds, BaseCType(optionalIntArrayRefT))
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds, symint=symint)
+        return NamedCType(binds, OptionalCType(elem.type))
+    elif isinstance(t, ListType):
+        # TODO: remove these special cases, ArrayRef fallthrough works fine
+        if str(t.elem) == "int":
+            if remove_non_owning_ref_types:
+                return NamedCType(binds, VectorCType(BaseCType(longT)))
+            else:
+                return NamedCType(binds, BaseCType(intArrayRefT))
+        if str(t.elem) == "SymInt":
+            if remove_non_owning_ref_types:
+                if symint:
+                    return NamedCType(binds, VectorCType(BaseCType(SymIntT)))
+                else:
+                    return NamedCType(binds, VectorCType(BaseCType(longT)))
+            else:
+                if symint:
+                    return NamedCType(binds, BaseCType(symIntArrayRefT))
+                else:
+                    return NamedCType(binds, BaseCType(intArrayRefT))
+        if str(t.elem) == "Tensor":
+            if local.use_ilistref_for_tensor_lists():
+                return NamedCType(binds, ConstRefCType(BaseCType(iTensorListRefT)))
+            else:
+                return NamedCType(binds, BaseCType(tensorListT))
+        elif str(t.elem) == "Scalar":
+            return NamedCType(binds, ArrayRefCType(BaseCType(scalarT)))
+        elif str(t.elem) == "Dimname":
+            return NamedCType(binds, BaseCType(dimnameListT))
+        elif str(t.elem) == "Tensor?":
+            return NamedCType(
+                binds, ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT))))
+            )
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds, symint=symint)
+        return NamedCType(binds, ArrayRefCType(elem.type))
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# Translate a JIT argument into its C++ type
+def argument_type(a: Argument, *, binds: ArgName, symint: bool = False) -> NamedCType:
+    return argumenttype_type(a.type, mutable=a.is_write, symint=symint, binds=binds)
+
+
+# Translation of a (non-multi) return type from JIT to C++
+# N.B: returntype_type returns a CType, not a NamedCType.
+# This is mostly because of the mismatch between return types and return names.
+# e.g. a function with a return type of 'void' has 0 return names,
+# and a function with a return type of 'std::tuple' has >1 return name.
+def returntype_type(t: Type, *, mutable: bool, symint: bool = False) -> CType:
+    # placeholder is ignored
+    # NB: symint is ALWAYS respected for return types.  So symint argument
+    # here is IGNORED
+    r = valuetype_type(t, binds="__placeholder__", symint=True)
+    if r is not None:
+        return r.type
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            if mutable:
+                if local.use_const_ref_for_mutable_tensors():
+                    return ConstRefCType(BaseCType(tensorT))
+                else:
+                    return MutRefCType(BaseCType(tensorT))
+            else:
+                # Note [Tensor Copy Returns]
+                # Currently, we use "Argument.is_write" to determine
+                # whether or not Tensor return types should be copies or references.
+                # If that ever changes, take a look at other locations of this note!
+                return BaseCType(tensorT)
+        elif t.name == BaseTy.Scalar:
+            return BaseCType(scalarT)
+    elif isinstance(t, ListType):
+        assert (
+            not mutable
+        ), "Native functions should never return a mutable tensor list. They should return void."
+        elem = returntype_type(t.elem, mutable=False)
+        assert t.size is None, f"fixed size list returns not supported: {t}"
+        return VectorCType(elem)
+    elif isinstance(t, OptionalType):
+        elem = returntype_type(t.elem, mutable=mutable)
+        if str(t.elem) == "Tensor":
+            return OptionalCType(elem)
+
+    raise AssertionError(f"unrecognized return type {t}")
+
+
+# Translation of a single return to its C++ type
+def return_type(r: Return, *, symint: bool = False) -> CType:
+    return returntype_type(r.type, mutable=r.is_write, symint=symint)
+
+
+# Translation of a full (possibly multi) return from JIT to its C++ type
+def returns_type(rs: Sequence[Return], *, symint: bool = False) -> CType:
+    if len(rs) == 0:
+        return BaseCType(voidT)
+    elif len(rs) == 1:
+        return return_type(rs[0], symint=symint)
+    else:
+        return TupleCType([return_type(r, symint=symint) for r in rs])
+
+
+def return_names(f: NativeFunction, *, fallback_name: str = "result") -> Sequence[str]:
+    returns: List[str] = []
+    for i, r in enumerate(f.func.returns):
+        # If we have an inplace function, the return argument is
+        # implicitly named self.
+        # TODO: Consider incorporating this into the data model
+        if f.func.name.name.inplace:
+            assert i == 0, "illegal inplace function with multiple returns"
+            name = "self"
+        # If we are out function, the name is the name of the
+        # corresponding output function (r.name will get recorded
+        # in field_name later.)
+        elif f.func.is_out_fn():
+            name = f.func.arguments.out[i].name
+        # If the return argument is explicitly named...
+        elif r.name:
+            name_conflict = any(
+                r.name == a.name for a in f.func.schema_order_arguments()
+            )
+            if name_conflict and not f.func.is_out_fn():
+                name = f"{r.name}_return"
+            else:
+                name = r.name
+        # If there is no explicit name and no fallback name was passed in, we just name the output result,
+        # unless it's a multi-return, in which case it's result0,
+        # result1, etc (zero-indexed)
+        else:
+            name = fallback_name if len(f.func.returns) == 1 else f"{fallback_name}{i}"
+        returns.append(name)
+    return returns
+
+
+JIT_TO_CPP_DEFAULT = {
+    "False": "false",
+    "True": "true",
+    "None": "c10::nullopt",  # UGH this one is type directed
+    "Mean": "at::Reduction::Mean",
+    "[]": "{}",
+    "contiguous_format": "MemoryFormat::Contiguous",
+    "long": "at::kLong",
+}
+
+
+# Convert a JIT default into C++ expression representing the default
+def default_expr(d: str, t: Type, *, symint: bool) -> str:
+    if d == "None" and str(t) == "Tensor?":
+        return "{}"
+    if isinstance(t, BaseType) and t.name is BaseTy.str:
+        # Schema allows single quotes but C++ needs double
+        if len(d) >= 2 and d[0] == "'" and d[-1] == "'":
+            s = ""
+            i = 1
+            while i + 1 < len(d):
+                if d[i] != "\\":
+                    if d[i] == '"':
+                        s += '\\"'
+                    else:
+                        s += d[i]
+                    i += 1
+                else:
+                    if d[i + 1] == "'":
+                        s += "'"
+                    else:
+                        s += d[i : i + 2]
+                    i += 2
+
+            return f'"{s}"'
+
+    if isinstance(t, OptionalType):
+        if d == "None":
+            return "c10::nullopt"
+
+        return default_expr(d, t.elem, symint=symint)
+
+    if isinstance(t, ListType):
+        if d.startswith("[") and d.endswith("]"):
+            return "{" + d[1:-1] + "}"
+        elif symint and d.isdigit() and str(t.elem) == "SymInt":
+            return f"c10::SymInt({d})"
+        elif t.size is None:
+            # NOTE: Sized lists can have scalar defaults
+            raise ValueError(f"Expected a list default '[...]' but found: '{d}'")
+
+    return JIT_TO_CPP_DEFAULT.get(d, d)
+
+
+# Convert an argument into its C++ API form
+
+
+def argument(
+    a: Union[Argument, TensorOptionsArguments, SelfArgument],
+    *,
+    cpp_no_default_args: Set[str],
+    method: bool,
+    faithful: bool,
+    symint: bool = False,
+    has_tensor_options: bool,
+) -> List[Binding]:
+    def sub_argument(
+        a: Union[Argument, TensorOptionsArguments, SelfArgument]
+    ) -> List[Binding]:
+        return argument(
+            a,
+            cpp_no_default_args=cpp_no_default_args,
+            method=method,
+            faithful=faithful,
+            symint=symint,
+            has_tensor_options=has_tensor_options,
+        )
+
+    if isinstance(a, Argument):
+        binds: ArgName
+        if a.name == "memory_format" and has_tensor_options:
+            binds = SpecialArgName.possibly_redundant_memory_format
+        else:
+            binds = a.name
+        default: Optional[str] = None
+        if a.name not in cpp_no_default_args and a.default is not None:
+            default = default_expr(a.default, a.type, symint=symint)
+        return [
+            Binding(
+                nctype=argument_type(a, binds=binds, symint=symint),
+                name=a.name,
+                default=default,
+                argument=a,
+            )
+        ]
+    elif isinstance(a, TensorOptionsArguments):
+        if faithful:
+            return (
+                sub_argument(a.dtype)
+                + sub_argument(a.layout)
+                + sub_argument(a.device)
+                + sub_argument(a.pin_memory)
+            )
+        else:
+            default = None
+            # Enforced by NativeFunction.__post_init__
+            assert "options" not in cpp_no_default_args
+            if all(x.default == "None" for x in a.all()):
+                default = "{}"
+            elif a.dtype.default == "long":
+                default = "at::kLong"  # TODO: this is wrong
+            return [
+                Binding(
+                    nctype=NamedCType("options", BaseCType(tensorOptionsT)),
+                    name="options",
+                    default=default,
+                    argument=a,
+                )
+            ]
+    elif isinstance(a, SelfArgument):
+        if method:
+            # Caller is responsible for installing implicit this in context!
+            return []
+        else:
+            return sub_argument(a.argument)
+    else:
+        assert_never(a)
+
+
+def arguments(
+    arguments: Arguments,
+    *,
+    faithful: bool,
+    symint: bool = False,
+    method: bool,
+    cpp_no_default_args: Set[str],
+) -> List[Binding]:
+    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+    if faithful:
+        args.extend(arguments.non_out)
+        args.extend(arguments.out)
+    else:
+        args.extend(arguments.out)
+        args.extend(arguments.non_out)
+    return [
+        r.no_default() if faithful else r
+        for a in args
+        for r in argument(
+            a,
+            faithful=faithful,
+            symint=symint,
+            method=method,
+            has_tensor_options=arguments.tensor_options is not None,
+            cpp_no_default_args=cpp_no_default_args,
+        )
+    ]
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/dispatcher.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/dispatcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..58816959f7cd2276274d71436c46d2c36315c631
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/dispatcher.py
@@ -0,0 +1,118 @@
+import itertools
+from typing import List, Sequence, Union
+
+from torchgen.api import cpp
+
+from torchgen.api.types import ArgName, Binding, CType, NamedCType
+from torchgen.model import (
+    Argument,
+    FunctionSchema,
+    Return,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.utils import assert_never, concatMap
+
+# This file describes the translation of JIT schema to the dispatcher
+# API, the *unboxed* calling convention by which invocations through
+# the dispatcher are made.  Historically, the dispatcher API matched
+# the C++ API, but with the establishment of the boxed API, we've
+# made changes to the dispatcher API to so that the unboxed API
+# better aligns with the boxed API.  The dispatcher API hooks heavily
+# into our template based boxing/unboxing machinery, so changes
+# to this convention will usually need template updates too.
+#
+# Prominent characteristics of the dispatcher API:
+#
+#   - dtype, layout, device and pin_memory are represented as separate
+#     arguments.
+#
+
+
+def name(func: FunctionSchema) -> str:
+    return cpp.name(func)
+
+
+def argumenttype_type(
+    t: Type,
+    *,
+    mutable: bool,
+    binds: ArgName,
+    remove_non_owning_ref_types: bool = False,
+    symint: bool = True,
+) -> NamedCType:
+    # This is a faux amis.  If it makes sense in the future to add
+    # more special cases here, or invert things so cpp.argument_type
+    # calls this, or just completely inline the function, please do
+    # it.
+    return cpp.argumenttype_type(
+        t,
+        mutable=mutable,
+        binds=binds,
+        symint=symint,
+        remove_non_owning_ref_types=remove_non_owning_ref_types,
+    )
+
+
+def argument_type(
+    a: Argument,
+    *,
+    binds: ArgName,
+    remove_non_owning_ref_types: bool = False,
+    symint: bool = True,
+) -> NamedCType:
+    return argumenttype_type(
+        a.type,
+        mutable=a.is_write,
+        binds=binds,
+        remove_non_owning_ref_types=remove_non_owning_ref_types,
+        symint=symint,
+    )
+
+
+def returns_type(rs: Sequence[Return], *, symint: bool = True) -> CType:
+    # At present, there is no difference. But there could be!
+    return cpp.returns_type(rs, symint=symint)
+
+
+def jit_arguments(func: FunctionSchema) -> List[Argument]:
+    def to_argument(
+        a: Union[Argument, TensorOptionsArguments, SelfArgument]
+    ) -> List[Argument]:
+        if isinstance(a, Argument):
+            return [a]
+        elif isinstance(a, SelfArgument):
+            return [a.argument]
+        elif isinstance(a, TensorOptionsArguments):
+            return [a.dtype, a.layout, a.device, a.pin_memory]
+        else:
+            assert_never(a)
+
+    return list(
+        concatMap(
+            to_argument,
+            itertools.chain(
+                func.arguments.positional, func.arguments.kwarg_only, func.arguments.out
+            ),
+        )
+    )
+
+
+def argument(
+    a: Argument, *, remove_non_owning_ref_types: bool = False, symint: bool = True
+) -> Binding:
+    return Binding(
+        nctype=argument_type(
+            a,
+            binds=a.name,
+            remove_non_owning_ref_types=remove_non_owning_ref_types,
+            symint=symint,
+        ),
+        name=a.name,
+        argument=a,
+    )
+
+
+def arguments(func: FunctionSchema, *, symint: bool = True) -> List[Binding]:
+    return [argument(a, symint=symint) for a in jit_arguments(func)]
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/lazy.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/lazy.py
new file mode 100644
index 0000000000000000000000000000000000000000..b14e910be0b8abbdfaa838849a28e381ed34fab3
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/lazy.py
@@ -0,0 +1,464 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from torchgen.api.types import (
+    BaseCppType,
+    BaseCType,
+    boolT,
+    CType,
+    deviceT,
+    doubleT,
+    generatorT,
+    layoutT,
+    ListCType,
+    longT,
+    memoryFormatT,
+    NamedCType,
+    OptionalCType,
+    scalarT,
+    scalarTypeT,
+    stringT,
+    SymIntT,
+    VectorCType,
+)
+
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    ListType,
+    OperatorName,
+    OptionalType,
+    Return,
+    TensorOptionsArguments,
+    Type,
+)
+
+
+_valueT: Optional[BaseCppType] = None
+
+
+# A ValueT is an IR type which represents the computation of a Tensor.  In other
+# words, a PyTorch user will do operations on lazy tensors, and each output lazy
+# tensor internally tracks a ValueT representing the IR node that would have
+# actually produced the value of this tensor for real.
+#
+# This is configurable because different lazy tensor backends (LTC vs XLA) will
+# have different IR representations.  (Though, arguably, after unification they
+# shouldn't!)
+def getValueT() -> BaseCppType:
+    global _valueT
+    if not _valueT:
+        raise NotImplementedError(
+            "The value type needs to be set with setValueT() in run_gen_lazy_tensor()"
+        )
+
+    return _valueT
+
+
+def setValueT(val: BaseCppType) -> None:
+    global _valueT
+    _valueT = val
+
+
+# this is a bad hack. I need to refactor the data model to represent each arg in the schema as an object,
+# making it easier to represent special properties of an arg.
+tensorListValueT = BaseCppType("torch::lazy", "Value")
+
+
+def process_ir_type(
+    typ: Type, properties: "LazyIrProperties", *, symint: bool
+) -> Union[BaseCType, VectorCType, OptionalCType, ListCType]:
+    """
+    This function takes a type from NativeFunctions and converts it for use with
+    lazy tensor codegen.
+
+    Type conversion for lazy currently consists of
+     (1) changing at::Tensors into lazy::Values
+     (2) wrapping everything in a BaseCType
+     (3) making cpp-reference types into cpp-value types (e.g. vector instead of IntArrayRef)
+
+    (1) converts at::Tensors to lazy::Values (which wrap lazy::Nodes, with which Lazy IR represents tensors.)
+    There is special handling for Optional[Tensor] or List[Tensor], etc- hence 'tensor-like'
+
+    This is incomplete- there are assertions in places that it's expected to need to add
+    more types as the codegen is used with more operators.
+    """
+    if isinstance(typ, BaseType):
+        if typ.name == BaseTy.Tensor:
+            return BaseCType(getValueT())
+        elif typ.name == BaseTy.Scalar:
+            if properties.TreatScalarsAsConstants:
+                return BaseCType(scalarT)
+            # at::scalar has special handling,
+            # and is wrapped in an lazy::Value just like at::tensor
+            return BaseCType(getValueT())
+        elif typ.name == BaseTy.ScalarType:
+            return BaseCType(scalarTypeT)
+        elif typ.name == BaseTy.int:
+            return BaseCType(longT)
+        elif typ.name == BaseTy.SymInt:
+            if symint:
+                return BaseCType(getValueT())
+            else:
+                return BaseCType(longT)
+        elif typ.name == BaseTy.bool:
+            return BaseCType(boolT)
+        elif typ.name == BaseTy.float:
+            return BaseCType(doubleT)
+        elif typ.name == BaseTy.str:
+            return BaseCType(stringT)
+        elif typ.name == BaseTy.Device:
+            return BaseCType(deviceT)
+        elif typ.name == BaseTy.Generator:
+            return BaseCType(generatorT)
+        elif typ.name == BaseTy.Layout:
+            return BaseCType(layoutT)
+        elif typ.name == BaseTy.MemoryFormat:
+            return BaseCType(memoryFormatT)
+        else:
+            raise AssertionError(f"TODO add support for type {repr(typ)}")
+    elif isinstance(typ, OptionalType):
+        return OptionalCType(process_ir_type(typ.elem, properties, symint=symint))
+    elif isinstance(typ, ListType):
+        if str(typ.elem) == "Tensor?":
+            # TODO(whc) is this actually correct? or should it use a Vector like above
+            return ListCType(OptionalCType(BaseCType(getValueT())))
+        elif str(typ.elem) == "Tensor":
+            # this is a TensorList which comes in from GetTensorList as a Value
+            return BaseCType(tensorListValueT)
+        elif typ.elem == BaseType(BaseTy.SymInt):
+            # TODO: return a value type.  The problem here is analogous to
+            # the problem with tensorListValueT: if you have SymInt[] you
+            # cannot conveniently save the list of Value directly, as nodes
+            # expect to save values as a vector for ALL arguments.  So you
+            # need a separate IR node that represents all of the size nodes
+            # assembled into a list.  I'm not an LTC dev so I don't want to
+            # figure it out right now.  Y'all figure it out...
+            return VectorCType(BaseCType(longT))
+
+        else:
+            return VectorCType(process_ir_type(typ.elem, properties, symint=symint))
+    else:
+        raise AssertionError(f"unrecognized type {repr(typ)}")
+
+
+# TODO: Determining this based off of CType is bad; this should be computed
+# from Type directly; then the same logic as process_ir_type can be used
+#
+# Invariant: passed typ should be an *owning* CType (e.g., we will report
+# that ArrayRef<Value> is NOT a value type)
+def isValueType(typ: CType, properties: "Optional[LazyIrProperties]" = None) -> bool:
+    """
+    Given a type, determine if it is a Value-like type.  This is equivalent to
+    being Tensor-like, but assumes the type has already been transformed.
+    """
+    if isinstance(typ, BaseCType):
+        # I am regretting my naming conventions, but now we are wrapping at::scalar in
+        # lazy value, while preserving other 'scalar' types as scalars in the IR
+        treat_scalars_as_constants = properties and properties.TreatScalarsAsConstants
+        return (
+            typ.type == getValueT()
+            or (typ.type == scalarT and not treat_scalars_as_constants)
+            or typ.type == SymIntT
+        )
+    elif typ == VectorCType(BaseCType(SymIntT)):
+        # TODO: report True for this
+        return False
+    elif isinstance(typ, (OptionalCType, ListCType, VectorCType)):
+        return isValueType(typ.elem, properties)
+    return False
+
+
+def isSymIntType(typ: Type) -> bool:
+    return isinstance(typ, BaseType) and typ.name == BaseTy.SymInt
+
+
+def isWrappedScalarType(typ: Type) -> bool:
+    """
+    Given a type, determine if it is a c10::scalar which we will wrap in a lazy Value.
+    Since we literally change the type from scalarT to valueT, information is lost.
+    This function helps build a list of wrapped scalars to save that information
+    """
+    if isinstance(typ, BaseType):
+        # I am regretting my naming conventions, but now we are wrapping at::scalar in
+        # lazy value, while preserving other 'scalar' types as scalars in the IR
+        return typ.name == BaseTy.Scalar
+    elif isinstance(typ, (OptionalType, ListType)):
+        return isWrappedScalarType(typ.elem)
+    return False
+
+
+# TODO: dedupe with Type.is_generator_like
+def isGeneratorType(typ: Type) -> bool:
+    if isinstance(typ, BaseType):
+        return typ.name == BaseTy.Generator
+    elif isinstance(typ, (OptionalType)):
+        return isGeneratorType(typ.elem)
+    return False
+
+
+# This class caches a few derived properties computed from an Argument
+# and LazyIrProperties
+class LazyArgument:
+    name: str
+    orig_type: Type
+    lazy_type_: Optional[CType]
+    is_wrapped_scalar: bool
+    is_generator: bool
+    # TODO: this is lies, it is false for symint list
+    is_symint_or_list: bool
+
+    # Whether or not we are treating this as symint or not
+    symint: bool
+
+    # true if this argument is or contains a lazy IR value
+    is_lazy_value: bool
+
+    def __init__(self, arg: Argument, properties: "LazyIrProperties", *, symint: bool):
+        self.name = arg.name
+        self.orig_type = arg.type
+        self.symint = symint
+        self.is_optional = isinstance(arg.type, OptionalType)
+        self.is_generator = isGeneratorType(arg.type)
+        self.lazy_type_ = process_ir_type(arg.type, properties, symint=symint)
+        self.is_wrapped_scalar = isWrappedScalarType(arg.type)
+        self.is_symint_or_list = symint and (
+            isSymIntType(arg.type)
+            or (isinstance(arg.type, OptionalType) and isSymIntType(arg.type.elem))
+            # TODO: lists of symints are not currently treated as value types
+            # or (isinstance(arg.type, ListType) and isSymIntType(arg.type.elem))
+        )
+
+        self.is_lazy_value = isValueType(self.lazy_type, properties)
+
+    @property
+    def lazy_type(self) -> CType:
+        assert (
+            self.lazy_type_ is not None
+        ), f"Attempted to access lazy_type for invalid argument {self.name}"
+        return self.lazy_type_
+
+
+class LazyIrProperties:
+    """Collection of properties for an IR node
+
+    The property groups are listed below. Each group is mutually
+    exclusive, meaning that only one property from each group can be True
+    at any one time. The properties can be accessed as if they were normal
+    attributes. The mutual exclusivity is automatically handled.
+    """
+
+    Properties: Tuple[Tuple[str, ...], ...] = (
+        (
+            "ShapePrecompute",  # Assume shape has been precomputed
+            "ShapeCompute",  # Need to compute the shape on construction
+            "ShapeCache",  # Utilize the shape cache to defer computation
+        ),
+        (
+            "Lower",  # Codegen full lower function
+            "LowerDeclOnly",  # Codegen only lower function declaration
+        ),
+        (
+            "CanBeReused",  # Codegen full reuse function
+            "CanBeReusedDeclOnly",  # Codegen only reuse function declaration
+        ),
+        (
+            "CreateFn",  # Codegen full create function
+            "CreateFnDeclOnly",  # Codegen only create function declaration
+        ),
+        (
+            "TreatScalarsAsConstants",  # Treat Scalars as constants instead of handling like values
+        ),
+    )
+
+    def __init__(self, *default_properties: str):
+        properties: Dict[Tuple[str, ...], Optional[str]] = dict.fromkeys(
+            LazyIrProperties.Properties
+        )
+        self.__dict__["properties"] = properties
+        for p in default_properties:
+            setattr(self, p, True)
+
+    def __getattr__(self, key: str) -> Any:
+        properties = self.__dict__["properties"]
+        for values in LazyIrProperties.Properties:
+            if key in values:
+                return properties[values] == key
+
+        return self.__getattribute__(key)
+
+    def __setattr__(self, key: str, value: Any) -> Any:
+        properties = self.__dict__["properties"]
+        for values in LazyIrProperties.Properties:
+            if key in values:
+                properties[values] = key if value else None
+                return value
+
+        raise KeyError(f"Invalid property: {key}")
+
+
+# Inspired by a FunctionSchema object, a LazyIrSchema holds the schema of a Lazy IR node.
+# Unlike a FunctionSchema, it has no round-trippable string form (relating to the YAML),
+# but carries type information from a native FunctionSchema modified for use with IR nodes,
+# and preserving original argument names.
+#
+# TODO: This is not idiomatic with how other torchgen APIs transform on schema.
+class LazyIrSchema:
+    # The name of the operator this function schema describes.
+    name: "OperatorName"
+
+    positional_args: Tuple[LazyArgument, ...]
+    keyword_args: Tuple[LazyArgument, ...]
+
+    # TODO: Need to handle collisions with argument names at some point
+    returns: Tuple["Return", ...]
+
+    # if this schema has a Generator arg, list its orig ctype/name but don't
+    # build a LazyArgument since lazy IR doesn't support it
+    generator_arg: Optional[NamedCType] = None
+
+    # original function schema
+    func: FunctionSchema
+
+    # Whether or not we are code-genning for SymInt or not
+    symint: bool
+
+    properties: LazyIrProperties = LazyIrProperties(
+        # default properties
+        "ShapePrecompute",
+        "Lower",
+        "CanBeReused",
+    )
+    opkind: Optional[str] = None
+
+    def __init__(
+        self,
+        func: FunctionSchema,
+        properties: Optional[LazyIrProperties] = None,
+        *,
+        symint: bool,
+    ):
+        if properties:
+            self.properties = properties
+
+        self.func = func
+        self.symint = symint
+        positional_args: List[LazyArgument] = []
+        for arg_field in ["pre_self_positional", "self_arg", "post_self_positional"]:
+            if arg_field == "self_arg" and func.arguments.self_arg is not None:
+                arg = func.arguments.self_arg.argument
+                positional_args.append(
+                    LazyArgument(arg, self.properties, symint=symint)
+                )
+            elif getattr(func.arguments, arg_field) is not None:
+                positional_args.extend(
+                    LazyArgument(arg, self.properties, symint=symint)
+                    for arg in getattr(func.arguments, arg_field)
+                )
+        self.positional_args = tuple(positional_args)
+
+        keyword_args: List[LazyArgument] = []
+        for arg_field in [
+            "pre_tensor_options_kwarg_only",
+            "tensor_options",
+            "post_tensor_options_kwarg_only",
+            "out",
+        ]:
+            curr_args = getattr(func.arguments, arg_field)
+            if curr_args is not None:
+                if isinstance(curr_args, TensorOptionsArguments):
+                    curr_args = curr_args.all()
+                for arg in curr_args:
+                    if isGeneratorType(arg.type):
+                        assert (
+                            self.generator_arg is None
+                        ), "We expect there is only one generator arg"
+                        self.generator_arg = NamedCType(
+                            arg.name, arg.type  # type:ignore[arg-type]
+                        )
+                keyword_args.extend(
+                    LazyArgument(arg, self.properties, symint=symint)
+                    for arg in curr_args
+                )
+        self.keyword_args = tuple(keyword_args)
+        self.name = func.name
+        self.returns = func.returns
+
+    @property
+    def node_name(self) -> str:
+        """
+        Return camel-case version of op in node.
+
+        Note: This function also appends any `overload_name` in the operation.
+        For example, if the op is `bitwise_and.Tensor`, the returned name
+        will be `BitwiseAndTensor`.
+        """
+        op_name = f"{self.name.name}_{self.name.overload_name}".lower()
+        return "".join(word.capitalize() or "" for word in op_name.split("_"))
+
+    @property
+    def aten_name(self) -> str:
+        return str(self.name.name)
+
+    @property
+    def base_name(self) -> str:
+        return f"{self.name.name.base}"
+
+    def filtered_args(
+        self,
+        positional: bool = True,
+        keyword: bool = True,
+        values: bool = True,
+        scalars: bool = True,
+        generator: bool = True,
+    ) -> List[LazyArgument]:
+        # This function maintains the sorted order of arguments but provides different filtered views.
+        # Some parts of the code care about kwargs vs args (TS lowerings),
+        # other parts care about whether they need to wrap the arg in a lazy value or leave it alone.
+        # Generators are special cased, as they are needed for fallback/shape-inference but not supported
+        # in TS lowerings and therefore also omitted from lazy IR.
+        args: List[LazyArgument] = []
+        if positional:
+            args.extend(self.positional_args)
+        if keyword:
+            args.extend(self.keyword_args)
+
+        if values and scalars and generator:
+            return args
+        elif values and scalars:
+            return [a for a in args if not a.is_generator]
+        elif values:
+            return [a for a in args if a.is_lazy_value]
+        elif scalars:
+            return [
+                a
+                for a in args
+                if not a.is_lazy_value and (generator or not a.is_generator)
+            ]
+
+        return []
+
+    @property
+    def positional_values(self) -> List[LazyArgument]:
+        return self.filtered_args(
+            positional=True, keyword=False, values=True, scalars=False
+        )
+
+    @property
+    def positional_scalars(self) -> List[LazyArgument]:
+        return self.filtered_args(
+            positional=True, keyword=False, values=False, scalars=True
+        )
+
+    @property
+    def keyword_values(self) -> List[LazyArgument]:
+        return self.filtered_args(
+            positional=False, keyword=True, values=True, scalars=False
+        )
+
+    @property
+    def keyword_scalars(self) -> List[LazyArgument]:
+        return self.filtered_args(
+            positional=False, keyword=True, values=False, scalars=True
+        )
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/meta.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/meta.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad488d303d46329ba198d7f077b617704655b3b6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/meta.py
@@ -0,0 +1,12 @@
+from torchgen.model import NativeFunctionsGroup
+
+# Follows dispatcher calling convention, but:
+#   - Mutable arguments not allowed.  Meta functions are always
+#     written in functional form.  Look at FunctionSchema.signature()
+#   - No tensor returns; instead we return a TensorMeta describing
+#     the tensor in question
+
+
+def name(g: NativeFunctionsGroup) -> str:
+    # use the overload name from the functional version
+    return str(g.functional.func.name).replace(".", "_")
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/native.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/native.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f8b3eb3af2e7e90ade39afb0f3c559951b69b99
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/native.py
@@ -0,0 +1,153 @@
+from typing import List, Optional, Sequence, Union
+
+from torchgen import local
+from torchgen.api import cpp
+
+from torchgen.api.types import (
+    ArgName,
+    BaseCType,
+    Binding,
+    boolT,
+    ConstRefCType,
+    CType,
+    deviceT,
+    layoutT,
+    ListCType,
+    MutRefCType,
+    NamedCType,
+    OptionalCType,
+    scalarT,
+    scalarTypeT,
+    tensorT,
+)
+from torchgen.model import (
+    Argument,
+    FunctionSchema,
+    Return,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.utils import assert_never
+
+# This file describes the translation of JIT schema to the native functions API.
+# This looks a lot like the C++ API (which makes historical sense, because the
+# idea was you wrote native functions to implement functions in the C++ API),
+# but over time we have evolved the C++ API without actually changing our
+# native:: kernels.  The intention is to make native API and dispatcher API
+# line up as closely as possible, since this results in the least overhead
+# (no translation is needed from dispatcher API to native API).
+#
+# NB: this is symint aware, you will get the non-SymInt variant for some
+# dispatch entries and SymInt for others.
+
+
+def name(func: FunctionSchema) -> str:
+    name = str(func.name.name)
+    # TODO: delete this!
+    if func.is_out_fn():
+        name += "_out"
+    if func.name.overload_name:
+        name += f"_{func.name.overload_name}"
+    return name
+
+
+def argumenttype_type(
+    t: Type, *, mutable: bool, binds: ArgName, symint: bool
+) -> NamedCType:
+    if str(t) == "Tensor?":
+        tensor_type: OptionalCType = OptionalCType(BaseCType(tensorT))
+        if mutable and not local.use_const_ref_for_mutable_tensors():
+            return NamedCType(binds, MutRefCType(tensor_type))
+        else:
+            return NamedCType(binds, ConstRefCType(tensor_type))
+    elif str(t) == "Tensor?[]":
+        return NamedCType(
+            binds, ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT))))
+        )
+    elif str(t) == "Scalar":
+        return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
+    elif str(t) == "Scalar?":
+        return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT))))
+    return cpp.argumenttype_type(t, mutable=mutable, binds=binds, symint=symint)
+
+
+def returns_type(rs: Sequence[Return], *, symint: bool) -> CType:
+    return cpp.returns_type(rs, symint=symint)
+
+
+def argument_type(a: Argument, *, binds: ArgName, symint: bool) -> NamedCType:
+    return argumenttype_type(a.type, mutable=a.is_write, binds=binds, symint=symint)
+
+
+def argument(
+    a: Union[Argument, SelfArgument, TensorOptionsArguments],
+    *,
+    is_out: bool,
+    symint: bool,
+) -> List[Binding]:
+    # Ideally, we NEVER default native functions.  However, there are a number
+    # of functions that call native:: directly and rely on the defaulting
+    # existing.  So for BC, we generate defaults for non-out variants (but not
+    # for out variants, where it is impossible to generate an appropriate
+    # default)
+    should_default = not is_out
+    if isinstance(a, Argument):
+        default: Optional[str] = None
+        if should_default and a.default is not None:
+            default = cpp.default_expr(a.default, a.type, symint=symint)
+        return [
+            Binding(
+                nctype=argument_type(a, binds=a.name, symint=symint),
+                name=a.name,
+                default=default,
+                argument=a,
+            )
+        ]
+    elif isinstance(a, SelfArgument):
+        # Erase SelfArgument from the distinction
+        return argument(a.argument, is_out=is_out, symint=symint)
+    elif isinstance(a, TensorOptionsArguments):
+        default = None
+        if should_default:
+            default = "{}"
+        # TODO: Not sure why the arguments assigned here are for
+        # TensorOptionsArguments and not the constituent pieces.  It seems
+        # to matter
+        return [
+            Binding(
+                nctype=NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT))),
+                name="dtype",
+                default=default,
+                argument=a,
+            ),
+            Binding(
+                nctype=NamedCType("layout", OptionalCType(BaseCType(layoutT))),
+                name="layout",
+                default=default,
+                argument=a,
+            ),
+            Binding(
+                nctype=NamedCType("device", OptionalCType(BaseCType(deviceT))),
+                name="device",
+                default=default,
+                argument=a,
+            ),
+            Binding(
+                nctype=NamedCType("pin_memory", OptionalCType(BaseCType(boolT))),
+                name="pin_memory",
+                default=default,
+                argument=a,
+            ),
+        ]
+    else:
+        assert_never(a)
+
+
+def arguments(func: FunctionSchema, *, symint: bool) -> List[Binding]:
+    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+    args.extend(func.arguments.non_out)
+    args.extend(func.arguments.out)
+    return [
+        r for arg in args for r in argument(arg, symint=symint, is_out=func.is_out_fn())
+    ]
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/structured.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/structured.py
new file mode 100644
index 0000000000000000000000000000000000000000..392b8a67e01e80ae7d4d15aafe99e6e6fe75cff6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/structured.py
@@ -0,0 +1,157 @@
+from typing import List, Union
+
+from torchgen.api import cpp
+
+from torchgen.api.types import (
+    ArgName,
+    ArrayRefCType,
+    BaseCType,
+    Binding,
+    ConstRefCType,
+    dimnameListT,
+    intArrayRefT,
+    iOptTensorListRefT,
+    iTensorListRefT,
+    NamedCType,
+    OptionalCType,
+    optionalIntArrayRefT,
+    optionalScalarRefT,
+    optionalTensorRefT,
+    scalarT,
+    tensorT,
+)
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    ListType,
+    NativeFunctionsGroup,
+    OptionalType,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.utils import assert_never
+
+# This file describes the translation of JIT schema to the structured functions API.
+# This is similar to native API, but a number of historical problems with native
+# API have been fixed.
+
+
+# Translation of types occurring in JIT arguments to a C++ argument type.
+# NB: For now, mutable doesn't do anything; but it could if we make
+# some more nominal types
+def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> NamedCType:
+    # If it's a value type, do the value type translation
+    # NB: structured kernels ALWAYS have symint off, since they involve actual
+    # kernels that require real ints.  The one exception is the
+    # CompositeExplicitAutograd and the meta function (which could
+    # hypothetically be SymInt), but for simplicity we plan for these to just
+    # be handled in Python
+    r = cpp.valuetype_type(t, symint=False, binds=binds)
+    if r is not None:
+        return r
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            return NamedCType(binds, ConstRefCType(BaseCType(tensorT)))
+        elif t.name == BaseTy.Scalar:
+            return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
+        else:
+            raise AssertionError(f"base type should have been value type {t}")
+    elif isinstance(t, OptionalType):
+        if t.elem == BaseType(BaseTy.Tensor):
+            return NamedCType(binds, BaseCType(optionalTensorRefT))
+        elif t.elem == BaseType(BaseTy.Scalar):
+            return NamedCType(binds, BaseCType(optionalScalarRefT))
+        elif isinstance(t.elem, ListType) and str(t.elem.elem) == "int":
+            return NamedCType(binds, BaseCType(optionalIntArrayRefT))
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
+        return NamedCType(binds, OptionalCType(elem.type))
+    elif isinstance(t, ListType):
+        if t.elem == BaseType(BaseTy.Tensor):
+            return NamedCType(binds, ConstRefCType(BaseCType(iTensorListRefT)))
+        elif t.elem == OptionalType(BaseType(BaseTy.Tensor)):
+            return NamedCType(binds, BaseCType(iOptTensorListRefT))
+        # TODO: delete these special cases; see torchgen.api.cpp--these
+        # must be changed in tandem, but there are problems; see
+        # https://github.com/pytorch/pytorch/pull/51485
+        elif str(t.elem) == "int":
+            return NamedCType(binds, BaseCType(intArrayRefT))
+        elif str(t.elem) == "Dimname":
+            return NamedCType(binds, BaseCType(dimnameListT))
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
+        return NamedCType(binds, ArrayRefCType(elem.type))
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+def argument_type(a: Argument, *, binds: ArgName) -> NamedCType:
+    return argumenttype_type(a.type, mutable=a.is_write, binds=binds)
+
+
+# returns_type intentionally omitted, because structured kernels never "return";
+# instead, they always indirectly report their outputs (in the case of a meta
+# function, by calling set_output; in the case of an impl function, by writing
+# directly into the provided out argument).
+
+
+# Structured kernels are never defaulted
+def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> List[Binding]:
+    if isinstance(a, Argument):
+        return [
+            Binding(
+                nctype=argument_type(a, binds=a.name),
+                name=a.name,
+                default=None,
+                argument=a,
+            )
+        ]
+    elif isinstance(a, SelfArgument):
+        return argument(a.argument)
+    elif isinstance(a, TensorOptionsArguments):
+        raise AssertionError("structured kernels don't support TensorOptions yet")
+    else:
+        assert_never(a)
+
+
+def impl_arguments(g: NativeFunctionsGroup) -> List[Binding]:
+    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+
+    if g.out.precomputed:
+        # A list of parameters for the impl function with
+        # certain parameters replaced with precomputed counterparts
+        # as specified in native_functions.yaml.
+        non_out_args_replaced: List[
+            Union[Argument, TensorOptionsArguments, SelfArgument]
+        ] = []
+        for a in g.out.func.arguments.non_out:
+            if isinstance(a, Argument) and a.name in g.out.precomputed.replace:
+                # If a is in precompute.replace, append the parameters
+                # that should replace it onto non_out_args_replaced.
+                non_out_args_replaced.extend(g.out.precomputed.replace[a.name])
+            else:
+                # If not, push a as it is.
+                non_out_args_replaced.append(a)
+
+        args.extend(non_out_args_replaced)
+        # g.out.precomputed.add is the list of parameters that are added
+        # without replacement after the non out args and just before the out args
+        args.extend(g.out.precomputed.add)
+    else:
+        args.extend(g.out.func.arguments.non_out)
+
+    args.extend(g.out.func.arguments.out)
+    return [r for arg in args for r in argument(arg)]
+
+
+def meta_arguments(g: NativeFunctionsGroup) -> List[Binding]:
+    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+    args.extend(g.functional.func.arguments.non_out)
+    return [r for arg in args for r in argument(arg)]
+
+
+def out_arguments(g: NativeFunctionsGroup) -> List[Binding]:
+    args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+    args.extend(g.out.func.arguments.out)
+    return [r for arg in args for r in argument(arg)]
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/translate.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/translate.py
new file mode 100644
index 0000000000000000000000000000000000000000..f59b6eab24d6a031738fe33dfaff30c1f3a36ad5
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/translate.py
@@ -0,0 +1,430 @@
+from typing import Dict, List, NoReturn, Sequence, Union
+
+from torchgen.api.types import (
+    ArrayRefCType,
+    BaseCType,
+    Binding,
+    boolT,
+    ConstRefCType,
+    deviceT,
+    Expr,
+    intArrayRefT,
+    iOptTensorListRefT,
+    layoutT,
+    ListCType,
+    longT,
+    memoryFormatT,
+    MutRefCType,
+    NamedCType,
+    opmath_t,
+    OptionalCType,
+    optionalIntArrayRefT,
+    optionalScalarRefT,
+    optionalSymIntArrayRefT,
+    optionalTensorRefT,
+    scalar_t,
+    scalarT,
+    scalarTypeT,
+    SpecialArgName,
+    symIntArrayRefT,
+    SymIntT,
+    tensorOptionsT,
+    tensorT,
+    VectorCType,
+)
+
+# This file implements a small program synthesis engine that implements
+# conversions between one API to another.
+#
+# The key data type in this file in NamedCType, short for Named C++ semantic type.  A NamedCType
+# represents a C++ type, plus semantic information about what it represents.
+# For example, consider the argument "bool pin_memory"; its normal C++ type is
+# "bool", but its C++ semantic type also keeps track that this represents a
+# "pin_memory"; you can't just use a random other boolean in a context where you
+# need a "pin_memory"!
+#
+# The translator takes a list of needed NamedCTypes, and then figures out how
+# to construct expressions with these NamedCTypes from the given bindings.  Many
+# of these expressions are trivial (I need a Tensor other; there's a Tensor
+# other scope); others are more nontrivial and may require packing/unpacking.
+# Some examples of non-trivial action:
+#
+#   - Need the "dtype" binding?  Well, maybe "dtype" isn't available
+#     in the context, instead, "options" is, and you need to extract
+#     it from there.  (Gather)
+#
+#   - Need the "context" binding?  Well, maybe "context" isn't available
+#     in the context, and you need to construct it from "dtype", "device",
+#     etc.  (Scatter)
+#
+#   - Need the "memory_format" binding?  Well, actually, it's available
+#     from both "memory_format" and "options", so you had better make sure
+#     they are consistent.  (Join)
+
+options_ctype = NamedCType("options", ConstRefCType(BaseCType(tensorOptionsT)))
+
+out_tensor_ctype = NamedCType("out", ConstRefCType(BaseCType(tensorT)))
+
+longVec_ctype = VectorCType(BaseCType(longT))
+longSymVec_ctype = VectorCType(BaseCType(SymIntT))
+optionalLongVec_ctype = OptionalCType(VectorCType(BaseCType(longT)))
+optionalScalar_ctype = OptionalCType(BaseCType(scalarT))
+optionalTensor_ctype = OptionalCType(BaseCType(tensorT))
+
+
+class UnsatError(RuntimeError):
+    pass
+
+
+# Given a set of in-scope bindings and a set of target bindings, synthesize
+# a list of expressions that uses only the in-scope bindings (bindings) that
+# have all of the types of goals.  You may want to use this function if
+# you're generating code for a function like:
+#
+#   void f({args}) {
+#     g({exprs}); // g is a different API
+#   }
+#
+# and you need to generate "exprs".
+#
+# Typically, a list of Bindings is convenient to get (you usually call something
+# like arguments() to get them); but technically you only need less information:
+# for 'bindings' an (un-ordered) list of Exprs is sufficient; similarly, for
+# 'goals', an (ordered) list of NamedCType goals is sufficient.  If you are doing
+# something more complicated, e.g., tracking the set of bindings in a context,
+# you may find using these smaller types more convenient.
+def translate(
+    bindings: Sequence[Union[Expr, Binding]],
+    goals: Sequence[Union[NamedCType, Binding]],
+    *,
+    method: bool = False,
+    allow_expensive_conversions: bool = False,
+) -> List[Expr]:
+    binding_exprs: List[Expr] = []
+    for b in bindings:
+        if isinstance(b, Binding):
+            binding_exprs.append(
+                Expr(
+                    expr=b.name,
+                    type=b.nctype,
+                )
+            )
+        else:
+            binding_exprs.append(b)
+
+    goal_ctypes: List[NamedCType] = []
+    for g in goals:
+        if isinstance(g, Binding):
+            goal_ctypes.append(g.nctype)
+        else:
+            goal_ctypes.append(g)
+
+    # Add all the bindings to the context
+    ctx: Dict[NamedCType, str] = {}
+    for b in binding_exprs:
+        ctx[b.type] = b.expr
+
+        # While we're at it, do some simple forward inference, looking through
+        # constructors.
+        #
+        # NB: When should you do forward inference versus backward inference?
+        # The general idea:
+        #
+        #   - Backward inference WHEN the goal gets smaller
+        #   - Forward inference WHEN the hypothesis gets smaller
+        #
+        # This helps ensure termination: backward inference starts with a goal
+        # and tries to make it simpler and simpler until it's trivial; if the
+        # goal can grow in size, we blow up to a really huge goal size.
+        # Similarly, with forward inference we take hypotheses and decompose
+        # them into simpler hypotheses; if hypotheses could expand in size,
+        # we also have potential nontermination.  (In the code below, forward
+        # inference is only ever carried out at a single step, but you could
+        # imagine repeated application of forward inference being profitable.)
+        #
+        # A good starting point in the literature for exploring more about proof
+        # search are these lecture notes
+        # https://www.cs.cmu.edu/~fp/courses/oregon-m10/04-focusing.pdf
+        #
+        # TODO: My kingdom for a pattern matcher
+        # https://www.python.org/dev/peps/pep-0634/
+        #
+        # TODO: This could get us in recomputation trouble if b.expr is nontrivial.
+        # Fix this by implementing some sort of sharing so that if multiple
+        # goals share the same expression, we only compute it once.  This seems
+        # to matter in practice as compiler is often unwilling to CSE nontrivial
+        # expressions like scalar.to<scalar_t>()
+        t = b.type
+        if (
+            isinstance(t, ConstRefCType)
+            and isinstance(t.elem, OptionalCType)
+            and isinstance(t.elem.elem, BaseCType)
+            and str(t.elem.elem.type) == "at::Tensor"
+        ):
+            ctx[
+                NamedCType(t.elem.elem.name, ConstRefCType(BaseCType(tensorT)))
+            ] = f"({b.expr}.has_value() ? *{b.expr} : at::Tensor())"
+
+        if t.type == ConstRefCType(OptionalCType(BaseCType(tensorT))):
+            ctx[
+                NamedCType(t.name, BaseCType(optionalTensorRefT))
+            ] = f"(({b.expr}.has_value() && (*{b.expr}).defined()) ? at::OptionalTensorRef(*{b.expr}) : at::OptionalTensorRef())"
+
+        if t.type == ConstRefCType(BaseCType(scalarT)):
+            ctx[NamedCType(t.name, BaseCType(opmath_t))] = f"({b.expr}).to<opmath_t>()"
+
+        if t.type == ConstRefCType(OptionalCType(BaseCType(scalarT))):
+            ctx[
+                NamedCType(t.name, BaseCType(optionalScalarRefT))
+            ] = f"({b.expr}.has_value() ? at::OptionalScalarRef(&({b.expr}.value())) : at::OptionalScalarRef())"
+
+        if t.type == BaseCType(scalar_t):
+            ctx[
+                NamedCType(t.name, BaseCType(opmath_t))
+            ] = f"static_cast<opmath_t>({b.expr})"
+
+        # [Note: IOptTensorListRef]
+        if t.type == ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT)))):
+            ctx[
+                NamedCType(t.name, BaseCType(iOptTensorListRefT))
+            ] = f"at::IOptTensorListRef({b.expr})"
+
+    # Add implicit bindings if the generated code is inside a Tensor method
+    if method:
+        ctx[
+            NamedCType("self", MutRefCType(BaseCType(tensorT)))
+        ] = "const_cast<Tensor&>(*this)"
+        ctx[
+            NamedCType("self", ConstRefCType(BaseCType(tensorT)))
+        ] = "const_cast<Tensor&>(*this)"
+        # This is better!  Byte-for-byte compat
+        # ctx[NamedCType("self", ConstRefCType(BaseCType(tensorT)))] = "*this"
+
+    def unsat(goal: NamedCType) -> NoReturn:
+        ctx_desc = "\n".join(
+            f"  {t.cpp_type()} {t.name}; // {e}" for t, e in ctx.items()
+        )
+        raise UnsatError(
+            f"""
+Failed to synthesize the expression "{goal.cpp_type()} {goal.name}".
+When I failed, the following bindings were available in the context:
+
+{ctx_desc}
+
+This probably means there is a missing rule in the rules of torchgen.api.translate.
+Check this module for more information.
+"""
+        )
+
+    # A shitty backtracking search implementation.  It's shitty because it
+    # does backtracking via stack (bad idea!) and for the most part tries to
+    # avoid backtracking.  In particular, if
+    # direct=True, we won't try to do any fancy synthesis, just trivial
+    # conversions (e.g., "T a" is OK for "const T& a").  So all of the
+    # existing rules in this function simply try to solve immediately,
+    # and bail if things don't work out.
+    def solve(goal: NamedCType, *, direct: bool) -> str:
+        def direct_solve(goal: NamedCType) -> str:
+            return solve(goal, direct=True)
+
+        if goal in ctx:
+            # Trivial
+            return ctx[goal]
+
+        # const & is satisfied with mutable &
+        if isinstance(goal.type, ConstRefCType):
+            try:
+                # WARNING: not strictly decreasing; be careful not
+                # to add a direct conversion that goes satisfies
+                # mutable& with const&
+                return solve(
+                    NamedCType(goal.name, MutRefCType(goal.type.elem)), direct=direct
+                )
+            except UnsatError:
+                pass
+
+        # mutable & is satisfied with value
+        if isinstance(goal.type, MutRefCType):
+            try:
+                return solve(NamedCType(goal.name, goal.type.elem), direct=direct)
+            except UnsatError:
+                pass
+
+        # TODO: These are referentially equal, shouldn't have to do this;
+        # ensuring we don't use type synonym IntArrayRef in codegen would
+        # help
+        if goal.type == ArrayRefCType(BaseCType(longT)):
+            return solve(NamedCType(goal.name, BaseCType(intArrayRefT)), direct=direct)
+
+        if direct:
+            unsat(goal)
+
+        # For now, all of these rules are mutually exclusive.
+        if goal == NamedCType("memory_format", OptionalCType(BaseCType(memoryFormatT))):
+            memory_format = direct_solve(
+                NamedCType(
+                    SpecialArgName.possibly_redundant_memory_format,
+                    OptionalCType(BaseCType(memoryFormatT)),
+                )
+            )
+            # No need to join "memory_format" and "options" if the target API takes "options" directly.
+            # Otherwise it will cause the redundant memory_format error.
+            if options_ctype in goal_ctypes:
+                return memory_format
+            try:
+                options = direct_solve(options_ctype)
+                return f"c10::impl::check_tensor_options_and_extract_memory_format({options}, {memory_format})"
+            except UnsatError:
+                return memory_format
+        elif goal == NamedCType("options", BaseCType(tensorOptionsT)):
+            dtype = direct_solve(
+                NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT)))
+            )
+            pin_memory = direct_solve(
+                NamedCType("pin_memory", OptionalCType(BaseCType(boolT)))
+            )
+            device = direct_solve(
+                NamedCType("device", OptionalCType(BaseCType(deviceT)))
+            )
+            layout = direct_solve(
+                NamedCType("layout", OptionalCType(BaseCType(layoutT)))
+            )
+            return f"TensorOptions().dtype({dtype}).layout({layout}).device({device}).pinned_memory({pin_memory})"
+
+        elif goal == NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT))):
+            try:
+                options = direct_solve(options_ctype)
+                return f"c10::optTypeMetaToScalarType({options}.dtype_opt())"
+            except UnsatError:
+                out_tensor = direct_solve(out_tensor_ctype)
+                return f"{out_tensor}.scalar_type()"
+
+        elif goal == NamedCType("layout", OptionalCType(BaseCType(layoutT))):
+            try:
+                options = direct_solve(options_ctype)
+                return f"{options}.layout_opt()"
+            except UnsatError:
+                out_tensor = direct_solve(out_tensor_ctype)
+                return f"{out_tensor}.layout()"
+
+        elif goal == NamedCType("device", OptionalCType(BaseCType(deviceT))):
+            try:
+                options = direct_solve(options_ctype)
+                return f"{options}.device_opt()"
+            except UnsatError:
+                out_tensor = direct_solve(out_tensor_ctype)
+                return f"{out_tensor}.device()"
+
+        elif goal == NamedCType("pin_memory", OptionalCType(BaseCType(boolT))):
+            try:
+                options = direct_solve(options_ctype)
+                return f"{options}.pinned_memory_opt()"
+            except UnsatError:
+                # If we're calling a factory op from its out= variant,
+                # We don't actually care about the value of pin_memory.
+                out_tensor = direct_solve(out_tensor_ctype)
+                return "c10::nullopt"
+
+        # We can always do translations from value types to reference types, like vector<int> -> IntArrayRef
+        elif goal.type == BaseCType(intArrayRefT):
+            try:
+                return direct_solve(NamedCType(goal.name, longVec_ctype))
+            except UnsatError:
+                # We can also go SymIntArrayRef -> IntArrayRef
+                symIntArrayRef_type = direct_solve(
+                    NamedCType(goal.name, BaseCType(symIntArrayRefT))
+                )
+                return f"C10_AS_INTARRAYREF_SLOW({symIntArrayRef_type})"
+        elif goal.type == BaseCType(symIntArrayRefT):
+            try:
+                r = direct_solve(NamedCType(goal.name, BaseCType(intArrayRefT)))
+                return f"c10::fromIntArrayRefSlow({r})"
+            except UnsatError:
+                return direct_solve(NamedCType(goal.name, longSymVec_ctype))
+        elif goal.type == BaseCType(SymIntT):
+            return direct_solve(NamedCType(goal.name, BaseCType(longT)))
+        elif goal.type == OptionalCType(BaseCType(SymIntT)):
+            argname = direct_solve(
+                NamedCType(goal.name, OptionalCType(BaseCType(longT)))
+            )
+            return f"{argname}.has_value() ? c10::make_optional(c10::SymInt(*{argname})) : c10::nullopt"
+        elif goal.type == BaseCType(longT):
+            symInt_type = direct_solve(NamedCType(goal.name, BaseCType(SymIntT)))
+            return f"{symInt_type}.guard_int(__FILE__, __LINE__)"
+        elif goal.type == OptionalCType(BaseCType(longT)):
+            argname = direct_solve(
+                NamedCType(goal.name, OptionalCType(BaseCType(SymIntT)))
+            )
+            return f"{argname}.has_value() ? c10::make_optional({argname}->guard_int(__FILE__, __LINE__)) : c10::nullopt"
+        elif goal.type == BaseCType(optionalIntArrayRefT):
+            try:
+                return direct_solve(NamedCType(goal.name, optionalLongVec_ctype))
+            except UnsatError:
+                argname = direct_solve(
+                    NamedCType(goal.name, BaseCType(optionalSymIntArrayRefT))
+                )
+                return f"{argname}.has_value() ? c10::make_optional(C10_AS_INTARRAYREF_SLOW(*{argname})) : c10::nullopt"
+        elif goal.type == BaseCType(optionalSymIntArrayRefT):
+            # TODO: You might also want to solve this from longSymVec_ctype or
+            # an optional version of it
+            argname = direct_solve(
+                NamedCType(goal.name, BaseCType(optionalIntArrayRefT))
+            )
+            return f"{argname}.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*{argname})) : c10::nullopt"
+        elif goal.type == BaseCType(optionalScalarRefT):
+            return direct_solve(NamedCType(goal.name, optionalScalar_ctype))
+        elif goal.type == BaseCType(optionalTensorRefT):
+            return direct_solve(NamedCType(goal.name, optionalTensor_ctype))
+
+        # Note [translation from C++ reference to value types]
+        # The below cases are all for when we have an argument with a reference type,
+        # and a corresponding goal with a value type.
+        # These are needed when we populate the inputs to a lambda capture and we need
+        # to guarantee the lifetime of each captured argument.
+        # We guard it with an explicit kwarg because converting to a value type is expensive
+        # (O(n)) to convert from IntArrayRef to vector<int>),
+        # so the caller of translate() should be explicit that they need it.
+        if allow_expensive_conversions:
+            if goal.type == VectorCType(BaseCType(longT)):
+                intArrayRef_ctype = NamedCType(goal.name, BaseCType(intArrayRefT))
+                argname = direct_solve(intArrayRef_ctype)
+                return f"{argname}.vec()"
+            if goal.type == VectorCType(BaseCType(SymIntT)):
+                symIntArrayRef_ctype = NamedCType(goal.name, BaseCType(symIntArrayRefT))
+                argname = direct_solve(symIntArrayRef_ctype)
+                return f"{argname}.vec()"
+            elif goal.type == OptionalCType(VectorCType(BaseCType(longT))):
+                optionalIntArrayRef_ctype = NamedCType(
+                    goal.name, BaseCType(optionalIntArrayRefT)
+                )
+                argname = direct_solve(optionalIntArrayRef_ctype)
+                return f"{argname}.has_value() ? c10::make_optional({argname}->vec()) : c10::nullopt"
+            elif goal.type == OptionalCType(BaseCType(scalarT)):
+                optionalScalarRef_ctype = NamedCType(
+                    goal.name, BaseCType(optionalScalarRefT)
+                )
+                argname = direct_solve(optionalScalarRef_ctype)
+                return f"{argname}.has_value() ? c10::make_optional({argname}) : c10::nullopt"
+            elif goal.type == OptionalCType(BaseCType(scalarT)):
+                optionalTensorRef_ctype = NamedCType(
+                    goal.name, BaseCType(optionalTensorRefT)
+                )
+                argname = direct_solve(optionalTensorRef_ctype)
+                return f"{argname}.has_value() ? c10::make_optional({argname}) : c10::nullopt"
+            # Technically, we also need to handle cases of C++ containers holding reference types.
+            # But there currently aren't any ops that require lambda capture codegen
+            # With arguments like std::vector<IntArrayRef>.
+            # If that changes, we'll have to add the translation here.
+
+        # We allow const casting on tensors, since const-correctness is a bit broken for at::Tensor.
+        # We could probably generalize this to non-tensor types too.
+        if goal.type == MutRefCType(BaseCType(tensorT)):
+            const_ref_tensor_ctype = NamedCType(
+                goal.name, ConstRefCType(BaseCType(tensorT))
+            )
+            argname = direct_solve(const_ref_tensor_ctype)
+            return f"const_cast<Tensor&>({argname})"
+
+        unsat(goal)
+
+    return [Expr(solve(g, direct=False), g) for g in goal_ctypes]
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/types/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/types/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c54b2c33d264569cee44f767ec3c6c53df5fc988
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/types/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/types/__pycache__/types.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/types/__pycache__/types.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5cda2412cc55fe1875665abfb7c15432bf118e58
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/types/__pycache__/types.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/types/types.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/types/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..16eff73638e4693bbc29e4476fa2486dfd6ca0fb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/api/types/types.py
@@ -0,0 +1,190 @@
+"""
+Where should I add a new type? `types_base.py` vs `types.py`
+
+This file defines data model classes for torchgen typing system, as well as some base types such as int32_t.
+
+`types.py` defines ATen Tensor type and some c10 types, along with signatures that use these types.
+
+The difference between these two files, is `types_base.py` should be implementation-agnostic, meaning it shouldn't
+contain any type definition that is tight to a specific C++ library (e.g., ATen), so that it can be easily reused
+if we want to generate code for another C++ library.
+
+Add new types to `types.py` if these types are ATen/c10 related.
+Add new types to `types_base.py` if they are basic and not attached to ATen/c10.
+"""
+from dataclasses import dataclass
+from typing import Dict
+
+from torchgen.model import BaseTy, ScalarType
+
+from .types_base import (
+    BaseCppType,
+    BaseCType,
+    boolT,
+    byteT,
+    charT,
+    CType,
+    doubleT,
+    floatT,
+    int32T,
+    longT,
+    shortT,
+)
+
+
+TENSOR_LIST_LIKE_CTYPES = [
+    "at::TensorList",
+    "const c10::List<c10::optional<at::Tensor>> &",
+    "const at::ITensorListRef &",
+]
+
+
+halfT = BaseCppType("at", "Half")
+complexHalfT = BaseCppType(
+    "c10", "complex<c10::Half>"
+)  # stuffing template param here is an abuse
+complexFloatT = BaseCppType("c10", "complex<float>")
+complexDoubleT = BaseCppType("c10", "complex<double>")
+bfloat16T = BaseCppType("at", "BFloat16")
+float8_e5m2T = BaseCppType("at", "Float8_e5m2")
+float8_e5m2fnuzT = BaseCppType("at", "Float8_e5m2fnuz")
+float8_e4m3fnT = BaseCppType("at", "Float8_e4m3fn")
+float8_e4m3fnuzT = BaseCppType("at", "Float8_e4m3fnuz")
+stringT = BaseCppType("c10", "string_view")
+generatorT = BaseCppType("at", "Generator")
+scalarTypeT = BaseCppType("at", "ScalarType")
+tensorT = BaseCppType("at", "Tensor")
+optionalTensorRefT = BaseCppType("at", "OptionalTensorRef")
+tensorListT = BaseCppType("at", "TensorList")
+iTensorListRefT = BaseCppType("at", "ITensorListRef")
+iOptTensorListRefT = BaseCppType("at", "IOptTensorListRef")
+dimnameT = BaseCppType("at", "Dimname")
+dimnameListT = BaseCppType("at", "DimnameList")
+dimVectorT = BaseCppType("at", "DimVector")
+layoutT = BaseCppType("at", "Layout")
+deviceT = BaseCppType("at", "Device")
+deviceIndexT = BaseCppType("at", "DeviceIndex")
+scalarT = BaseCppType("at", "Scalar")
+optionalScalarRefT = BaseCppType("at", "OptionalScalarRef")
+memoryFormatT = BaseCppType("at", "MemoryFormat")
+qschemeT = BaseCppType("at", "QScheme")
+storageT = BaseCppType("at", "Storage")
+streamT = BaseCppType("at", "Stream")
+intArrayRefT = BaseCppType("at", "IntArrayRef")
+optionalIntArrayRefT = BaseCppType("at", "OptionalIntArrayRef")
+optionalSymIntArrayRefT = BaseCppType("at", "OptionalSymIntArrayRef")
+tensorOptionsT = BaseCppType("at", "TensorOptions")
+typeAndSizeT = BaseCppType("torch::autograd::generated", "TypeAndSize")
+tensorGeometryT = BaseCppType("at", "TensorGeometry")
+SymIntT = BaseCppType("c10", "SymInt")
+symIntArrayRefT = BaseCppType("c10", "SymIntArrayRef")
+
+# Types representing template parameters.  Technically, we probably shouldn't
+# represent them this way in codegen, but it was pretty convenient.
+scalar_t = BaseCppType("", "scalar_t")
+opmath_t = BaseCppType("", "opmath_t")
+
+ScalarTypeToCppMapping: Dict[ScalarType, BaseCppType] = {
+    ScalarType.Byte: byteT,
+    ScalarType.Char: charT,
+    ScalarType.Short: shortT,
+    ScalarType.Int: int32T,
+    ScalarType.Long: longT,
+    ScalarType.Half: halfT,
+    ScalarType.Float: floatT,
+    ScalarType.Double: doubleT,
+    ScalarType.ComplexHalf: complexHalfT,
+    ScalarType.ComplexFloat: complexFloatT,
+    ScalarType.ComplexDouble: complexDoubleT,
+    ScalarType.Bool: boolT,
+    ScalarType.Float8_e5m2: float8_e5m2T,
+    ScalarType.Float8_e5m2fnuz: float8_e5m2fnuzT,
+    ScalarType.Float8_e4m3fn: float8_e4m3fnT,
+    ScalarType.Float8_e4m3fnuz: float8_e4m3fnuzT,
+}
+
+BaseTypeToCppMapping: Dict[BaseTy, BaseCppType] = {
+    BaseTy.int: longT,
+    BaseTy.float: doubleT,
+    BaseTy.bool: boolT,
+    BaseTy.str: stringT,
+    BaseTy.Generator: generatorT,
+    BaseTy.ScalarType: scalarTypeT,
+    BaseTy.Tensor: tensorT,
+    BaseTy.Dimname: dimnameT,
+    BaseTy.DimVector: dimVectorT,
+    BaseTy.Layout: layoutT,
+    BaseTy.Device: deviceT,
+    BaseTy.DeviceIndex: deviceIndexT,
+    BaseTy.Scalar: scalarT,
+    BaseTy.MemoryFormat: memoryFormatT,
+    BaseTy.QScheme: qschemeT,
+    BaseTy.Storage: storageT,
+    BaseTy.Stream: streamT,
+    BaseTy.SymInt: SymIntT,
+}
+
+# CTypes encode C++ type structure as needed for translation.
+
+
+@dataclass(frozen=True)
+class OptionalCType(CType):
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"c10::optional<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"c10::optional<{self.elem.cpp_type_registration_declarations()}>"
+
+    def remove_const_ref(self) -> "CType":
+        return OptionalCType(self.elem.remove_const_ref())
+
+
+@dataclass(frozen=True)
+class ListCType(CType):
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"c10::List<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"c10::List<{self.elem.cpp_type_registration_declarations()}>"
+
+    def remove_const_ref(self) -> "CType":
+        return ListCType(self.elem.remove_const_ref())
+
+
+@dataclass(frozen=True)
+class ArrayRefCType(CType):
+    elem: "CType"
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"at::ArrayRef<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        return f"ArrayRef<{self.elem.cpp_type_registration_declarations()}>"
+
+    def remove_const_ref(self) -> "CType":
+        return ArrayRefCType(self.elem.remove_const_ref())
+
+
+@dataclass(frozen=True)
+class VectorizedCType(CType):
+    # This template is explicitly specialized, so the only valid
+    # elems are those we have specializations for (e.g., float, double, ...)
+    # scalar_t is also a common argument here (when we are codegen in
+    # a templated context)
+    elem: BaseCType
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        return f"at::vec::Vectorized<{self.elem.cpp_type()}>"
+
+    def cpp_type_registration_declarations(self) -> str:
+        raise NotImplementedError
+
+    def remove_const_ref(self) -> "CType":
+        return self
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/native/native_functions.yaml b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/native/native_functions.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dae8d951268975af65479c7479cca47f9e022e89
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/native/native_functions.yaml
@@ -0,0 +1,15514 @@
+# See README.md in this directory for more guidance
+
+# *********NB: _cast_* operators are DEPRECATED and will be removed
+# eventually. These were previously used before TorchScript IR supported
+# representing ScalarType's. They are now superseded by usage of
+# `aten::to()`. The ops remain here for backward compatibility purposes.
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# DEPRECATED. DO NOT USE
+- func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor
+  variants: function
+
+# Computes the gradient of current tensor w.r.t. graph leaves.
+- func: _backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()
+  manual_cpp_binding: True
+  variants: method
+
+# DEPRECATED. Sets the tensor data held by this `Variable` to be the same as
+# `new_data`.  It requires that `new_data` and `Variable` have compatible tensor
+# type, by checking `_has_compatible_shallow_copy_type(this, new_data)`.
+#
+# This function is deprecated because it doesn't really make sense in a world
+# where Variables *are* Tensors (as opposed to them containing tensors, which
+# is what the previous interpretation was.)
+- func: set_data(Tensor(a!) self, Tensor new_data) -> ()
+  manual_cpp_binding: True
+  variants: method
+
+- func: data(Tensor self) -> Tensor
+  manual_cpp_binding: True
+  variants: method
+
+# True if this `Variable` is a leaf and thus does not have a `grad_fn`.
+- func: is_leaf(Tensor self) -> bool
+  manual_cpp_binding: True
+  variants: method
+
+# Returns the output index of this variable from the forward operation that
+# produced it.  Conversely, it returns the input index of the gradient `Node` to
+# which this `Variable` is connected (because in the gradient computation,
+# inputs and outputs switch meaning).  For example:
+#
+#   y0, y1, y2 = f(x)
+#   assert y0.output_nr == 0
+#   assert y1.output_nr == 1
+#   assert y2.output_nr == 2
+#
+- func: output_nr(Tensor self) -> int
+  manual_cpp_binding: True
+  variants: method
+
+- func: _version(Tensor self) -> int
+  manual_cpp_binding: True
+  variants: method
+
+- func: requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!)
+  manual_cpp_binding: True
+  variants: method
+
+# Enables .grad attribute for non-leaf Tensors.
+- func: retain_grad(Tensor(a!) self) -> ()
+  manual_cpp_binding: True
+  variants: method
+
+- func: retains_grad(Tensor self) -> bool
+  manual_cpp_binding: True
+  variants: method
+
+- func: _fw_primal(Tensor(a) self, int level) -> Tensor(a)
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: _fw_primal
+
+- func: _make_dual(Tensor(a) primal, Tensor tangent, int level) -> Tensor(a)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _make_dual
+
+- func: _unpack_dual(Tensor(a) dual, int level) -> (Tensor(a) primal, Tensor tangent)
+  variants: function
+
+# NOTE: [_new_zeros_with_same_feature_meta]
+# This function creates a new tensor with the layout and TensorOptions
+# of `other` but also takes into account the batch dimensions of `self`
+#
+# This function has a couple extra constraints because it is also used for `jvp`
+# in functorch.
+# - is used for forward AD because there is the restriction
+#   that the primal and tangent must have the same layout
+# - We cannot assume that `self` and `other` have the same sizes or even dim
+#   because in the inplace over view case, `other` is the base tensor, and
+#   `self` is the forward grad with respect to the view, which can have an
+#   entirely different shape
+# - takes the number of batch dims for `self` because we also handle
+#   some batching logic. We handle that here instead of a batching rule because
+#   we'd like to avoid calling as_strided in the batching rule (as to enable
+#   nested vmap in functorch).
+# - needs to be CompositeExplicitAutograd for jvp support in functorch.
+#   functorch currently relies on TensorWrapper which does not have storage
+#   CompositeExplicitAutograd makes sure the TensorWrapper is unwrapped.
+# - this function may eventually take on another int argument to store the
+#   the number of batch dims for other once we support that use case
+- func: _new_zeros_with_same_feature_meta(Tensor self, Tensor other, *, int self_num_batch_dims=0) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _new_zeros_with_same_feature_meta
+  autogen: _new_zeros_with_same_feature_meta.out
+
+# This function compares the storage numel of self with that of other, where
+# storage numel is computed as: `other.storage().nbytes() / other.itemsize()`.
+# We create this function for composite compliance purposes. The batching rule
+# always returns true because vmapped as_strided does not support accessing
+# storage locations not indexable by the input tensor.
+# See the note above for more information.
+- func: _has_same_storage_numel(Tensor self, Tensor other) -> bool
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _has_same_storage_numel
+
+- func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)
+  variants: method
+  tags: inplace_view
+
+- func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a)
+  variants: method
+
+- func: align_to(Tensor(a) self, Dimname[] names) -> Tensor(a)
+  variants: method
+
+- func: align_to.ellipsis_idx(Tensor(a) self, Dimname[] order, int ellipsis_idx) -> Tensor(a)
+  variants: method
+
+- func: align_as(Tensor self, Tensor other) -> Tensor
+  variants: method
+
+- func: align_tensors(Tensor[] tensors) -> Tensor[]
+
+# Not assert because it's a keyword; not Assert because FX already
+# took that syntax
+# TODO: need to specify this is side-effectful somehow
+- func: _assert_async(Tensor self) -> ()
+  dispatch:
+    CPU: _assert_async_cpu
+    CUDA: _assert_async_cuda
+
+- func: _assert_async.msg(Tensor self, str assert_msg) -> ()
+  dispatch:
+    CPU: _assert_async_msg_cpu
+    CUDA: _assert_async_msg_cuda
+
+- func: _assert_scalar(Scalar self, str assert_msg) -> ()
+  dispatch:
+    CompositeExplicitAutograd: _assert_scalar
+
+- func: _functional_assert_scalar(Scalar self, str assert_msg, Tensor dep_token) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _functional_assert_scalar
+
+- func: _functional_assert_async.msg(Tensor self, str assert_msg, Tensor dep_token) -> Tensor
+  dispatch:
+    CPU: _functional_assert_async_msg_cpu
+
+- func: _assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> ()
+
+- func: _print(str s) -> ()
+  dispatch:
+    CompositeExplicitAutograd: _print
+
+- func: sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> ()
+  dispatch:
+    CompositeExplicitAutograd: sym_constrain_range
+
+- func: sym_constrain_range_for_size(Scalar size, *, int? min=None, int? max=None) -> ()
+  dispatch:
+    CompositeExplicitAutograd: sym_constrain_range_for_size
+
+- func: _functional_sym_constrain_range(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _functional_sym_constrain_range
+
+- func: _functional_sym_constrain_range_for_size(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _functional_sym_constrain_range_for_size
+
+- func: _make_dep_token(*, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  dispatch:
+    CPU: _make_dep_token_cpu
+
+- func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a)
+  variants: method
+
+- func: _use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool
+  device_check: NoCheck  # Tensor arguments allowed to be on different devices, see also _cudnn_ctc_loss
+  dispatch:
+    CUDA: _use_cudnn_ctc_loss
+
+- func: _use_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank) -> bool
+  device_check: NoCheck  # Tensor arguments allowed to be on different devices, see also _cudnn_ctc_loss
+  dispatch:
+    CUDA: _use_cudnn_ctc_loss_tensor
+
+- func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
+  device_check: NoCheck  # log_probs is expected to be on CUDA while targets is expected to be on CPU
+  dispatch:
+    CUDA: _cudnn_ctc_loss
+  autogen: _cudnn_ctc_loss.out
+
+- func: _cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
+  device_check: NoCheck  # log_probs is expected to be on CUDA while targets is expected to be on CPU
+  dispatch:
+    CUDA: _cudnn_ctc_loss_tensor
+
+- func: _use_cudnn_rnn_flatten_weight() -> bool
+
+- func: _cudnn_rnn_flatten_weight(Tensor[] weight_arr, int weight_stride0, SymInt input_size, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, bool bidirectional) -> Tensor
+  dispatch:
+    CUDA: _cudnn_rnn_flatten_weight
+  autogen: _cudnn_rnn_flatten_weight.out
+
+- func: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  # rnn_tanh may or may not redispatch to _cudnn_rnn based on algorithm and build. Thus it might hit dispatch or kernel device check.
+  # Disable dispatch time device check for consistent behavior.
+  device_check: NoCheck
+  dispatch:
+    CUDA: _cudnn_rnn
+  autogen: _cudnn_rnn.out
+  tags: nondeterministic_seeded
+
+- func: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
+  dispatch:
+    CUDA: _cudnn_rnn_backward
+  autogen: _cudnn_rnn_backward.out
+
+- func: _cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  dispatch:
+    CUDA: _cudnn_init_dropout_state
+  autogen: _cudnn_init_dropout_state.out
+  tags: nondeterministic_seeded
+
+- func: _debug_has_internal_overlap(Tensor self) -> int
+  variants: function
+
+- func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: fused_dropout_cuda
+  tags: nondeterministic_seeded
+  autogen: _fused_dropout.out
+
+- func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: masked_scale_cuda
+  autogen: _masked_scale.out
+
+- func: native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: native_dropout_cpu
+    CUDA: native_dropout_cuda
+    NestedTensorCPU, NestedTensorCUDA: native_dropout_nested
+  tags: [nondeterministic_seeded, core]
+  autogen: native_dropout.out
+
+- func: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
+  dispatch:
+    CPU, NestedTensorCPU, NestedTensorCUDA: native_dropout_backward
+    CUDA: native_dropout_backward_cuda
+  autogen: native_dropout_backward.out
+  tags: pointwise
+
+- func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor)
+
+- func: _sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!)
+
+- func: _sobol_engine_scramble_(Tensor(a!) self, Tensor ltm, int dimension) -> Tensor(a!)
+
+- func: _sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!)
+
+- func: _reshape_from_tensor(Tensor self, Tensor shape) -> Tensor
+
+- func: _shape_as_tensor(Tensor self) -> Tensor
+
+- func: dropout(Tensor input, float p, bool train) -> Tensor
+  tags: nondeterministic_seeded
+
+- func: dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+  tags: nondeterministic_seeded
+
+- func: feature_dropout(Tensor input, float p, bool train) -> Tensor
+  tags: nondeterministic_seeded
+
+- func: feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+  tags: nondeterministic_seeded
+
+- func: alpha_dropout(Tensor input, float p, bool train) -> Tensor
+  tags: nondeterministic_seeded
+
+- func: alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+  tags: nondeterministic_seeded
+
+- func: feature_alpha_dropout(Tensor input, float p, bool train) -> Tensor
+  tags: nondeterministic_seeded
+
+- func: feature_alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+  tags: nondeterministic_seeded
+
+- func: abs(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: abs
+    SparseCPU, SparseCUDA: abs_sparse
+    SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs
+  tags: [core, pointwise]
+
+- func: abs_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: abs_
+    SparseCPU, SparseCUDA: abs_sparse_
+    SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs_
+
+- func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: abs_out
+    MPS: abs_out_mps
+    SparseCPU, SparseCUDA: abs_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_out
+  tags: pointwise
+
+# Note [Adding an alias]
+# To add an alias do the following:
+#
+# 1) Copy the original functions native_functions.yaml entry, but replace the
+#      original function's name with their own and delete any dispatch
+#      keys for the aliases. Specifying a dispatch key will prevent
+#      autograd from recording the operations the alias performs, which
+#      will stop it from "inheriting" the original operation's autograd behavior.
+# 2) Implement the corresponding functions and have them redispatch to the
+#      original function.
+# 3) Add docstrings to the new function that reference the original function,
+#      and document the method as usual (if it exists.)
+#    (See torch/_torch_docs.py and docs/source/torch.rst if adding a function,
+#     torch/_tensor_docs.py and docs/source/tensors.rst if adding a method,
+#     or module-specific doc bindings (like torch/linalg/__init__.py) if
+#     adding an alias in a namespace.)
+# 4) Update torch/overrides.py consistent with the original function.
+# 5) Update the alias_map in torch/csrc/jit/passes/normalize_ops.cpp.
+# 6) Add aliases argument to existing OpInfo/UnaryUfuncInfo or create new OpInfo/UnaryUfuncInfo entry
+# in op_db list in torch/testing/_internal/common_methods_invocations.py
+#
+# See torch.absolute, an alias for torch.abs, as an example.
+# Absolute, alias for abs
+
+- func: absolute(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: absolute_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: angle(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: angle
+    SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr
+  tags: pointwise
+
+- func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: angle_out
+    SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr_out
+  tags: pointwise
+
+- func: view_as_real(Tensor(a) self) -> Tensor(a)
+  variants: function
+  dispatch:
+    CPU, CUDA, MPS, Meta: view_as_real
+
+- func: view_as_complex(Tensor(a) self) -> Tensor(a)
+  variants: function
+  dispatch:
+    CPU, CUDA, MPS, Meta: view_as_complex
+
+- func: sgn(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: sgn.out
+  dispatch:
+    SparseCPU, SparseCUDA: sgn_sparse
+    SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn
+  tags: pointwise
+
+- func: sgn_(Tensor(a!) self) -> Tensor(a!)
+  variants: method
+  structured_delegate: sgn.out
+  dispatch:
+    SparseCPU, SparseCUDA: sgn_sparse_
+    SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn_
+  tags: pointwise
+
+- func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sgn_out
+    MPS: sgn_out_mps
+    SparseCPU, SparseCUDA: sgn_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_out
+  tags: pointwise
+
+- func: chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+  variants: method
+
+- func: real(Tensor(a) self) -> Tensor(a)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: imag(Tensor(a) self) -> Tensor(a)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: _conj(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: _conj
+
+- func: conj(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+  manual_cpp_binding: True
+
+- func: _conj_physical(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: _conj_physical
+    SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr
+  autogen: _conj_physical.out
+
+- func: conj_physical(Tensor self) -> Tensor
+  variants: function, method
+  tags: pointwise
+
+- func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: conj_physical_out
+    MPS: conj_physical_out_mps
+    SparseCPU, SparseCUDA: conj_physical_out_sparse
+    SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_out
+  tags: pointwise
+
+- func: conj_physical_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: conj_physical_
+    SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_
+  tags: pointwise
+
+- func: resolve_conj(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+
+- func: resolve_neg(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+
+- func: _neg_view(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: _neg_view
+
+- func: acos(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: acos.out
+  tags: [core, pointwise]
+
+- func: acos_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: acos.out
+  tags: pointwise
+
+- func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: acos_out
+    MPS: acos_out_mps
+  tags: pointwise
+
+# arccos, alias of acos
+- func: arccos(Tensor self) -> Tensor
+  variants: function, method
+
+- func: arccos_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor
+  tags: core
+
+- func: adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor
+  tags: core
+
+# Return: (Tensor output, Tensor indices)
+- func: adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)
+
+- func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: add.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: add_sparse
+    SparseCsrCPU, SparseCsrCUDA: add_sparse_csr
+    MkldnnCPU: mkldnn_add
+    ZeroTensor: add_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
+  tags: [core, pointwise]
+
+- func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: add.out
+  dispatch:
+    SparseCPU, SparseCUDA: add_sparse_
+    SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_
+    MkldnnCPU: mkldnn_add_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
+  tags: pointwise
+
+- func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  ufunc_inner_loop:
+    Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
+    ScalarOnly: add (Bool)
+  dispatch:
+    SparseCPU: add_out_sparse_cpu
+    SparseCUDA: add_out_sparse_cuda
+    SparseCsrCPU: add_out_sparse_compressed_cpu
+    SparseCsrCUDA: add_out_sparse_compressed_cuda
+    MkldnnCPU: mkldnn_add_out
+    MPS: add_out_mps
+  tags: pointwise
+
+- func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CPU: add_relu
+
+- func: _add_relu_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CPU: add_relu_
+
+- func: _add_relu.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CPU: add_relu_out
+
+- func: _add_relu.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CPU: add_relu
+
+- func: _add_relu_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CPU: add_relu_
+  autogen: _add_relu.Scalar_out
+
+# For C++ only, until we have conversion from C++ numbers to Tensor
+- func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: add
+  tags: [core, pointwise]
+
+- func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: add_
+  autogen: add.Scalar_out
+  tags: pointwise
+
+- func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  structured_delegate: addmv.out
+  variants: function, method
+
+- func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  structured_delegate: addmv.out
+  variants: function, method
+
+- func: addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: addmv_out_cpu
+    CUDA: addmv_out_cuda
+    MPS: addmv_out_mps
+    SparseCsrCPU: addmv_out_sparse_compressed
+    SparseCsrCUDA: addmv_out_sparse_compressed_cuda
+
+- func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU, CUDA: addr
+    MPS: addr_mps
+    CompositeExplicitAutograd: math_addr
+
+- func: addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: addr_
+
+- func: addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addr_out
+    MPS: addr_out_mps
+    CompositeExplicitAutograd: math_addr_out
+
+- func: affine_grid_generator(Tensor theta, SymInt[] size, bool align_corners) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: affine_grid_generator
+  autogen: affine_grid_generator.out
+
+- func: affine_grid_generator_backward(Tensor grad, SymInt[] size, bool align_corners) -> Tensor
+  variants: function
+
+- func: _is_all_true(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: _is_all_true
+
+- func: _is_any_true(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: _is_any_true
+
+# Note: this function is only for testing.
+- func: _test_check_tensor(Tensor self) -> Tensor
+  variants: function
+
+# Note; this function is only for testing
+- func: _test_functorch_fallback(Tensor self, Tensor other) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _test_functorch_fallback
+  autogen: _test_functorch_fallback.out
+
+- func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: all.out
+  variants: function, method
+
+- func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: all.dims_out
+  variants: function, method
+  cpp_no_default_args: ['dim']
+  dispatch:
+    CompositeExplicitAutograd: all_dims_default
+
+- func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    CPU, CUDA: all_out
+    MPS: all_out_mps
+
+- func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    CPU, CUDA: all_dims_out
+    CompositeExplicitAutograd: all_dims_out_default
+  cpp_no_default_args: ['dim']
+
+- func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool
+  variants: function, method
+  tags: data_dependent_output
+  dispatch:
+    CompositeExplicitAutograd: allclose
+
+- func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: any.out
+  variants: function, method
+  tags: core
+
+- func: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: any.dims_out
+  variants: function, method
+  cpp_no_default_args: ['dim']
+  tags: core
+  dispatch:
+    CompositeExplicitAutograd: any_dims_default
+
+- func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    CPU, CUDA: any_out
+    MPS: any_out_mps
+
+- func: any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    CPU, CUDA: any_dims_out
+    CompositeExplicitAutograd: any_dims_out_default
+  cpp_no_default_args: ['dim']
+
+- func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: arange
+
+- func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: arange
+
+# This operator should be named `arange.start_out` if following the naming convention. However that
+# name is already taken. Disabled because of CI job failures.
+# FIXME: enable this
+#- func: arange.start_out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!)
+#  dispatch:
+#    CompositeExplicitAutograd: arange_start_out
+
+- func: arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: arange
+  cpp_no_default_args: ['step']
+  tags: core
+
+- func: arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: arange_out
+
+- func: arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, Meta: arange_out
+    CUDA: arange_cuda_out
+    MPS: arange_mps_out
+  cpp_no_default_args: ['step']
+
+# This function is a temporary hack to allow tracing of arange like constructs with dynamic
+# bounds on arange.  Normal arange is not traceable because it does not take any tensor inputs;
+# if the range you need is based on another tensor, calling this function directly will
+# preserve tracing.  Get rid of this when arange can directly take tensors for bounds
+# (so that it can be traced directly).
+- func: _dim_arange(Tensor like, int dim) -> Tensor
+
+- func: argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
+  structured_delegate: argmax.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+
+- func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU, CUDA: argmax_out
+    MPS: argmax_out_mps
+
+- func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
+  structured_delegate: argmin.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+
+- func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU, CUDA: argmin_out
+    MPS: argmin_out_mps
+
+- func: acosh(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: acosh.out
+  tags: [core, pointwise]
+
+- func: acosh_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: acosh.out
+  tags: pointwise
+
+- func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: acosh_out
+    MPS: acosh_out_mps
+  tags: pointwise
+# arccosh, alias for acosh
+
+- func: arccosh(Tensor self) -> Tensor
+  variants: function, method
+
+- func: arccosh_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: asinh(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: asinh.out
+  dispatch:
+    SparseCPU, SparseCUDA: asinh_sparse
+    SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr
+  tags: [core, pointwise]
+
+- func: asinh_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: asinh.out
+  dispatch:
+    SparseCPU, SparseCUDA: asinh_sparse_
+    SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_
+  tags: pointwise
+
+- func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: asinh_out
+    MPS: asinh_out_mps
+    SparseCPU, SparseCUDA: asinh_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_out
+  tags: pointwise
+
+# arcsinh, alias for asinh
+- func: arcsinh(Tensor self) -> Tensor
+  variants: function, method
+
+- func: arcsinh_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: arcsinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: atanh(Tensor self) -> Tensor
+  structured_delegate: atanh.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: atanh_sparse
+    SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr
+  tags: [core, pointwise]
+
+- func: atanh_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: atanh.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: atanh_sparse_
+    SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_
+  tags: pointwise
+
+- func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: atanh_out
+    MPS: atanh_out_mps
+    SparseCPU, SparseCUDA: atanh_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_out
+  tags: pointwise
+# arctanh, alias for atanh
+
+- func: arctanh(Tensor self) -> Tensor
+  variants: function, method
+
+- func: arctanh_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    ZeroTensor, CPU, CUDA: as_strided_tensorimpl
+    Meta: as_strided_tensorimpl_meta_symint
+    MPS: as_strided_tensorimpl_mps
+    QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl
+  device_check: NoCheck
+  device_guard: False
+  tags: core
+
+- func: as_strided_(Tensor(a!) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: as_strided__symint
+
+- func: asin(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: asin.out
+  dispatch:
+    SparseCPU, SparseCUDA: asin_sparse
+    SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr
+  tags: [core, pointwise]
+
+- func: asin_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: asin.out
+  dispatch:
+    SparseCPU, SparseCUDA: asin_sparse_
+    SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_
+  tags: pointwise
+
+- func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: asin_out
+    MPS: asin_out_mps
+    SparseCPU, SparseCUDA: asin_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_out
+  tags: pointwise
+
+# arcsin, alias of asin
+- func: arcsin(Tensor self) -> Tensor
+  variants: function, method
+
+- func: arcsin_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: arcsin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: atan(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: atan.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: atan_sparse
+    SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr
+  tags: [core, pointwise]
+
+- func: atan_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: atan.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: atan_sparse_
+    SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_
+  tags: pointwise
+
+- func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: atan_out
+    MPS: atan_out_mps
+    SparseCPU, SparseCUDA: atan_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_out
+  tags: pointwise
+
+# arctan, alias of atan
+- func: arctan(Tensor self) -> Tensor
+  variants: function, method
+
+- func: arctan_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: arctan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: atleast_1d(Tensor self) -> Tensor
+  variants: function
+
+- func: atleast_1d.Sequence(Tensor[] tensors) -> Tensor[]
+
+- func: atleast_2d(Tensor self) -> Tensor
+  variants: function
+
+- func: atleast_2d.Sequence(Tensor[] tensors) -> Tensor[]
+  variants: function
+
+- func: atleast_3d(Tensor self) -> Tensor
+  variants: function
+
+- func: atleast_3d.Sequence(Tensor[] tensors) -> Tensor[]
+  variants: function
+
+- func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function, method
+  structured_delegate: baddbmm.out
+
+- func: baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  variants: method
+  structured_delegate: baddbmm.out
+
+- func: baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU: baddbmm_out_cpu
+    CUDA: baddbmm_out_cuda
+    MPS: baddbmm_out_mps
+    SparseCsrCUDA: baddbmm_out_sparse_csr_cuda
+
+- func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: bartlett_window
+  autogen: bartlett_window.out
+
+- func: bartlett_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: bartlett_window
+  autogen: bartlett_window.periodic_out
+
+- func: batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor
+
+- func: quantized_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor
+  dispatch:
+    QuantizedCPU: quantized_batch_norm
+  autogen: quantized_batch_norm.out
+
+- func: _batch_norm_impl_index(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> (Tensor, Tensor, Tensor, Tensor, int)
+
+- func: _batch_norm_impl_index_backward(int impl_index, Tensor input, Tensor grad_output, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var_transform, bool train, float eps, bool[3] output_mask, Tensor reservedSpace) -> (Tensor, Tensor, Tensor)
+
+# Sample bernoulli with values in `self` as probability.
+- func: bernoulli(Tensor self, *, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: bernoulli
+  tags: nondeterministic_seeded
+
+- func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU, CUDA: bernoulli_out
+    MPS: bernoulli_out_mps
+
+- func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU, CUDA: bernoulli_
+    MPS: bernoulli_mps_
+  autogen: bernoulli.Tensor, bernoulli.Tensor_out
+
+- func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU, CUDA: bernoulli_
+    MPS: bernoulli_mps_
+  autogen: bernoulli.float_out
+
+# Note [bernoulli.p schema]
+# We should probably just fix the overload ambiguity by appending a _functional to the C++ API name (BC breaking)
+# This out-of-place version isn't used explicitly, but needed by jit.
+# There is no default valid on `p` here because it would introduce ambiguity
+# with `bernoulli(Tensor self, *, Generator? generator=None)` declaration.
+- func: bernoulli.p(Tensor self, float p, *, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: bernoulli
+
+- func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias=None) -> Tensor
+
+- func: binary_cross_entropy(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  variants: function
+  dispatch:
+    CPU: binary_cross_entropy_cpu
+    CUDA: binary_cross_entropy_cuda
+    MPS: binary_cross_entropy_mps
+
+- func: binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  variants: function
+  dispatch:
+    CPU: binary_cross_entropy_out_cpu
+    CUDA: binary_cross_entropy_out_cuda
+    MPS: binary_cross_entropy_out_mps
+
+- func: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
+  python_module: nn
+  variants: function
+  dispatch:
+    CPU: binary_cross_entropy_backward_cpu
+    CUDA: binary_cross_entropy_backward_cuda
+    MPS: binary_cross_entropy_backward_mps
+
+- func: binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  variants: function
+  dispatch:
+    CPU: binary_cross_entropy_backward_out_cpu
+    CUDA: binary_cross_entropy_backward_out_cuda
+    MPS: binary_cross_entropy_backward_out_mps
+
+- func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: binary_cross_entropy_with_logits
+  autogen: binary_cross_entropy_with_logits.out
+
+- func: bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: _bincount_cpu
+    CUDA: _bincount_cuda
+    MPS: _bincount_mps
+  tags: dynamic_output_shape
+  autogen: bincount.out
+
+- func: bitwise_not(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: bitwise_not.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: bitwise_not_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: bitwise_not.out
+  variants: method
+  tags: pointwise
+
+- func: bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: bitwise_not_out
+    MPS: bitwise_not_out_mps
+  tags: pointwise
+
+- func: copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA, MPS: copysign_out
+  tags: pointwise
+
+- func: copysign.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: copysign.out
+  tags: pointwise
+
+- func: copysign_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: copysign.out
+
+- func: copysign.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: copysign
+  tags: pointwise
+
+- func: copysign_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: copysign_
+
+- func: copysign.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: copysign_out
+  tags: pointwise
+
+- func: _lazy_clone(Tensor self) -> Tensor
+  # Like clone, but the copy takes place lazily, only if either the
+  # input or the output are written.
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: _lazy_clone
+
+- func: logical_not(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logical_not
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not
+  tags: [core, pointwise]
+
+- func: logical_not_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: logical_not_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_
+  tags: pointwise
+
+- func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: logical_not_out
+    MPS: logical_not_out_mps
+  tags: pointwise
+
+- func: logical_xor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logical_xor
+  tags: [core, pointwise]
+
+- func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: logical_xor_
+  tags: pointwise
+
+- func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: logical_xor_out
+    MPS: logical_xor_out_mps
+  tags: pointwise
+
+- func: logical_and(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logical_and
+  tags: [core, pointwise]
+
+- func: logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: logical_and_
+  tags: pointwise
+
+- func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: logical_and_out
+    MPS: logical_and_out_mps
+  tags: pointwise
+
+- func: logical_or(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logical_or
+  tags: [core, pointwise]
+
+- func: logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: logical_or_
+  tags: pointwise
+
+- func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: logical_or_out
+    MPS: logical_or_out_mps
+  tags: pointwise
+
+- func: blackman_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: blackman_window
+  autogen: blackman_window.out
+
+- func: blackman_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: blackman_window
+  autogen: blackman_window.periodic_out
+
+- func: bmm(Tensor self, Tensor mat2) -> Tensor
+  structured_delegate: bmm.out
+  variants: function, method
+  dispatch:
+    SparseCPU: bmm_sparse_cpu
+    SparseCUDA: bmm_sparse_cuda
+    NestedTensorCPU: bmm_nested
+    NestedTensorCUDA: bmm_nested_cuda
+  tags: core
+
+- func: bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU: bmm_out_cpu
+    CUDA: bmm_out_cuda
+    MPS: bmm_out_mps
+    SparseCPU: bmm_out_sparse_cpu
+    SparseCUDA: bmm_out_sparse_cuda
+    SparseCsrCUDA: bmm_out_sparse_csr_cuda
+
+- func: broadcast_tensors(Tensor[] tensors) -> Tensor[]
+  device_check: NoCheck
+  device_guard: False
+
+- func: broadcast_to(Tensor(a) self, SymInt[] size) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: broadcast_to_symint
+
+- func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
+  variants: function
+  dispatch:
+    SparseCPU, SparseCUDA: sparse_broadcast_to
+
+- func: cat(Tensor[] tensors, int dim=0) -> Tensor
+  structured_delegate: cat.out
+  dispatch:
+    SparseCPU, SparseCUDA: cat_sparse
+    QuantizedCPU: cat_quantized_cpu
+    NestedTensorCPU, NestedTensorCUDA: cat_nested
+  tags: core
+
+- func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  precomputed:
+  - dim -> int dim, int valid, bool all_contiguous, bool all_same_dtype, bool all_same_sizes_and_stride, MemoryFormat memory_format
+  dispatch:
+    CPU: cat_out_cpu
+    CUDA: cat_out_cuda
+    MPS: cat_out_mps
+    QuantizedCPU: cat_out_quantized_cpu
+
+- func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor
+
+- func: cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+
+# alias for torch.cat
+- func: concat(Tensor[] tensors, int dim=0) -> Tensor
+
+- func: concat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: concat.names(Tensor[] tensors, Dimname dim) -> Tensor
+
+- func: concat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+
+# alias for torch.cat
+- func: concatenate(Tensor[] tensors, int dim=0) -> Tensor
+
+- func: concatenate.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: concatenate.names(Tensor[] tensors, Dimname dim) -> Tensor
+
+- func: concatenate.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: block_diag(Tensor[] tensors) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: block_diag
+  autogen: block_diag.out
+
+- func: ceil(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: ceil.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: ceil_sparse
+    SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr
+  tags: [core, pointwise]
+
+- func: ceil_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: ceil.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: ceil_sparse_
+    SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_
+  tags: pointwise
+
+- func: ceil.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: ceil_out
+    MPS: ceil_out_mps
+    SparseCPU, SparseCUDA: ceil_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_out
+  tags: pointwise
+
+# alias for torch.linalg.multi_dot
+- func: chain_matmul(Tensor[] matrices) -> Tensor
+  variants: function
+
+# alias for torch.linalg.multi_dot
+- func: chain_matmul.out(Tensor[] matrices, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: unsafe_chunk(Tensor self, int chunks, int dim=0) -> Tensor[]
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: chunk(Tensor(a -> *) self, int chunks, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: chunk
+    NestedTensorCPU, NestedTensorCUDA: chunk_nested_tensor
+
+- func: tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: tensor_split_sections_symint
+
+- func: tensor_split.indices(Tensor(a -> *) self, SymInt[] indices, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: tensor_split_indices_symint
+
+- func: tensor_split.tensor_indices_or_sections(Tensor(a -> *) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[]
+  variants: function, method
+
+- func: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ['min']
+  structured_delegate: clamp.out
+  dispatch:
+    QuantizedCPU: clamp_quantized_cpu
+  tags: [core, pointwise]
+
+- func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
+  variants: function, method
+  structured_delegate: clamp.Tensor_out
+  tags: [core, pointwise]
+
+- func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ['min']
+  structured_delegate: clamp.out
+  tags: pointwise
+
+- func: clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: clamp.Tensor_out
+  tags: pointwise
+
+- func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ['min']
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: clamp_out
+    MPS: clamp_out_mps
+  tags: pointwise
+
+- func: clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: clamp_Tensor_out
+    MPS: clamp_Tensor_out_mps
+  tags: pointwise
+
+- func: clamp_max(Tensor self, Scalar max) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: clamp_max.out
+  tags: pointwise
+
+- func: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor
+  variants: function, method
+  structured_delegate: clamp_max.Tensor_out
+  tags: pointwise
+
+- func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: clamp_max.out
+  tags: pointwise
+
+- func: clamp_max_.Tensor(Tensor(a!) self, Tensor max) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: clamp_max.Tensor_out
+  tags: pointwise
+
+- func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: clamp_max_out
+    MPS: clamp_max_out_mps
+  tags: pointwise
+
+- func: clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: clamp_max_Tensor_out
+    MPS: clamp_max_Tensor_out_mps
+  tags: pointwise
+
+- func: clamp_min(Tensor self, Scalar min) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: clamp_min.out
+  tags: pointwise
+
+- func: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor
+  variants: function, method
+  structured_delegate: clamp_min.Tensor_out
+  tags: pointwise
+
+- func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: clamp_min.out
+  tags: pointwise
+
+- func: clamp_min_.Tensor(Tensor(a!) self, Tensor min) -> Tensor(a!)
+  variants: function, method
+  structured_delegate: clamp_min.Tensor_out
+  tags: pointwise
+
+- func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: clamp_min_out
+    MPS: clamp_min_out_mps
+  tags: pointwise
+
+- func: clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: clamp_min_Tensor_out
+    MPS: clamp_min_Tensor_out_mps
+  tags: pointwise
+
+# clip is an alias for clamp
+- func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+  cpp_no_default_args: ['min']
+  variants: function, method
+  tags: pointwise
+
+- func: clip.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
+  variants: function, method
+  tags: pointwise
+
+- func: clip_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
+  cpp_no_default_args: ['min']
+  variants: function, method
+  tags: pointwise
+
+- func: clip_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)
+  variants: function, method
+  tags: pointwise
+
+- func: clip.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
+  cpp_no_default_args: ['min']
+  tags: pointwise
+
+- func: clip.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: cudnn_is_acceptable(Tensor self) -> bool
+  device_check: NoCheck
+  device_guard: False
+
+- func: complex(Tensor real, Tensor imag) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: complex
+
+- func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: complex_out
+    MPS: complex_out_mps
+
+- func: polar(Tensor abs, Tensor angle) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: polar
+
+- func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: polar_out
+    MPS: polar_out_mps
+
+- func: constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: constant_pad_nd
+    MPS: constant_pad_nd_mps
+  autogen: constant_pad_nd.out
+  tags: core
+
+- func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)
+  variants: method
+  manual_cpp_binding: True
+
+- func: convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: convolution
+  autogen: convolution.out
+  tags: core
+
+- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CompositeExplicitAutograd, CUDA: convolution_backward
+  autogen: convolution_backward.out
+  tags: core
+
+- func: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: convolution_overrideable
+  autogen: convolution_overrideable.out
+
+- func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
+  dispatch:
+    CompositeExplicitAutograd: convolution_backward_overrideable
+  autogen: convolution_backward_overrideable.out
+
+- func: _convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _convolution
+  autogen: _convolution.out
+
+- func: _convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, int[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor
+
+- func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, str padding, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: _convolution_mode_symint
+
+- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+
+- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] dilation=1, SymInt groups=1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: conv1d_symint
+
+- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, SymInt groups=1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: conv2d_symint
+
+- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: conv3d_symint
+
+- func: conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, str padding="valid", SymInt[1] dilation=1, SymInt groups=1) -> Tensor
+  cpp_no_default_args: ['bias', 'stride', 'padding']
+  dispatch:
+    CompositeImplicitAutograd: conv1d_padding_symint
+
+- func: conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, str padding="valid", SymInt[2] dilation=1, SymInt groups=1) -> Tensor
+  cpp_no_default_args: ['bias', 'stride', 'padding']
+  dispatch:
+    CompositeImplicitAutograd: conv2d_padding_symint
+
+- func: conv3d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, str padding="valid", SymInt[3] dilation=1, SymInt groups=1) -> Tensor
+  cpp_no_default_args: ['bias', 'stride', 'padding']
+  dispatch:
+    CompositeImplicitAutograd: conv3d_padding_symint
+
+- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: conv_tbc
+  autogen: conv_tbc.out
+
+- func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor)
+
+# NB: we inherit the goofy argument order from PyTorch torch.nn.functional
+- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, SymInt groups=1, SymInt[1] dilation=1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: conv_transpose1d_symint
+
+- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: conv_transpose2d_symint
+
+- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt groups=1, SymInt[3] dilation=1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: conv_transpose3d_symint
+
+- func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: copy
+  tags: core
+
+- func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    MkldnnCPU: copy_mkldnn_
+    SparseCPU, SparseCUDA: copy_sparse_wrapper_
+    CompositeExplicitAutograd: copy_
+    SparseCsrCPU, SparseCsrCUDA: copy_sparse_compressed_
+    NestedTensorCPU, NestedTensorCUDA: copy_nested_
+  autogen: copy.out
+
+- func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
+  dispatch:
+    MPS: _copy_from_mps
+  autogen: _copy_from.out
+
+# We need this to be able to properly copy from a CPU to an XLA tensor with different sizes.
+# See https://github.com/pytorch/xla/issues/2881
+- func: _copy_from_and_resize(Tensor self, Tensor dst) -> Tensor
+  dispatch:
+    MPS: _copy_from_and_resize_mps
+  autogen: _copy_from_and_resize.out
+
+- func: cos(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: cos.out
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: cos_nested
+  tags: [core, pointwise]
+
+- func: cos_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: cos.out
+  tags: pointwise
+
+- func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: cos_out
+    MPS: cos_out_mps
+  tags: pointwise
+
+- func: cosh(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: cosh.out
+  tags: [core, pointwise]
+
+- func: cosh_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: cosh.out
+  tags: pointwise
+
+- func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: cosh_out
+    MPS: cosh_out_mps
+  tags: pointwise
+
+- func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
+
+- func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: count_nonzero_cpu
+    CUDA: count_nonzero_cuda
+    MPS: count_nonzero_mps
+  autogen: count_nonzero.dim_IntList_out
+
+- func: count_nonzero(Tensor self, int? dim=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: count_nonzero
+  autogen: count_nonzero.out
+
+- func: cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor
+  variants: function, method
+
+- func: corrcoef(Tensor self) -> Tensor
+  variants: function, method
+
+- func: cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid
+  dispatch:
+    CUDA: cudnn_affine_grid_generator_forward
+  autogen: cudnn_affine_grid_generator.out
+
+# TODO: Why do I have to call this grad?!
+- func: cudnn_affine_grid_generator_backward(Tensor grad, int N, int C, int H, int W) -> Tensor grad_theta
+  dispatch:
+    CUDA: cudnn_affine_grid_generator_backward
+  autogen: cudnn_affine_grid_generator_backward.out
+
+- func: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: cudnn_batch_norm
+  autogen: cudnn_batch_norm.out
+
+# NB: You can only use this if you used cudnn_batch_norm training=True
+- func: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: cudnn_batch_norm_backward
+  autogen: cudnn_batch_norm_backward.out
+
+- func: cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
+  dispatch:
+    CUDA: cudnn_convolution
+
+- func: cudnn_convolution.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CUDA: cudnn_convolution_out
+
+- func: cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
+  dispatch:
+    CUDA: cudnn_convolution_transpose
+  autogen: cudnn_convolution_transpose.out
+
+- func: _mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    MPS: _mps_convolution_transpose
+  autogen: _mps_convolution_transpose.out
+
+- func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[2] output_mask) -> (Tensor, Tensor)
+  dispatch:
+    MPS: mps_convolution_transpose_backward
+  autogen: mps_convolution_transpose_backward.out
+
+- func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    CUDA: cudnn_convolution_relu
+  autogen: cudnn_convolution_relu.out
+
+- func: cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    CUDA: cudnn_convolution_add_relu
+  autogen: cudnn_convolution_add_relu.out
+
+# NB: input is special cased in a way I don't quite understand
+- func: cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output
+  dispatch:
+    CUDA: cudnn_grid_sampler_forward
+  autogen: cudnn_grid_sampler.out
+
+- func: cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output) -> (Tensor grad_self, Tensor grad_grid)
+  dispatch:
+    CUDA: cudnn_grid_sampler_backward
+  autogen: cudnn_grid_sampler_backward.out
+
+- func: cummax(Tensor self, int dim) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: cummax
+
+- func: cummax.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CompositeExplicitAutograd: cummax_out
+
+- func: cummax.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: cummax.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+
+- func: _cummax_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()
+  variants: function
+  dispatch:
+    CPU: cummax_helper_cpu
+    CUDA: cummax_helper_cuda
+
+- func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: cummin
+
+- func: cummin.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CompositeExplicitAutograd: cummin_out
+
+- func: cummin.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: cummin.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+
+- func: _cummin_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()
+  variants: function
+  dispatch:
+    CPU: cummin_helper_cpu
+    CUDA: cummin_helper_cuda
+
+- func: cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+
+- func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  structured_delegate: cumprod.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: cumprod_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
+  structured_delegate: cumprod.out
+  variants: method
+
+- func: cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: cumprod_out
+    MPS: cumprod_out_mps
+
+- func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: cumprod_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)
+  variants: method
+
+- func: cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: cumprod_backward(Tensor grad, Tensor input, int dim, Tensor output) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+
+- func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  structured_delegate: cumsum.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+
+- func: cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
+  structured_delegate: cumsum.out
+  variants: method
+
+- func: cumsum.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: cumsum_out
+    MPS: cumsum_out_mps
+
+- func: cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: cumsum_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)
+  variants: method
+
+- func: cumsum.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: cumulative_trapezoid.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor
+
+- func: cumulative_trapezoid.dx(Tensor y, *, Scalar dx=1, int dim=-1) -> Tensor
+
+- func: ctc_loss.IntList(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor
+
+# convenience function that converts to intlists for you
+- func: ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor
+
+- func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
+  dispatch:
+    CPU: ctc_loss_cpu
+    CUDA: ctc_loss_gpu
+    Meta: ctc_loss_meta
+  autogen: _ctc_loss.out
+  tags: dynamic_output_shape  # the shape of second output is data dependent
+
+- func: _ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
+  dispatch:
+    CPU, CUDA: ctc_loss_tensor
+  autogen: _ctc_loss.Tensor_out
+  tags: dynamic_output_shape  # the shape of second output is data dependent
+
+- func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
+  dispatch:
+    CPU: ctc_loss_backward_cpu
+    CUDA: ctc_loss_backward_gpu
+  autogen: _ctc_loss_backward.out
+
+- func: _ctc_loss_backward.Tensor(Tensor grad, Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
+  dispatch:
+    CPU, CUDA: ctc_loss_backward_tensor
+
+- func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: diag_embed
+  autogen: diag_embed.out
+
+- func: diagflat(Tensor self, int offset=0) -> Tensor
+  variants: function, method
+
+- func: diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: diagonal
+  tags: core
+
+- func: linalg_diagonal(Tensor(a) A, *, int offset=0, int dim1=-2, int dim2=-1) -> Tensor(a)
+  python_module: linalg
+  variants: function
+
+- func: diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a)
+  variants: function, method
+
+- func: diagonal_backward(Tensor grad_output, SymInt[] input_sizes, int offset, int dim1, int dim2) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: diagonal_backward_symint
+  autogen: diagonal_backward.out
+
+- func: fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!)
+  variants: method
+
+- func: diff(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None) -> Tensor
+  variants: function, method
+
+- func: diff.out(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+
+- func: gradient.scalarint(Tensor self, *, Scalar? spacing=None, int? dim=None, int edge_order=1) -> Tensor[]
+  variants: function
+
+- func: gradient.scalararray(Tensor self, *, Scalar spacing, int[] dim, int edge_order=1) -> Tensor[]
+  variants: function
+
+- func: gradient.array(Tensor self, *, int[] dim, int edge_order=1) -> Tensor[]
+  variants: function
+
+- func: gradient.scalarrayint(Tensor self, *, Scalar[] spacing, int? dim=None, int edge_order=1) -> Tensor[]
+  variants: function
+
+- func: gradient.scalarrayarray(Tensor self, *, Scalar[] spacing, int[] dim, int edge_order=1) -> Tensor[]
+  variants: function
+
+- func: gradient.tensorarrayint(Tensor self, *, Tensor[] spacing, int? dim=None, int edge_order=1) -> Tensor[]
+  variants: function
+
+- func: gradient.tensorarray(Tensor self, *, Tensor[] spacing, int[] dim, int edge_order=1) -> Tensor[]
+  variants: function
+
+- func: div.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: div.out
+  dispatch:
+    SparseCPU, SparseCUDA: div_sparse
+    ZeroTensor: div_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Tensor
+  tags: [core, pointwise]
+
+- func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: div.out
+  dispatch:
+    SparseCPU, SparseCUDA: div_sparse_
+  tags: pointwise
+
+- func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: div_out
+    MPS: div_out_mps
+    SparseCPU, SparseCUDA: div_out_sparse_zerodim
+  tags: pointwise
+
+- func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: div.out_mode
+  dispatch:
+    SparseCPU, SparseCUDA: div_sparse
+  tags: [core, pointwise]
+
+- func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: div.out_mode
+  dispatch:
+    SparseCPU, SparseCUDA: div_sparse_
+  tags: pointwise
+
+- func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: div_out_mode
+    MPS: div_out_mode_mps
+    SparseCPU, SparseCUDA: div_out_sparse_zerodim
+  tags: pointwise
+
+# For C++ only, until we have conversion from C++ numbers to Tensor
+- func: div.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: div
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Scalar
+  tags: [core, pointwise]
+
+- func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: div_
+  autogen: div.Scalar_out
+  tags: pointwise
+
+- func: div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: div
+  tags: [core, pointwise]
+
+- func: div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: div_
+  autogen: div.Scalar_mode_out
+  tags: pointwise
+
+# divide, alias for div
+- func: divide.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+
+- func: divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: divide.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: function, method
+
+- func: divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: divide.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
+  variants: function, method
+
+- func: divide_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
+  variants: method
+
+- func: divide.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+
+- func: divide.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
+  variants: function, method
+
+- func: divide_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)
+  variants: method
+
+  # true_divide, an alias for div
+- func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: pointwise
+
+- func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: dot(Tensor self, Tensor tensor) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: dot
+    CUDA: dot_cuda
+    MPS: dot_mps
+
+- func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: dot_out
+
+- func: vdot(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: vdot
+    CUDA: vdot_cuda
+
+- func: vdot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: vdot_out
+
+- func: einsum(str equation, Tensor[] tensors, *, int[]? path=None) -> Tensor
+
+- func: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: embedding_symint
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding
+  autogen: embedding.out
+  tags: core
+
+- func: embedding_backward(Tensor grad, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: embedding_backward_symint
+
+- func: embedding_dense_backward(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq) -> Tensor
+  dispatch:
+    CPU: embedding_dense_backward_cpu
+    CUDA: embedding_dense_backward_cuda
+    MPS: embedding_dense_backward_mps
+  autogen: embedding_dense_backward.out
+  tags: core
+
+- func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
+  dispatch:
+    CPU: embedding_renorm_cpu_
+    CUDA: embedding_renorm_cuda_
+  autogen: embedding_renorm, embedding_renorm.out
+
+- func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
+
+# NOTE [ embedding_bag Native Functions ]
+# The `_embedding_bag.*` variants assume that input tensors except for `weight`,
+# e.g. `indices` and `offsets` (and `offset2bag`), are contiguous.
+# We really only need to enforce this for `_embedding_bag` (the forward) because
+# the backward inputs are the same as forward ones.
+# The above `embedding_bag` wrapper is created to achieve this, e.g.,
+# applying indices = indices.contiguous().
+# The backward functions apply a check that these input tensors are contiguous.
+
+
+- func: _embedding_bag_forward_only(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: _embedding_bag_forward_only_cpu
+    CUDA: _embedding_bag_forward_only_cuda
+  autogen: _embedding_bag_forward_only.out
+
+- func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
+
+# row_stack is the alias of vstack
+- func: row_stack(Tensor[] tensors) -> Tensor
+
+- func: row_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor)
+
+# To keep backward and forward compatibility, and to avoid ambiguity with the
+# original signature above, scale_grad_by_freq, mode, sparse,
+# per_sample_weights, and include_last_offset parameters do not have default
+# values. Once the original signature is removed, default values can be added.
+- func: embedding_bag.padding_idx(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, bool include_last_offset, int? padding_idx) -> (Tensor, Tensor, Tensor, Tensor)
+
+- func: _embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: _embedding_bag_cpu
+    CUDA: _embedding_bag_cuda
+  autogen: _embedding_bag.out
+  tags: core
+
+- func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: _embedding_bag_backward_symint
+
+- func: _embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: _embedding_bag_sparse_backward_symint
+
+- func: _embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+  dispatch:
+    CPU: _embedding_bag_dense_backward_cpu
+    CUDA: _embedding_bag_dense_backward_cuda
+  autogen: _embedding_bag_dense_backward.out
+
+- func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor
+  dispatch:
+    CPU: _embedding_bag_per_sample_weights_backward_cpu
+    CUDA: _embedding_bag_per_sample_weights_backward_cuda
+  autogen: _embedding_bag_per_sample_weights_backward.out
+
+- func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: empty_names
+  autogen: empty.names_out
+
+- func: empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  dispatch:
+    CPU: empty_cpu
+    CUDA: empty_cuda
+    MPS: empty_mps
+    Meta: empty_meta_symint
+    MkldnnCPU: empty_mkldnn
+    SparseCPU, SparseCUDA, SparseMeta: empty_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_sparse_compressed
+    QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized
+  tags: core
+
+- func: empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: empty_permuted_symint
+  autogen: empty_permuted.out
+
+# We do not make new_empty a composite that calls into new_empty_strided, as the strided version
+# is significantly more difficult to implement by different backends
+- func: new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: new_empty_symint
+  autogen: new_empty.out
+
+- func: new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  variants: method
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: new_empty_strided_symint
+  autogen: new_empty_strided.out
+
+- func: new_full(Tensor self, SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  variants: method
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: new_full
+  autogen: new_full.out
+
+- func: new_zeros(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  variants: method
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: new_zeros
+  autogen: new_zeros.out
+
+- func: new_ones(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  variants: method
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: new_ones
+  autogen: new_ones.out
+
+# other overrides are to provide a more helpful error message that dtype is required
+- func: _empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
+  dispatch:
+    CPU: empty_affine_quantized_other_backends_stub
+    QuantizedCPU, QuantizedCUDA: empty_affine_quantized
+  autogen: _empty_affine_quantized.out
+
+# it's a factory function receiving a tensor argument, thus overriding explicitly
+# other overrides are to provide a more helpful error message that dtype is required
+- func: _empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
+  category_override: factory
+  dispatch:
+    CPU: empty_per_channel_affine_quantized_other_backends_stub
+    QuantizedCPU, QuantizedCUDA: empty_per_channel_affine_quantized
+  autogen: _empty_per_channel_affine_quantized.out
+
+- func: resize_(Tensor(a!) self, SymInt[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: [core, inplace_view]
+  dispatch:
+    Meta: resize__symint
+    CPU: resize_
+    CUDA: resize_cuda_
+    MPS: resize_mps_
+    QuantizedCPU: quantized_resize_cpu_
+    SparseCsrCPU, SparseCsrCUDA: resize_sparse_csr_
+  autogen: resize, resize.out
+
+# This is a utility function to enable users to resize out tensor while registering kernels for out variants.
+# Eventually, we can consider exposing `resize_output` as a public API to ship it with python op registration
+# to make it easy to register out variants for ops.
+- func: _resize_output_(Tensor(a!) self, SymInt[] size, Device device) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: function
+  dispatch:
+    Meta: _resize_output_
+  autogen: _resize_output, _resize_output.out
+
+- func: empty_quantized(int[] size, Tensor qtensor, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  category_override: factory
+  variants: function
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: empty_quantized
+  autogen: empty_quantized.out
+
+- func: empty.out(SymInt[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  device_guard: False
+
+- func: empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: empty_like
+    QuantizedCPU, QuantizedCUDA: empty_like_quantized
+    SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: empty_like_nested
+  autogen: empty_like.out
+
+- func: empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CPU: empty_strided_cpu
+    CUDA: empty_strided_cuda
+    MPS: empty_strided_mps
+    Meta: empty_strided_meta_symint
+    QuantizedCPU, QuantizedCUDA: empty_strided_unknown_quantized
+  autogen: empty_strided.out
+  tags: core
+
+- func: erf(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erf.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: erf_sparse
+    SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr
+  tags: [core, pointwise]
+
+- func: erf_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erf.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: erf_sparse_
+    SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_
+  tags: pointwise
+
+- func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: erf_out
+    MPS: erf_out_mps
+    SparseCPU, SparseCUDA: erf_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_out
+  tags: pointwise
+
+- func: erfc(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erfc.out
+  variants: function, method
+  tags: pointwise
+
+- func: erfc_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erfc.out
+  variants: function, method
+  tags: pointwise
+
+- func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: erfc_out
+  tags: pointwise
+
+- func: exp(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: exp.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: exp_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: exp.out
+  variants: function, method
+  tags: pointwise
+
+- func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: exp_out
+    MPS: exp_out_mps
+  tags: pointwise
+
+- func: exp2(Tensor self) -> Tensor
+  structured_delegate: exp2.out
+  variants: function, method
+  tags: pointwise
+
+- func: exp2_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: exp2.out
+  variants: function, method
+  tags: pointwise
+
+- func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: exp2_out
+    MPS: exp2_out_mps
+  tags: pointwise
+
+- func: expm1(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: expm1.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: expm1_sparse
+    SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr
+  tags: [core, pointwise]
+
+- func: expm1_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: expm1.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: expm1_sparse_
+    SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_
+  tags: pointwise
+
+- func: expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: expm1_out
+    MPS: expm1_out_mps
+    SparseCPU, SparseCUDA: expm1_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_out
+  tags: pointwise
+
+- func: expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)
+  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: expand
+  tags: core
+
+- func: expand_as(Tensor(a) self, Tensor other) -> Tensor(a)
+  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
+  device_check: NoCheck
+  device_guard: False
+
+# decomposes to eye.m
+- func: eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: eye
+
+- func: eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: eye
+
+- func: eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, Meta: eye_out_cpu
+    CUDA: eye_out_cuda
+    MPS: eye_out_mps
+
+- func: eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, Meta: eye_out_cpu
+    CUDA: eye_out_cuda
+    MPS: eye_out_mps
+
+- func: flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
+  variants: function, method
+
+- func: flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a)
+  variants: function, method
+
+- func: flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a)
+  variants: function, method
+
+- func: flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a)
+  variants: function, method
+
+- func: unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: unflatten_symint
+
+- func: unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: unflatten_dimname_symint
+
+- func: fill.Scalar(Tensor self, Scalar value) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: fill
+  tags: core
+
+- func: fill.Tensor(Tensor self, Tensor value) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: fill
+
+- func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: fill_
+    MPS: fill_scalar_mps
+    QuantizedCPU, QuantizedCUDA: fill_quantized_
+    Meta: fill_meta_
+    SparseCsrCPU, SparseCsrCUDA: fill_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: fill_nested_
+  autogen: fill.Scalar_out
+
+- func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: fill_
+    MPS: fill_tensor_mps_
+    QuantizedCPU, QuantizedCUDA: fill_quantized_
+    Meta: fill_meta_
+    NestedTensorCPU, NestedTensorCUDA: fill_nested_
+  autogen: fill.Tensor_out
+
+- func: floor(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: floor.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: floor_sparse
+    SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr
+  tags: [core, pointwise]
+
+- func: floor_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: floor.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: floor_sparse_
+    SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_
+  tags: pointwise
+
+- func: floor.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: floor_out
+    MPS: floor_out_mps
+    SparseCPU, SparseCUDA: floor_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_out
+  tags: pointwise
+
+- func: floor_divide(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: floor_divide
+    MPS: floor_divide_mps
+    SparseCPU, SparseCUDA: floor_divide_sparse
+
+- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU, CUDA: floor_divide_
+    MPS: floor_divide_mps_
+    SparseCPU, SparseCUDA: floor_divide_sparse_
+
+- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: floor_divide_out
+    MPS: floor_divide_out_mps
+    SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
+
+- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: floor_divide
+
+- func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: floor_divide_
+  autogen: floor_divide.Scalar_out
+
+- func: frac(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: frac.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: frac_sparse
+    SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr
+  tags: pointwise
+
+- func: frac_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: frac.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: frac_sparse_
+    SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_
+  tags: pointwise
+
+- func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: frac_out
+    MPS: frac_out_mps
+    SparseCPU, SparseCUDA: frac_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_out
+  tags: pointwise
+
+- func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: full
+  autogen: full.names_out
+
+- func: full(SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: full
+  tags: core
+
+- func: full.out(SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: full_out
+
+- func: full_like(Tensor self, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: full_like
+  autogen: full_like.out
+
+- func: from_file(str filename, bool? shared=None, int? size=0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CPU: from_file
+  autogen: from_file.out
+
+- func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: gcd_out
+  tags: pointwise
+
+- func: gcd(Tensor self, Tensor other) -> Tensor
+  structured_delegate: gcd.out
+  variants: function, method
+  tags: pointwise
+
+- func: gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: gcd.out
+  variants: function, method
+
+- func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: lcm_out
+  tags: pointwise
+
+- func: lcm(Tensor self, Tensor other) -> Tensor
+  structured_delegate: lcm.out
+  variants: function, method
+  tags: pointwise
+
+- func: lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: lcm.out
+  variants: function, method
+
+# NOTE [ grid_sampler Native Functions ]
+# `grid_sampler` is _supposed to_ do all the shape checking and then dispatch to
+# one of `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of
+# which has the corresponding backward defined as native functions as well.
+# However, we do shape checking everywhere for now since each of the mentioned
+# functions can be called directly, which will lead to crashes otherwise.
+# See https://github.com/pytorch/pytorch/issues/73187 for more information.
+#
+# There is also _grid_sampler_2d_backward_cpu_fallback which is an
+# implementation detail of grid_sampler_2d and is only exposed here for testing
+# purposes.
+#
+# Additionally, arguments `padding_mode` and `interpolation_mode` are cast to
+# enums defined in `native/GridSampler.h`. `cudnn_grid_sampler` doesn't take in
+# `interpolation_mode` because it only supports Bilinear interpolation mode.
+# Nor does it take in `align_corners` because it only supports the mode
+# `align_corners = True`.
+- func: grid_sampler(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+
+- func: grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+  dispatch:
+    CPU, QuantizedCPU: grid_sampler_2d_cpu
+    CUDA: grid_sampler_2d_cuda
+    MPS: grid_sampler_2d_mps
+  autogen: grid_sampler_2d.out
+  tags: core
+
+# `grid_sampler_2d_backward` takes in `output_mask` to optimize performance for
+# the case where `input` doesn't require gradient. Gradient for `grid` is always
+# computed (only `output_mask[0]` is checked by the implementations).
+- func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor)
+  dispatch:
+    CPU: grid_sampler_2d_backward_cpu
+    CUDA: grid_sampler_2d_backward_cuda
+  autogen: grid_sampler_2d_backward.out
+
+# See NOTE [ grid_sample CPU fallback ]
+- func: _grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _grid_sampler_2d_cpu_fallback
+  autogen: _grid_sampler_2d_cpu_fallback.out
+
+- func: _grid_sampler_2d_cpu_fallback_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
+
+- func: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+  dispatch:
+    CPU: grid_sampler_3d_cpu
+    CUDA: grid_sampler_3d_cuda
+  autogen: grid_sampler_3d.out
+
+# `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for
+# the case where `input` doesn't require gradient. Gradient for `grid` is always
+# computed (only `output_mask[0]` is checked by the implementations).
+- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor)
+  dispatch:
+    CPU: grid_sampler_3d_backward_cpu
+    CUDA: grid_sampler_3d_backward_cuda
+  autogen: grid_sampler_3d_backward.out
+
+- func: hann_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: hann_window
+  autogen: hann_window.out
+
+- func: hann_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: hann_window
+  autogen: hann_window.periodic_out
+
+- func: hamming_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: hamming_window
+  autogen: hamming_window.out
+
+- func: hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: hamming_window
+  autogen: hamming_window.periodic_out
+
+- func: hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: hamming_window
+  autogen: hamming_window.periodic_alpha_out
+
+- func: hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: hamming_window
+  autogen: hamming_window.periodic_alpha_beta_out
+
+- func: kaiser_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: kaiser_window
+  autogen: kaiser_window.out
+
+- func: kaiser_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: kaiser_window
+  autogen: kaiser_window.periodic_out
+
+- func: kaiser_window.beta(int window_length, bool periodic, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: kaiser_window
+  autogen: kaiser_window.beta_out
+
+- func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor
+
+- func: group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
+
+- func: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU, CUDA: native_group_norm
+    CompositeExplicitAutograd: math_group_norm
+  autogen: native_group_norm.out
+  tags: core
+
+- func: native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU, CUDA: native_group_norm_backward
+  autogen: native_group_norm_backward.out
+  tags: core
+
+# Real to complex forward FFT
+- func: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _fft_r2c_mkl
+    CUDA: _fft_r2c_cufft
+    MPS: _fft_r2c_mps
+
+- func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CPU: _fft_r2c_mkl_out
+    CUDA: _fft_r2c_cufft_out
+    MPS: _fft_r2c_mps_out
+
+# Complex to real inverse FFT
+- func: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _fft_c2r_mkl
+    CUDA: _fft_c2r_cufft
+    MPS: _fft_c2r_mps
+
+- func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CPU: _fft_c2r_mkl_out
+    CUDA: _fft_c2r_cufft_out
+    MPS: _fft_c2r_mps_out
+
+# Standard complex to complex FFT (forward or backward)
+- func: _fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _fft_c2c_mkl
+    CUDA: _fft_c2c_cufft
+    MPS: _fft_c2c_mps
+
+- func: _fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CPU: _fft_c2c_mkl_out
+    CUDA: _fft_c2c_cufft_out
+    MPS: _fft_c2c_mps_out
+
+- func: _validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: _validate_compressed_sparse_indices_cpu
+    CUDA: _validate_compressed_sparse_indices_cuda
+
+- func: _cufft_get_plan_cache_size(DeviceIndex device_index) -> int
+
+- func: _cufft_get_plan_cache_max_size(DeviceIndex device_index) -> int
+
+- func: _cufft_set_plan_cache_max_size(DeviceIndex device_index, int max_size) -> ()
+
+- func: _cufft_clear_plan_cache(DeviceIndex device_index) -> ()
+
+- func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: index.Tensor_out
+  variants: function, method
+  dispatch:
+    QuantizedCPU: quantized_index
+  tags: [core, dynamic_output_shape]
+  # NB: This function is special-cased in tools/autograd/gen_variable_type.py
+  # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
+  # - Tensor Tensor::index(ArrayRef<TensorIndex> indices)
+  # - Tensor Tensor::index(std::initializer_list<TensorIndex> indices)
+
+- func: index.Tensor_out(Tensor self, Tensor?[] indices, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  structured: True
+  structured_inherits: TensorIteratorBase
+  precomputed:
+  - indices -> DimVector sizes, DimVector strides
+  dispatch:
+    CPU, CUDA, MPS: index_out
+
+# Used by inductor to signal indexing without bounds checks
+# Note that we don't support boolean indexing, to avoid dynamic output shapes
+- func: _unsafe_index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _unsafe_index
+
+- func: index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  precomputed:
+  - dim -> int dim
+  dispatch:
+    CPU, CUDA: index_copy_out
+
+- func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
+  variants: method
+  structured_delegate: index_copy.out
+
+- func: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
+  variants: function, method
+  structured_delegate: index_copy.out
+
+- func: index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!)
+  variants: method
+
+- func: index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor
+  variants: function, method
+
+- func: index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)
+  device_check: NoCheck   # delegate to _index_put_impl_, which leverages TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: index_put_
+  autogen: index_put.out
+  # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
+  # - Tensor & Tensor::index_put_(ArrayRef<TensorIndex> indices, Tensor const & rhs)
+  # - Tensor & Tensor::index_put_(ArrayRef<TensorIndex> indices, Scalar v)
+  # - Tensor & Tensor::index_put_(std::initializer_list<TensorIndex> indices, Tensor const & rhs)
+  # - Tensor & Tensor::index_put_(std::initializer_list<TensorIndex> indices, Scalar v)
+
+- func: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
+  device_check: NoCheck   # delegate to _index_put_impl_ after clone, which leverages TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: index_put
+  tags: core
+
+- func: _unsafe_index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
+  device_check: NoCheck   # delegate to _index_put_impl_ after clone, which leverages TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _unsafe_index_put
+
+- func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CPU, CUDA, MPS: _index_put_impl_
+    QuantizedCPU: _index_put_impl_quantized_cpu_
+    QuantizedCUDA: _index_put_impl_quantized_cuda_
+  autogen: _index_put_impl, _index_put_impl.out
+
+- func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor
+  variants: function
+
+- func: isclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> Tensor
+  variants: function, method
+
+- func: isin.Tensor_Tensor_out(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: isin_Tensor_Tensor_out
+
+- func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
+  variants: function
+  structured_delegate: isin.Tensor_Tensor_out
+
+- func: isin.Tensor_Scalar_out(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: isin_Tensor_Scalar_out
+
+- func: isin.Tensor_Scalar(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False) -> Tensor
+  variants: function
+  structured_delegate: isin.Tensor_Scalar_out
+
+- func: isin.Scalar_Tensor_out(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: isin_Scalar_Tensor_out
+
+- func: isin.Scalar_Tensor(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
+  variants: function
+  structured_delegate: isin.Scalar_Tensor_out
+
+- func: isnan(Tensor self) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CPU, CUDA, MPS: isnan
+    SparseCPU, SparseCUDA: isnan_sparse
+    SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr
+  autogen: isnan.out
+  tags: [core, pointwise]
+
+- func: is_distributed(Tensor self) -> bool
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: is_floating_point(Tensor self) -> bool
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: is_complex(Tensor self) -> bool
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: is_conj(Tensor self) -> bool
+  variants: function, method
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: _is_zerotensor(Tensor self) -> bool
+  variants: function, method
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: is_neg(Tensor self) -> bool
+  variants: function, method
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: isreal(Tensor self) -> Tensor
+  variants: function, method
+
+- func: is_nonzero(Tensor self) -> bool
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: is_same_size(Tensor self, Tensor other) -> bool
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: nested_is_same_size
+    CompositeExplicitAutograd: is_same_size
+
+- func: is_signed(Tensor self) -> bool
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: is_inference(Tensor self) -> bool
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: kl_div(Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor
+
+- func: kron(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+
+- func: kron.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: kthvalue
+
+- func: kthvalue.values(Tensor self, int k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    CPU: kthvalue_out_cpu
+    CUDA: kthvalue_out_cuda
+
+- func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+
+- func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+
+- func: layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: layer_norm_symint
+
+- func: native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: layer_norm_cpu
+    CUDA: layer_norm_cuda
+    MPS: layer_norm_mps
+    CompositeExplicitAutograd: math_native_layer_norm
+    NestedTensorCPU, NestedTensorCUDA: nested_layer_norm
+  autogen: native_layer_norm.out
+  tags: core
+
+- func: native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: layer_norm_backward_cpu
+    CUDA: layer_norm_backward_cuda
+    MPS: layer_norm_backward_mps
+    NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested
+  autogen: native_layer_norm_backward.out
+  tags: core
+
+- func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: nan_to_num
+    SparseCPU, SparseCUDA: nan_to_num_sparse
+  tags: pointwise
+
+- func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: nan_to_num_
+    SparseCPU, SparseCUDA: nan_to_num_sparse_
+  tags: pointwise
+
+- func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: nan_to_num_out
+    MPS: nan_to_num_out_mps
+    SparseCPU, SparseCUDA: nan_to_num_sparse_out
+  tags: pointwise
+
+- func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: linear
+    NestedTensorCPU, NestedTensorCUDA: nested_linear
+    MPS: _mps_linear
+
+- func: linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: nested_linear_backward
+    MPS: mps_linear_backward
+  autogen: linear_backward.out
+
+- func: linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: linear_out
+
+- func: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
+  python_module: nn
+  dispatch:
+    MkldnnCPU: mkldnn_linear
+  autogen: mkldnn_linear.out
+
+- func: mkldnn_linear_backward_input(int[] input_size, Tensor grad_output, Tensor weight) -> Tensor
+  dispatch:
+    MkldnnCPU: mkldnn_linear_backward_input
+  autogen: mkldnn_linear_backward_input.out
+
+- func: mkldnn_linear_backward_weights(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined) -> (Tensor, Tensor)
+  dispatch:
+    MkldnnCPU: mkldnn_linear_backward_weights
+  autogen: mkldnn_linear_backward_weights.out
+
+- func: mkldnn_linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    MkldnnCPU: mkldnn_linear_backward
+  autogen: mkldnn_linear_backward.out
+
+- func: _cslt_compress(Tensor input) -> Tensor
+  dispatch:
+    CUDA: _cslt_compress
+
+- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0) -> Tensor
+  dispatch:
+    CUDA: _cslt_sparse_mm
+
+- func: _cslt_sparse_mm_search(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> int
+  dispatch:
+    CUDA: _cslt_sparse_mm_search
+
+- func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor
+  dispatch:
+    CUDA: _sparse_semi_structured_linear
+
+- func: _mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor
+  dispatch:
+    CUDA: _mixed_dtypes_linear
+
+- func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
+
+- func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
+
+- func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int)
+
+- func: fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor
+
+- func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
+
+- func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
+
+- func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor
+
+- func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor
+
+- func: ldexp.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+
+- func: ldexp_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: function, method
+  tags: pointwise
+
+- func: ldexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
+
+- func: linspace(Scalar start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: linspace
+
+- func: linspace.Tensor_Tensor(Tensor start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: linspace
+
+- func: linspace.Tensor_Scalar(Tensor start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: linspace
+
+- func: linspace.Scalar_Tensor(Scalar start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: linspace
+
+- func: linspace.out(Scalar start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, Meta: linspace_out
+    CUDA: linspace_cuda_out
+    MPS: linspace_out_mps
+
+- func: linspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: linspace_out
+
+- func: linspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: linspace_out
+
+- func: linspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: linspace_out
+
+- func: log(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: log_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log.out
+  variants: function, method
+  tags: pointwise
+
+- func: log.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: log_out
+    MPS: log_out_mps
+  tags: pointwise
+
+- func: log10(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log10.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: log10_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log10.out
+  variants: function, method
+  tags: pointwise
+
+- func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: log10_out
+    MPS: log10_out_mps
+  tags: pointwise
+
+- func: log1p(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log1p.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: log1p_sparse
+    SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr
+  tags: [core, pointwise]
+
+- func: log1p_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log1p.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: log1p_sparse_
+    SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_
+  tags: pointwise
+
+- func: log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: log1p_out
+    MPS: log1p_out_mps
+    SparseCPU, SparseCUDA: log1p_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_out
+  tags: pointwise
+
+- func: log2(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log2.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: log2_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: log2.out
+  variants: function, method
+  tags: pointwise
+
+- func: log2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: log2_out
+    MPS: log2_out_mps
+  tags: pointwise
+
+- func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: logaddexp_out
+    MPS: logaddexp_out_mps
+  tags: pointwise
+
+- func: logaddexp(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+  structured_delegate: logaddexp.out
+  tags: pointwise
+
+- func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: logaddexp2_out
+    MPS: logaddexp2_out_mps
+  tags: pointwise
+
+- func: logaddexp2(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+  structured_delegate: logaddexp2.out
+  tags: pointwise
+
+- func: xlogy.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: xlogy.OutTensor
+  variants: function, method
+  tags: pointwise
+
+- func: xlogy.Scalar_Self(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: xlogy
+  tags: pointwise
+
+- func: xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: xlogy
+  tags: pointwise
+
+# xlogy: inplace variant
+- func: xlogy_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: xlogy.OutTensor
+  tags: pointwise
+
+- func: xlogy_.Scalar_Other(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: xlogy_
+
+# xlogy: out variant
+- func: xlogy.OutTensor(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  variants: function
+  dispatch:
+    CPU, CUDA: xlogy_out
+    MPS: xlogy_out_mps
+  tags: pointwise
+
+- func: xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: xlogy_out
+  tags: pointwise
+
+- func: xlogy.OutScalar_Other(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: xlogy_out
+  tags: pointwise
+
+- func: logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: logspace
+
+- func: logspace.Tensor_Tensor(Tensor start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: logspace
+
+- func: logspace.Tensor_Scalar(Tensor start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: logspace
+
+- func: logspace.Scalar_Tensor(Scalar start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: logspace
+
+- func: logspace.out(Scalar start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, Meta: logspace_out
+    CUDA: logspace_cuda_out
+
+- func: logspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: logspace_out
+
+- func: logspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: logspace_out
+
+- func: logspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+  category_override: factory
+  dispatch:
+    CompositeExplicitAutograd: logspace_out
+
+# log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models.
+- func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  variants: function, method
+
+- func: log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: log_softmax_out
+
+- func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+  variants: function, method
+
+- func: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+  structured_delegate: _log_softmax.out
+  tags: core
+
+- func: _log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: log_softmax_cpu_out
+    CUDA: log_softmax_cuda_out
+    MPS: log_softmax_mps_out
+
+- func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
+  structured_delegate: _log_softmax_backward_data.out
+
+- func: _log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: log_softmax_backward_cpu_out
+    CUDA: log_softmax_backward_cuda_out
+    MPS: log_softmax_backward_mps_out
+
+- func: _logcumsumexp(Tensor self, int dim) -> Tensor
+  dispatch:
+    CPU: _logcumsumexp_cpu
+    CUDA: _logcumsumexp_cuda
+
+- func: _logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: _logcumsumexp_out_cpu
+    CUDA: _logcumsumexp_out_cuda
+
+- func: logcumsumexp(Tensor self, int dim) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logcumsumexp
+
+- func: logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: logcumsumexp_out
+
+- func: logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor
+  variants: function, method
+
+- func: logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: logsumexp
+
+- func: logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    # calls squeeze
+    CompositeExplicitAutogradNonFunctional: logsumexp_out
+
+- func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
+
+- func: matmul(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: matmul
+    NestedTensorCPU, NestedTensorCUDA: matmul_nested
+
+- func: matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] mask) -> (Tensor, Tensor)
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: matmul_backward_nested
+  autogen: matmul_backward.out
+
+- func: matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeImplicitAutograd: matmul_out
+    NestedTensorCPU, NestedTensorCUDA: matmul_out_nested
+
+# Alias to linalg.matrix_power
+- func: matrix_power(Tensor self, int n) -> Tensor
+  variants: function, method
+
+# Alias to linalg.matrix_power
+- func: matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!)
+
+# Alias to linalg.matrix_exp
+- func: matrix_exp(Tensor self) -> Tensor
+  variants: function, method
+
+# This function should be deprecated in favor of differential_analytic_matrix_function in FunctionsManual.cpp
+- func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor
+
+# DEPRECATED: Use torch.aminmax instead
+- func: _aminmax(Tensor self) -> (Tensor, Tensor)
+  dispatch:
+    CPU, CUDA: _aminmax_all
+  autogen: _aminmax.out
+
+# DEPRECATED: Use torch.aminmax instead
+- func: _aminmax.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor)
+  dispatch:
+    CPU, CUDA: _aminmax
+  autogen: _aminmax.dim_out
+
+- func: aminmax(Tensor self, *, int? dim=None, bool keepdim=False) -> (Tensor min, Tensor max)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: aminmax.out
+  variants: function, method
+
+- func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    CPU, CUDA: aminmax_out
+    MPS: aminmax_out_mps
+
+- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
+  dispatch:
+    CPU, CUDA: _compute_linear_combination
+
+- func: _compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: _compute_linear_combination_out
+
+- func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: max.dim_max
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: qmax
+  tags: core
+
+- func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  precomputed:
+  - dim -> int dim
+  dispatch:
+    CPU, CUDA: max_out
+    MPS: max_out_mps
+
+- func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+
+- func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: value_selecting_reduction_backward_symint
+
+- func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+  variants: function, method
+  structured_delegate: amax.out
+  tags: core
+
+- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU, CUDA: amax_out
+    MPS: amax_out_mps
+
+# Return: (Tensor output, Tensor indices)
+- func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+
+- func: max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
+
+- func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: max_pool2d
+    MPS: mps_max_pool2d
+
+- func: max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    MPS: mps_max_pool2d_backward
+  autogen: max_pool2d_backward.out
+
+- func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    MkldnnCPU: mkldnn_max_pool2d
+  autogen: mkldnn_max_pool2d.out
+
+- func: mkldnn_max_pool2d_backward(Tensor grad_output, Tensor output, Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    MkldnnCPU: mkldnn_max_pool2d_backward
+  autogen: mkldnn_max_pool2d_backward.out
+
+- func: mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    MkldnnCPU: mkldnn_max_pool3d
+  autogen: mkldnn_max_pool3d.out
+
+- func: mkldnn_max_pool3d_backward(Tensor grad_output, Tensor output, Tensor input, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    MkldnnCPU: mkldnn_max_pool3d_backward
+  autogen: mkldnn_max_pool3d_backward.out
+
+- func: quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    QuantizedCPU: quantized_max_pool1d
+  autogen: quantized_max_pool1d.out
+
+- func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    QuantizedCPU: quantized_max_pool2d
+    QuantizedCUDA: quantized_max_pool2d_cudnn
+  autogen: quantized_max_pool2d.out
+
+- func: quantized_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+  dispatch:
+    QuantizedCPU: quantized_max_pool3d
+  autogen: quantized_max_pool3d.out
+
+- func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+
+# The CPU and GPU dispatch variants are named weirdly here because otherwise there
+# are namespacing issues in C++
+- func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: mean
+  tags: core
+
+# For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this.
+# FIXME: fix CI jobs and re-enable this
+#- func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+#  device_check: NoCheck   # TensorIterator
+#  dispatch:
+#    CompositeExplicitAutograd: mean_dtype_out
+
+- func: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  structured_delegate: mean.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    QuantizedCPU: mean_quantized_cpu
+  tags: core
+
+- func: mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: mean_out
+    MPS: mean_out_mps
+    QuantizedCPU: mean_out_quantized_cpu
+
+- func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # Composite
+  variants: function, method
+
+- func: nanmean.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # Composite
+
+- func: median(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: median_cpu
+    CUDA: median_cuda
+    MPS: median_mps
+  autogen: median.out
+
+- func: median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: median
+
+- func: median.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    CPU: median_out_cpu
+    CUDA: median_out_cuda
+    MPS: median_out_mps
+
+- func: median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+
+- func: median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+
+- func: nanmedian(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: nanmedian_cpu
+    CUDA: nanmedian_cuda
+  autogen: nanmedian.out
+
+- func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: nanmedian
+
+- func: nanmedian.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    CPU: nanmedian_out_cpu
+    CUDA: nanmedian_out_cuda
+
+- func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+
+- func: nanmedian.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+
+- func: min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: min.dim_min
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: qmin
+  tags: core
+
+- func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  precomputed:
+  - dim -> int dim
+  dispatch:
+    CPU, CUDA: min_out
+    MPS: min_out_mps
+
+- func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+
+- func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+  variants: function, method
+  structured_delegate: amin.out
+  tags: core
+
+- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU, CUDA: amin_out
+    MPS: amin_out_mps
+
+# TODO: Add this function to MPS dispatch key so that we avoid declaring it in
+# native_functions.yaml
+# https://github.com/pytorch/pytorch/issues/77394
+- func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    MPS: _mps_convolution
+  autogen: _mps_convolution.out
+
+- func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    MPS: mps_convolution_backward
+  autogen: mps_convolution_backward.out
+
+- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: mkldnn_convolution
+  autogen: mkldnn_convolution.out
+
+- func: mkldnn_rnn_layer(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: mkldnn_rnn_layer
+    MkldnnCPU: mkldnn_rnn_layer
+  autogen: mkldnn_rnn_layer.out
+
+- func: mkldnn_rnn_layer_backward(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: mkldnn_rnn_layer_backward
+  autogen: mkldnn_rnn_layer_backward.out
+
+- func: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: miopen_batch_norm
+  autogen: miopen_batch_norm.out
+
+- func: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: miopen_batch_norm_backward
+  autogen: miopen_batch_norm_backward.out
+
+- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+  dispatch:
+    CUDA: miopen_convolution
+  autogen: miopen_convolution.out
+
+- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+  dispatch:
+    CUDA: miopen_convolution_transpose
+  autogen: miopen_convolution_transpose.out
+
+- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+  dispatch:
+    CUDA: miopen_depthwise_convolution
+  autogen: miopen_depthwise_convolution.out
+
+- func: miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    CUDA: miopen_convolution_relu
+
+- func: miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    CUDA: miopen_convolution_add_relu
+
+- func: miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: miopen_rnn
+  autogen: miopen_rnn.out
+  tags: nondeterministic_seeded
+
+
+- func: miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
+  dispatch:
+    CUDA: miopen_rnn_backward
+  autogen: miopen_rnn_backward.out
+
+- func: mm(Tensor self, Tensor mat2) -> Tensor
+  structured_delegate: mm.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: _sparse_mm
+    SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm
+  tags: core
+
+- func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: mm_out_cpu
+    CUDA: mm_out_cuda
+    MPS: mm_out_mps
+    SparseCPU, SparseCUDA: _sparse_mm_out
+    SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
+
+- func: _int_mm(Tensor self, Tensor mat2) -> Tensor
+  dispatch:
+    CUDA: _int_mm_cuda
+
+- func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CUDA: _int_mm_out_cuda
+
+- func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor
+  dispatch:
+    CPU: _convert_weight_to_int4pack_cpu
+    CUDA: _convert_weight_to_int4pack_cuda
+
+- func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor
+  dispatch:
+    CPU: _weight_int4pack_mm_cpu
+    CUDA: _weight_int4pack_mm_cuda
+
+- func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor
+  dispatch:
+    CPU: _weight_int8pack_mm_cpu
+
+- func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
+  python_module: sparse
+
+- func: _sparse_mm.reduce(Tensor sparse, Tensor dense, str reduce) -> Tensor
+  python_module: sparse
+
+- func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor
+  dispatch:
+    SparseCPU: sparse_sparse_matmul_cpu
+    SparseCUDA: sparse_sparse_matmul_cuda
+  autogen: _sparse_sparse_matmul.out
+
+- func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+  dispatch:
+    CPU, CUDA: mode
+
+- func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    CompositeExplicitAutograd: mode_out
+
+- func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+
+- func: mode.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+
+- func: mul.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: mul.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: mul_sparse
+    SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr
+    MkldnnCPU: mkldnn_mul
+    ZeroTensor: mul_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
+  tags: [core, pointwise]
+
+- func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: mul.out
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA: mul_sparse_
+    SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr_
+    MkldnnCPU: mkldnn_mul_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor
+  tags: pointwise
+
+- func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: mul_out
+    MPS: mul_out_mps
+    SparseCPU: mul_out_sparse_cpu
+    SparseCUDA: mul_out_sparse_cuda
+    SparseCsrCPU, SparseCsrCUDA: mul_out_sparse_csr
+    MkldnnCPU: mkldnn_mul_out
+  tags: pointwise
+  # For C++ only, until we have conversion from C++ numbers to Tensor
+
+- func: mul.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: mul
+    SparseCsrCPU, SparseCsrCUDA: mul_scalar_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Scalar
+  tags: [core, pointwise]
+
+- func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: mul_
+    SparseCsrCPU, SparseCsrCUDA: mul__scalar_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Scalar
+  autogen: mul.Scalar_out
+  tags: pointwise
+# multiply, alias for mul
+
+- func: multiply.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+
+- func: multiply_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: multiply.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: multiply.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: function, method
+
+- func: multiply_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: mv(Tensor self, Tensor vec) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: mv
+    SparseCPU, SparseCUDA: mv_sparse
+
+- func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: mv_out
+
+- func: mvlgamma.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: mvlgamma_out
+  tags: pointwise
+
+- func: mvlgamma(Tensor self, int p) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: mvlgamma
+  tags: pointwise
+
+- func: mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: mvlgamma_
+  tags: pointwise
+
+- func: narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: narrow_copy_dense_cpu
+    SparseCPU, SparseCUDA: narrow_copy_sparse
+    CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint
+  tags: view_copy
+
+- func: narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: narrow_copy_dense_cpu_out
+
+- func: narrow(Tensor(a) self, int dim, SymInt start, SymInt length) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: narrow_symint
+    NestedTensorCPU, NestedTensorCUDA: narrow_nested_symint
+
+- func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: narrow_tensor_symint
+
+- func: native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: batch_norm_cpu
+    CUDA: batch_norm_cuda
+    MPS: batch_norm_mps
+    MkldnnCPU: mkldnn_batch_norm
+
+- func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+  dispatch:
+    CUDA: batch_norm_cuda_out
+    MPS: batch_norm_mps_out
+    CPU: batch_norm_cpu_out
+
+# TODO: In 2 weeks, we should make native_batch_norm composite implicit so that this correct schema percolates correctly through our dispatching
+- func: _native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: _batch_norm_legit_cpu
+    CUDA: _batch_norm_legit_cuda
+    MPS: _batch_norm_legit_mps
+    MkldnnCPU: _mkldnn_batch_norm_legit
+  autogen: _native_batch_norm_legit_functional
+  tags: core
+
+# HACK: identical to _native_batch_norm_legit, but training is known to be False,
+# So we known that running stats will not be mutated.
+# The real fix here is batch norm consolidation.
+- func: _native_batch_norm_legit_no_training(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CompositeExplicitAutograd: _batch_norm_legit_no_training
+  autogen: _native_batch_norm_legit_no_training.out
+  tags: core
+
+- func: _native_batch_norm_legit.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd) -> (Tensor(d!), Tensor(e!), Tensor(f!))
+  dispatch:
+    CPU: _batch_norm_legit_cpu_out
+    CUDA: _batch_norm_legit_cuda_out
+    MPS: _batch_norm_legit_mps_out
+
+- func: _native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: _batch_norm_legit_no_stats_cpu
+    CUDA: _batch_norm_legit_no_stats_cuda
+    MPS: _batch_norm_legit_no_stats_mps
+    MkldnnCPU: _mkldnn_batch_norm_legit_no_stats
+  tags: core
+
+- func: _native_batch_norm_legit.no_stats_out(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+  dispatch:
+    CPU: _batch_norm_legit_no_stats_cpu_out
+    CUDA: _batch_norm_legit_no_stats_cuda_out
+    MPS: _batch_norm_legit_no_stats_mps_out
+
+- func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
+  dispatch:
+    CUDA: batch_norm_stats_cuda
+  autogen: batch_norm_stats.out
+
+- func: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor
+  dispatch:
+    CUDA: batch_norm_elemt_cuda
+
+- func: batch_norm_elemt.out(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CUDA: batch_norm_elemt_cuda_out
+
+# for backward compatibility
+- func: batch_norm_gather_stats(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count) -> (Tensor, Tensor)
+  dispatch:
+    CUDA: batch_norm_gather_stats_cuda
+  autogen: batch_norm_gather_stats.out
+
+- func: batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor)
+  dispatch:
+    CUDA: batch_norm_gather_stats_with_counts_cuda
+  autogen: batch_norm_gather_stats_with_counts.out
+
+- func: native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: batch_norm_backward_cpu
+    CUDA: batch_norm_backward_cuda
+    MPS: batch_norm_backward_mps
+    MkldnnCPU: mkldnn_batch_norm_backward
+  autogen: native_batch_norm_backward.out
+
+- func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: batch_norm_backward_reduce_cuda
+  autogen: batch_norm_backward_reduce.out
+
+- func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor
+  dispatch:
+    CUDA: batch_norm_backward_elemt_cuda
+  autogen: batch_norm_backward_elemt.out
+
+- func: batch_norm_update_stats(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum) -> (Tensor, Tensor)
+  dispatch:
+    CPU: batch_norm_update_stats_cpu
+    CUDA: batch_norm_update_stats_cuda
+  autogen: batch_norm_update_stats.out
+
+- func: is_vulkan_available() -> bool
+
+- func: _nnpack_available() -> bool
+
+- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _nnpack_spatial_convolution
+  autogen: _nnpack_spatial_convolution.out
+
+- func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: ones
+  autogen: ones.names_out
+
+- func: ones(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: ones
+
+- func: ones.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: ones_out
+
+- func: ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: ones_like
+    NestedTensorCPU, NestedTensorCUDA: ones_like
+  autogen: ones_like.out
+
+- func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor
+
+- func: cdist(Tensor x1, Tensor x2, float p=2, int? compute_mode=None) -> Tensor
+
+- func: _euclidean_dist(Tensor x1, Tensor x2) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _euclidean_dist
+  autogen: _euclidean_dist.out
+
+- func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
+  dispatch:
+    CPU, CUDA: _cdist_forward
+    MPS: _cdist_forward_mps
+  autogen: _cdist_forward.out
+  tags: core
+
+- func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
+  dispatch:
+    CPU, CUDA: _cdist_backward
+  autogen: _cdist_backward.out
+
+- func: pdist(Tensor self, float p=2) -> Tensor
+
+- func: _pdist_forward(Tensor self, float p=2) -> Tensor
+  dispatch:
+    CPU, CUDA: _pdist_forward
+  autogen: _pdist_forward.out
+  tags: core
+
+- func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
+  dispatch:
+    CPU, CUDA: _pdist_backward
+  autogen: _pdist_backward.out
+
+- func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor
+  variants: function
+
+- func: permute(Tensor(a) self, int[] dims) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: permute
+    MPS: permute_mps
+    SparseCPU, SparseCUDA: permute_sparse_coo
+  tags: core
+
+- func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
+  variants: function, method
+
+- func: movedim.int(Tensor(a) self, int source, int destination) -> Tensor(a)
+  variants: function, method
+
+# moveaxis, alias for movedim
+- func: moveaxis.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
+  variants: function, method
+
+- func: moveaxis.int(Tensor(a) self, int source, int destination) -> Tensor(a)
+  variants: function, method
+
+# Only exposed from C++ -- in Python,
+# we expose it as an attribute `T`, not a function.
+#
+# I'd like to name this "T" in C++ too, but
+# calling a native function "T" causes undefined
+# behavior on Windows, for reasons I don't understand
+# (maybe related to capital letter collation somehow...)
+- func: numpy_T(Tensor(a) self) -> Tensor(a)
+  variants: method
+
+# Exposed on Python as an attribute 'H'
+- func: matrix_H(Tensor(a) self) -> Tensor(a)
+  variants: method
+
+# Exposed on Python as an attribute 'mT'
+- func: mT(Tensor(a) self) -> Tensor(a)
+  variants: method
+
+# Exposed on Python as an attribute 'mH'
+- func: mH(Tensor(a) self) -> Tensor(a)
+  variants: method
+
+- func: adjoint(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+
+- func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
+  dispatch:
+    CPU: pixel_shuffle_cpu
+    MPS: pixel_shuffle_mps
+    CompositeExplicitAutogradNonFunctional: math_pixel_shuffle
+  autogen: pixel_shuffle.out
+
+- func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
+  dispatch:
+    CPU: pixel_unshuffle_cpu
+    MPS: pixel_unshuffle_mps
+    CompositeExplicitAutogradNonFunctional: math_pixel_unshuffle
+  autogen: pixel_unshuffle.out
+
+- func: channel_shuffle(Tensor self, SymInt groups) -> Tensor
+  dispatch:
+    CPU, CUDA: channel_shuffle
+    QuantizedCPU: channel_shuffle_quantized_cpu
+  autogen: channel_shuffle.out
+
+- func: native_channel_shuffle(Tensor self, SymInt groups) -> Tensor
+  dispatch:
+    CPU: channel_shuffle_cpu
+    CompositeImplicitAutograd: math_channel_shuffle
+
+- func: is_pinned(Tensor self, Device? device=None) -> bool
+  variants: method
+  dispatch:
+    NestedTensorCUDA, CUDA: is_pinned_cuda
+    MPS: is_pinned_mps
+    CompositeExplicitAutograd: is_pinned_default
+
+# TODO: add a copy kwarg that guarantees that the tensor is put into fresh
+# pinned memory
+- func: pin_memory(Tensor(a) self, Device? device=None) -> Tensor(a)
+  variants: method
+
+# Unlike pin_memory, this is guaranteed to give a new non-aliasing tensor
+- func: _pin_memory(Tensor self, Device? device=None) -> Tensor
+  dispatch:
+    CUDA: _pin_memory_cuda
+    MPS: _pin_memory_mps
+    NestedTensorCUDA, NestedTensorCPU: _pin_memory_nested
+  autogen: _pin_memory.out
+
+- func: pinverse(Tensor self, float rcond=1e-15) -> Tensor
+  variants: function, method
+
+- func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor
+  variants: function
+
+- func: rad2deg(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: rad2deg
+    SparseCPU, SparseCUDA: rad2deg_sparse
+    SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr
+
+- func: rad2deg_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: rad2deg_
+    SparseCPU, SparseCUDA: rad2deg_sparse_
+    SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr_
+
+- func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: rad2deg_out
+    SparseCPU, SparseCUDA: rad2deg_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr_out
+
+- func: deg2rad(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: deg2rad
+    SparseCPU, SparseCUDA: deg2rad_sparse
+    SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr
+  tags: pointwise
+
+- func: deg2rad_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: deg2rad_
+    SparseCPU, SparseCUDA: deg2rad_sparse_
+    SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr_
+  tags: pointwise
+
+- func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: deg2rad_out
+    SparseCPU, SparseCUDA: deg2rad_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr_out
+  tags: pointwise
+
+- func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: scalar_tensor
+  autogen: scalar_tensor.out
+  tags: core
+
+- func: rand.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: rand
+  autogen: rand.names_out
+  tags: nondeterministic_seeded
+
+- func: rand.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: rand
+  autogen: rand.generator_with_names_out
+
+- func: rand(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: [core, nondeterministic_seeded]
+  dispatch:
+    CompositeExplicitAutograd: rand
+
+- func: rand.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: rand
+
+- func: rand.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: rand_out
+
+- func: rand.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+
+- func: rand_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: rand_like
+  autogen: rand_like.out
+
+- func: randint(SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint
+
+- func: randint.generator(SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint
+
+- func: randint.low(SymInt low, SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint
+
+- func: randint.low_generator(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint
+
+- func: randint.out(SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint_out
+
+- func: randint.generator_out(SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint_out
+
+- func: randint.low_out(SymInt low, SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint_out
+
+- func: randint.low_generator_out(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randint_out
+
+- func: randint_like(Tensor self, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: randint_like
+  autogen: randint_like.out
+
+- func: randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: randint_like
+  autogen: randint_like.low_dtype_out
+
+- func: randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: [core, nondeterministic_seeded]
+  dispatch:
+    CompositeExplicitAutograd: randn
+
+- func: randn.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randn
+
+- func: randn.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: randn
+  autogen: randn.names_out
+
+- func: randn.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: randn
+  autogen: randn.generator_with_names_out
+
+- func: randn.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+
+- func: randn.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+
+- func: randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: randn_like
+  autogen: randn_like.out
+
+- func: randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: [core, nondeterministic_seeded]
+  dispatch:
+    CompositeExplicitAutograd: randperm
+
+- func: randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randperm
+
+- func: randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: randperm_out
+
+- func: randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU: randperm_out_cpu
+    CUDA: randperm_out_cuda
+    MPS: randperm_out_mps
+
+- func: range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: range
+
+- func: range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: range
+
+- func: range.out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: range_out_no_step
+
+- func: range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, Meta: range_out
+    CUDA: range_cuda_out
+    MPS: range_mps_out
+  cpp_no_default_args: ['step']
+
+- func: ravel(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+
+- func: reciprocal(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: reciprocal.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: reciprocal_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: reciprocal.out
+  variants: function, method
+  tags: pointwise
+
+- func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: reciprocal_out
+    MPS: reciprocal_out_mps
+  tags: pointwise
+
+- func: neg(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: neg.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: neg_sparse
+    SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg
+  tags: [core, pointwise]
+
+- func: neg_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: neg.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: neg_sparse_
+    SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg_
+  tags: pointwise
+
+- func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: neg_out
+    MPS: neg_out_mps
+    SparseCPU, SparseCUDA: neg_out_sparse
+    SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_out
+  tags: pointwise
+# Alias for neg
+
+- func: negative(Tensor self) -> Tensor
+  variants: function, method
+
+- func: negative_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: repeat(Tensor self, SymInt[] repeats) -> Tensor
+  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
+  dispatch:
+    CompositeExplicitAutograd: repeat
+    MPS: repeat_mps
+  autogen: repeat.out
+  tags: core
+
+- func: repeat_interleave.Tensor(Tensor repeats, *, SymInt? output_size=None) -> Tensor
+  variants: function
+  dispatch:
+    CPU: repeat_interleave_cpu
+    CUDA: repeat_interleave_cuda
+    MPS: repeat_interleave_mps
+  tags: dynamic_output_shape
+  autogen: repeat_interleave.Tensor_out
+
+- func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: repeat_interleave_symint
+
+- func: repeat_interleave.self_int(Tensor self, SymInt repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: repeat_interleave_symint
+
+- func: reshape(Tensor(a) self, SymInt[] shape) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: reshape_symint
+    CompositeImplicitAutogradNestedTensor: reshape_nested_symint
+
+- func: _reshape_copy(Tensor self, SymInt[] size) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _reshape_copy_symint
+
+# NOTE [ _reshape_alias ] is meant to be used in the implementation of reshape.
+# They are not user-facing, hence the leading underscore. Please don't use it
+# anywhere else.
+- func: _reshape_alias(Tensor(a) self, SymInt[] size, SymInt[] stride) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS: _reshape_alias
+    # We don't need to support mkldnn since this is handled explicitly by the reshape operator.
+
+- func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    MkldnnCPU: mkldnn_reshape
+  autogen: _mkldnn_reshape.out
+
+- func: reshape_as(Tensor(a) self, Tensor other) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: reshape_as
+    CompositeImplicitAutogradNestedTensor: reshape_as_nested
+
+- func: round(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: round.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: round_sparse
+    SparseCsrCPU, SparseCsrCUDA: round_sparse_csr
+  tags: [core, pointwise]
+
+- func: round_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: round.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: round_sparse_
+    SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_
+  tags: pointwise
+
+- func: round.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU: round_out
+    CUDA: round_out
+    MPS: round_out_mps
+    SparseCPU, SparseCUDA: round_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_out
+  tags: pointwise
+
+- func: round.decimals(Tensor self, *, int decimals) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: round.decimals_out
+  variants: function, method
+  tags: pointwise
+
+- func: round_.decimals(Tensor(a!) self, *, int decimals) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: round.decimals_out
+  variants: function, method
+  tags: pointwise
+
+- func: round.decimals_out(Tensor self, *, int decimals, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU: round_decimals_out
+    CUDA: round_decimals_out
+  tags: pointwise
+
+- func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+
+- func: rrelu_(Tensor(a!) self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  device_check: NoCheck   # TensorIterator
+
+- func: relu(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: relu
+    MPS: relu_mps
+    MkldnnCPU: mkldnn_relu
+    QuantizedCPU: relu_quantized_cpu
+    QuantizedCUDA: relu_quantized_cuda
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu
+    SparseCPU, SparseCUDA: relu_sparse
+    SparseCsrCPU, SparseCsrCUDA: relu_sparse_csr
+  tags: [core, pointwise]
+
+- func: relu_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: relu_
+    MPS: relu_mps_
+    MkldnnCPU: mkldnn_relu_
+    QuantizedCPU: relu_quantized_cpu_
+    QuantizedCUDA: relu_quantized_cuda_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_
+    SparseCPU, SparseCUDA: relu_sparse_
+    SparseCsrCPU, SparseCsrCUDA: relu_sparse_csr_
+  autogen: relu.out
+  tags: pointwise
+
+- func: relu6(Tensor self) -> Tensor
+  python_module: nn
+
+- func: relu6_(Tensor(a!) self) -> Tensor(a!)
+  python_module: nn
+
+- func: prelu(Tensor self, Tensor weight) -> Tensor
+  variants: function, method
+  autogen: prelu.out
+
+- func: _prelu_kernel(Tensor self, Tensor weight) -> Tensor
+  dispatch:
+    CPU, CUDA: _prelu_kernel
+    QuantizedCPU: _prelu_kernel_quantized_cpu
+    MkldnnCPU: mkldnn_prelu
+    MPS: prelu_mps
+
+- func: _prelu_kernel_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
+  dispatch:
+    CPU, CUDA: _prelu_kernel_backward
+    MkldnnCPU: mkldnn_prelu_backward
+    MPS: prelu_backward_mps
+
+- func: gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU: gelu_out_cpu
+    CUDA: gelu_out_cuda
+    MPS: gelu_out_mps
+
+- func: gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!)
+  structured_delegate: gelu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    QuantizedCPU: gelu_quantized_cpu_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_
+
+- func: gelu(Tensor self, *, str approximate='none') -> Tensor
+  structured_delegate: gelu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    MkldnnCPU: mkldnn_gelu
+    QuantizedCPU: gelu_quantized_cpu
+    QuantizedCUDA: gelu_quantized_cuda
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu
+  tags: [core, pointwise]
+
+- func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU: gelu_backward_out_cpu
+    CUDA: gelu_backward_out_cuda
+    MPS: gelu_backward_out_mps
+
+- func: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor
+  structured_delegate: gelu_backward.grad_input
+  python_module: nn
+  dispatch:
+    MkldnnCPU: mkldnn_gelu_backward
+    NestedTensorCPU, NestedTensorCUDA: gelu_backwards_nested
+  tags: pointwise
+
+- func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor
+  variants: function
+  python_module: nn
+  device_check: NoCheck
+  device_guard: False
+
+- func: hardshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: hardshrink_out
+
+- func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
+  structured_delegate: hardshrink.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: hardshrink_backward.grad_input(Tensor grad_out, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: hardshrink_backward_out
+
+- func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
+  structured_delegate: hardshrink_backward.grad_input
+  variants: function, method
+
+- func: rsqrt(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: rsqrt.out
+  variants: function, method
+  tags: [core, pointwise]
+
+- func: rsqrt_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: rsqrt.out
+  variants: function, method
+  tags: pointwise
+
+- func: rsqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: rsqrt_out
+    MPS: rsqrt_out_mps
+  tags: pointwise
+
+- func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: select.int(Tensor(a) self, int dim, SymInt index) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: select_symint
+    SparseCsrCPU, SparseCsrCUDA: select_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: select_nested
+  tags: core
+
+- func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: select_backward_symint
+  autogen: select_backward.out
+
+- func: _nested_select_backward(Tensor grad_output, Tensor self, int dim, SymInt index) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: _nested_select_backward_symint
+
+- func: selu(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+
+- func: selu_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: celu(Tensor self, Scalar alpha=1.0) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CompositeExplicitAutograd: celu
+
+- func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CompositeExplicitAutograd: celu_
+  autogen: celu.out
+
+- func: silu(Tensor self) -> Tensor
+  structured_delegate: silu.out
+  python_module: nn
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu
+  tags: pointwise
+
+- func: silu_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: silu.out
+  python_module: nn
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu_
+  tags: pointwise
+
+- func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: silu_out
+    MPS: silu_out_mps
+  tags: pointwise
+
+- func: silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: silu_backward_out
+    MPS: silu_backward_out_mps
+  tags: pointwise
+
+- func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
+  structured_delegate: silu_backward.grad_input
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: math_silu_backward
+    NestedTensorCPU, NestedTensorCUDA: silu_backward_nested
+  tags: pointwise
+
+- func: mish(Tensor self) -> Tensor
+  structured_delegate: mish.out
+  python_module: nn
+
+- func: mish_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: mish.out
+  python_module: nn
+
+- func: mish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: mish_out
+    MPS: mish_out_mps
+
+- func: mish_backward(Tensor grad_output, Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: mish_backward
+    MPS: mish_backward_mps
+    CompositeImplicitAutograd: math_mish_backward
+
+- func: sigmoid(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sigmoid.out
+  variants: function, method
+  dispatch:
+    QuantizedCPU: sigmoid_quantized_cpu
+    MkldnnCPU: mkldnn_sigmoid
+  tags: [core, pointwise]
+
+- func: sigmoid_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sigmoid.out
+  variants: function, method
+  dispatch:
+    MkldnnCPU: mkldnn_sigmoid_
+  tags: pointwise
+
+- func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sigmoid_out
+    MPS: sigmoid_out_mps
+  tags: pointwise
+
+- func: logit(Tensor self, float? eps=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU, CUDA: logit
+    MPS: logit_mps
+  tags: pointwise
+
+- func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)
+  variants: function, method
+  dispatch:
+    CPU, CUDA: logit_
+  tags: pointwise
+
+- func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: logit_out
+    MPS: logit_out_mps
+  tags: pointwise
+
+- func: sin(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sin.out
+  variants: function, method
+  dispatch:
+    SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr
+    SparseCPU, SparseCUDA: sin_sparse
+    NestedTensorCPU, NestedTensorCUDA: sin_nested
+  tags: [core, pointwise]
+
+- func: sin_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sin.out
+  variants: function, method
+  dispatch:
+    SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_
+    SparseCPU, SparseCUDA: sin_sparse_
+  tags: pointwise
+
+- func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sin_out
+    MPS: sin_out_mps
+    SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_out
+    SparseCPU, SparseCUDA: sin_sparse_out
+  tags: pointwise
+
+- func: sinc(Tensor self) -> Tensor
+  structured_delegate: sinc.out
+  variants: function, method
+  tags: pointwise
+
+- func: sinc_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: sinc.out
+  variants: function, method
+  tags: pointwise
+
+- func: sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sinc_out
+  tags: pointwise
+
+- func: sinh(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sinh.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: sinh_sparse
+    SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr
+  tags: [core, pointwise]
+
+- func: sinh_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sinh.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: sinh_sparse_
+    SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_
+  tags: pointwise
+
+- func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sinh_out
+    MPS: sinh_out_mps
+    SparseCPU, SparseCUDA: sinh_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_out
+
+# Returns a copy of this `Variable` that is detached from its autograd graph.
+# This method is OK to call if the `Variable` is a view.
+#
+# NOTE: Previously, if we change the tensor metadata (e.g. sizes / strides /
+# storage / storage_offset) of a tensor created from `detach()`, those metadata
+# in the original tensor will also be updated. However, the new behavior is that
+# those metadata changes to the detached tensor will not update the original tensor
+# anymore, and in the `detach()` function we need to set `allow_tensor_metadata_change_`
+# to false to make such changes explicitly illegal, in order to prevent users from
+# changing metadata of the detached tensor and expecting the original tensor to also
+# be updated.
+  tags: pointwise
+- func: detach(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: detach
+    NestedTensorCPU, NestedTensorCUDA: detach
+
+# Like `detach()`, but modifies this `Variable` in-place. This method may
+# only be called on non-view `Variable`s. You can use `is_view()` to check
+# this. If this `Variable` is a view, throws an `std::runtime_error()`.
+- func: detach_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutograd: detach_
+
+- func: size.int(Tensor self, int dim) -> int
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: size.Dimname(Tensor self, Dimname dim) -> int
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: sym_size.int(Tensor self, int dim) -> SymInt
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  tags: core
+  manual_cpp_binding: True
+
+- func: sym_numel(Tensor self) -> SymInt
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  tags: core
+  manual_cpp_binding: True
+
+- func: sym_storage_offset(Tensor self) -> SymInt
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  tags: core
+  manual_cpp_binding: True
+
+- func: slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: slice
+  tags: core
+
+# NOTE: The implementation of split_with_sizes bypasses the dispatcher to call this; undo
+# that if adding specific implementations here!
+
+- func: slice_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: slice_backward
+  autogen: slice_backward.out
+
+# NB: This op exists to back the implementation of reverse view_funcs for various views (chunk,
+# slice.Tensor, split_with_sizes, et. al.). Currently, these are only used during fake-ification
+# of PT2 graph input subclass instances that are views. This means:
+# * This op shouldn't really show up in eager mode (so e.g. XLA shouldn't have to implement it)
+# * This op shouldn't show up in a PT2 graph (so a PT2 backend shouldn't have to implement it)
+# * A subclass will have to implement this to work in PT2 if a subclass view is used as a graph
+#   input AND the view utilizes this op in its inverse. The idea is that slice_inverse() is
+#   easier to implement for a subclass than as_strided()
+- func: slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: slice_inverse_symint
+
+- func: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: slice_scatter
+  autogen: slice_scatter.out
+  tags: [core, view_copy]
+
+- func: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: select_scatter_symint
+  autogen: select_scatter.out
+  tags: core
+
+- func: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: diagonal_scatter
+  autogen: diagonal_scatter.out
+
+- func: as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: as_strided_scatter_symint
+  autogen: as_strided_scatter.out
+
+- func: smm(Tensor self, Tensor mat2) -> Tensor
+  variants: function, method
+
+# softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models.
+- func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  variants: function, method
+
+- func: softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: softmax_out
+
+- func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+  variants: function, method
+
+- func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+  structured_delegate: _softmax.out
+  dispatch:
+    MkldnnCPU: mkldnn_softmax
+    NestedTensorCPU, NestedTensorCUDA: softmax_nested
+  tags: core
+
+- func: _softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: softmax_cpu_out
+    CUDA: softmax_cuda_out
+    MPS: softmax_mps_out
+
+- func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
+  structured_delegate: _softmax_backward_data.out
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: nested_softmax_backward
+
+- func: _softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: softmax_backward_cpu_out
+    CUDA: softmax_backward_cuda_out
+    MPS: softmax_backward_mps_out
+
+- func: unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: unsafe_split
+  autogen: unsafe_split.Tensor_out
+
+- func: split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: split
+
+- func: split.sizes(Tensor(a -> *) self, SymInt[] split_size, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: split_symint
+
+- func: unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: unsafe_split_with_sizes
+  autogen: unsafe_split_with_sizes.out
+
+- func: split_with_sizes(Tensor(a -> *) self, SymInt[] split_sizes, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: split_with_sizes
+    NestedTensorCPU, NestedTensorCUDA: split_with_sizes_nested
+  tags: core
+
+- func: hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
+  variants: function, method
+
+- func: hsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]
+  variants: function, method
+
+- func: vsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
+  variants: function, method
+
+- func: vsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]
+  variants: function, method
+
+- func: dsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
+  variants: function, method
+
+- func: dsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]
+  variants: function, method
+
+- func: squeeze(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: squeeze
+    QuantizedCPU, QuantizedCUDA: squeeze_quantized
+    NestedTensorCPU, NestedTensorCUDA: squeeze_nested
+
+- func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: squeeze
+    QuantizedCPU, QuantizedCUDA: squeeze_quantized
+    NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
+  tags: core
+
+- func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+
+- func: squeeze.dims(Tensor(a) self, int[] dim) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: squeeze
+    QuantizedCPU, QuantizedCUDA: squeeze_quantized
+    NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
+  tags: core
+
+- func: squeeze_(Tensor(a!) self) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutograd: squeeze_
+
+- func: squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutograd: squeeze_
+
+- func: squeeze_.dims(Tensor(a!) self, int[] dim) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutograd: squeeze_
+
+- func: squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+
+- func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function, method
+
+- func: sspaddmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: _sspaddmm_out_only_sparse
+    CUDA: _sspaddmm_out_only_sparse_cuda
+    SparseCPU: _sspaddmm_out_cpu
+    SparseCUDA: _sspaddmm_out_cuda
+
+- func: _chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _chunk_cat
+
+- func: _chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: _chunk_cat_out
+
+- func: stack(Tensor[] tensors, int dim=0) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: stack
+
+- func: stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: stack_out
+
+- func: _stack(Tensor[] tensors, int dim=0) -> Tensor
+  dispatch: # match the backends supported by _cat
+    CPU: _stack_cpu
+    CompositeExplicitAutograd: _stack
+
+- func: _stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch: # match the backends supported by _cat_out
+    CPU: _stack_out_cpu
+    CompositeExplicitAutograd: _stack_out
+
+- func: hstack(Tensor[] tensors) -> Tensor
+
+- func: hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: vstack(Tensor[] tensors) -> Tensor
+
+- func: vstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: dstack(Tensor[] tensors) -> Tensor
+
+- func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+
+# Overload without center & pad mode, needed for forward-compatibility
+- func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
+  variants: function, method
+  cpp_no_default_args: ['hop_length', 'win_length', 'window', 'normalized']
+
+- func: stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
+  variants: function, method
+
+- func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor
+  variants: function, method
+
+- func: stride.int(Tensor self, int dim) -> int
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  manual_cpp_binding: True
+
+- func: stride.Dimname(Tensor self, Dimname dim) -> int
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: sym_stride.int(Tensor self, int dim) -> SymInt
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  tags: core
+  manual_cpp_binding: True
+
+- func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: sum
+    SparseCPU, SparseCUDA: sum_coo
+    SparseCsrCPU, SparseCsrCUDA: sum_csr
+  autogen: sum.out
+
+- func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  # TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype
+  structured_delegate: sum.IntList_out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    NestedTensorCPU: NestedTensor_sum_dim_CPU
+    SparseCPU, SparseCUDA: sum_sparse_coo
+    SparseCsrCPU, SparseCsrCUDA: sum_sparse_compressed
+  tags: core
+
+- func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: sum_out
+    MPS: sum_out_mps
+
+- func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+# TODO: this function will be replaced once nested expand semantics have been settled on
+- func: _nested_sum_backward(Tensor grad, Tensor self, int[1]? dim, bool keepdim=False) -> Tensor
+  dispatch:
+    NestedTensorCPU: _nested_sum_backward_cpu
+
+- func: nansum(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU, CUDA: nansum
+    MPS: nansum_mps
+
+- func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: nansum_out
+    MPS: nansum_out_mps
+
+- func: sum_to_size(Tensor self, SymInt[] size) -> Tensor
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: sum_to_size_symint
+
+- func: sqrt(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sqrt.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: sqrt_sparse
+    SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr
+  tags: [core, pointwise]
+
+- func: sqrt_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sqrt.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: sqrt_sparse_
+    SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_
+  tags: pointwise
+
+- func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sqrt_out
+    MPS: sqrt_out_mps
+    SparseCPU, SparseCUDA: sqrt_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_out
+  tags: pointwise
+
+- func: square(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: pointwise
+
+- func: square_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: pointwise
+
+- func: square.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
+
+- func: std(Tensor self, bool unbiased=True) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ["unbiased"]
+
+- func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ["unbiased"]
+
+- func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: std
+    MPS: std_mps
+    QuantizedCPU: std_quantized_cpu
+
+- func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CPU, CUDA: std_mean
+    MPS: std_mean_mps
+  autogen: std_mean.correction_out
+
+- func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ["unbiased"]
+
+- func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: std_out
+    QuantizedCPU: std_out_quantized_cpu
+
+- func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ["unbiased"]
+
+- func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ["unbiased"]
+
+- func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: prod
+    MPS: prod_mps
+  autogen: prod.out
+  tags: core
+
+- func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  structured_delegate: prod.int_out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+
+- func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: prod_out
+    MPS: prod_out_mps
+
+- func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: t(Tensor(a) self) -> Tensor(a)
+  device_check: NoCheck
+  device_guard: False
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: t
+
+- func: t_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck
+  device_guard: False
+  variants: method
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutograd: t_
+
+- func: tan(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: tan.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: tan_sparse
+    SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr
+  tags: [core, pointwise]
+
+- func: tan_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: tan.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: tan_sparse_
+    SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_
+  tags: pointwise
+
+- func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: tan_out
+    MPS: tan_out_mps
+    SparseCPU, SparseCUDA: tan_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_out
+  tags: pointwise
+
+- func: tanh(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: tanh.out
+  variants: function, method
+  dispatch:
+    QuantizedCPU: tanh_quantized_cpu
+    MkldnnCPU: mkldnn_tanh
+    SparseCPU, SparseCUDA: tanh_sparse
+    SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh
+  tags: [core, pointwise]
+
+- func: tanh_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: tanh.out
+  variants: function, method
+  dispatch:
+    MkldnnCPU: mkldnn_tanh_
+    SparseCPU, SparseCUDA: tanh_sparse_
+    SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh_
+  tags: pointwise
+
+- func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: tanh_out
+    MPS: tanh_out_mps
+    SparseCPU, SparseCUDA: tanh_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_out
+  tags: pointwise
+
+- func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
+  variants: function
+
+- func: tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+
+# TODO: namespace threshold in 'nn'
+- func: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  structured_delegate: threshold.out
+  dispatch:
+    QuantizedCPU: threshold_quantized_cpu
+
+- func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  structured_delegate: threshold.out
+
+- func: threshold.out(Tensor self, Scalar threshold, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: threshold_out
+    MPS: threshold_out_mps
+
+- func: threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: threshold_backward_out
+    MPS: threshold_backward_out_mps
+    SparseCPU, SparseCUDA: threshold_backward_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed_out
+
+- func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
+  variants: function
+  structured_delegate: threshold_backward.grad_input
+  dispatch:
+    MkldnnCPU: mkldnn_relu_backward
+    SparseCPU, SparseCUDA: threshold_backward_sparse
+    SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed
+    NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested
+  tags: pointwise
+
+- func: tile(Tensor self, SymInt[] dims) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: tile_symint
+
+- func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: transpose
+    NestedTensorCPU, NestedTensorCUDA: transpose_nested
+
+- func: transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: _mkldnn_transpose(Tensor self, int dim0, int dim1) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    MkldnnCPU: mkldnn_transpose
+
+- func: transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutograd: transpose_
+
+- func: _mkldnn_transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    MkldnnCPU: mkldnn_transpose_
+  autogen: _mkldnn_transpose.out
+
+- func: one_hot(Tensor self, int num_classes=-1) -> Tensor
+  python_module: nn
+  variants: function
+  tags: dynamic_output_shape
+
+- func: flip(Tensor self, int[] dims) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU, QuantizedCPU, CUDA, QuantizedCUDA: flip
+    MPS: flip_mps
+  autogen: flip.out
+  tags: core
+
+- func: fliplr(Tensor self) -> Tensor
+  variants: function, method
+
+- func: flipud(Tensor self) -> Tensor
+  variants: function, method
+
+- func: roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU, MPS: roll
+    CUDA: roll_cuda
+  autogen: roll.out
+
+# default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args
+
+- func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: rot90
+  autogen: rot90.out
+
+- func: trapezoid.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor
+
+- func: trapezoid.dx(Tensor y, *, Scalar dx=1, int dim=-1) -> Tensor
+
+- func: trapz.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor
+
+- func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor
+
+# Fused implementation detail for transformers. Adds in-projection bias to QKV and divides Q by sqrt(D/num_heads).
+- func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU, NestedTensorCPU: transform_bias_rescale_qkv_cpu
+    CUDA, NestedTensorCUDA: transform_bias_rescale_qkv_cuda
+  autogen: _transform_bias_rescale_qkv.out
+
+- func: _nested_tensor_from_mask(Tensor t, Tensor mask, bool mask_check=True) -> Tensor
+  dispatch:
+    CPU, CUDA: NestedTensor_nested_tensor_from_mask
+  autogen: _nested_tensor_from_mask.out
+
+- func: _nested_tensor_from_mask_left_aligned(Tensor t, Tensor mask) -> bool
+  dispatch:
+    CPU, CUDA: NestedTensor_nested_tensor_from_mask_left_aligned
+
+- func: _nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor
+  device_check: NoCheck # cpu_nested_shape_example will always be on CPU
+  dispatch:
+    CPU: nested_from_padded_generic
+    CUDA: nested_from_padded_cuda
+  autogen: _nested_from_padded.out
+
+# These private functions are temporary. They will be updated/deleted when nested tensors switch to using SymInts for their metadata representation
+- func: _nested_tensor_size(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: _nested_tensor_size
+  autogen: _nested_tensor_size.out
+
+- func: _nested_tensor_strides(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: _nested_tensor_strides
+  autogen: _nested_tensor_strides.out
+
+- func: _nested_tensor_storage_offsets(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA, NestedTensorMeta: _nested_tensor_storage_offsets
+  autogen: _nested_tensor_storage_offsets.out
+
+# _nested_from_padded is not usable from Python, so
+# _nested_from_padded_and_nested_example is available for testing.
+- func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
+  autogen: _nested_from_padded_and_nested_example.out
+
+# The input arguments' types to this functions are temporary. When nested tensors switch to using SymInts for their metadata representation
+# this will need to be updated
+- func: _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor(a)
+  variants: function
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: _nested_view_from_buffer
+
+- func: _nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor
+  variants: function
+  device_check: NoCheck
+  tags: view_copy
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy
+  autogen: _nested_view_from_buffer_copy.out
+
+- func: _nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor(a)
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+- func: _nested_view_from_jagged_copy(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor
+  variants: function
+  device_check: NoCheck
+  tags: view_copy
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _nested_view_from_jagged_copy
+  autogen: _nested_view_from_jagged_copy.out
+
+- func: _nested_get_values(Tensor(a) self) -> Tensor(a)
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+- func: _nested_get_values_copy(Tensor self) -> Tensor
+  variants: function
+  device_check: NoCheck
+  tags: view_copy
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _nested_get_values_copy
+  autogen: _nested_get_values_copy.out
+
+- func: _nested_get_offsets(Tensor self) -> Tensor
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+# returns undefined Tensor if no lengths present
+- func: _nested_get_lengths(Tensor self) -> Tensor
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+- func: _nested_get_ragged_idx(Tensor self) -> int
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+- func: _nested_get_jagged_dummy(Tensor any) -> Tensor
+  category_override: dummy
+  dispatch: {}
+
+- func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
+  dispatch:
+    # calls unsqueeze
+    CompositeExplicitAutogradNonFunctional: _trilinear
+  autogen: _trilinear.out
+
+- func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, float margin=1.0, float p=2, float eps=1e-06, bool swap=False, int reduction=Mean) -> Tensor
+
+- func: trunc(Tensor self) -> Tensor
+  structured_delegate: trunc.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: trunc_sparse
+    SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr
+  tags: [core, pointwise]
+
+- func: trunc_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: trunc.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: trunc_sparse_
+    SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_
+  tags: pointwise
+
+- func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: trunc_out
+    MPS: trunc_out_mps
+    SparseCPU, SparseCUDA: trunc_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_out
+  tags: pointwise
+# Alias for trunc
+
+- func: fix(Tensor self) -> Tensor
+  variants: function, method
+
+- func: fix_(Tensor(a!) self) -> Tensor(a!)
+  variants: function, method
+
+- func: fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: type_as(Tensor self, Tensor other) -> Tensor
+  variants: method
+
+- func: _has_compatible_shallow_copy_type(Tensor self, Tensor from) -> bool
+  variants: function
+
+- func: _unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: _unique_cpu
+    CUDA: _unique_cuda
+  autogen: _unique.out
+
+- func: unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: unique_dim_cpu
+    CUDA: unique_dim_cuda
+  tags: dynamic_output_shape
+  autogen: unique_dim.out
+
+- func: unique_consecutive(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: unique_consecutive_cpu
+    CUDA: unique_consecutive_cuda
+    MPS: unique_consecutive_mps
+  tags: dynamic_output_shape
+  autogen: unique_consecutive.out
+
+- func: unique_dim_consecutive(Tensor self, int dim, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: unique_dim_consecutive_cpu
+    CUDA: unique_dim_consecutive_cuda
+    MPS: unique_dim_consecutive_mps
+  tags: dynamic_output_shape
+  autogen: unique_dim_consecutive.out
+
+# _unique and _unique_dim are fragile and modifying them easily cause internal break
+# the below operator is a temporary hack for adding return_counts support
+# Please don't rely on these two operators, they will be removed soon
+
+- func: _unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: _unique2_cpu
+    CUDA: _unique2_cuda
+    MPS: _unique2_mps
+  tags: dynamic_output_shape
+  autogen: _unique2.out
+
+- func: _unsafe_view(Tensor self, SymInt[] size) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _unsafe_view
+  autogen: _unsafe_view.out
+
+- func: unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: unsqueeze
+    SparseCPU, SparseCUDA: unsqueeze_sparse
+    QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
+    NestedTensorCPU, NestedTensorCUDA: unsqueeze_nested
+  tags: core
+
+- func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+  dispatch:
+    CompositeExplicitAutograd: unsqueeze_
+
+- func: vander(Tensor x, int? N=None, bool increasing=False) -> Tensor
+
+- func: var(Tensor self, bool unbiased=True) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  tags: core
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA: var
+    MPS: var_mps
+  tags: core
+
+- func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: var_out
+
+- func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ["unbiased"]
+
+- func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CPU, CUDA: var_mean
+    MPS: var_mean_mps
+  autogen: var_mean.correction_out
+
+- func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  cpp_no_default_args: ["unbiased"]
+
+- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: view_as(Tensor(a) self, Tensor other) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
+- func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CPU, CUDA, MPS: where
+  tags: [core, pointwise]
+
+- func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA, MPS: where_self_out
+
+- func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
+  variants: function
+
+- func: where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor
+  variants: function, method
+
+- func: where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor
+  variants: function
+
+- func: where(Tensor condition) -> Tensor[]
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
+  variants: function
+
+# VariableType::_weight_norm does not want to be given a gap in the autograd graph,
+# so we don't define "dispatch" variants for it.
+- func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor
+  variants: function
+
+- func: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: weight_norm_cpu
+    CUDA: weight_norm_cuda
+    MPS: weight_norm_mps
+  autogen: _weight_norm_interface.out
+
+- func: _weight_norm_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: weight_norm_backward_cpu
+    CUDA: weight_norm_backward_cuda
+    MPS: weight_norm_backward_mps
+  autogen: _weight_norm_interface_backward.out
+
+- func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
+  variants: function
+
+- func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: zeros
+  autogen: zeros.names_out
+
+- func: _efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CPU: _efficientzerotensor
+    CUDA: _efficientzerotensor_cuda
+    MPS: _efficientzerotensor_mps
+    Meta: _efficientzerotensor_meta
+  autogen: _efficientzerotensor.out
+
+- func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: zeros_symint
+
+- func: zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: zeros_out
+    SparseCPU, SparseCUDA, SparseMeta: zeros_sparse_out
+
+- func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: zeros_like
+  autogen: zeros_like.out
+
+- func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _standard_gamma_grad_cpu
+    CUDA: _standard_gamma_grad_cuda
+  autogen: _standard_gamma_grad.out
+
+- func: _standard_gamma(Tensor self, Generator? generator=None) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _s_gamma_cpu
+    CUDA: _s_gamma_cuda
+  tags: nondeterministic_seeded
+  autogen: _standard_gamma.out
+
+- func: _dirichlet_grad(Tensor x, Tensor alpha, Tensor total) -> Tensor
+  dispatch:
+    CPU: _dirichlet_grad_cpu
+    CUDA: _dirichlet_grad_cuda
+  autogen: _dirichlet_grad.out
+
+- func: _sample_dirichlet(Tensor self, Generator? generator=None) -> Tensor
+  tags: nondeterministic_seeded
+  variants: function
+  dispatch:
+    CPU: _s_dirichlet_cpu
+    CUDA: _s_dirichlet_cuda
+  autogen: _sample_dirichlet.out
+
+- func: poisson(Tensor self, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU: _s_poisson_cpu
+    CUDA: _s_poisson_cuda
+  tags: nondeterministic_seeded
+  autogen: poisson.out
+
+- func: binomial(Tensor count, Tensor prob, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU: _s_binomial_cpu
+    CUDA: _s_binomial_cuda
+  tags: nondeterministic_seeded
+  autogen: binomial.out
+
+# When more variants get ported to native, this dispatch will get more
+# complicated
+
+- func: native_norm(Tensor self, Scalar p=2) -> Tensor
+  dispatch:
+    SparseCPU, SparseCUDA: norm_sparse
+  autogen: native_norm.out
+
+- func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor
+  dispatch:
+    SparseCPU, SparseCUDA: norm_sparse
+  autogen: native_norm.ScalarOpt_dim_dtype_out
+
+# TODO: reduce signatures down to one when optional args is available
+- func: _sparse_sum(Tensor self) -> Tensor
+
+- func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor
+
+- func: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _sparse_sum
+  autogen: _sparse_sum.dim_out
+
+- func: _sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor
+
+- func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
+  dispatch:
+    SparseCPU: _sparse_sum_backward_cpu
+    SparseCUDA: _sparse_sum_backward_cuda
+  autogen: _sparse_sum_backward.out
+
+- func: _sparse_csr_sum.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  dispatch:
+    SparseCsrCPU: _sparse_csr_sum_cpu
+    SparseCsrCUDA: _sparse_csr_sum_cuda
+  autogen: _sparse_csr_sum.dim_dtype_out
+
+- func: _sparse_csr_prod.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  dispatch:
+    SparseCsrCPU: _sparse_csr_prod_cpu
+    SparseCsrCUDA: _sparse_csr_prod_cuda
+  autogen: _sparse_csr_prod.dim_dtype_out
+
+- func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  python_module: sparse
+  variants: function
+
+- func: _sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+  python_module: sparse
+  variants: function
+
+- func: _sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+  python_module: sparse
+  dispatch:
+    SparseCPU: softmax_sparse_cpu
+    SparseCUDA: softmax_sparse_cuda
+  autogen: _sparse_softmax.out
+
+- func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
+  dispatch:
+    SparseCPU: softmax_backward_sparse_cpu
+    SparseCUDA: softmax_backward_sparse_cuda
+  autogen: _sparse_softmax_backward_data.out
+
+- func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  python_module: sparse
+  variants: function
+
+- func: _sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+  python_module: sparse
+  variants: function
+
+- func: _sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+  python_module: sparse
+  dispatch:
+    SparseCPU: log_softmax_sparse_cpu
+    SparseCUDA: log_softmax_sparse_cuda
+  autogen: _sparse_log_softmax.out
+
+- func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
+  dispatch:
+    SparseCPU: log_softmax_backward_sparse_cpu
+    SparseCUDA: log_softmax_backward_sparse_cuda
+  autogen: _sparse_log_softmax_backward_data.out
+
+- func: _spdiags(Tensor diagonals, Tensor offsets, int[] shape, Layout? layout=None) -> Tensor
+  python_module: sparse
+  dispatch:
+    CPU: spdiags
+  autogen: _spdiags.out
+
+- func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: norm
+  autogen: norm.ScalarOpt_dtype_out
+
+- func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: norm
+  autogen: norm.Scalar_out
+
+- func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  structured_delegate: norm.dtype_out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: sparse_dtype_norm
+
+- func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
+  structured_delegate: norm.out
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: sparse_norm
+
+- func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: norm_dtype_out
+    MPS: norm_dtype_out_mps
+
+- func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: norm_out
+    MPS: norm_out_mps
+
+# These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
+- func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: frexp
+  tags: pointwise
+
+- func: frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent)
+  dispatch:
+    CPU, CUDA: frexp_out
+  tags: pointwise
+
+# Deprecated (v.1.12)
+- func: frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
+  variants: function
+
+# Deprecated (v.1.12)
+- func: frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+
+# Deprecated (v.1.12)
+- func: nuclear_norm(Tensor self, bool keepdim=False) -> Tensor
+  variants: function
+
+# Deprecated (v.1.12)
+- func: nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+
+# Deprecated (v.1.12)
+- func: nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor
+  variants: function
+
+# Deprecated (v.1.12)
+- func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+
+- func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: clone
+    SparseCPU, SparseCUDA: clone_sparse
+    SparseCsrCPU, SparseCsrCUDA: clone_sparse_compressed
+    MkldnnCPU: mkldnn_clone
+    QuantizedCPU, QuantizedCUDA: quantized_clone
+    NestedTensorCPU, NestedTensorCUDA: clone_nested
+  autogen: clone.out
+  tags: [core, pointwise]
+
+- func: positive(Tensor(a) self) -> Tensor(a)
+  variants: function, method
+  tags: pointwise
+
+- func: resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: resize_as_
+  autogen: resize_as, resize_as.out
+  tags: inplace_view
+
+- func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: resize_as_sparse_
+    SparseCsrCPU, SparseCsrCUDA: resize_as_sparse_compressed_
+  autogen: resize_as_sparse, resize_as_sparse.out
+
+- func: zero_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA: zero_
+    MPS: zero_mps_
+    Meta: zero_meta_
+    SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
+    MkldnnCPU: mkldnn_zero_
+    NestedTensorCPU, NestedTensorCUDA: zero_nested_
+  autogen: zero, zero.out
+
+- func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sub_out
+    MPS: sub_out_mps
+    SparseCPU, SparseCUDA: sub_out_sparse
+  tags: pointwise
+
+- func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: sub.out
+  dispatch:
+    SparseCPU, SparseCUDA: sub_sparse
+    ZeroTensor: sub_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_sub_Tensor
+  tags: [core, pointwise]
+
+- func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: sub.out
+  dispatch:
+    SparseCPU, SparseCUDA: sub_sparse_
+  tags: pointwise
+# For C++ only, until we have conversion from C++ numbers to Tensor
+
+- func: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: sub
+  tags: [core, pointwise]
+
+- func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: sub_
+  autogen: sub.Scalar_out
+  tags: pointwise
+# subtract, alias for sub
+
+- func: subtract.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+
+- func: subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: function, method
+
+- func: subtract_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+  variants: method
+
+# For C++ only, until we have conversion from C++ numbers to Tensor
+- func: subtract.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  variants: function, method
+
+- func: subtract_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+  variants: method
+
+- func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CPU, CUDA: rsub
+  autogen: rsub.Tensor_out
+
+- func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: heaviside_out
+  tags: pointwise
+
+- func: heaviside(Tensor self, Tensor values) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: heaviside.out
+  tags: pointwise
+
+- func: heaviside_(Tensor(a!) self, Tensor values) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: heaviside.out
+
+# For C++ only, until we have conversion from C++ numbers to Tensor
+- func: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: rsub
+  autogen: rsub.Scalar_out
+
+# Functionally the same as addmm, but we give it a different derivative formula
+# that doesn't propagate gradients to non-present entries on sparse.
+  tags: pointwise
+- func: _sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  python_module: sparse
+  dispatch:
+    CompositeExplicitAutograd: _sparse_addmm
+  autogen: _sparse_addmm.out
+
+- func: sparse_sampled_addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  python_module: sparse
+  dispatch:
+    SparseCsrCUDA: sparse_sampled_addmm_out_sparse_csr_cuda
+    SparseCsrCPU: sparse_sampled_addmm_out_sparse_csr_cpu
+
+- func: sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  python_module: sparse
+  dispatch:
+    SparseCsrCUDA: sparse_sampled_addmm_sparse_csr_cuda
+    SparseCsrCPU: sparse_sampled_addmm_sparse_csr_cpu
+
+- func: _sparse_mm_reduce_impl(Tensor self, Tensor other, str reduce) -> (Tensor, Tensor)
+  python_module: sparse
+  dispatch:
+    SparseCsrCPU: _sparse_mm_reduce_impl_sparse_csr_cpu
+
+- func: _sparse_mm_reduce_impl_backward(Tensor self, Tensor grad_out, Tensor weight, str reduce, Tensor arg_out, bool[2] output_mask) -> (Tensor, Tensor)
+  python_module: sparse
+  dispatch:
+    SparseCsrCPU: _sparse_mm_reduce_impl_backward_sparse_csr_cpu
+
+- func: addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: addmm_out_cpu
+    CUDA: addmm_out_cuda
+    MPS: addmm_out_mps
+    SparseCPU: addmm_out_sparse_dense_cpu
+    SparseCUDA: addmm_out_sparse_dense_cuda
+    SparseCsrCPU: addmm_out_sparse_compressed_cpu
+    SparseCsrCUDA: addmm_out_sparse_compressed_cuda
+
+- func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  structured_delegate: addmm.out
+  variants: function, method
+  dispatch:
+    SparseCPU: addmm_sparse_dense_cpu
+    SparseCUDA: addmm_sparse_dense_cuda
+    SparseCsrCPU, SparseCsrCUDA: addmm_sparse_compressed_dense
+  tags: core
+
+- func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  structured_delegate: addmm.out
+  variants: method
+  dispatch:
+    # Warning!  For whatever reason, the inplace sparse addmm is NON
+    # broadcasting
+    SparseCPU: s_addmm_sparse_dense_cpu_
+    SparseCUDA: s_addmm_sparse_dense_cuda_
+
+- func: _addmm_activation.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: addmm_activation_out_cpu
+    CUDA: addmm_activation_out_cuda
+
+- func: _addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor
+  structured_delegate: _addmm_activation.out
+  variants: function, method
+
+- func: _scaled_mm(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _scaled_mm_cuda
+
+- func: _scaled_mm.out(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False, Tensor(a!) out, Tensor(b!) out_amax) -> (Tensor(a!), Tensor(b!))
+  variants: function
+  dispatch:
+    CUDA: _scaled_mm_out_cuda
+
+# NOTE [ Sparse: autograd and API ]
+#
+#
+# Sparse Tensor Constructors
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The API entry points to sparse tensor construction should be
+# `sparse_coo tensor` and `_sparse_coo_tensor_unsafe`. Depending on whether the
+# indices and values tensors are given, they eventually dispatch to either
+# `sparse_coo_tensor_with_dims` or `sparse_coo_tensor_with_dims_and_tensors`.
+#
+# The autograd support for ctor is implement on `sparse_coo_tensor_with_dims_and_tensors`.
+#
+# The API methods `sparse_coo tensor` and `_sparse_coo_tensor_unsafe`
+# **must not** have specific type dispatches because otherwise codegen will
+# consider them as abstract methods (see Note [Abstract ATen methods]), dispatch
+# using **Tensor** type, and thus lose autograd tracking on the actual method
+# they dispatch to, e.g., `sparse_coo_tensor_with_dims_and_tensors`.
+#
+#
+# Sparse Methods API Design
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Goals: 1. Flexible API for users to write custom sparse ops
+#        2. ctor and member accessor with autograd support
+#
+# To achieve 1, we need to provide a set of *dangerous* APIs (dangerous in the
+# sense that misusing them will break sparse tensor invariant and may out in
+# unexpected behavior, e.g., crash). These methods are all prefixed with
+# underscore "_" to indicate that they should be used with care. We provide:
+#
+#   + `_indices()`: returns the *raw* indices within the sparse tensor (not just
+#                   sharing storage). Any inplace operation will change the
+#                   actual indices, including t_, set_, as_strided_, resize_,
+#                   etc.
+#   + `_values()`: returns the *raw* values within the sparse tensor. Similar
+#                  semantics as `_indices()`
+#   + `_nnz()`: returns the number of non-zero entries. This will always be
+#               determined by the shapes of indices and values.
+#   + `_coalesced_(bool)`: inplace sets whether the tensor is coalesced, and
+#                          returns itself.
+#
+# These methods are very useful in writing new operations, e.g., a custom
+# autograd Function.
+#
+# We also provide other public *safe* APIs:
+#   + `indices()`: returns a **view** of the indices tensor if the sparse tensor
+#                  is **coalesced**.
+#   + `values()`: returns a **view** of the values tensor if the containing
+#                 sparse tensor is **coalesced**.
+#   + `sparse_dim()`: number of sparse dimensions
+#   + `dense_dim()`: number of dense dimensions
+#   + `is_coalesced()`: whether the sparse tensor is coalesced
+#
+# `_indices()` and `_values()` should returns the raw indices and values dense
+# tensors within a sparse tensor. They can be quite unsafe with inplace
+# operations like `t_()`, and exposes uncoalesced indices and values. The public
+# recommended API is `indices()` and `values()`, both of which first check that
+# the tensor is coalesced and return views on those tensors.
+#
+#
+# Autograd Support
+# ~~~~~~~~~~~~~~~~
+#
+# Autograd is supported on `values()` and sparse tensor ctor with indices and
+# values tensors. E.g., `torch.sparse_coo_tensor(i, v).values().sum()` is
+# differentiable w.r.t. `v`.
+#
+# NB: The `values()` and `_values()` operators are special in that they are
+# layout-aware, i.e., the output depends not just on the data it represents, but
+# also on the input layout details (in this case, the `indices` tensor). See
+# NOTE [ as_strided Backward and layout-aware/agnostic autograd ] in Functions.cpp
+# for discussion on layout-aware vs layout-agnostic autograd. Since PyTorch ops
+# operate in the layout-agnostic mode, similar to `as_strided`, backward of
+# these two operators need to consider them in a layout-agnostic way:
+#   + `values()`:
+#     Input is coalesced.
+#     We just pretend having `input.indices()` as an additional argument
+#     `input_indices`, then forward is similar to
+#     `input.to(kStrided).index_select(input_indices)` regardless of the layout.
+#     Note that `values()` normally is layout-aware even if we constrain
+#     ourselves on sparse inputs since it may include all zeros values entries
+#     as "present" entries.
+#   + `_values()`:
+#     Input may be uncoalesced.
+#     It is not straightforward to construct a layout-agnostic version because
+#     duplicate indices entries may exist and additional parameterization is
+#     needed to distribute the value into different values entries. Furthermore,
+#     this op is intended to provide ways to write custom sparse ops, rather
+#     than being used in autograd graph, so it is marked as *non-differentiable*
+#     in derivatives.yaml.
+#
+# Before reading the following, see NOTE [ Autograd Variable Views ] in
+# variable.h for details on views that are tracked by autograd, and views that
+# are not.
+#
+# Moreover, these methods return tensors that share storage with inputs, so we
+# mark these methods as view ops to support autograd history tracking.
+# The sparse tensor ctor output should technically be view of both input indices
+# and values tensors, but currently we only support setting as view of a single
+# Variable, so it is only view of the values tensor.
+# TODO: clone indices in sparse tensor ctor.
+#
+# For other methods that return outputs that share storage with inputs, i.e.,
+# `indices()` and `_indices()`. We mark their outputs as non-differentiable, so
+# the view relation is not tracked by autograd, but the version counter is still
+# shared. In other words, their outputs are non-differentiable views of the
+# sparse tensor.
+# FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
+# the default would never make sense.
+
+- func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: sparse_compressed_tensor
+
+- func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+
+- func: sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: sparse_compressed_tensor
+- func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+
+- func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: _sparse_compressed_tensor_unsafe_symint
+
+- func: _sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: _sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: _sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: _sparse_bsc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+
+- func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: sparse_coo_tensor
+  autogen: sparse_coo_tensor.size_out
+
+- func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
+
+- func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
+
+- func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: _sparse_coo_tensor_unsafe_symint
+
+- func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None) -> ()
+
+- func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> ()
+- func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
+- func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
+- func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
+- func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
+
+- func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_sparse
+  autogen: _sparse_coo_tensor_with_dims.out
+
+- func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_and_tensor_sparse_symint
+  autogen: _sparse_coo_tensor_with_dims_and_tensors.out
+
+- func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: sparse_resize_
+  autogen: sparse_resize, sparse_resize.out
+
+- func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: sparse_resize_and_clear_
+  autogen: sparse_resize_and_clear, sparse_resize_and_clear.out
+
+- func: sparse_mask(Tensor self, Tensor mask) -> Tensor
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA: sparse_mask
+    SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_compressed
+  autogen: sparse_mask.out
+
+- func: _sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA: sparse_mask_projection
+  autogen: _sparse_mask_projection.out
+
+- func: _to_cpu(Tensor[] tensors) -> Tensor[]
+  variants: function
+
+- func: to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor
+  variants: method
+
+# Special case of to_dense with custom derivative
+- func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA: sparse_to_dense
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_dense
+    MkldnnCPU: mkldnn_to_dense
+  autogen: _to_dense.out
+
+- func: to_dense_backward(Tensor grad, Tensor input, bool? masked_grad=None) -> Tensor
+
+- func: sparse_dim(Tensor self) -> int
+  variants: method
+  dispatch:
+    CPU, CUDA: sparse_dim_strided
+    SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_dim_sparse_csr
+  device_check: NoCheck
+  device_guard: False
+
+# legacy method
+- func: _dimI(Tensor self) -> int
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA: sparse_dim_sparse
+  device_check: NoCheck
+  device_guard: False
+
+- func: dense_dim(Tensor self) -> int
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_dim_strided
+    SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
+  device_check: NoCheck
+  device_guard: False
+
+# legacy method
+- func: _dimV(Tensor self) -> int
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
+  device_check: NoCheck
+  device_guard: False
+
+- func: _nnz(Tensor self) -> int
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: _nnz_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _nnz_sparse_csr
+  device_check: NoCheck
+  device_guard: False
+
+# NOTE: [ coalesce autograd ]
+# coalesce returns self directly for already coalesced sparse tensors.
+# This means coalesce cannot have a derivative registered, otherwise it creates
+# circular references in the autograd graph (see gh-52874).
+# Instead, the derivative is registered on the slow-path "_coalesce"
+- func: coalesce(Tensor(a) self) -> Tensor(a)
+  variants: method
+
+- func: _coalesce(Tensor self) -> Tensor
+  dispatch:
+    SparseCPU: _coalesce_sparse_cpu
+    SparseCUDA: _coalesce_sparse_cuda
+  autogen: _coalesce.out
+
+- func: is_coalesced(Tensor self) -> bool
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: is_coalesced_sparse
+    CompositeExplicitAutograd: is_coalesced_default
+  device_check: NoCheck
+  device_guard: False
+
+- func: _indices(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: _indices_sparse
+  device_check: NoCheck
+  device_guard: False
+
+- func: _values(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: _values_sparse
+  device_check: NoCheck
+  device_guard: False
+
+# This method doesn't do any check but only directly sets the flag. So it can be
+# a bit unsafe. Similar to _indices and _values, this is useful for implementing
+# custom sparse operations in Python/C++ extension.
+- func: _coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!)
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: _coalesced_sparse_
+  device_check: NoCheck
+  device_guard: False
+  autogen: _coalesced, _coalesced.out
+
+- func: indices(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: indices_sparse
+    CompositeExplicitAutograd: indices_default
+  device_check: NoCheck
+  device_guard: False
+
+- func: values(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMeta: values_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: values_nested
+    CompositeExplicitAutograd: values_default
+  device_check: NoCheck
+  device_guard: False
+
+- func: crow_indices(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: crow_indices_sparse_csr
+    CompositeExplicitAutograd: crow_indices_default
+  device_check: NoCheck
+  device_guard: False
+
+- func: col_indices(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: col_indices_sparse_csr
+    CompositeExplicitAutograd: col_indices_default
+  device_check: NoCheck
+  device_guard: False
+
+- func: ccol_indices(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ccol_indices_sparse_csr
+    CompositeExplicitAutograd: ccol_indices_default
+  device_check: NoCheck
+  device_guard: False
+
+- func: row_indices(Tensor(a) self) -> Tensor(a)
+  variants: method
+  dispatch:
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: row_indices_sparse_csr
+    CompositeExplicitAutograd: row_indices_default
+  device_check: NoCheck
+  device_guard: False
+
+- func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    SparseCPU: hspmm_out_sparse_cpu
+    SparseCUDA: hspmm_out_sparse_cuda
+
+- func: hspmm(Tensor mat1, Tensor mat2) -> Tensor
+  dispatch:
+    SparseCPU: hspmm_sparse_cpu
+    SparseCUDA: hspmm_sparse_cuda
+
+- func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+  device_check: NoCheck  # Allows copy into different device
+  variants: function
+  dispatch:
+    SparseCPU, SparseCUDA: copy_sparse_
+  autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
+
+# By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
+- func: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: unbind
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind
+
+- func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]
+  variants: function, method
+
+- func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
+  variants: method
+
+# Special case of to_sparse.sparse_dim with custom derivative
+- func: _to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse
+    SparseCPU, SparseCUDA: sparse_coo_to_sparse
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
+  autogen: _to_sparse.sparse_dim_out
+
+- func: to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
+  variants: method
+
+# Special case of to_sparse with custom derivative
+- func: _to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse
+    SparseCPU, SparseCUDA: sparse_coo_to_sparse
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
+  autogen: _to_sparse.out
+
+- func: to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
+  variants: method
+
+# Special case of to_sparse_csr with custom derivative
+- func: _to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse_csr
+    SparseCPU, SparseCUDA: coo_to_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csr
+  autogen: _to_sparse_csr.out
+
+- func: to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
+  variants: method
+
+# Special case of to_sparse_csc with custom derivative
+- func: _to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse_csc
+    SparseCPU, SparseCUDA: coo_to_sparse_csc
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csc
+  autogen: _to_sparse_csc.out
+
+- func: to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+  variants: method
+
+# Special case of to_sparse_bsr with custom derivative
+- func: _to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse_bsr
+    SparseCPU, SparseCUDA: coo_to_sparse_bsr
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsr
+  autogen: _to_sparse_bsr.out
+
+- func: to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+  variants: method
+
+# Special case of to_sparse_bsc with custom derivative
+- func: _to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+  variants: method
+  dispatch:
+    CPU, CUDA: dense_to_sparse_bsc
+    SparseCPU, SparseCUDA: coo_to_sparse_bsc
+    SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsc
+  autogen: _to_sparse_bsc.out
+
+- func: _to_sparse_semi_structured(Tensor dense) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _to_sparse_semi_structured
+
+- func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor
+  variants: method
+  dispatch:
+    CPU: dense_to_mkldnn
+  autogen: to_mkldnn.out
+
+- func: mkldnn_reorder_conv2d_weight(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor
+  variants: function
+  python_module: nn
+  dispatch:
+    MkldnnCPU: mkldnn_reorder_conv2d_weight
+  autogen: mkldnn_reorder_conv2d_weight.out
+
+- func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
+  variants: function
+  python_module: nn
+  dispatch:
+    MkldnnCPU: mkldnn_reorder_conv3d_weight
+  autogen: mkldnn_reorder_conv3d_weight.out
+
+- func: to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor
+
+- func: quantize_per_tensor_dynamic(Tensor self, ScalarType dtype, bool reduce_range) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: quantize_per_tensor_dynamic
+  autogen: quantize_per_tensor_dynamic.out
+
+- func: quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: quantize_per_tensor
+  autogen: quantize_per_tensor.out
+
+- func: quantize_per_tensor.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: quantize_per_tensor_tensor_qparams
+  autogen: quantize_per_tensor.tensor_qparams_out
+
+- func: quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[]
+  variants: function
+  dispatch:
+    CPU: quantize_per_tensor_list_cpu
+  autogen: quantize_per_tensor.tensors_out
+
+- func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: quantize_per_channel
+  autogen: quantize_per_channel.out
+
+- func: dequantize.self(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU, CUDA: dequantize_cpu_or_cuda
+    QuantizedCPU, QuantizedCUDA: dequantize_quantized
+  autogen: dequantize.self_out
+
+- func: dequantize.tensors(Tensor[] tensors) -> Tensor[]
+  variants: function
+  dispatch:
+    QuantizedCPU: dequantize_tensors_quantized_cpu
+  autogen: dequantize.tensors_out
+
+- func: q_scale(Tensor self) -> float
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: q_scale_quant
+
+- func: q_zero_point(Tensor self) -> int
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: q_zero_point_quant
+
+- func: q_per_channel_scales(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: q_per_channel_scales
+  autogen: q_per_channel_scales.out
+
+- func: q_per_channel_zero_points(Tensor self) -> Tensor
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: q_per_channel_zero_points
+  autogen: q_per_channel_zero_points.out
+
+- func: q_per_channel_axis(Tensor self) -> int
+  variants: function, method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: q_per_channel_axis
+
+- func: int_repr(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    QuantizedCPU: int_repr_quantized_cpu
+    QuantizedCUDA: int_repr_quantized_cuda
+  autogen: int_repr.out
+
+- func: _make_per_tensor_quantized_tensor(Tensor self, float scale, int zero_point) -> Tensor
+  dispatch:
+    CPU: make_per_tensor_quantized_tensor_cpu
+    CUDA: make_per_tensor_quantized_tensor_cuda
+  autogen: _make_per_tensor_quantized_tensor.out
+
+- func: _make_per_channel_quantized_tensor(Tensor self, Tensor scale, Tensor zero_point, int axis) -> Tensor
+  dispatch:
+    CPU: make_per_channel_quantized_tensor_cpu
+    CUDA: make_per_channel_quantized_tensor_cuda
+  autogen: _make_per_channel_quantized_tensor.out
+
+- func: qscheme(Tensor self) -> QScheme
+  variants: method
+  dispatch:
+    QuantizedCPU, QuantizedCUDA: qscheme_quant
+
+- func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: fake_quantize_per_tensor_affine.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+  variants: function
+  dispatch:
+    CPU, CUDA: fake_quantize_per_tensor_affine_cachemask
+  autogen: fake_quantize_per_tensor_affine_cachemask.out
+
+- func: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+  variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams
+  autogen: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out
+
+- func: fake_quantize_per_tensor_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor
+  variants: function
+
+- func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_learnable_per_tensor_affine
+  autogen: _fake_quantize_learnable_per_tensor_affine.out
+
+- func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_learnable_per_tensor_affine_backward
+
+- func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+
+- func: fake_quantize_per_channel_affine_cachemask(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+  variants: function
+  dispatch:
+    CPU, CUDA: fake_quantize_per_channel_affine_cachemask
+  autogen: fake_quantize_per_channel_affine_cachemask.out
+
+- func: fake_quantize_per_channel_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor
+  variants: function
+
+- func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_learnable_per_channel_affine
+  autogen: _fake_quantize_learnable_per_channel_affine.out
+
+- func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_learnable_per_channel_affine_backward
+
+- func: fused_moving_avg_obs_fake_quant(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> Tensor
+  variants: function
+
+- func: _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask)
+  dispatch:
+    CPU: fused_moving_avg_obs_fake_quant_cpu
+    CUDA: fused_moving_avg_obs_fake_quant_cuda
+  autogen: _fused_moving_avg_obs_fq_helper_functional, _fused_moving_avg_obs_fq_helper.out
+
+- func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int)
+  variants: function
+
+- func: _saturate_weight_to_fp16(Tensor weight) -> Tensor
+  variants: function
+
+- func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor)
+  variants: function
+
+- func: _autocast_to_reduced_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) -> Tensor(a)
+  variants: method
+  device_guard: False
+
+- func: _autocast_to_full_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled) -> Tensor(a)
+  variants: method
+  device_guard: False
+
+- func: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: _to_copy
+    NestedTensorCPU, NestedTensorCUDA: _to_copy_nested
+  autogen: _to_copy.out
+  tags: core
+
+# to(Device) must not exist because all constructors of Device also works for
+# TensorOptions. Otherwise, an ambiguity error is thrown.
+# See NOTE [ TensorOptions Constructors ].
+- func: to.dtype_layout(Tensor(a) self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
+- func: to.device(Tensor(a) self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
+- func: to.dtype(Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
+- func: to.other(Tensor(a) self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+
+- func: meshgrid(Tensor[] tensors) -> Tensor[]
+
+# TODO: Two weeks after this lands, combine these two overloads,
+#       making "indexing" optional. These are temporarily distinct for
+#       forward-compatibility reasons.
+- func: meshgrid.indexing(Tensor[] tensors, *, str indexing) -> Tensor[]
+
+- func: cartesian_prod(Tensor[] tensors) -> Tensor
+  variants: function
+
+- func: combinations(Tensor self, int r=2, bool with_replacement=False) -> Tensor
+  variants: function
+
+- func: item(Tensor self) -> Scalar
+  tags: data_dependent_output
+  variants: method
+
+- func: result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType
+  variants: function
+
+- func: result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType
+  variants: function
+
+- func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType
+  variants: function
+
+- func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType
+
+- func: can_cast(ScalarType from, ScalarType to) -> bool
+  variants: function
+
+- func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType
+  variants: function
+
+# NB: Does NOT check precondition that numel == 1
+- func: _local_scalar_dense(Tensor self) -> Scalar
+  tags: [core, data_dependent_output]
+  dispatch:
+    CPU: _local_scalar_dense_cpu
+    CUDA: _local_scalar_dense_cuda
+    MPS: _local_scalar_dense_mps
+  variants: function
+
+# MPS LSTM implementation
+
+- func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    MPS: _lstm_mps
+  autogen: _lstm_mps.out
+  tags: nondeterministic_seeded
+
+- func: lstm_mps_backward(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
+  dispatch:
+    MPS: lstm_mps_backward
+  autogen: lstm_mps_backward.out
+
+
+# Fused RNN kernels
+- func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: _thnn_fused_lstm_cell_cuda
+  autogen: _thnn_fused_lstm_cell.out
+
+# NB: The composite version of this function below is a simple wrapper that duplicates some of the outputs
+#     It is necessary to avoid triggering TensorImpl use count checks in debug mode
+# NB: this is function is NOT differentiable
+- func: _thnn_fused_lstm_cell_backward_impl(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: _thnn_fused_lstm_cell_backward_impl_cuda
+  autogen: _thnn_fused_lstm_cell_backward_impl.out
+
+- func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+
+- func: _thnn_differentiable_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor input_gates, Tensor hidden_gates, Tensor? input_bias, Tensor? hidden_bias, Tensor cx, Tensor cy) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+
+- func: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor)
+  dispatch:
+    CUDA: _thnn_fused_gru_cell_cuda
+  autogen: _thnn_fused_gru_cell.out
+
+- func: _thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: _thnn_fused_gru_cell_backward_cuda
+  autogen: _thnn_fused_gru_cell_backward.out
+
+- func: _thnn_differentiable_gru_cell_backward(Tensor grad_hy, Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias, Tensor? hidden_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+
+# RNN cells and layers
+- func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+  tags: nondeterministic_seeded
+
+- func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)
+
+- func: gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor
+
+- func: rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor
+
+- func: rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor
+
+# Quantized RNN layer registration has been moved to C10 dispatch in `RNN.cpp`
+
+# Quantized RNN layers
+# - func: quantized_lstm(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor)
+
+
+# - func: quantized_lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor)
+
+
+# Quantized GRU layers
+
+# - func: quantized_gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+#
+
+# - func: quantized_gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+#
+
+# Quantized RNN cells
+- func: quantized_lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> (Tensor, Tensor)
+
+- func: quantized_gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
+
+- func: quantized_rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
+
+- func: quantized_rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
+
+# PackedSequence utilities
+- func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)
+  dispatch:
+    CompositeExplicitAutograd: _pack_padded_sequence
+  autogen: _pack_padded_sequence.out
+
+- func: _pack_padded_sequence_backward(Tensor grad, SymInt[] input_size, Tensor batch_sizes, bool batch_first) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: _pack_padded_sequence_backward_symint
+
+- func: _pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int total_length) -> (Tensor, Tensor)
+
+# wrappers for legacy TH methods
+
+- func: set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CPU, CUDA, Meta, MPS: set_
+  autogen: set.source_Storage, set.source_Storage_out
+  tags: inplace_view
+
+- func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CPU: set_storage_cpu_
+    Meta: set_storage_meta__symint
+    CUDA: set_storage_cuda_
+    MPS: set_storage_mps_
+    QuantizedCPU, QuantizedCUDA: set_storage_quantized_
+  autogen: set.source_Storage_storage_offset, set.source_Storage_storage_offset_out
+  tags: inplace_view
+
+- func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: set__symint
+  tags: inplace_view
+
+- func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CPU, CUDA, Meta, MPS: set_tensor_
+  autogen: set.source_Tensor, set.source_Tensor_out
+  tags: inplace_view
+
+- func: set_(Tensor(a!) self) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CPU: set_cpu_
+    CUDA: set_cuda_
+    Meta: set_meta_
+    MPS: set_mps_
+  autogen: set, set.out
+  tags: inplace_view
+
+# Not making it CompositeImplicitAutograd because lift
+# should be a primitive w.r.t. functorch
+
+# TODO: this should have a view annotation
+# TODO: shouldn't be a method
+- func: lift(Tensor self) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: lift
+  autogen: lift.out
+
+# lift_fresh is called with an argument that is guaranteed to be
+# fresh (i.e., newly allocated).  This is ONLY called from a
+# torch.tensor call; if you FX trace a lift_fresh, you are obligated
+# to convert this into a lift_fresh_copy (because FX will violate the
+# freshness invariant when tracing).
+- func: lift_fresh(Tensor(a) self) -> Tensor(a)
+  dispatch:
+    CompositeExplicitAutograd: lift_fresh
+
+# Like lift, but it clones the input.
+- func: lift_fresh_copy(Tensor self) -> Tensor
+  tags: view_copy
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: lift_fresh_copy
+  autogen: lift_fresh_copy.out
+
+- func: is_set_to(Tensor self, Tensor tensor) -> bool
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CPU, CUDA, MPS: is_set_to
+
+- func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU: masked_fill__cpu
+    CUDA: masked_fill__cuda
+    QuantizedCPU: masked_fill__quantized_cpu
+    QuantizedCUDA: masked_fill__quantized_cuda
+    MPS: masked_fill__mps
+  autogen: masked_fill.Scalar_out
+
+- func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: masked_fill
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_masked_fill
+  tags: pointwise
+
+- func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU: masked_fill__cpu
+    CUDA: masked_fill__cuda
+    QuantizedCPU: masked_fill__quantized_cpu
+    QuantizedCUDA: masked_fill__quantized_cuda
+    MPS: masked_fill__mps
+  autogen: masked_fill.Tensor_out
+
+- func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: masked_fill
+
+- func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CPU: masked_scatter__cpu
+    CUDA: masked_scatter__cuda
+    MPS: masked_scatter__mps
+  autogen: masked_scatter.out
+
+- func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: masked_scatter
+
+- func: masked_scatter_backward(Tensor grad_output, Tensor mask, SymInt[] sizes) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: masked_scatter_backward_symint
+
+- func: _masked_softmax(Tensor self, Tensor mask, int? dim=None, int? mask_type=None) -> Tensor
+  dispatch:
+    CUDA: masked_softmax_cuda
+    CPU: masked_softmax_cpu
+  autogen: _masked_softmax.out
+
+- func: _masked_softmax_backward(Tensor grad_output, Tensor output, Tensor mask, int? dim=None) -> Tensor
+  dispatch:
+    CUDA: masked_softmax_backward_cuda
+    CPU: masked_softmax_backward_cpu
+  autogen: _masked_softmax_backward.out
+
+- func: view(Tensor(a) self, SymInt[] size) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS: view
+    MkldnnCPU: mkldnn_view
+    NestedTensorCPU, NestedTensorCUDA: view_nested
+  tags: core
+
+# Warning: If you want to change the name or overload name of this
+# operator, you might also want to change the `isBlockListedSchema`
+# function in `torch/csrc/jit/frontend/schema_catching.cpp`.
+# The name and overload name of this operator is hardcoded in that
+# function in order to workaround a bug:
+# https://github.com/pytorch/pytorch/issues/47964
+- func: view.dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: view_dtype
+
+- func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CPU, CUDA: put_
+  autogen: put.out
+
+- func: put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: put
+
+- func: index_add.out(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  precomputed:
+  - dim -> int dim
+  dispatch:
+    CPU: index_add_cpu_out
+    CUDA: index_add_cuda_out
+    MPS: index_add_mps_out
+
+- func: index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor(a!)
+  structured_delegate: index_add.out
+  variants: method
+
+- func: index_add(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor
+  structured_delegate: index_add.out
+  variants: function, method
+
+- func: index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor
+  variants: function, method
+
+- func: index_reduce.out(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  precomputed:
+  - dim -> int dim
+  dispatch:
+    CPU: index_reduce_cpu_out
+    CUDA: index_reduce_cuda_out
+
+- func: index_reduce_(Tensor(a!) self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor(a!)
+  structured_delegate: index_reduce.out
+  variants: method
+
+- func: index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor
+  structured_delegate: index_reduce.out
+  variants: function, method
+
+- func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU: index_fill_
+    CUDA: index_fill_
+    MPS: index_fill_mps_
+  autogen: index_fill.int_Scalar_out
+
+- func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: index_fill
+
+- func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU, CUDA: index_fill_
+    MPS: index_fill_mps_
+  autogen: index_fill.int_Tensor_out
+
+- func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: index_fill
+
+- func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: index_fill_.Dimname_Tensor(Tensor(a!) self, Dimname dim, Tensor index, Tensor value) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: index_fill.Dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+
+- func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
+  structured_delegate: scatter.src_out
+  variants: function, method
+  tags: core
+
+- func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
+  structured_delegate: scatter.src_out
+  variants: method
+
+- func: scatter.src_out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU, CUDA: scatter_src_out
+    MPS: scatter_src_out_mps
+
+- func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
+  structured_delegate: scatter.value_out
+  variants: function, method
+  tags: core
+
+- func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
+  structured_delegate: scatter.value_out
+  variants: method
+
+- func: scatter.value_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU, CUDA: scatter_value_out
+    MPS: scatter_value_out_mps
+
+- func: scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor
+  structured_delegate: scatter.reduce_out
+  variants: function, method
+
+- func: scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!)
+  structured_delegate: scatter.reduce_out
+  variants: method
+
+- func: scatter.reduce_out(Tensor self, int dim, Tensor index, Tensor src, *, str reduce, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU, CUDA: scatter_reduce_out
+    MPS: scatter_reduce_out_mps
+
+- func: scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor
+  structured_delegate: scatter.value_reduce_out
+  variants: function, method
+
+- func: scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!)
+  structured_delegate: scatter.value_reduce_out
+  variants: method
+
+- func: scatter.value_reduce_out(Tensor self, int dim, Tensor index, Scalar value, *, str reduce, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU, CUDA: scatter_value_reduce_out
+    MPS: scatter_value_reduce_out_mps
+
+- func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
+  variants: function, method
+
+- func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
+  variants: function, method
+
+- func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
+  structured_delegate: scatter_add.out
+  variants: function, method
+  tags: core
+
+- func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
+  structured_delegate: scatter_add.out
+  variants: method
+
+- func: scatter_add.out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU, CUDA: scatter_add
+    MPS: scatter_add_mps_out
+
+- func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
+  variants: function, method
+
+- func: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor
+  structured_delegate: scatter_reduce.two_out
+  variants: function, method
+  tags: core
+
+- func: scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!)
+  structured_delegate: scatter_reduce.two_out
+  variants: method
+
+- func: scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  variants: function
+  dispatch:
+    CPU, CUDA: scatter_reduce_two
+
+- func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: eq.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: eq.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: bitwise_and.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  variants: function
+  dispatch:
+    CPU, CUDA: bitwise_and_out
+    MPS: bitwise_and_out_mps
+  tags: pointwise
+
+- func: bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_and_out
+  tags: pointwise
+
+- func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_and
+  tags: [core, pointwise]
+
+- func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_and
+  autogen: bitwise_and.Scalar_Tensor_out
+  tags: pointwise
+
+- func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: bitwise_and.Tensor_out
+  tags: [core, pointwise]
+
+- func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: bitwise_and_
+  tags: pointwise
+
+- func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_and.Tensor_out
+  tags: pointwise
+
+- func: __and__.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+
+- func: __and__.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+
+- func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: __iand__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  variants: function
+  dispatch:
+    CPU, CUDA: bitwise_or_out
+    MPS: bitwise_or_out_mps
+  tags: pointwise
+
+- func: bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_or_out
+  tags: pointwise
+
+- func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_or
+  tags: [core, pointwise]
+
+- func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_or
+  autogen: bitwise_or.Scalar_Tensor_out
+  tags: pointwise
+
+- func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: bitwise_or.Tensor_out
+  tags: [core, pointwise]
+
+- func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: bitwise_or_
+  tags: pointwise
+
+- func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_or.Tensor_out
+  tags: pointwise
+
+- func: __or__.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+
+- func: __or__.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+
+- func: __ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: __ior__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  variants: function
+  dispatch:
+    CPU, CUDA: bitwise_xor_out
+    MPS: bitwise_xor_out_mps
+  tags: pointwise
+
+- func: bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_xor_out
+  tags: pointwise
+
+- func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_xor
+  tags: [core, pointwise]
+
+- func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_xor
+  autogen: bitwise_xor.Scalar_Tensor_out
+  tags: pointwise
+
+- func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: bitwise_xor.Tensor_out
+  tags: [core, pointwise]
+
+- func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: bitwise_xor_
+  tags: pointwise
+
+- func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_xor.Tensor_out
+  tags: pointwise
+
+- func: __xor__.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: __xor__.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: pointwise
+
+- func: __ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: pointwise
+
+- func: __lshift__.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA: __lshift__
+  tags: pointwise
+
+- func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA: __lshift__
+  tags: pointwise
+
+- func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU, CUDA: __ilshift__
+  autogen: __lshift__.Scalar_out
+  tags: pointwise
+
+- func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU, CUDA: __ilshift__
+  autogen: __lshift__.Tensor_out
+  tags: pointwise
+
+- func: bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: bitwise_left_shift.Tensor_out
+  tags: pointwise
+
+- func: bitwise_left_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_left_shift.Tensor_out
+  tags: pointwise
+
+- func: bitwise_left_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: bitwise_left_shift_out
+  tags: pointwise
+
+- func: bitwise_left_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_left_shift
+  tags: pointwise
+
+- func: bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: bitwise_left_shift_
+  tags: pointwise
+
+- func: bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_left_shift_out
+  tags: pointwise
+
+- func: bitwise_left_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_left_shift
+  autogen: bitwise_left_shift.Scalar_Tensor_out
+  tags: pointwise
+
+- func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA: __rshift__
+  tags: pointwise
+
+- func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA: __rshift__
+  tags: pointwise
+
+- func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU, CUDA: __irshift__
+  autogen: __rshift__.Scalar_out
+
+- func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CPU, CUDA: __irshift__
+  autogen: __rshift__.Tensor_out
+
+- func: bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function, method
+  structured_delegate: bitwise_right_shift.Tensor_out
+  tags: pointwise
+
+- func: bitwise_right_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: bitwise_right_shift.Tensor_out
+  tags: pointwise
+
+- func: bitwise_right_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: bitwise_right_shift_out
+  tags: pointwise
+
+- func: bitwise_right_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_right_shift
+  tags: pointwise
+
+- func: bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: bitwise_right_shift_
+  tags: pointwise
+
+- func: bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_right_shift_out
+  tags: pointwise
+
+- func: bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: bitwise_right_shift
+  autogen: bitwise_right_shift.Scalar_Tensor_out
+  tags: pointwise
+
+- func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+  structured_delegate: tril.out
+  variants: method
+
+- func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+  structured_delegate: triu.out
+  variants: method
+
+- func: digamma_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: digamma.out
+  variants: method
+  tags: pointwise
+
+- func: lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: lerp.Scalar_out
+  tags: pointwise
+
+- func: lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: lerp.Tensor_out
+  tags: pointwise
+
+- func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CPU, CUDA: addbmm_
+    MPS: addbmm_mps_
+
+- func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addbmm_out
+    MPS: addbmm_out_mps
+
+- func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU, CUDA: addbmm
+    MPS: addbmm_mps
+
+- func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU, CUDA: random_
+    Meta: random_meta_
+    MPS: random_mps_
+  autogen: random.from, random.from_out
+
+- func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    CPU, CUDA: random_
+    Meta: random_meta_
+    MPS: random_mps_
+  autogen: random.to, random.to_out
+
+- func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    CPU, CUDA: random_
+    MPS: random_mps_
+    Meta: random_meta_
+  autogen: random, random.out
+
+- func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    CPU, CUDA: uniform_
+    MPS: uniform_mps_
+    Meta: uniform_meta_
+  autogen: uniform, uniform.out
+
+- func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU, CUDA: cauchy_
+  autogen: cauchy, cauchy.out
+
+- func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    CPU, CUDA: log_normal_
+  autogen: log_normal, log_normal.out
+
+- func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    CPU, CUDA: exponential_
+    MPS: exponential_mps_
+  autogen: exponential, exponential.out
+
+- func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    CPU, CUDA: geometric_
+
+  # wrappers for TH functions
+  autogen: geometric, geometric.out
+
+- func: diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: diag(Tensor self, int diagonal=0) -> Tensor
+  variants: method, function
+
+- func: cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
+  variants: method, function
+
+- func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: triu_cpu
+    CUDA: triu_cuda
+    MPS: triu_mps_out
+
+- func: triu(Tensor self, int diagonal=0) -> Tensor
+  structured_delegate: triu.out
+  variants: method, function
+
+- func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: tril_cpu
+    CUDA: tril_cuda
+    MPS: tril_mps_out
+
+- func: tril(Tensor self, int diagonal=0) -> Tensor
+  structured_delegate: tril.out
+  variants: method, function
+
+- func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CPU: tril_indices_cpu
+    CUDA: tril_indices_cuda
+  autogen: tril_indices.out
+
+- func: triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CPU: triu_indices_cpu
+    CUDA: triu_indices_cuda
+  autogen: triu_indices.out
+
+- func: trace(Tensor self) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU: trace_cpu
+    CUDA: trace_cuda
+    MPS: trace_mps
+  autogen: trace.out
+
+- func: trace_backward(Tensor grad, SymInt[] sizes) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: trace_backward_symint
+
+- func: ne.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: ne_Scalar_out
+    MPS: ne_scalar_out_mps
+    QuantizedCPU: ne_out_quantized_cpu
+  tags: pointwise
+
+- func: ne.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: ne.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: ne_quantized_cpu
+  tags: [core, pointwise]
+
+- func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: ne_Tensor_out
+    MPS: ne_tensor_out_mps
+    QuantizedCPU: ne_out_quantized_cpu
+  tags: pointwise
+
+- func: ne.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: ne.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: ne_quantized_cpu
+  tags: [core, pointwise]
+
+- func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: ne.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: ne.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+# not_equal, alias for torch.ne
+- func: not_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: not_equal.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: method, function
+
+- func: not_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: not_equal.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+
+- func: not_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: not_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: eq_Scalar_out
+    MPS: eq_scalar_out_mps
+    QuantizedCPU: eq_out_quantized_cpu
+  tags: pointwise
+
+- func: eq.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: eq.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: eq_quantized_cpu
+    NestedTensorCPU, NestedTensorCUDA: eq_scalar_nested
+  tags: [core, pointwise]
+
+- func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: eq_Tensor_out
+    MPS: eq_tensor_out_mps
+    QuantizedCPU: eq_out_quantized_cpu
+  tags: pointwise
+
+- func: eq.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: eq.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: eq_quantized_cpu
+  tags: [core, pointwise]
+
+- func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: ge_Scalar_out
+    MPS: ge_scalar_out_mps
+    QuantizedCPU: ge_out_quantized_cpu
+  tags: pointwise
+
+- func: ge.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: ge.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: ge_quantized_cpu
+    NestedTensorCPU, NestedTensorCUDA: ge_scalar_nested
+  tags: [core, pointwise]
+
+- func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: ge_Tensor_out
+    MPS: ge_tensor_out_mps
+    QuantizedCPU: ge_out_quantized_cpu
+  tags: pointwise
+
+- func: ge.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: ge.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: ge_quantized_cpu
+  tags: [core, pointwise]
+
+- func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: ge.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: ge.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+# greater_equal, alias for torch.ge
+- func: greater_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: greater_equal.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: method, function
+
+- func: greater_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: greater_equal.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+
+- func: greater_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: greater_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: le_Scalar_out
+    MPS: le_scalar_out_mps
+    QuantizedCPU: le_out_quantized_cpu
+  tags: pointwise
+
+- func: le.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: le.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: le_quantized_cpu
+  tags: [core, pointwise]
+
+- func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: le_Tensor_out
+    MPS: le_tensor_out_mps
+    QuantizedCPU: le_out_quantized_cpu
+  tags: pointwise
+
+- func: le.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: le.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: le_quantized_cpu
+  tags: [core, pointwise]
+
+- func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: le.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: le.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+# less_equal, alias for torch.le
+- func: less_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: less_equal.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: method, function
+
+- func: less_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: less_equal.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+
+- func: less_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: less_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: gt_Scalar_out
+    MPS: gt_scalar_out_mps
+    QuantizedCPU: gt_out_quantized_cpu
+  tags: pointwise
+
+- func: gt.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: gt.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: gt_quantized_cpu
+    NestedTensorCPU, NestedTensorCUDA: gt_scalar_nested
+  tags: [core, pointwise]
+
+- func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: gt_Tensor_out
+    MPS: gt_tensor_out_mps
+    QuantizedCPU: gt_out_quantized_cpu
+  tags: pointwise
+
+- func: gt.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: gt.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: gt_quantized_cpu
+  tags: [core, pointwise]
+
+- func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: gt.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: gt.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+#  greater, alias for torch.gt
+- func: greater.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: greater.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: method, function
+
+- func: greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: greater.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+
+- func: greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: lt_Scalar_out
+    MPS: lt_scalar_out_mps
+    QuantizedCPU: lt_out_quantized_cpu
+  tags: pointwise
+
+- func: lt.Scalar(Tensor self, Scalar other) -> Tensor
+  structured_delegate: lt.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: lt_quantized_cpu
+  tags: [core, pointwise]
+
+- func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: lt_Tensor_out
+    MPS: lt_tensor_out_mps
+    QuantizedCPU: lt_out_quantized_cpu
+  tags: pointwise
+
+- func: lt.Tensor(Tensor self, Tensor other) -> Tensor
+  structured_delegate: lt.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    QuantizedCPU: lt_quantized_cpu
+  tags: [core, pointwise]
+
+- func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  structured_delegate: lt.Scalar_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+- func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: lt.Tensor_out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+
+#  less, alias for torch.lt
+- func: less.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: less.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: method, function
+
+- func: less.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: less.Tensor(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+
+- func: less_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+
+- func: less_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: take_out
+
+- func: take(Tensor self, Tensor index) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU, CUDA: take
+
+- func: take_along_dim.out(Tensor self, Tensor indices, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: take_along_dim(Tensor self, Tensor indices, int? dim=None) -> Tensor
+  variants: method, function
+
+- func: index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, QuantizedCPU: index_select_out_cpu_
+    CUDA, QuantizedCUDA: index_select_out_cuda
+    MPS: index_select_out_mps
+
+- func: index_select(Tensor self, int dim, Tensor index) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU: index_select_cpu_
+    QuantizedCPU: index_select_quantized_cpu_
+    CUDA: index_select_cuda
+    QuantizedCUDA: index_select_quantized_cuda
+    SparseCPU: index_select_sparse_cpu
+    SparseCUDA: index_select_sparse_cuda
+    MPS: index_select_mps
+  tags: core
+
+- func: index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor
+  variants: method, function
+
+- func: index_select_backward(Tensor grad, SymInt[] self_sizes, int dim, Tensor index) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeImplicitAutograd: index_select_backward_symint
+
+- func: masked_select.out(Tensor self, Tensor mask, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: masked_select_out_cpu
+    CUDA: masked_select_out_cuda
+    MPS: masked_select_out_mps
+  tags: dynamic_output_shape
+
+- func: masked_select(Tensor self, Tensor mask) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU: masked_select_cpu
+    CUDA: masked_select_cuda
+    MPS: masked_select_mps
+  tags: dynamic_output_shape
+
+- func: masked_select_backward(Tensor grad, Tensor input, Tensor mask) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+
+- func: nonzero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: nonzero_out_cpu
+    CUDA: nonzero_out_cuda
+    MPS: nonzero_out_mps
+  tags: dynamic_output_shape
+
+- func: nonzero(Tensor self) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU: nonzero_cpu
+    CUDA: nonzero_cuda
+    MPS: nonzero_mps
+  tags: [dynamic_output_shape, core]
+
+- func: nonzero_static.out(Tensor self, *, int size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: nonzero_static_out_cpu
+
+- func: nonzero_static(Tensor self, *, int size, int fill_value=-1) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU: nonzero_static_cpu
+
+- func: nonzero_numpy(Tensor self) -> Tensor[]
+  variants: method, function
+
+- func: argwhere(Tensor self) -> Tensor
+  variants: method, function
+  tags: dynamic_output_shape
+
+- func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU, CUDA: gather_out
+    MPS: gather_out_mps
+
+- func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
+  variants: method, function
+  structured_delegate: gather.out
+  tags: core
+
+- func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+
+- func: gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
+
+- func: gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor
+  variants: method, function
+
+- func: _gather_sparse_backward(Tensor self, int dim, Tensor index, Tensor grad) -> Tensor
+
+- func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: addcmul_out
+    MPS: addcmul_out_mps
+  tags: pointwise
+
+- func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+  structured_delegate: addcmul.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
+  structured_delegate: addcmul.out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: pointwise
+
+- func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: addcdiv_out
+    MPS: addcdiv_out_mps
+  tags: pointwise
+
+- func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+  structured_delegate: addcdiv.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
+  structured_delegate: addcdiv.out
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  tags: pointwise
+
+- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: cross_entropy_loss_symint
+
+- func: triangular_solve.X(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False, *, Tensor(a!) X, Tensor(b!) M) -> (Tensor(a!) solution, Tensor(b!) cloned_coefficient)
+  structured: True
+  dispatch:
+    CPU, CUDA: triangular_solve_out
+    MPS: triangular_solve_mps_out
+    SparseCsrCPU: triangular_solve_out_sparse_csr_cpu
+    SparseCsrCUDA: triangular_solve_out_sparse_csr_cuda
+
+- func: triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient)
+  structured_delegate: triangular_solve.X
+  variants: method, function
+
+- func: _linalg_check_errors(Tensor info, str api_name, *, bool is_matrix) -> ()
+  dispatch:
+    CompositeExplicitAutograd: _linalg_check_errors
+
+- func: linalg_solve_triangular.out(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  dispatch:
+    CPU, CUDA: linalg_solve_triangular_out
+    MPS: linalg_solve_triangular_mps_out
+
+- func: linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_solve_triangular
+    MPS: linalg_solve_triangular_mps
+
+- func: linalg_vander(Tensor x, *, SymInt? N=None) -> Tensor
+  python_module: linalg
+  dispatch:
+    CompositeImplicitAutograd: linalg_vander_symint
+
+- func: svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V)
+
+- func: svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V)
+  variants: method, function
+
+# swapaxes, alias for transpose
+- func: swapaxes(Tensor(a) self, int axis0, int axis1) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: swapaxes_(Tensor(a!) self, int axis0, int axis1) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+
+# swapdims, alias for transpose
+- func: swapdims(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: swapdims_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  tags: inplace_view
+
+- func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cholesky_out
+
+- func: cholesky(Tensor self, bool upper=False) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU, CUDA: cholesky
+
+- func: cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: cholesky_solve_out
+
+- func: cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: cholesky_solve
+
+- func: _cholesky_solve_helper(Tensor self, Tensor A, bool upper) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _cholesky_solve_helper_cpu
+    CUDA: _cholesky_solve_helper_cuda
+  autogen: _cholesky_solve_helper.out
+
+- func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU, CUDA: cholesky_inverse
+
+- func: cholesky_inverse.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cholesky_inverse_out
+
+- func: qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
+
+- func: qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R)
+  variants: method, function
+
+- func: geqrf.a(Tensor self, *, Tensor(a!) a, Tensor(b!) tau) -> (Tensor(a!) a, Tensor(b!) tau)
+  dispatch:
+    CPU, CUDA: geqrf_out
+
+- func: geqrf(Tensor self) -> (Tensor a, Tensor tau)
+  variants: method, function
+  dispatch:
+    CPU, CUDA: geqrf
+
+# orgqr, alias for linalg_householder_product
+- func: orgqr(Tensor self, Tensor input2) -> Tensor
+  variants: method, function
+
+- func: orgqr.out(Tensor self, Tensor input2, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: ormqr.out(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: ormqr_out
+
+- func: ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU, CUDA: ormqr
+
+- func: _lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor LU, Tensor pivots, Tensor info)
+  variants: function
+
+- func: lu_solve.out(Tensor self, Tensor LU_data, Tensor LU_pivots, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor
+  variants: method, function
+
+# lu_unpack
+- func: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U)
+  structured_delegate: lu_unpack.out
+  variants: function
+
+- func: lu_unpack.out(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True, *, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: lu_unpack_out
+
+# TODO: remove dispatch section when porting TH CUDA to ATen
+- func: multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU, CUDA: multinomial_out
+    MPS: multinomial_out_mps
+
+- func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU, CUDA: multinomial
+    MPS: multinomial_mps
+  tags: nondeterministic_seeded
+
+- func: lgamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: lgamma_out
+    MPS: lgamma_out_mps
+  tags: pointwise
+
+- func: lgamma_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: lgamma.out
+  variants: method
+  tags: pointwise
+
+- func: lgamma(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: lgamma.out
+  variants: method, function
+  tags: pointwise
+
+- func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: digamma_out
+    MPS: digamma_out_mps
+  tags: pointwise
+
+- func: digamma(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: digamma.out
+  variants: method, function
+  tags: pointwise
+
+- func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: polygamma_out
+    MPS: polygamma_out_mps
+  tags: pointwise
+
+- func: polygamma(int n, Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: polygamma.out
+  variants: method, function
+  tags: pointwise
+
+- func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: polygamma_
+  tags: pointwise
+
+- func: erfinv(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erfinv.out
+  variants: method, function
+  dispatch:
+    SparseCPU, SparseCUDA: erfinv_sparse
+    SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr
+  tags: pointwise
+
+- func: erfinv_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: erfinv.out
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA: erfinv_sparse_
+    SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_
+  tags: pointwise
+
+- func: erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: erfinv_out
+    MPS: erfinv_out_mps
+    SparseCPU, SparseCUDA: erfinv_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_out
+  tags: pointwise
+
+- func: i0(Tensor self) -> Tensor
+  structured_delegate: i0.out
+  variants: function, method
+  tags: pointwise
+
+- func: i0_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: i0.out
+  variants: function, method
+  tags: pointwise
+
+- func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: i0_out
+  tags: pointwise
+
+- func: sign(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sign.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: sign_sparse
+    SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr
+  tags: [core, pointwise]
+
+- func: sign_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: sign.out
+  variants: method
+  dispatch:
+    SparseCPU, SparseCUDA: sign_sparse_
+    SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_
+  tags: pointwise
+
+- func: sign.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sign_out
+    MPS: sign_out_mps
+    SparseCPU, SparseCUDA: sign_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_out
+  tags: pointwise
+
+- func: signbit(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: signbit.out
+  dispatch:
+    SparseCPU, SparseCUDA: signbit_sparse
+    SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr
+  tags: pointwise
+
+- func: signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU: signbit_out
+    CUDA: signbit_out
+    MPS: signbit_out_mps
+    SparseCPU, SparseCUDA: signbit_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr_out
+  tags: pointwise
+
+- func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: dist
+  autogen: dist.out
+
+- func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: atan2_out
+    MPS: atan2_out_mps
+  tags: [core, pointwise]
+
+- func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: atan2.out
+  variants: method
+  tags: pointwise
+
+- func: atan2(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: atan2.out
+  variants: method, function
+  tags: [core, pointwise]
+# arctan2, alias of atan2
+
+- func: arctan2(Tensor self, Tensor other) -> Tensor
+  variants: method, function
+
+- func: arctan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+
+- func: arctan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: lerp_Scalar
+    MPS: lerp_Scalar_mps
+  tags: pointwise
+
+- func: lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: lerp_Tensor
+    MPS: lerp_Tensor_mps
+  tags: pointwise
+
+- func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: lerp.Scalar_out
+  tags: pointwise
+
+- func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: lerp.Tensor_out
+  tags: pointwise
+
+- func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, MPS: histogram_histc_out
+    CUDA: _histc_out_cuda
+
+- func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
+  variants: method, function
+  dispatch:
+    CPU, MPS: histogram_histc
+    CUDA: _histc_cuda
+
+- func: histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
+  dispatch:
+    CPU, MPS: histogram_out
+
+- func: histogram.bins_tensor(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
+  variants: method, function
+  dispatch:
+    CPU, MPS: histogram
+
+- func: histogram.bin_ct_out(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
+  dispatch:
+    CPU, MPS: histogram_out
+
+- func: histogram.bin_ct(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
+  variants: method, function
+  dispatch:
+    CPU, MPS: histogram
+
+- func: _histogramdd_bin_edges(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor[]
+  dispatch:
+    CPU, MPS: histogramdd_bin_edges
+  autogen: _histogramdd_bin_edges.out
+
+- func: _histogramdd_from_bin_cts(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor
+  dispatch:
+    CPU, MPS: _histogramdd
+  autogen: _histogramdd_from_bin_cts.out
+
+- func: _histogramdd_from_bin_tensors(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False) -> Tensor
+  dispatch:
+    CPU, MPS: _histogramdd
+  autogen: _histogramdd_from_bin_tensors.out
+
+- func: histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
+
+- func: histogramdd.int_bins(Tensor self, int bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
+
+- func: histogramdd.TensorList_bins(Tensor self, Tensor[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
+
+- func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CompositeExplicitAutograd: fmod_out
+  tags: pointwise
+
+- func: fmod.Scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: fmod
+  tags: [core, pointwise]
+
+- func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: fmod_
+  tags: pointwise
+
+- func: fmod.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: fmod_out
+    MPS: fmod_mps_out
+  tags: pointwise
+
+- func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: fmod.Tensor_out
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: fmod.Tensor_out
+  tags: pointwise
+
+- func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: hypot_out
+    MPS: hypot_out_mps
+  tags: pointwise
+
+- func: hypot(Tensor self, Tensor other) -> Tensor
+  structured_delegate: hypot.out
+  variants: method, function
+  tags: pointwise
+
+- func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: hypot.out
+  variants: method
+  tags: pointwise
+
+- func: igamma.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: igamma_out
+  tags: pointwise
+
+- func: igamma(Tensor self, Tensor other) -> Tensor
+  structured_delegate: igamma.out
+  variants: method, function
+  tags: pointwise
+
+- func: igamma_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: igamma.out
+  variants: method
+  tags: pointwise
+
+- func: igammac.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: igammac_out
+  tags: pointwise
+
+- func: igammac(Tensor self, Tensor other) -> Tensor
+  structured_delegate: igammac.out
+  variants: method, function
+  tags: pointwise
+
+- func: igammac_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: igammac.out
+  variants: method
+  tags: pointwise
+
+- func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA, MPS: nextafter_out
+  tags: pointwise
+
+- func: nextafter(Tensor self, Tensor other) -> Tensor
+  structured_delegate: nextafter.out
+  variants: method, function
+  tags: pointwise
+
+- func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  structured_delegate: nextafter.out
+  variants: method
+  tags: pointwise
+
+- func: remainder.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: remainder_out
+  tags: pointwise
+
+- func: remainder.Scalar(Tensor self, Scalar other) -> Tensor
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: remainder
+  tags: [core, pointwise]
+
+- func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CompositeExplicitAutograd: remainder_
+  tags: pointwise
+
+- func: remainder.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: remainder_out
+    MPS: remainder_out_mps
+  tags: pointwise
+
+- func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: remainder.Tensor_out
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: remainder.Tensor_out
+  variants: method
+  tags: pointwise
+
+- func: remainder.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: function
+  dispatch:
+    CPU, CUDA, MPS: remainder
+  autogen: remainder.Scalar_Tensor_out
+  tags: pointwise
+
+- func: min(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA: min
+    MPS: min_mps
+    QuantizedCPU: min_quantized_cpu
+
+- func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: min_unary_out
+    QuantizedCPU: min_quantized_unary_out
+
+- func: fmin(Tensor self, Tensor other) -> Tensor
+  structured_delegate: fmin.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: fmin.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA, MPS: fmin_out
+  tags: pointwise
+
+- func: max(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA: max
+    MPS: max_mps
+    QuantizedCPU: max_quantized_cpu
+
+- func: fmax(Tensor self, Tensor other) -> Tensor
+  structured_delegate: fmax.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: fmax.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA, MPS: fmax_out
+  tags: pointwise
+
+- func: maximum(Tensor self, Tensor other) -> Tensor
+  structured_delegate: maximum.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: maximum_out
+    MPS: maximum_out_mps
+  tags: pointwise
+
+# binary max, alias of maximum
+# NOTE: max is not an alias for maximum, since there is also unary max
+- func: max.other(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: pointwise
+
+- func: max.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: max_unary_out
+    QuantizedCPU: max_quantized_unary_out
+
+- func: minimum(Tensor self, Tensor other) -> Tensor
+  structured_delegate: minimum.out
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU, CUDA: minimum_out
+    MPS: minimum_out_mps
+  tags: pointwise
+
+# binary min, alias for minimum
+# NOTE: min is not an alias for minimum, since there is also unary min
+- func: min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: pointwise
+
+- func: min.other(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  tags: pointwise
+
+- func: quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+  variants: method, function
+
+- func: quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+
+- func: quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+  variants: method, function
+
+- func: quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+
+- func: nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+  variants: method, function
+
+- func: nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+
+- func: nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+  variants: method, function
+
+- func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+
+- func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CompositeExplicitAutograd: sort_out
+
+- func: sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  structured: True
+  dispatch:
+    CPU, CUDA: sort_stable_out
+    MPS: sort_stable_out_mps
+
+- func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: sort
+  tags: core
+
+- func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+  structured_delegate: sort.values_stable
+  variants: method, function
+  dispatch:
+    QuantizedCPU: sort_quantized_cpu_stable
+
+- func: sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+
+- func: sort.dimname_values_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+
+- func: sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)
+  variants: method, function
+
+- func: sort.dimname_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)
+  variants: method, function
+
+- func: msort.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: msort(Tensor self) -> Tensor
+  variants: method, function
+
+- func: argsort(Tensor self, int dim=-1, bool descending=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+
+- func: argsort.stable(Tensor self, *, bool stable, int dim=-1, bool descending=False) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  dispatch:
+    CPU, CUDA, MPS: argsort_stable
+  autogen: argsort.stable_out
+
+- func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor
+  variants: method, function
+
+- func: topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  structured: True
+  dispatch:
+    CPU: topk_out_cpu
+    CUDA: topk_out_cuda
+    MPS: topk_out_mps
+
+- func: topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+  variants: method, function
+  structured_delegate: topk.values
+  dispatch:
+    QuantizedCPU: topk_quantized_cpu
+  tags: core
+
+- func: all(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: all.all_out
+  variants: method, function
+
+- func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  structured: True
+  dispatch:
+    CPU, CUDA: all_all_out
+    MPS: all_all_out_mps
+
+- func: any(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: any.all_out
+  variants: method, function
+  dispatch:
+    SparseCPU, SparseCUDA: any_sparse
+  tags: core
+
+- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  structured: True
+  dispatch:
+    CPU, CUDA: any_all_out
+    MPS: any_all_out_mps
+
+- func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    CPU, CUDA: renorm_out
+    MPS: renorm_out_mps
+
+- func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  variants: method, function
+  structured_delegate: renorm.out
+
+- func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  variants: method
+  structured_delegate: renorm.out
+
+- func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)
+  variants: method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CPU, CUDA, Meta, MPS: unfold
+    QuantizedCPU, QuantizedCUDA: unfold
+
+- func: unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: unfold_backward
+  autogen: unfold_backward.out
+
+- func: equal(Tensor self, Tensor other) -> bool
+  tags: [data_dependent_output, pointwise]
+  variants: method, function
+  dispatch:
+    CPU: cpu_equal
+    CUDA: cuda_equal
+    MPS: mps_equal
+    QuantizedCPU: equal_quantized_cpu
+
+- func: pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: pow_Tensor_Tensor_out
+    MPS: pow_tensor_tensor_out_mps
+  tags: pointwise
+
+- func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Tensor_Tensor_out
+  variants: method, function
+  tags: [core, pointwise]
+
+- func: pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  dispatch:
+    CPU, CUDA: pow_Scalar_out
+    MPS: pow_Scalar_out_mps
+  tags: pointwise
+
+- func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Scalar_out
+  tags: [core, pointwise]
+
+- func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: pow_Tensor_Scalar_out
+    SparseCPU, SparseCUDA: pow_out_sparse_scalar
+    MPS: pow_tensor_scalar_out_mps
+  tags: pointwise
+
+- func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Tensor_Scalar_out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: pow_sparse_scalar
+  tags: [core, pointwise]
+
+- func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Tensor_Scalar_out
+  variants: method
+  tags: pointwise
+
+- func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: pow.Tensor_Tensor_out
+  variants: method
+  tags: pointwise
+
+- func: float_power.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
+
+- func: float_power.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
+  variants: function, method
+  tags: pointwise
+
+- func: float_power.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
+
+- func: float_power.Scalar(Scalar self, Tensor exponent) -> Tensor
+  tags: pointwise
+
+- func: float_power.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
+
+- func: float_power.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
+  variants: function, method
+  tags: pointwise
+
+- func: float_power_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
+  variants: method
+  tags: pointwise
+
+- func: float_power_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
+  variants: method
+  tags: pointwise
+
+- func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  variants: method
+  dispatch:
+    CPU, CUDA: normal_
+    MPS: normal_mps_
+    Meta: normal_meta_
+    SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: normal_nested_
+  autogen: normal.out
+
+# Only used by the functionalization pass.
+# Normally, the codegen would be able to generate a normal() NativeFunction,
+# but we can't due to overload ambiguity with normal.Tensor_float.
+- func: normal_functional(Tensor self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutograd: normal_functional
+
+- func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU, CUDA: normal_out
+    MPS: normal_mps_out
+    Meta: normal_out_meta
+
+- func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
+    MPS: normal_mps
+    Meta: normal_meta
+  tags: nondeterministic_seeded
+
+- func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: normal_out
+    Meta: normal_out_meta
+    MPS: normal_mps_out
+  tags: nondeterministic_seeded
+
+- func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
+    MPS: normal_mps
+    Meta: normal_meta
+  tags: nondeterministic_seeded
+
+- func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: normal_out
+    Meta: normal_out_meta
+    MPS: normal_mps_out
+  tags: nondeterministic_seeded
+
+- func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
+    MPS: normal_mps
+    Meta: normal_meta
+  tags: nondeterministic_seeded
+
+- func: normal.float_float(float mean, float std, SymInt[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: normal
+  tags: nondeterministic_seeded
+
+- func: normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: normal_out
+  tags: nondeterministic_seeded
+
+- func: alias(Tensor(a) self) -> Tensor(a)
+  variants: method, function
+  dispatch:
+    CompositeExplicitAutograd: alias
+  tags: core
+
+- func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
+  variants: function
+  dispatch:
+    CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
+    CPU: _amp_foreach_non_finite_check_and_unscale_cpu_
+  autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out
+
+- func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CUDA: _amp_update_scale_cuda_
+    CPU: _amp_update_scale_cpu_
+  autogen: _amp_update_scale, _amp_update_scale.out
+
+    #- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
+    #dispatch:
+    #CPU: _cat_cpu
+    #CUDA: cat_cuda
+    #MPS: cat_mps
+    #QuantizedCPU: cat_quantized_cpu
+
+    #- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+    #dispatch:
+    #CPU: _cat_out_cpu
+  #CUDA: cat_out_cuda
+  #QuantizedCPU: cat_out_quantized_cpu
+
+- func: _foreach_add.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalar_kernel_slow
+    CUDA: foreach_tensor_add_scalar_kernel_cuda
+
+- func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalar_kernel_slow_
+    CUDA: foreach_tensor_add_scalar_kernel_cuda_
+  autogen: _foreach_add.Scalar_out
+
+- func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_list_kernel_slow
+    CUDA: foreach_tensor_add_list_kernel_cuda
+
+- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_list_kernel_slow_
+    CUDA: foreach_tensor_add_list_kernel_cuda_
+  autogen: _foreach_add.List_out
+
+- func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalarlist_kernel_slow
+    CUDA: foreach_tensor_add_scalarlist_kernel_cuda
+
+- func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
+  autogen: _foreach_add.ScalarList_out
+
+- func: _foreach_add.Tensor(Tensor[] self, Tensor other, *, Scalar alpha=1) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_tensor_kernel_slow
+    CUDA: foreach_tensor_add_tensor_kernel_cuda
+
+- func: _foreach_add_.Tensor(Tensor(a!)[] self, Tensor other, *, Scalar alpha=1) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_tensor_kernel_slow_
+    CUDA: foreach_tensor_add_tensor_kernel_cuda_
+  autogen: _foreach_add.Tensor_out
+
+- func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalar_kernel_slow
+    CUDA: foreach_tensor_sub_scalar_kernel_cuda
+
+- func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalar_kernel_slow_
+    CUDA: foreach_tensor_sub_scalar_kernel_cuda_
+  autogen: _foreach_sub.Scalar_out
+
+- func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_list_kernel_slow
+    CUDA: foreach_tensor_sub_list_kernel_cuda
+
+- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_list_kernel_slow_
+    CUDA: foreach_tensor_sub_list_kernel_cuda_
+  autogen: _foreach_sub.List_out
+
+- func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalarlist_kernel_slow
+    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
+
+- func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
+  autogen: _foreach_sub.ScalarList_out
+
+- func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalar_kernel_slow
+    CUDA: foreach_tensor_mul_scalar_kernel_cuda
+
+- func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalar_kernel_slow_
+    CUDA: foreach_tensor_mul_scalar_kernel_cuda_
+  autogen: _foreach_mul.Scalar_out
+
+- func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_list_kernel_slow
+    CUDA: foreach_tensor_mul_list_kernel_cuda
+
+- func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_list_kernel_slow_
+    CUDA: foreach_tensor_mul_list_kernel_cuda_
+  autogen: _foreach_mul.List_out
+
+- func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalarlist_kernel_slow
+    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
+
+- func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
+  autogen: _foreach_mul.ScalarList_out
+
+- func: _foreach_mul.Tensor(Tensor[] self, Tensor other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_tensor_kernel_slow
+    CUDA: foreach_tensor_mul_tensor_kernel_cuda
+
+- func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_tensor_kernel_slow_
+    CUDA: foreach_tensor_mul_tensor_kernel_cuda_
+  autogen: _foreach_mul.Tensor_out
+
+- func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalar_kernel_slow
+    CUDA: foreach_tensor_div_scalar_kernel_cuda
+
+- func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalar_kernel_slow_
+    CUDA: foreach_tensor_div_scalar_kernel_cuda_
+  autogen: _foreach_div.Scalar_out
+
+- func: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_list_kernel_slow
+    CUDA: foreach_tensor_div_list_kernel_cuda
+
+- func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_list_kernel_slow_
+    CUDA: foreach_tensor_div_list_kernel_cuda_
+  autogen: _foreach_div.List_out
+
+- func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalarlist_kernel_slow
+    CUDA: foreach_tensor_div_scalarlist_kernel_cuda
+
+- func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
+  autogen: _foreach_div.ScalarList_out
+
+- func: _foreach_div.Tensor(Tensor[] self, Tensor other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_tensor_kernel_slow
+    CUDA: foreach_tensor_div_tensor_kernel_cuda
+
+- func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_tensor_kernel_slow_
+    CUDA: foreach_tensor_div_tensor_kernel_cuda_
+  autogen: _foreach_div.Tensor_out
+
+- func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalar_kernel_slow
+    CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
+
+- func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
+    CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
+  autogen: _foreach_clamp_max.Scalar_out
+
+- func: _foreach_clamp_max.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_list_kernel_slow
+    CUDA: foreach_tensor_clamp_max_list_kernel_cuda
+
+- func: _foreach_clamp_max_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_list_kernel_slow_
+    CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
+  autogen: _foreach_clamp_max.List_out
+
+- func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
+    CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
+
+- func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
+  autogen: _foreach_clamp_max.ScalarList_out
+
+- func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalar_kernel_slow
+    CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
+
+- func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
+    CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
+  autogen: _foreach_clamp_min.Scalar_out
+
+- func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_list_kernel_slow
+    CUDA: foreach_tensor_clamp_min_list_kernel_cuda
+
+- func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_list_kernel_slow_
+    CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
+  autogen: _foreach_clamp_min.List_out
+
+- func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
+    CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
+
+- func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
+  autogen: _foreach_clamp_min.ScalarList_out
+
+# foreach_minimum/maximum dispatches to clamp_max/min
+- func: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalar_kernel_slow
+    CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
+
+- func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
+    CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
+  autogen: _foreach_maximum.Scalar_out
+
+# foreach_minimum/maximum dispatches to clamp_max/min
+- func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_list_kernel_slow
+    CUDA: foreach_tensor_clamp_min_list_kernel_cuda
+
+- func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_list_kernel_slow_
+    CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
+  autogen: _foreach_maximum.List_out
+
+# foreach_minimum/maximum dispatches to clamp_max/min
+- func: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
+    CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
+
+- func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
+  autogen: _foreach_maximum.ScalarList_out
+
+- func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalar_kernel_slow
+    CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
+
+- func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
+    CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
+  autogen: _foreach_minimum.Scalar_out
+
+- func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_list_kernel_slow
+    CUDA: foreach_tensor_clamp_max_list_kernel_cuda
+
+- func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_list_kernel_slow_
+    CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
+  autogen: _foreach_minimum.List_out
+
+- func: _foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
+    CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
+
+- func: _foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
+  autogen: _foreach_minimum.ScalarList_out
+
+- func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcdiv_scalar_slow
+    CUDA: foreach_tensor_addcdiv_scalar_cuda
+
+- func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcdiv_scalarlist_slow
+    CUDA: foreach_tensor_addcdiv_scalarlist_cuda
+
+- func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcdiv_tensor_slow
+    CUDA: foreach_tensor_addcdiv_tensor_cuda
+
+- func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcdiv_scalar_slow_
+    CUDA: foreach_tensor_addcdiv_scalar_cuda_
+  autogen: _foreach_addcdiv.Scalar_out
+
+- func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcdiv_scalarlist_slow_
+    CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
+  autogen: _foreach_addcdiv.ScalarList_out
+
+- func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcdiv_tensor_slow_
+    CUDA: foreach_tensor_addcdiv_tensor_cuda_
+  autogen: _foreach_addcdiv.Tensor_out
+
+- func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcmul_scalar_slow
+    CUDA: foreach_tensor_addcmul_scalar_cuda
+
+- func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcmul_scalarlist_slow
+    CUDA: foreach_tensor_addcmul_scalarlist_cuda
+
+- func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcmul_tensor_slow
+    CUDA: foreach_tensor_addcmul_tensor_cuda
+
+- func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcmul_scalar_slow_
+    CUDA: foreach_tensor_addcmul_scalar_cuda_
+  autogen: _foreach_addcmul.Scalar_out
+
+- func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcmul_scalarlist_slow_
+    CUDA: foreach_tensor_addcmul_scalarlist_cuda_
+  autogen: _foreach_addcmul.ScalarList_out
+
+- func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcmul_tensor_slow_
+    CUDA: foreach_tensor_addcmul_tensor_cuda_
+  autogen: _foreach_addcmul.Tensor_out
+
+- func: _foreach_abs(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_abs_slow
+    CUDA: foreach_tensor_abs_cuda
+
+- func: _foreach_abs_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_abs_slow_
+    CUDA: foreach_tensor_abs_cuda_
+  autogen: _foreach_abs.out
+
+- func: _foreach_acos(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_acos_slow
+    CUDA: foreach_tensor_acos_cuda
+
+- func: _foreach_acos_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_acos_slow_
+    CUDA: foreach_tensor_acos_cuda_
+  autogen: _foreach_acos.out
+
+- func: _foreach_asin(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_asin_slow
+    CUDA: foreach_tensor_asin_cuda
+
+- func: _foreach_asin_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_asin_slow_
+    CUDA: foreach_tensor_asin_cuda_
+  autogen: _foreach_asin.out
+
+- func: _foreach_atan(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_atan_slow
+    CUDA: foreach_tensor_atan_cuda
+
+- func: _foreach_atan_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_atan_slow_
+    CUDA: foreach_tensor_atan_cuda_
+  autogen: _foreach_atan.out
+
+- func: _foreach_ceil(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_ceil_slow
+    CUDA: foreach_tensor_ceil_cuda
+
+- func: _foreach_ceil_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_ceil_slow_
+    CUDA: foreach_tensor_ceil_cuda_
+  autogen: _foreach_ceil.out
+
+- func: _foreach_cos(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_cos_slow
+    CUDA: foreach_tensor_cos_cuda
+
+- func: _foreach_cos_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_cos_slow_
+    CUDA: foreach_tensor_cos_cuda_
+  autogen: _foreach_cos.out
+
+- func: _foreach_cosh(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_cosh_slow
+    CUDA: foreach_tensor_cosh_cuda
+
+- func: _foreach_cosh_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_cosh_slow_
+    CUDA: foreach_tensor_cosh_cuda_
+  autogen: _foreach_cosh.out
+
+- func: _foreach_erf(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_erf_slow
+    CUDA: foreach_tensor_erf_cuda
+
+- func: _foreach_erf_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_erf_slow_
+    CUDA: foreach_tensor_erf_cuda_
+  autogen: _foreach_erf.out
+
+- func: _foreach_erfc(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_erfc_slow
+    CUDA: foreach_tensor_erfc_cuda
+
+- func: _foreach_erfc_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_erfc_slow_
+    CUDA: foreach_tensor_erfc_cuda_
+  autogen: _foreach_erfc.out
+
+- func: _foreach_exp(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_exp_slow
+    CUDA: foreach_tensor_exp_cuda
+
+- func: _foreach_exp_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_exp_slow_
+    CUDA: foreach_tensor_exp_cuda_
+  autogen: _foreach_exp.out
+
+- func: _foreach_expm1(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_expm1_slow
+    CUDA: foreach_tensor_expm1_cuda
+
+- func: _foreach_expm1_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_expm1_slow_
+    CUDA: foreach_tensor_expm1_cuda_
+  autogen: _foreach_expm1.out
+
+- func: _foreach_floor(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_floor_slow
+    CUDA: foreach_tensor_floor_cuda
+
+- func: _foreach_floor_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_floor_slow_
+    CUDA: foreach_tensor_floor_cuda_
+  autogen: _foreach_floor.out
+
+- func: _foreach_frac(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_frac_slow
+    CUDA: foreach_tensor_frac_cuda
+
+- func: _foreach_frac_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_frac_slow_
+    CUDA: foreach_tensor_frac_cuda_
+  autogen: _foreach_frac.out
+
+- func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensors are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_ternary_lerp_slow
+    CUDA: foreach_tensor_lerp_ternary_cuda
+  autogen: _foreach_lerp.List_out
+
+- func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensors are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_ternary_lerp_slow_
+    CUDA: foreach_tensor_lerp_ternary_cuda_
+  autogen: _foreach_lerp.List_out
+
+- func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensors are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_lerp_list_kernel_slow
+    CUDA: foreach_tensor_lerp_list_cuda
+  autogen: _foreach_lerp.Scalar_out
+
+- func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensors are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_lerp_list_kernel_slow_
+    CUDA: foreach_tensor_lerp_list_cuda_
+  autogen: _foreach_lerp.Scalar_out
+
+- func: _foreach_lgamma(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_lgamma_slow
+    CUDA: foreach_tensor_lgamma_cuda
+
+- func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_lgamma_slow_
+    CUDA: foreach_tensor_lgamma_cuda_
+  autogen: _foreach_lgamma.out
+
+- func: _foreach_log(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log_slow
+    CUDA: foreach_tensor_log_cuda
+
+- func: _foreach_log_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log_slow_
+    CUDA: foreach_tensor_log_cuda_
+  autogen: _foreach_log.out
+
+- func: _foreach_log10(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log10_slow
+    CUDA: foreach_tensor_log10_cuda
+
+- func: _foreach_log10_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log10_slow_
+    CUDA: foreach_tensor_log10_cuda_
+  autogen: _foreach_log10.out
+
+- func: _foreach_log1p(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log1p_slow
+    CUDA: foreach_tensor_log1p_cuda
+
+- func: _foreach_log1p_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log1p_slow_
+    CUDA: foreach_tensor_log1p_cuda_
+  autogen: _foreach_log1p.out
+
+- func: _foreach_log2(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log2_slow
+    CUDA: foreach_tensor_log2_cuda
+
+- func: _foreach_log2_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_log2_slow_
+    CUDA: foreach_tensor_log2_cuda_
+  autogen: _foreach_log2.out
+
+- func: _foreach_neg(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_neg_slow
+    CUDA: foreach_tensor_neg_cuda
+
+- func: _foreach_neg_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_neg_slow_
+    CUDA: foreach_tensor_neg_cuda_
+  autogen: _foreach_neg.out
+
+- func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_norm_slow
+    CUDA: foreach_tensor_norm_cuda
+  autogen: _foreach_norm.Scalar_out
+
+- func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_list_kernel_slow
+    CUDA: foreach_tensor_pow_list_kernel_cuda
+
+- func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_scalar_kernel_slow
+    CUDA: foreach_tensor_pow_scalar_kernel_cuda
+
+- func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_scalarlist_kernel_slow
+    CUDA: foreach_tensor_pow_scalarlist_kernel_cuda
+
+- func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_scalar_pow_list_kernel_slow
+    CUDA: foreach_scalar_pow_list_kernel_cuda
+
+- func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> ()
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_list_kernel_slow_
+    CUDA: foreach_tensor_pow_list_kernel_cuda_
+  autogen: _foreach_pow.List_out
+
+- func: _foreach_pow_.Scalar(Tensor(a!)[] self, Scalar exponent) -> ()
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_scalar_kernel_slow_
+    CUDA: foreach_tensor_pow_scalar_kernel_cuda_
+  autogen: _foreach_pow.Scalar_out
+
+- func: _foreach_pow_.ScalarList(Tensor(a!)[] self, Scalar[] exponent) -> ()
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_
+  autogen: _foreach_pow.ScalarList_out
+
+- func: _foreach_reciprocal(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_reciprocal_slow
+    CUDA: foreach_tensor_reciprocal_cuda
+
+- func: _foreach_reciprocal_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_reciprocal_slow_
+    CUDA: foreach_tensor_reciprocal_cuda_
+  autogen: _foreach_reciprocal.out
+
+- func: _foreach_round(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_round_slow
+    CUDA: foreach_tensor_round_cuda
+
+- func: _foreach_round_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_round_slow_
+    CUDA: foreach_tensor_round_cuda_
+  autogen: _foreach_round.out
+
+- func: _foreach_sigmoid(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sigmoid_slow
+    CUDA: foreach_tensor_sigmoid_cuda
+
+- func: _foreach_sigmoid_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sigmoid_slow_
+    CUDA: foreach_tensor_sigmoid_cuda_
+  autogen: _foreach_sigmoid.out
+
+- func: _foreach_sign(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sign_slow
+    CUDA: foreach_tensor_sign_cuda
+
+- func: _foreach_sign_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sign_slow_
+    CUDA: foreach_tensor_sign_cuda_
+  autogen: _foreach_sign.out
+
+- func: _foreach_sin(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sin_slow
+    CUDA: foreach_tensor_sin_cuda
+
+- func: _foreach_sin_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sin_slow_
+    CUDA: foreach_tensor_sin_cuda_
+  autogen: _foreach_sin.out
+
+- func: _foreach_sinh(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sinh_slow
+    CUDA: foreach_tensor_sinh_cuda
+
+- func: _foreach_sinh_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sinh_slow_
+    CUDA: foreach_tensor_sinh_cuda_
+  autogen: _foreach_sinh.out
+
+- func: _foreach_sqrt(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sqrt_slow
+    CUDA: foreach_tensor_sqrt_cuda
+
+- func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sqrt_slow_
+    CUDA: foreach_tensor_sqrt_cuda_
+  autogen: _foreach_sqrt.out
+
+- func: _foreach_tan(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_tan_slow
+    CUDA: foreach_tensor_tan_cuda
+
+- func: _foreach_tan_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_tan_slow_
+    CUDA: foreach_tensor_tan_cuda_
+  autogen: _foreach_tan.out
+
+- func: _foreach_tanh(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_tanh_slow
+    CUDA: foreach_tensor_tanh_cuda
+
+- func: _foreach_tanh_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_tanh_slow_
+    CUDA: foreach_tensor_tanh_cuda_
+  autogen: _foreach_tanh.out
+
+- func: _foreach_trunc(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_trunc_slow
+    CUDA: foreach_tensor_trunc_cuda
+
+- func: _foreach_trunc_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_trunc_slow_
+    CUDA: foreach_tensor_trunc_cuda_
+  autogen: _foreach_trunc.out
+
+- func: _foreach_zero_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_zero_slow_
+    CUDA: foreach_tensor_zero_cuda_
+  autogen: _foreach_zero, _foreach_zero.out
+
+- func: _foreach_copy_(Tensor(a!)[] self, Tensor[] src, bool non_blocking=False) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_copy_list_kernel_slow_
+    CUDA: foreach_tensor_copy_list_kernel_cuda_
+  autogen: _foreach_copy, _foreach_copy.out
+
+- func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
+  dispatch:
+    CPU: bucketize_cpu
+    CUDA: bucketize_cuda
+    MPS: bucketize_mps
+
+- func: bucketize.Tensor_out(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: bucketize_out_cpu
+    CUDA: bucketize_out_cuda
+    MPS: bucketize_out_mps
+
+- func: bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
+  dispatch:
+    CPU: bucketize_cpu
+    CUDA: bucketize_cuda
+    MPS: bucketize_mps
+  autogen: bucketize.Scalar_out
+
+- func: searchsorted.Tensor(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor
+  dispatch:
+    CPU: searchsorted_cpu
+    CUDA: searchsorted_cuda
+    MPS: searchsorted_mps
+
+- func: searchsorted.Tensor_out(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: searchsorted_out_cpu
+    CUDA: searchsorted_out_cuda
+    MPS: searchsorted_out_mps
+
+- func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor
+  dispatch:
+    CPU: searchsorted_cpu
+    CUDA: searchsorted_cuda
+    MPS: searchsorted_mps
+
+- func: searchsorted.Scalar_out(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: searchsorted_out_cpu
+    CUDA: searchsorted_out_cuda
+    MPS: searchsorted_out_mps
+
+- func: _convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor
+  structured_delegate: _convert_indices_from_coo_to_csr.out
+
+- func: _convert_indices_from_coo_to_csr.out(Tensor self, int size, *, bool out_int32=False, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: _convert_indices_from_coo_to_csr_structured_cpu
+    CUDA: _convert_indices_from_coo_to_csr_structured_cuda
+
+- func: _convert_indices_from_csr_to_coo(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False) -> Tensor
+  structured_delegate: _convert_indices_from_csr_to_coo.out
+
+- func: _convert_indices_from_csr_to_coo.out(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  dispatch:
+    CPU: _convert_indices_from_csr_to_coo_structured_cpu
+    CUDA: _convert_indices_from_csr_to_coo_structured_cuda
+
+## NN wrappers
+
+- func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: mse_loss_out
+    MPS: mse_loss_out_mps
+
+- func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: mse_loss.out
+  python_module: nn
+
+- func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU, CUDA: mse_loss_backward_out
+    MPS: mse_loss_backward_out_mps
+
+- func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: mse_loss_backward
+    MPS: mse_loss_backward_mps
+
+- func: l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+  python_module: nn
+
+- func: multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: multi_margin_loss_cpu_out
+    CUDA: multi_margin_loss_cuda_out
+
+- func: multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: multi_margin_loss_cpu
+    CUDA: multi_margin_loss_cuda
+
+- func: multi_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: multi_margin_loss_cpu_backward_out
+    CUDA: multi_margin_loss_cuda_backward_out
+
+- func: multi_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: multi_margin_loss_cpu_backward
+    CUDA: multi_margin_loss_cuda_backward
+
+- func: multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+
+- func: multilabel_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+  python_module: nn
+
+- func: multilabel_margin_loss_forward.output(Tensor self, Tensor target, int reduction, *, Tensor(a!) output, Tensor(b!) is_target) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  dispatch:
+    CPU: multilabel_margin_loss_forward_out_cpu
+    CUDA: multilabel_margin_loss_forward_out_cuda
+
+- func: multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target)
+  python_module: nn
+  dispatch:
+    CPU: multilabel_margin_loss_forward_cpu
+    CUDA: multilabel_margin_loss_forward_cuda
+
+- func: multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: multilabel_margin_loss_backward_cpu_out
+    CUDA: multilabel_margin_loss_backward_cuda_out
+
+- func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: multilabel_margin_loss_backward_cpu
+    CUDA: multilabel_margin_loss_backward_cuda
+
+- func: nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+
+- func: nll_loss_nd(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: nll_loss_nd_symint
+
+- func: nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: nll_loss_symint
+
+- func: nll_loss_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: nll_loss_forward_out_cpu
+    CUDA: nll_loss_forward_out_cuda
+    MPS: nll_loss_forward_out_mps
+
+- func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
+  python_module: nn
+  structured_delegate: nll_loss_forward.output
+
+- func: nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: nll_loss_backward_out_cpu
+    CUDA: nll_loss_backward_out_cuda
+    MPS: nll_loss_backward_out_mps
+
+- func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
+  python_module: nn
+  structured_delegate: nll_loss_backward.grad_input
+
+- func: nll_loss2d.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+
+- func: nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: nll_loss2d_symint
+
+- func: nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  dispatch:
+    CPU: nll_loss2d_forward_out_cpu
+    CUDA: nll_loss2d_forward_out_cuda
+    MPS: nll_loss2d_forward_out_mps
+
+- func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
+  python_module: nn
+  dispatch:
+    CPU: nll_loss2d_forward_cpu
+    CUDA: nll_loss2d_forward_cuda
+    MPS: nll_loss2d_forward_mps
+
+- func: nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: nll_loss2d_backward_out_cpu
+    CUDA: nll_loss2d_backward_out_cuda
+    MPS: nll_loss2d_backward_out_mps
+
+- func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: nll_loss2d_backward_cpu
+    CUDA: nll_loss2d_backward_cuda
+    MPS: nll_loss2d_backward_mps
+
+- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: smooth_l1_loss_out
+    MPS: smooth_l1_loss_out_mps
+
+- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: smooth_l1_loss.out
+  python_module: nn
+
+- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: smooth_l1_loss_backward_out
+    CUDA: smooth_l1_loss_backward_out
+    MPS: smooth_l1_loss_backward_out_mps
+
+- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: smooth_l1_loss_backward
+
+- func: huber_loss.out(Tensor self, Tensor target, int reduction=Mean, float delta=1.0, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU, CUDA: huber_loss_out
+    MPS: huber_loss_out_mps
+
+- func: huber_loss(Tensor self, Tensor target, int reduction=Mean, float delta=1.0) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: huber_loss
+    MPS: huber_loss_mps
+
+- func: huber_loss_backward.out(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU, CUDA: huber_loss_backward_out
+    MPS: huber_loss_backward_out_mps
+
+- func: huber_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: huber_loss_backward
+
+- func: soft_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: soft_margin_loss_out
+
+- func: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: soft_margin_loss
+
+- func: soft_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: soft_margin_loss_backward_out
+
+- func: soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: soft_margin_loss_backward
+
+- func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: elu_out
+    MPS: elu_out_mps
+
+- func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
+  structured_delegate: elu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: elu_backward_out
+    MPS: elu_backward_out_mps
+
+- func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
+  structured_delegate: elu_backward.grad_input
+  python_module: nn
+
+- func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
+  structured_delegate: elu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: glu_out
+    MPS: glu_out_mps
+
+- func: glu(Tensor self, int dim=-1) -> Tensor
+  structured_delegate: glu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: glu_backward.grad_input(Tensor grad_output, Tensor self, int dim, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: glu_backward_cpu_out
+    CUDA: glu_backward_cuda_out
+    MPS: glu_backward_mps_out
+
+- func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: glu_backward_cpu
+    CUDA: glu_backward_cuda
+    MPS: glu_backward_mps
+
+- func: glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: glu_jvp
+  autogen: glu_jvp.out
+
+- func: glu_backward_jvp(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: glu_backward_jvp
+  autogen: glu_backward_jvp.out
+
+- func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardsigmoid_out
+    MPS: hardsigmoid_out_mps
+    QuantizedCPU: hardsigmoid_out_quantized_cpu
+
+- func: hardsigmoid(Tensor self) -> Tensor
+  structured_delegate: hardsigmoid.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    QuantizedCPU: hardsigmoid_quantized_cpu
+
+- func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!)
+  structured_delegate: hardsigmoid.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardsigmoid_backward_out
+    MPS: hardsigmoid_backward_out_mps
+
+- func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
+  structured_delegate: hardsigmoid_backward.grad_input
+  python_module: nn
+
+- func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA, MPS: hardtanh_out
+    QuantizedCPU: hardtanh_out_quantized_cpu
+
+- func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA, MPS: hardtanh
+    QuantizedCPU: hardtanh_quantized_cpu
+  tags: core
+
+- func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardtanh_backward_out
+    MPS: hardtanh_backward_out_mps
+
+- func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardtanh_backward
+    MPS: hardtanh_backward_mps
+
+- func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA, MPS: hardtanh_
+    QuantizedCPU: hardtanh_quantized_cpu_
+
+- func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish_out
+    MPS: hardswish_out_mps
+
+- func: hardswish(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish
+    MPS: hardswish_mps
+
+- func: hardswish_(Tensor(a!) self) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish_
+    MPS: hardswish_mps_
+
+- func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish_backward
+    MPS: hardswish_backward_mps
+  autogen: hardswish_backward.out
+
+- func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: leaky_relu_out
+    MPS: leaky_relu_out_mps
+    QuantizedCPU: leaky_relu_out_quantized_cpu
+
+- func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
+  structured_delegate: leaky_relu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    QuantizedCPU: leaky_relu_quantized_cpu
+  tags: core
+
+- func: leaky_relu_backward.grad_input(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: leaky_relu_backward_out
+    MPS: leaky_relu_backward_out_mps
+
+- func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
+  structured_delegate: leaky_relu_backward.grad_input
+  python_module: nn
+
+- func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
+  structured_delegate: leaky_relu.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    QuantizedCPU: leaky_relu_quantized_cpu_
+
+- func: log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: log_sigmoid(Tensor self) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: log_sigmoid_forward.output(Tensor self, *, Tensor(a!) output, Tensor(b!) buffer) -> (Tensor(a!), Tensor(b!))
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU: log_sigmoid_forward_out_cpu
+    CUDA: log_sigmoid_forward_out_cuda
+    MPS: log_sigmoid_forward_out_mps
+
+- func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU: log_sigmoid_forward_cpu
+    CUDA: log_sigmoid_forward_cuda
+    MPS: log_sigmoid_forward_mps
+
+- func: log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: log_sigmoid_backward_cpu_out
+    CUDA: log_sigmoid_backward_cuda_out
+    MPS: log_sigmoid_backward_mps_out
+
+- func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: log_sigmoid_backward_cpu
+    CUDA: log_sigmoid_backward_cuda
+    MPS: log_sigmoid_backward_mps
+
+- func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU: rrelu_with_noise_out_cpu
+    CUDA: rrelu_with_noise_out_cuda
+
+- func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: rrelu_with_noise_cpu
+    CUDA: rrelu_with_noise_cuda
+  tags: nondeterministic_seeded
+
+- func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: rrelu_with_noise_backward
+  autogen: rrelu_with_noise_backward.out
+
+- func: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
+  python_module: nn
+  tags: nondeterministic_seeded
+  dispatch:
+    CPU: rrelu_with_noise_cpu_
+    CUDA: rrelu_with_noise_cuda_
+
+- func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: softplus_out
+    MPS: softplus_out_mps
+
+- func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
+  structured_delegate: softplus.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: softplus_backward_out
+    MPS: softplus_backward_out_mps
+
+- func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold) -> Tensor
+  structured_delegate: softplus_backward.grad_input
+  python_module: nn
+
+- func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink_out
+    MPS: softshrink_out_mps
+
+- func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
+  structured_delegate: softshrink.out
+  device_check: NoCheck   # TensorIterator
+  python_module: nn
+
+- func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink_backward_out
+    MPS: softshrink_backward_out_mps
+
+- func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
+  structured_delegate: softshrink_backward.grad_input
+  python_module: nn
+
+- func: adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: adaptive_avg_pool2d_out_cpu
+    CUDA: adaptive_avg_pool2d_out_cuda
+    MPS: adaptive_avg_pool2d_out_mps
+    MkldnnCPU: mkldnn_adaptive_avg_pool2d_out_stub
+
+- func: adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: adaptive_avg_pool2d_symint
+
+- func: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
+  dispatch:
+    MkldnnCPU: mkldnn_adaptive_avg_pool2d
+
+- func: mkldnn_adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    MkldnnCPU: mkldnn_adaptive_avg_pool2d_out
+
+- func: mkldnn_adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
+  dispatch:
+    MkldnnCPU: mkldnn_adaptive_avg_pool2d_backward
+  autogen: mkldnn_adaptive_avg_pool2d_backward.out
+
+- func: _adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor
+  dispatch:
+    CPU: adaptive_avg_pool2d_cpu
+    CUDA: adaptive_avg_pool2d_cuda
+    MPS: adaptive_avg_pool2d_mps
+    QuantizedCPU: adaptive_avg_pool2d_quantized_cpu
+    QuantizedCUDA: adaptive_avg_pool2d_quantized_cuda
+  autogen: _adaptive_avg_pool2d.out
+  tags: core
+
+- func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: adaptive_avg_pool2d_backward_cpu
+    CUDA: adaptive_avg_pool2d_backward_cuda
+    MPS: adaptive_avg_pool2d_backward_mps
+  autogen: _adaptive_avg_pool2d_backward.out
+  tags: core
+
+- func: adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: adaptive_avg_pool3d_out_cpu
+    CUDA: adaptive_avg_pool3d_out_cuda
+    QuantizedCPU: adaptive_avg_pool3d_out_quantized_cpu
+
+- func: adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: adaptive_avg_pool3d_symint
+
+- func: _adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
+  dispatch:
+    CPU: adaptive_avg_pool3d_cpu
+    CUDA: adaptive_avg_pool3d_cuda
+    QuantizedCPU: adaptive_avg_pool3d_quantized_cpu
+  autogen: _adaptive_avg_pool3d.out
+  tags: core
+
+- func: adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: adaptive_avg_pool3d_backward_out_cpu
+    CUDA: adaptive_avg_pool3d_backward_out_cuda
+
+- func: _adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: adaptive_avg_pool3d_backward_cpu
+    CUDA: adaptive_avg_pool3d_backward_cuda
+  autogen: _adaptive_avg_pool3d_backward.out
+
+# Return: (Tensor output, Tensor indices)
+- func: adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: adaptive_max_pool2d_out_cpu
+    CUDA: adaptive_max_pool2d_out_cuda
+    MPS: adaptive_max_pool2d_out_mps
+
+# Return: (Tensor output, Tensor indices)
+- func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
+  python_module: nn
+  structured_delegate: adaptive_max_pool2d.out
+
+- func: adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: adaptive_max_pool2d_backward_out_cpu
+    CUDA: adaptive_max_pool2d_backward_out_cuda
+    MPS: adaptive_max_pool2d_backward_out_mps
+
+- func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
+  python_module: nn
+  structured_delegate: adaptive_max_pool2d_backward.grad_input
+
+# Return: (Tensor output, Tensor indices)
+- func: adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: adaptive_max_pool3d_out_cpu
+    CUDA: adaptive_max_pool3d_out_cuda
+
+# Return: (Tensor output, Tensor indices)
+- func: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)
+  python_module: nn
+  structured_delegate: adaptive_max_pool3d.out
+
+- func: adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: adaptive_max_pool3d_backward_out_cpu
+    CUDA: adaptive_max_pool3d_backward_out_cuda
+
+- func: adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
+  python_module: nn
+  structured_delegate: adaptive_max_pool3d_backward.grad_input
+
+- func: avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  precomputed:
+  - kernel_size -> int kH, int kW
+  - stride -> int dH, int dW
+  - padding -> int padH, int padW
+  dispatch:
+    CPU: avg_pool2d_out_cpu
+    CUDA: avg_pool2d_out_cuda
+    MPS: avg_pool2d_out_mps
+    MkldnnCPU: mkldnn_avg_pool2d_out
+
+- func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+  python_module: nn
+  structured_delegate: avg_pool2d.out
+  dispatch:
+    MkldnnCPU: mkldnn_avg_pool2d
+    QuantizedCPU: avg_pool2d_quantized_cpu
+  tags: core
+
+- func: avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: avg_pool2d_backward_out_cpu
+    CUDA: avg_pool2d_backward_out_cuda
+    MPS: avg_pool2d_backward_out_mps
+    MkldnnCPU: mkldnn_avg_pool2d_backward_out
+
+- func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
+  python_module: nn
+  structured_delegate: avg_pool2d_backward.grad_input
+  dispatch:
+    MkldnnCPU: mkldnn_avg_pool2d_backward
+  tags: core
+
+- func: avg_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: avg_pool3d_out_cpu
+    CUDA: avg_pool3d_out_cuda
+    MkldnnCPU: mkldnn_avg_pool3d_out
+
+- func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+  python_module: nn
+  structured_delegate: avg_pool3d.out
+  dispatch:
+    MkldnnCPU: mkldnn_avg_pool3d
+    QuantizedCPU: avg_pool3d_quantized_cpu
+  tags: core
+
+- func: avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: avg_pool3d_backward_out_cpu
+    CUDA: avg_pool3d_backward_out_cuda
+    MkldnnCPU: mkldnn_avg_pool3d_backward_out
+
+- func: avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
+  python_module: nn
+  structured_delegate: avg_pool3d_backward.grad_input
+  dispatch:
+    MkldnnCPU: mkldnn_avg_pool3d_backward
+
+# Return: (Tensor output, Tensor indices)
+- func: fractional_max_pool2d.output(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: fractional_max_pool2d_out_cpu
+    CUDA: fractional_max_pool2d_out_cuda
+
+# Return: (Tensor output, Tensor indices)
+- func: fractional_max_pool2d(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples) -> (Tensor, Tensor)
+  python_module: nn
+  structured_delegate: fractional_max_pool2d.output
+
+- func: fractional_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: fractional_max_pool2d_backward_cpu
+    CUDA: fractional_max_pool2d_backward_cuda
+
+- func: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor
+  python_module: nn
+  structured_delegate: fractional_max_pool2d_backward.grad_input
+
+# Return: (Tensor output, Tensor indices)
+- func: fractional_max_pool3d.output(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  precomputed:
+  - kernel_size -> int poolSizeT, int poolSizeH, int poolSizeW
+  - output_size -> int outputT, int outputH, int outputW
+  - int numBatch, int numPlanes, int inputT, int inputH, int inputW
+  dispatch:
+    CPU: fractional_max_pool3d_out_cpu
+    CUDA: fractional_max_pool3d_out_cuda
+
+# Return: (Tensor output, Tensor indices)
+- func: fractional_max_pool3d(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples) -> (Tensor, Tensor)
+  python_module: nn
+  structured_delegate: fractional_max_pool3d.output
+
+- func: fractional_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: fractional_max_pool3d_backward_out_cpu
+    CUDA: fractional_max_pool3d_backward_out_cuda
+
+- func: fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: fractional_max_pool3d_backward_cpu
+    CUDA: fractional_max_pool3d_backward_cuda
+
+# Return: (Tensor output, Tensor indices)
+- func: max_pool2d_with_indices.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: max_pool2d_with_indices_out_cpu
+    CUDA: max_pool2d_with_indices_out_cuda
+    MPS: max_pool2d_with_indices_out_mps
+
+# Return: (Tensor output, Tensor indices)
+- func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+  python_module: nn
+  structured_delegate: max_pool2d_with_indices.out
+  tags: core
+
+- func: max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: max_pool2d_with_indices_backward_out_cpu
+    CUDA: max_pool2d_with_indices_backward_out_cuda
+    MPS: max_pool2d_with_indices_backward_out_mps
+
+- func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
+  python_module: nn
+  structured_delegate: max_pool2d_with_indices_backward.grad_input
+  tags: core
+
+# Return: (Tensor output, Tensor indices)
+- func: max_pool3d_with_indices.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+  python_module: nn
+  dispatch:
+    CPU: max_pool3d_with_indices_out_cpu
+    CUDA: max_pool3d_with_indices_out_cuda
+
+# Return: (Tensor output, Tensor indices)
+- func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+  python_module: nn
+  dispatch:
+    CPU: max_pool3d_with_indices_cpu
+    CUDA: max_pool3d_with_indices_cuda
+  tags: core
+
+- func: max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: max_pool3d_with_indices_backward_out_cpu
+    CUDA: max_pool3d_with_indices_backward_out_cuda
+
+- func: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: max_pool3d_with_indices_backward_cpu
+    CUDA: max_pool3d_with_indices_backward_cuda
+
+- func: max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: max_unpooling2d_forward_out_cpu
+    CUDA: max_unpooling2d_forward_out_cuda
+
+- func: max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: max_unpooling2d_forward_cpu
+    CUDA: max_unpooling2d_forward_cuda
+
+- func: max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: max_unpooling3d_forward_out_cpu
+    CUDA: max_unpooling3d_forward_out_cuda
+
+- func: max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: max_unpooling3d_forward_cpu
+    CUDA: max_unpooling3d_forward_cuda
+
+- func: reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: reflection_pad1d_out_cpu
+    QuantizedCPU: reflection_pad1d_out_quantized_cpu
+    CUDA: reflection_pad1d_out_cuda
+    MPS: reflection_pad1d_out_mps
+
+- func: reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor
+  python_module: nn
+  structured_delegate: reflection_pad1d.out
+  tags: core
+
+- func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: reflection_pad1d_backward_out_cpu
+    CUDA: reflection_pad1d_backward_out_cuda
+    MPS: reflection_pad1d_backward_out_mps
+
+- func: reflection_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
+  python_module: nn
+  structured_delegate: reflection_pad1d_backward.grad_input
+
+- func: reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU, QuantizedCPU: reflection_pad2d_out_cpu
+    CUDA: reflection_pad2d_out_cuda
+    MPS: reflection_pad2d_out_mps
+
+- func: reflection_pad2d(Tensor self, SymInt[4] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: reflection_pad2d_cpu
+    QuantizedCPU: reflection_pad2d_quantized_cpu
+    CUDA: reflection_pad2d_cuda
+    MPS: reflection_pad2d_mps
+  tags: core
+
+- func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: reflection_pad2d_backward_out_cpu
+    CUDA: reflection_pad2d_backward_out_cuda
+    MPS: reflection_pad2d_backward_out_mps
+
+- func: reflection_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: reflection_pad2d_backward_cpu
+    CUDA: reflection_pad2d_backward_cuda
+    MPS: reflection_pad2d_backward_mps
+
+- func: reflection_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: reflection_pad3d_out_cpu
+    CUDA: reflection_pad3d_out_cuda
+    MPS: reflection_pad3d_out_mps
+
+- func: reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor
+  python_module: nn
+  structured_delegate: reflection_pad3d.out
+  tags: core
+
+- func: reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: reflection_pad3d_backward_out_cpu
+    CUDA: reflection_pad3d_backward_out_cuda
+    MPS: reflection_pad3d_backward_out_mps
+
+- func: reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
+  python_module: nn
+  structured_delegate: reflection_pad3d_backward.grad_input
+
+- func: replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: replication_pad1d_out_cpu
+    CUDA: replication_pad1d_out_cuda
+    MPS: replication_pad1d_out_mps
+
+- func: replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor
+  python_module: nn
+  structured_delegate: replication_pad1d.out
+
+- func: replication_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: replication_pad1d_backward_out_cpu
+    CUDA: replication_pad1d_backward_out_cuda
+    MPS: replication_pad1d_backward_out_mps
+
+- func: replication_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
+  python_module: nn
+  structured_delegate: replication_pad1d_backward.grad_input
+
+- func: replication_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: replication_pad2d_out_cpu
+    CUDA: replication_pad2d_out_cuda
+    MPS: replication_pad2d_out_mps
+
+- func: replication_pad2d(Tensor self, SymInt[4] padding) -> Tensor
+  python_module: nn
+  structured_delegate: replication_pad2d.out
+  tags: core
+
+- func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: replication_pad2d_backward_out_cpu
+    CUDA: replication_pad2d_backward_out_cuda
+    MPS: replication_pad2d_backward_out_mps
+
+- func: replication_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: replication_pad2d_backward_cpu
+    CUDA: replication_pad2d_backward_cuda
+    MPS: replication_pad2d_backward_mps
+
+- func: replication_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: replication_pad3d_out_cpu
+    CUDA: replication_pad3d_out_cuda
+    MPS: replication_pad3d_out_mps
+
+- func: replication_pad3d(Tensor self, SymInt[6] padding) -> Tensor
+  python_module: nn
+  structured_delegate: replication_pad3d.out
+  tags: core
+
+
+- func: replication_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: replication_pad3d_backward_out_cpu
+    CUDA: replication_pad3d_backward_out_cuda
+    MPS: replication_pad3d_backward_out_mps
+
+- func: replication_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: replication_pad3d_backward_cpu
+    CUDA: replication_pad3d_backward_cuda
+    MPS: replication_pad3d_backward_mps
+
+- func: _pad_circular(Tensor self, SymInt[] pad) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: _pad_circular_symint
+
+- func: _pad_enum(Tensor self, SymInt[] pad, int mode, float? value=None) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: _pad_enum_symint
+
+- func: pad(Tensor self, SymInt[] pad, str mode="constant", float? value=None) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: pad_symint
+
+- func: upsample_linear1d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_linear1d.vec_out
+
+- func: upsample_bilinear2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_bilinear2d.vec_out
+  tags: core
+
+- func: _upsample_bilinear2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: _upsample_bilinear2d_aa.vec_out
+
+- func: upsample_trilinear3d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_trilinear3d.vec_out
+
+- func: upsample_bicubic2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_bicubic2d.vec_out
+
+- func: _upsample_bicubic2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: _upsample_bicubic2d_aa.vec_out
+
+- func: upsample_nearest1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_nearest1d.vec_out
+
+- func: _upsample_nearest_exact1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: _upsample_nearest_exact1d.vec_out
+
+- func: upsample_nearest2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_nearest2d.vec_out
+  tags: core
+
+- func: _upsample_nearest_exact2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: _upsample_nearest_exact2d.vec_out
+
+- func: upsample_nearest3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: upsample_nearest3d.vec_out
+
+- func: _upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+  python_module: nn
+  autogen: _upsample_nearest_exact3d.vec_out
+
+# NOTE: all of the non-"vec" upsample overloads are only kept for backward compatibility.
+- func: upsample_linear1d.out(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_linear1d_out_cpu
+    CUDA: upsample_linear1d_out_cuda
+    MPS: upsample_linear1d_out_mps
+
+- func: upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_linear1d.out
+
+- func: upsample_linear1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_linear1d_backward_out_cpu
+    CUDA: upsample_linear1d_backward_out_cuda
+    MPS: upsample_linear1d_backward_out_mps
+
+- func: upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_linear1d_backward.grad_input
+
+- func: upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_bilinear2d_out_cpu
+    CUDA: upsample_bilinear2d_out_cuda
+    MPS: upsample_bilinear2d_out_mps
+
+- func: upsample_bilinear2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_bilinear2d.out
+  dispatch:
+    QuantizedCPU: upsample_bilinear2d_quantized_cpu
+
+- func: upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_bilinear2d_backward_out_cpu
+    CUDA: upsample_bilinear2d_backward_out_cuda
+    MPS: upsample_bilinear2d_backward_out_mps
+
+- func: upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_bilinear2d_backward.grad_input
+
+- func: _upsample_bilinear2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_bilinear2d_aa_out_cpu
+    CUDA: _upsample_bilinear2d_aa_out_cuda
+
+- func: _upsample_bilinear2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_bilinear2d_aa.out
+
+- func: _upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_bilinear2d_aa_backward_out_cpu
+    CUDA: _upsample_bilinear2d_aa_backward_out_cuda
+
+- func: _upsample_bilinear2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_bilinear2d_aa_backward.grad_input
+
+- func: upsample_bicubic2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_bicubic2d_out_cpu
+    CUDA: upsample_bicubic2d_out_cuda
+
+- func: upsample_bicubic2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_bicubic2d.out
+
+- func: upsample_bicubic2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_bicubic2d_backward_out_cpu
+    CUDA: upsample_bicubic2d_backward_out_cuda
+
+- func: upsample_bicubic2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_bicubic2d_backward.grad_input
+
+- func: _upsample_bicubic2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_bicubic2d_aa_out_cpu
+    CUDA: _upsample_bicubic2d_aa_out_cuda
+
+- func: _upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_bicubic2d_aa.out
+
+- func: _upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_bicubic2d_aa_backward_out_cpu
+    CUDA: _upsample_bicubic2d_aa_backward_out_cuda
+
+- func: _upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_bicubic2d_aa_backward.grad_input
+
+- func: upsample_trilinear3d.out(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_trilinear3d_out_cpu
+    CUDA: upsample_trilinear3d_out_cuda
+
+- func: upsample_trilinear3d(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_trilinear3d.out
+
+- func: upsample_trilinear3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_trilinear3d_backward_out_cpu
+    CUDA: upsample_trilinear3d_backward_out_cuda
+
+- func: upsample_trilinear3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_trilinear3d_backward.grad_input
+
+- func: upsample_nearest1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_nearest1d_out_cpu
+    CUDA: upsample_nearest1d_out_cuda
+    MPS: upsample_nearest1d_out_mps
+
+- func: _upsample_nearest_exact1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_nearest_exact1d_out_cpu
+    CUDA: _upsample_nearest_exact1d_out_cuda
+    MPS: _upsample_nearest_exact1d_out_mps
+
+- func: upsample_nearest1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest1d.out
+
+- func: _upsample_nearest_exact1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact1d.out
+
+- func: upsample_nearest1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_nearest1d_backward_out_cpu
+    CUDA: upsample_nearest1d_backward_out_cuda
+    MPS: upsample_nearest1d_backward_out_mps
+
+- func: _upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_nearest_exact1d_backward_out_cpu
+    CUDA: _upsample_nearest_exact1d_backward_out_cuda
+    MPS: _upsample_nearest_exact1d_backward_out_mps
+
+- func: upsample_nearest1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest1d_backward.grad_input
+
+- func: _upsample_nearest_exact1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact1d_backward.grad_input
+
+- func: upsample_nearest2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_nearest2d_out_cpu
+    CUDA: upsample_nearest2d_out_cuda
+    MPS: upsample_nearest2d_out_mps
+
+- func: _upsample_nearest_exact2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_nearest_exact2d_out_cpu
+    CUDA: _upsample_nearest_exact2d_out_cuda
+    MPS: _upsample_nearest_exact2d_out_mps
+
+- func: upsample_nearest2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest2d.out
+  dispatch:
+    QuantizedCPU: upsample_nearest2d_quantized_cpu
+
+- func: _upsample_nearest_exact2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact2d.out
+  dispatch:
+    QuantizedCPU: _upsample_nearest_exact2d_quantized_cpu
+
+- func: upsample_nearest2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_nearest2d_backward_out_cpu
+    CUDA: upsample_nearest2d_backward_out_cuda
+    MPS: upsample_nearest2d_backward_out_mps
+
+- func: _upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_nearest_exact2d_backward_out_cpu
+    CUDA: _upsample_nearest_exact2d_backward_out_cuda
+    MPS: _upsample_nearest_exact2d_backward_out_mps
+
+- func: upsample_nearest2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest2d_backward.grad_input
+
+- func: _upsample_nearest_exact2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact2d_backward.grad_input
+
+- func: upsample_nearest3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_nearest3d_out_cpu
+    CUDA: upsample_nearest3d_out_cuda
+
+- func: _upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_nearest_exact3d_out_cpu
+    CUDA: _upsample_nearest_exact3d_out_cuda
+
+- func: upsample_nearest3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest3d.out
+  dispatch:
+    QuantizedCPU: upsample_nearest3d_quantized_cpu
+
+- func: _upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact3d.out
+  dispatch:
+    QuantizedCPU: _upsample_nearest_exact3d_quantized_cpu
+
+- func: upsample_nearest3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: upsample_nearest3d_backward_out_cpu
+    CUDA: upsample_nearest3d_backward_out_cuda
+
+- func: _upsample_nearest_exact3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: _upsample_nearest_exact3d_backward_out_cpu
+    CUDA: _upsample_nearest_exact3d_backward_out_cuda
+
+- func: upsample_nearest3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: upsample_nearest3d_backward.grad_input
+
+- func: _upsample_nearest_exact3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+  python_module: nn
+  structured_delegate: _upsample_nearest_exact3d_backward.grad_input
+
+- func: sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: sigmoid_backward_out
+    MPS: sigmoid_backward_out_mps
+  tags: pointwise
+
+- func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
+  python_module: nn
+  structured_delegate: sigmoid_backward.grad_input
+  tags: pointwise
+
+- func: logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: logit_backward_out
+    MPS: logit_backward_out_mps
+  tags: pointwise
+
+- func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
+  python_module: nn
+  structured_delegate: logit_backward.grad_input
+  tags: pointwise
+
+- func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: tanh_backward_out
+    MPS: tanh_backward_out_mps
+  tags: pointwise
+
+- func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
+  python_module: nn
+  structured_delegate: tanh_backward.grad_input
+
+# What's a thnn_conv_ versus a slow_conv_?
+#
+# Historically, we have inefficient implementations of convolutions
+# coming from the THNN/THCUNN library.  These convolutions typically
+# operated by computing the Toeplitz matrix and then doing a matrix
+# multiply with the input; this is very memory inefficient!  However,
+# occasionally, we really don't have anything better, so it's helpful
+# to have these fallbacks when there is no more optimized implementation
+# in cudnn or mkldnn, etc.  Both thnn_ and slow_ convolutions fall
+# into this bucket.
+#
+# The difference between these two designations, is that thnn_ refers
+# to a convolution that is still written in the "legacy" style; that is,
+# C code in the THNN/ or THCUNN/ directory.  A slow_ convolution is
+# one that is written in the native style: modern C++.  Algorithmically,
+# these are the same thing, but we give them different prefixes to
+# make the operational distinction clear.
+  tags: pointwise
+
+- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  structured: True
+  dispatch:
+    CPU: slow_conv_transpose2d_structured_cpu
+    CUDA: slow_conv_transpose2d_structured_cuda
+
+- func: slow_conv_transpose2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1) -> Tensor
+  python_module: nn
+  structured_delegate: slow_conv_transpose2d.out
+
+- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: slow_conv_transpose3d_out_cpu
+    CUDA: slow_conv_transpose3d_out_cuda
+
+- func: slow_conv_transpose3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: slow_conv_transpose3d_cpu
+    CUDA: slow_conv_transpose3d_cuda
+
+- func: thnn_conv2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+
+- func: thnn_conv2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0) -> Tensor
+  python_module: nn
+
+- func: _slow_conv2d_forward.output(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) output) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: slow_conv2d_forward_out_cpu
+    CUDA: slow_conv2d_forward_out_cuda
+
+- func: _slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: slow_conv2d_forward_cpu
+    CUDA: slow_conv2d_forward_cuda
+
+- func: _slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+  python_module: nn
+  dispatch:
+    CPU: slow_conv2d_backward_out_cpu
+    CUDA: slow_conv2d_backward_out_cuda
+
+- func: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
+  python_module: nn
+  dispatch:
+    CPU: slow_conv2d_backward_cpu
+    CUDA: slow_conv2d_backward_cuda
+  autogen: _slow_conv2d_backward.output_mask_out
+
+- func: _conv_depthwise2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  python_module: nn
+  dispatch:
+    CUDA: conv_depthwise2d_cuda_out
+
+- func: _conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor
+  python_module: nn
+  dispatch:
+    CUDA: conv_depthwise2d_cuda
+
+- func: conv_depthwise3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation) -> Tensor
+  python_module: nn
+  dispatch:
+    CUDA: conv_depthwise3d_cuda
+  autogen: conv_depthwise3d.out
+
+- func: slow_conv3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+
+- func: slow_conv3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0) -> Tensor
+  python_module: nn
+
+- func: slow_conv3d_forward.output(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: slow_conv3d_forward_out_cpu
+
+- func: slow_conv3d_forward(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: slow_conv3d_forward_cpu
+
+- func: slow_conv_dilated2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: slow_conv_dilated2d_cpu
+    CUDA: slow_conv_dilated2d_cuda
+  autogen: slow_conv_dilated2d.out
+
+- func: slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: slow_conv_dilated3d_cpu
+    CUDA: slow_conv_dilated3d_cuda
+  autogen: slow_conv_dilated3d.out
+
+- func: col2im.out(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: col2im_out_cpu
+    CUDA: col2im_out_cuda
+
+- func: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: col2im_cpu
+    CUDA: col2im_cuda
+  tags: core
+
+- func: column_stack(Tensor[] tensors) -> Tensor
+
+- func: column_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: im2col.out(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU: im2col_out_cpu
+    CUDA: im2col_out_cuda
+
+- func: im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: im2col_cpu
+    CUDA: im2col_cuda
+
+- func: isfinite(Tensor self) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+
+- func: isinf(Tensor self) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: isinf
+    SparseCPU, SparseCUDA: isinf_sparse
+    SparseMeta: isinf_sparse_meta
+    SparseCsrCPU, SparseCsrCUDA: isinf_sparse_csr
+  autogen: isinf.out
+  tags: [core, pointwise]
+
+- func: record_stream(Tensor(a!) self, Stream s) -> ()
+  variants: method
+  dispatch:
+    CUDA: record_stream_cuda
+
+- func: isposinf(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: isposinf.out
+  dispatch:
+    SparseCPU, SparseCUDA: isposinf_sparse
+    SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr
+  tags: pointwise
+
+- func: isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: isposinf_out
+    SparseCPU, SparseCUDA: isposinf_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr_out
+  tags: pointwise
+
+- func: isneginf(Tensor self) -> Tensor
+  variants: function, method
+  structured_delegate: isneginf.out
+  dispatch:
+    SparseCPU, SparseCUDA: isneginf_sparse
+    SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr
+  tags: pointwise
+
+- func: isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: isneginf_out
+    SparseCPU, SparseCUDA: isneginf_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr_out
+  tags: pointwise
+
+# NOTE [_add_batch_dim and _remove_batch_dim]
+# _add_batch_dim and _remove_batch_dim are meant to be used in the implementation
+# of the vmap frontend API (see torch/_vmap_internals.py). They are not
+# user-facing, hence the leading underscore. Please don't use them them anywhere else.
+- func: _add_batch_dim(Tensor self, int batch_dim, int level) -> Tensor
+  variants: function
+
+# See NOTE [_add_batch_dim and _remove_batch_dim]
+- func: _remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor
+  variants: function
+
+## Functions related to the `torch.special` namespace
+# Note [special namespace binding]
+# Functions in the special python module should have their names start with
+#   "special_" underscore and be bound to the desired Python name in
+#   torch/special/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/special.h.
+#   The "special_" names should be hidden from the user and not documented.
+
+- func: special_entr(Tensor self) -> Tensor
+  structured_delegate: special_entr.out
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_entr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: special
+  variants: function
+  dispatch:
+    CPU, CUDA: special_entr_out
+  tags: pointwise
+
+- func: special_ndtri(Tensor self) -> Tensor
+  structured_delegate: special_ndtri.out
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_ndtri.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: special
+  variants: function
+  dispatch:
+    CPU, CUDA: special_ndtri_out
+  tags: pointwise
+
+- func: special_log_ndtr(Tensor self) -> Tensor
+  structured_delegate: special_log_ndtr.out
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_log_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: special
+  variants: function
+  dispatch:
+    CPU, CUDA: special_log_ndtr_out
+  tags: pointwise
+
+- func: special_expm1(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_exp2(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_psi(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_psi.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_digamma(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_gammaln(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_gammaln.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_erf(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_erfc(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+
+- func: special_erfcx(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+  structured_delegate: special_erfcx.out
+  tags: pointwise
+
+- func: special_erfcx.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: special_erfcx_out
+  tags: pointwise
+
+- func: special_erfinv(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+
+- func: special_ndtr(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_xlog1py(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  structured_delegate: special_xlog1py.out
+  tags: pointwise
+
+- func: special_xlog1py.self_scalar(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_xlog1py
+  tags: pointwise
+
+- func: special_xlog1py.other_scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_xlog1py
+  tags: pointwise
+
+- func: special_xlog1py.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: special
+  variants: function
+  dispatch:
+    CPU, CUDA: special_xlog1py_out
+  tags: pointwise
+
+- func: special_xlog1py.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_xlog1py_out
+  tags: pointwise
+
+- func: special_xlog1py.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_xlog1py_out
+  tags: pointwise
+
+- func: special_xlogy(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+
+- func: special_xlogy.self_scalar(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+
+- func: special_xlogy.other_scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+
+- func: special_xlogy.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+
+- func: special_xlogy.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+
+- func: special_xlogy.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+
+- func: special_zeta(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  structured_delegate: special_zeta.out
+  tags: pointwise
+
+- func: special_zeta.self_scalar(Scalar self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_zeta
+  tags: pointwise
+
+- func: special_zeta.other_scalar(Tensor self, Scalar other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_zeta
+  tags: pointwise
+
+- func: special_zeta.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  python_module: special
+  variants: function
+  dispatch:
+    CPU, CUDA: special_zeta_out
+  tags: pointwise
+
+- func: special_zeta.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_zeta_out
+  tags: pointwise
+
+- func: special_zeta.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  python_module: special
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: special_zeta_out
+  tags: pointwise
+
+- func: special_i0(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_i0e(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+  structured_delegate: special_i0e.out
+  tags: pointwise
+
+- func: special_i0e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: special_i0e_out
+  tags: pointwise
+
+- func: special_i1(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+  structured_delegate: special_i1.out
+  tags: pointwise
+
+- func: special_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: special_i1_out
+  tags: pointwise
+
+- func: special_i1e(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+  structured_delegate: special_i1e.out
+  tags: pointwise
+
+- func: special_i1e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: special_i1e_out
+  tags: pointwise
+
+- func: special_logit(Tensor self, float? eps=None) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+
+- func: special_polygamma(int n, Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+
+- func: special_logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+
+- func: special_expit(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_expit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_sinc(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_round(Tensor self, *, int decimals=0) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_round.out(Tensor self, *, int decimals=0, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_log1p(Tensor self) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_log_softmax(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_gammainc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_gammainc(Tensor self, Tensor other) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_gammaincc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_gammaincc(Tensor self, Tensor other) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_multigammaln(Tensor self, int p) -> Tensor
+  python_module: special
+  variants: function
+
+- func: special_multigammaln.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: special
+  variants: function
+
+- func: special_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  python_module: special
+  variants: function
+
+## Functions related to the fast Fourier transform and the torch.fft namespace
+# Note [FFT namespace binding]
+# Functions in the fft python module should have their names start with
+#   "fft_" underscore and be bound to the desired Python name in
+#   torch/fft/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/fft.h.
+#   The "fft_" names should be hidden from the user and not documented.
+#
+# See fft_fft as an example.
+
+# torch.fft.fft
+# NOTE: NOT an alias for torch.fft, which has different semantics
+- func: fft_fft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_fft_symint
+
+- func: fft_fft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_fft_symint_out
+
+- func: fft_ifft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ifft_symint
+
+- func: fft_ifft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ifft_symint_out
+
+- func: fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_rfft_symint
+
+- func: fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_rfft_symint_out
+
+- func: fft_irfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_irfft_symint
+
+- func: fft_irfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_irfft_symint_out
+
+- func: fft_hfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_hfft_symint
+
+- func: fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_hfft_symint_out
+
+- func: fft_ihfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ihfft_symint
+
+- func: fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ihfft_symint_out
+
+- func: fft_fft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_fft2_symint
+
+- func: fft_fft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_fft2_symint_out
+
+- func: fft_ifft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ifft2_symint
+
+- func: fft_ifft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ifft2_symint_out
+
+- func: fft_rfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_rfft2_symint
+
+- func: fft_rfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_rfft2_symint_out
+
+- func: fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_irfft2_symint
+
+- func: fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_irfft2_symint_out
+
+- func: fft_hfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_hfft2_symint
+
+- func: fft_hfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_hfft2_symint_out
+
+- func: fft_ihfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ihfft2_symint
+
+- func: fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ihfft2_symint_out
+
+- func: fft_fftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_fftn_symint
+
+- func: fft_fftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_fftn_symint_out
+
+- func: fft_ifftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ifftn_symint
+
+- func: fft_ifftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ifftn_symint_out
+
+- func: fft_rfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_rfftn_symint
+
+- func: fft_rfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_rfftn_symint_out
+
+- func: fft_irfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_irfftn_symint
+
+- func: fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_irfftn_symint_out
+
+- func: fft_hfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_hfftn_symint
+
+- func: fft_hfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_hfftn_symint_out
+
+- func: fft_ihfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ihfftn_symint
+
+- func: fft_ihfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_const_ref_for_mutable_tensors: True
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeImplicitAutograd: fft_ihfftn_symint_out
+
+- func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: fft_fftfreq
+
+- func: fft_fftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: fft_fftfreq_out
+
+- func: fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: fft_rfftfreq
+
+- func: fft_rfftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: fft
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: fft_rfftfreq_out
+
+- func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor
+  python_module: fft
+  variants: function
+
+- func: fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor
+  python_module: fft
+  variants: function
+
+## Functions for linear algebra and the torch.linalg namespace
+# Note [linalg namespace binding]
+# Functions in the linalg python module should have their names start with
+#   "linalg_" and be bound to the desired Python name in
+#   torch/linalg/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/linalg.h.
+#   The "linalg_" names should be hidden from the user and not documented.
+#
+# See linalg_det as an example.
+
+# "_ex" stands for experimental
+- func: linalg_cholesky_ex(Tensor self, *, bool upper=False, bool check_errors=False) -> (Tensor L, Tensor info)
+  python_module: linalg
+  structured_delegate: linalg_cholesky_ex.L
+
+- func: linalg_cholesky_ex.L(Tensor self, *, bool upper=False, bool check_errors=False, Tensor(a!) L, Tensor(b!) info) -> (Tensor(a!) L, Tensor(b!) info)
+  python_module: linalg
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_cholesky_ex_out
+
+- func: linalg_cholesky(Tensor self, *, bool upper=False) -> Tensor
+  python_module: linalg
+
+- func: linalg_cholesky.out(Tensor self, *, bool upper=False, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: linalg_cross(Tensor self, Tensor other, *, int dim=-1) -> Tensor
+  python_module: linalg
+  variants: function
+  structured_delegate: linalg_cross.out
+  dispatch:
+    ZeroTensor: linalg_cross_zerotensor
+
+- func: linalg_cross.out(Tensor self, Tensor other, *, int dim=-1, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  structured: True
+  dispatch:
+    CPU, CUDA, MPS: linalg_cross_out
+
+# linalg.lu_factor
+- func: linalg_lu_factor(Tensor A, *, bool pivot=True) -> (Tensor LU, Tensor pivots)
+  python_module: linalg
+  variants: function
+
+- func: linalg_lu_factor.out(Tensor A, *, bool pivot=True, Tensor(a!) LU, Tensor(b!) pivots) -> (Tensor(a!) LU, Tensor(b!) pivots)
+  python_module: linalg
+  variants: function
+
+- func: linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info)
+  python_module: linalg
+  structured_delegate: linalg_lu_factor_ex.out
+  variants: function
+
+- func: linalg_lu_factor_ex.out(Tensor A, *, bool pivot=True, bool check_errors=False, Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info)
+  python_module: linalg
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_lu_factor_ex_out
+
+# linalg.lu
+- func: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U)
+  python_module: linalg
+  structured_delegate: linalg_lu.out
+  variants: function
+
+- func: linalg_lu.out(Tensor A, *, bool pivot=True, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
+  python_module: linalg
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_lu_out
+
+# linalg.lu_solve
+- func: linalg_lu_solve(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False) -> Tensor
+  python_module: linalg
+  structured_delegate: linalg_lu_solve.out
+  variants: function
+
+- func: linalg_lu_solve.out(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_lu_solve_out
+
+# linalg.det
+- func: _linalg_det(Tensor A) -> (Tensor result, Tensor LU, Tensor pivots)
+  structured_delegate: _linalg_det.result
+
+- func: _linalg_det.result(Tensor A, *, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots)
+  structured: True
+  dispatch:
+    CPU, CUDA: _linalg_det_out
+
+- func: linalg_det(Tensor A) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_det.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+# torch.det, alias for torch.linalg.det
+- func: det(Tensor self) -> Tensor
+  variants: function, method
+
+- func: linalg_ldl_factor_ex(Tensor self, *, bool hermitian=False, bool check_errors=False) -> (Tensor LD, Tensor pivots, Tensor info)
+  structured_delegate: linalg_ldl_factor_ex.out
+  python_module: linalg
+  variants: function
+
+- func: linalg_ldl_factor_ex.out(Tensor self, *, bool hermitian=False, bool check_errors=False, Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info)
+  structured: True
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_ldl_factor_ex_out
+
+- func: linalg_ldl_factor(Tensor self, *, bool hermitian=False) -> (Tensor LD, Tensor pivots)
+  python_module: linalg
+  variants: function
+
+- func: linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots)
+  python_module: linalg
+  variants: function
+
+- func: linalg_ldl_solve(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False) -> Tensor
+  structured_delegate: linalg_ldl_solve.out
+  python_module: linalg
+  variants: function
+
+- func: linalg_ldl_solve.out(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_ldl_solve_out
+
+- func: linalg_lstsq(Tensor self, Tensor b, float? rcond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values)
+  python_module: linalg
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: linalg_lstsq
+  tags: dynamic_output_shape
+
+- func: linalg_lstsq.out(Tensor self, Tensor b, float? rcond=None, *, str? driver=None, Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values) -> (Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values)
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_lstsq_out
+  tags: dynamic_output_shape
+
+# torch.linalg.matmul, alias for torch.matmul
+- func: linalg_matmul(Tensor self, Tensor other) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: linalg_vecdot(Tensor x, Tensor y, *, int dim=-1) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_vecdot.out(Tensor x, Tensor y, *, int dim=-1, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: linalg_matrix_exp(Tensor self) -> Tensor
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_matrix_exp
+  autogen: linalg_matrix_exp.out
+
+- func: _linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet, Tensor LU, Tensor pivots)
+  structured_delegate: _linalg_slogdet.sign
+
+- func: _linalg_slogdet.sign(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots) -> (Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots)
+  structured: True
+  dispatch:
+    CPU, CUDA: _linalg_slogdet_out
+
+- func: linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet)
+  python_module: linalg
+
+- func: linalg_slogdet.out(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet) -> (Tensor(a!) sign, Tensor(b!) logabsdet)
+  python_module: linalg
+
+- func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
+  variants: function, method
+
+- func: slogdet.out(Tensor self, *, Tensor(a!) sign, Tensor(b!) logabsdet) -> (Tensor(a!) sign, Tensor(b!) logabsdet)
+  variants: function
+
+- func: logdet(Tensor self) -> Tensor
+  variants: function, method
+
+- func: linalg_eig(Tensor self) -> (Tensor eigenvalues, Tensor eigenvectors)
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_eig
+
+- func: linalg_eig.out(Tensor self, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
+  python_module: linalg
+  dispatch:
+    CPU, CUDA: linalg_eig_out
+
+- func: _linalg_eigvals(Tensor self) -> Tensor
+  python_module: linalg
+  dispatch:
+    CPU, CUDA: _linalg_eigvals
+
+- func: linalg_eigvals(Tensor self) -> Tensor
+  python_module: linalg
+
+- func: linalg_eigvals.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  dispatch:
+    CPU, CUDA: linalg_eigvals_out
+
+# This function is exposes the `compute_v` flag, which is then used to implement `linalg.eigh` and
+# `linalg.eigvalsh` as composite functions that call this one
+- func: _linalg_eigh(Tensor A, str UPLO="L", bool compute_v=True) -> (Tensor eigenvalues, Tensor eigenvectors)
+  structured_delegate: _linalg_eigh.eigenvalues
+
+- func: _linalg_eigh.eigenvalues(Tensor A, str UPLO="L", bool compute_v=True, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
+  structured: True
+  dispatch:
+    CPU, CUDA: _linalg_eigh_out
+
+- func: linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors)
+  python_module: linalg
+
+- func: linalg_eigh.eigvals(Tensor self, str UPLO="L", *, Tensor(a!) eigvals, Tensor(b!) eigvecs) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
+  python_module: linalg
+
+- func: linalg_eigvalsh(Tensor self, str UPLO="L") -> Tensor
+  python_module: linalg
+
+- func: linalg_eigvalsh.out(Tensor self, str UPLO="L", *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: linalg_householder_product(Tensor input, Tensor tau) -> Tensor
+  python_module: linalg
+  variants: function
+  dispatch:
+    CPU, CUDA: linalg_householder_product
+
+- func: linalg_householder_product.out(Tensor input, Tensor tau, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  dispatch:
+    CPU, CUDA: linalg_householder_product_out
+
+- func: linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info)
+  python_module: linalg
+  structured_delegate: linalg_inv_ex.inverse
+
+- func: linalg_inv_ex.inverse(Tensor A, *, bool check_errors=False, Tensor(a!) inverse, Tensor(b!) info) -> (Tensor(a!) inverse, Tensor(b!) info)
+  python_module: linalg
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_inv_ex_out
+    MPS: linalg_inv_ex_out_mps
+
+- func: linalg_inv(Tensor A) -> Tensor
+  python_module: linalg
+
+- func: linalg_inv.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: inverse(Tensor self) -> Tensor
+  variants: function, method
+
+- func: inverse.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: inner(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+
+- func: inner.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: outer(Tensor self, Tensor vec2) -> Tensor
+  variants: function, method
+
+- func: outer.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+
+# torch.ger, alias for torch.outer
+- func: ger(Tensor self, Tensor vec2) -> Tensor
+  variants: function, method
+
+- func: ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_norm.ord_str(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_norm.out(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_norm.ord_str_out(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  python_module: linalg
+  variants: function
+  structured_delegate: linalg_vector_norm.out
+
+- func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_vector_norm_out
+    MPS: linalg_vector_norm_out_mps
+
+- func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  python_module: linalg
+
+- func: linalg_matrix_norm.out(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: linalg_matrix_norm.str_ord(Tensor self, str ord='fro', int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  python_module: linalg
+
+- func: linalg_matrix_norm.str_ord_out(Tensor self, str ord='fro', int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+# This function is exposes the `compute_uv` flag, which is then used to implement `linalg.svd` and
+# `linalg.svdvals` as composite functions that call this one
+- func: _linalg_svd(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh)
+  variants: function
+  structured_delegate: _linalg_svd.U
+
+- func: _linalg_svd.U(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh)
+  structured: True
+  dispatch:
+    CPU, CUDA: _linalg_svd_out
+
+- func: linalg_svd(Tensor A, bool full_matrices=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh)
+  python_module: linalg
+  variants: function
+
+- func: linalg_svd.U(Tensor A, bool full_matrices=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh)
+  python_module: linalg
+  variants: function
+
+- func: linalg_svdvals(Tensor A, *, str? driver=None) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_svdvals.out(Tensor A, *, str? driver=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_cond(Tensor self, Scalar? p=None) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_cond.out(Tensor self, Scalar? p=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_cond.p_str(Tensor self, str p) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_cond.p_str_out(Tensor self, str p, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_pinv.atol_rtol_tensor(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False) -> Tensor
+  python_module: linalg
+  variants: function
+  dispatch:
+    # calls svd, which calls mH() (view op)
+    # also calls narrow()
+    CompositeExplicitAutogradNonFunctional: linalg_pinv
+
+- func: linalg_pinv.atol_rtol_tensor_out(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: linalg_pinv_out
+
+- func: linalg_pinv.atol_rtol_float(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False) -> Tensor
+  cpp_no_default_args: ['atol', 'rtol']
+  python_module: linalg
+  variants: function
+
+- func: linalg_pinv.atol_rtol_float_out(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+  cpp_no_default_args: ['atol', 'rtol']
+  python_module: linalg
+  variants: function
+
+- func: linalg_pinv(Tensor self, float rcond, bool hermitian=False) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_pinv.rcond_tensor(Tensor self, Tensor rcond, bool hermitian=False) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_pinv.out(Tensor self, float rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_pinv.out_rcond_tensor(Tensor self, Tensor rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: _linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor LU, Tensor pivots, Tensor info)
+  structured_delegate: _linalg_solve_ex.result
+
+- func: _linalg_solve_ex.result(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info)
+  structured: True
+  dispatch:
+    CPU, CUDA: _linalg_solve_ex_out
+
+- func: linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor info)
+  python_module: linalg
+
+- func: linalg_solve_ex.out(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) info) -> (Tensor(a!) result, Tensor(b!) info)
+  python_module: linalg
+
+- func: linalg_solve(Tensor A, Tensor B, *, bool left=True) -> Tensor
+  python_module: linalg
+
+- func: linalg_solve.out(Tensor A, Tensor B, *, bool left=True, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: linalg_tensorinv(Tensor self, int ind=2) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_tensorinv.out(Tensor self, int ind=2, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_tensorsolve(Tensor self, Tensor other, int[]? dims=None) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_tensorsolve.out(Tensor self, Tensor other, int[]? dims=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R)
+  python_module: linalg
+  variants: function
+  structured_delegate: linalg_qr.out
+
+- func: linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
+  python_module: linalg
+  structured: True
+  dispatch:
+    CPU, CUDA: linalg_qr_out
+
+- func: linalg_matrix_power(Tensor self, int n) -> Tensor
+  python_module: linalg
+
+- func: linalg_matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+- func: linalg_matrix_rank.atol_rtol_tensor(Tensor input, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_matrix_rank.atol_rtol_tensor_out(Tensor input, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_matrix_rank.atol_rtol_float(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False) -> Tensor
+  cpp_no_default_args: ['atol', 'rtol']
+  python_module: linalg
+  variants: function
+
+- func: linalg_matrix_rank.atol_rtol_float_out(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+  cpp_no_default_args: ['atol', 'rtol']
+  python_module: linalg
+  variants: function
+
+- func: linalg_matrix_rank(Tensor self, float tol, bool hermitian=False) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_matrix_rank.out(Tensor self, float tol, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_matrix_rank.tol_tensor(Tensor input, Tensor tol, bool hermitian=False) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_matrix_rank.out_tol_tensor(Tensor input, Tensor tol, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_multi_dot(Tensor[] tensors) -> Tensor
+  python_module: linalg
+
+- func: linalg_multi_dot.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
+## Functions related to the `torch.nested` namespace
+# Note [nested namespace binding]
+# Functions in the nested python module should have their names start with
+#   "nested_" underscore and be bound to the desired Python name in
+#   torch/nested/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/nested.h.
+#   The "nested_" names should be hidden from the user and not documented.
+
+- func: nested_to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor
+  python_module: nested
+  variants: function
+
+## Functions that are only for testing
+# It is undocumented and should not be used outside of tests.
+- func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
+
+# Note: for testing COW materialization within `at::parallel_for` loop function
+- func: _test_parallel_materialize(Tensor self, int num_parallel, bool skip_first=False) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _test_parallel_materialize
+
+# Note: this function is only for testing.
+- func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: _test_optional_intlist
+  autogen: _test_optional_intlist.out
+
+# Note: this function is only for testing.
+- func: _test_optional_filled_intlist(Tensor values, int[2]? addends) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: _test_optional_intlist
+  autogen: _test_optional_filled_intlist.out
+
+# Note: this function is only for testing.
+- func: _test_optional_floatlist(Tensor values, float[]? addends) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: _test_optional_floatlist
+  autogen: _test_optional_floatlist.out
+
+# Note: this function is only for testing.
+- func: _test_string_default(Tensor dummy, str a="\"'\\", str b='"\'\\') -> Tensor
+  python_module: nn
+
+# Note: this function is only for testing.
+- func: _test_ambiguous_defaults.a(Tensor dummy, int a=1, int b=1) -> Tensor
+  python_module: nn
+
+# Note: this function is only for testing.
+- func: _test_ambiguous_defaults.b(Tensor dummy, int a=2, str b="2") -> Tensor
+  cpp_no_default_args: ['a', 'b']
+  python_module: nn
+
+# Note: this function is only for testing.
+- func: _test_warn_in_autograd(Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: _test_warn_in_autograd
+  autogen: _test_warn_in_autograd.out
+
+# Note: this function is only for testing.
+- func: _test_autograd_multiple_dispatch.fullcoverage(Tensor self) -> Tensor
+  dispatch:
+    # the NestedTensor keys are necessary because NestedTensor has been removed
+    # from the CompositeExplicitAutograd keyset see Note [NestedTensor Not Included in Backend Keys]
+    CompositeExplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_fullcoverage
+  autogen: _test_autograd_multiple_dispatch.fullcoverage_out
+
+# Note: this function is only for testing.
+- func: _test_autograd_multiple_dispatch.ntonly(Tensor self, bool b) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_ntonly
+
+# Note: this function is only for testing.
+- func: _test_autograd_multiple_dispatch_view(Tensor(a) self) -> Tensor(a)
+  dispatch:
+    CompositeExplicitAutograd: _test_autograd_multiple_dispatch_view
+
+# Note: this function is only for testing.
+- func: _test_autograd_multiple_dispatch_view_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _test_autograd_multiple_dispatch_view_copy
+  tags: view_copy
+  autogen: _test_autograd_multiple_dispatch_view_copy.out
+
+- func: segment_reduce(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, Tensor? offsets=None, int axis=0, bool unsafe=False, Scalar? initial=None) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: segment_reduce_kernel
+  autogen: segment_reduce.out
+
+- func: _segment_reduce_backward(Tensor grad, Tensor output, Tensor data, str reduce, *, Tensor? lengths=None, Tensor? offsets=None, int axis=0, Scalar? initial=None) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA: _segment_reduce_backward_kernel
+  autogen: _segment_reduce_backward.out
+
+- func: pad_sequence(Tensor[] sequences, bool batch_first=False, float padding_value=0.0) -> Tensor
+  python_module: nn
+  variants: function
+
+- func: flatten_dense_tensors(Tensor[] tensors) -> Tensor
+  variants: function
+  python_module: nn
+
+- func: unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[]
+  variants: function
+  python_module: nn
+
+- func: _nested_tensor_from_tensor_list(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _nested_tensor_from_tensor_list
+  autogen: _nested_tensor_from_tensor_list.out
+
+- func: _fw_primal_copy(Tensor self, int level) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _fw_primal_copy
+  tags: view_copy
+  autogen: _fw_primal_copy.out
+
+- func: _make_dual_copy(Tensor primal, Tensor tangent, int level) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _make_dual_copy
+  tags: view_copy
+  autogen: _make_dual_copy.out
+
+- func: view_as_real_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: view_as_real_copy
+  tags: view_copy
+  autogen: view_as_real_copy.out
+
+- func: view_as_complex_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: view_as_complex_copy
+  tags: view_copy
+  autogen: view_as_complex_copy.out
+
+- func: _conj_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _conj_copy
+  tags: view_copy
+  autogen: _conj_copy.out
+
+- func: _neg_view_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _neg_view_copy
+  tags: view_copy
+  autogen: _neg_view_copy.out
+
+- func: as_strided_copy(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: as_strided_copy_symint
+  tags: view_copy
+  autogen: as_strided_copy.out
+
+- func: _sparse_broadcast_to_copy(Tensor self, int[] size) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _sparse_broadcast_to_copy
+  tags: view_copy
+  autogen: _sparse_broadcast_to_copy.out
+
+- func: diagonal_copy(Tensor self, int offset=0, int dim1=0, int dim2=1) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: diagonal_copy
+  tags: view_copy
+  autogen: diagonal_copy.out
+
+- func: expand_copy(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: expand_copy_symint
+  tags: view_copy
+  autogen: expand_copy.out
+
+- func: permute_copy(Tensor self, int[] dims) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: permute_copy
+  tags: view_copy
+  autogen: permute_copy.out
+
+- func: _reshape_alias_copy(Tensor self, SymInt[] size, SymInt[] stride) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _reshape_alias_copy_symint
+  tags: view_copy
+  autogen: _reshape_alias_copy.out
+
+- func: select_copy.int(Tensor self, int dim, SymInt index) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: select_copy_symint
+    SparseCsrCPU, SparseCsrCUDA: select_copy_sparse_csr
+  tags: view_copy
+  autogen: select_copy.int_out
+
+- func: detach_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: detach_copy
+  tags: view_copy
+  autogen: detach_copy.out
+
+- func: slice_copy.Tensor(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: slice_copy_Tensor_symint
+  tags: view_copy
+  autogen: slice_copy.Tensor_out
+
+- func: split_copy.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: split_copy_Tensor_symint
+  tags: view_copy
+
+- func: split_with_sizes_copy(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: split_with_sizes_copy_symint
+  tags: view_copy
+
+- func: squeeze_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: squeeze_copy
+  tags: view_copy
+  autogen: squeeze_copy.out
+
+- func: squeeze_copy.dim(Tensor self, int dim) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: squeeze_copy_dim
+  tags: view_copy
+  autogen: squeeze_copy.dim_out
+
+- func: squeeze_copy.dims(Tensor self, int[] dim) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: squeeze_copy_dims
+  tags: view_copy
+  autogen: squeeze_copy.dims_out
+
+- func: t_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: t_copy
+  tags: view_copy
+  autogen: t_copy.out
+
+- func: transpose_copy.int(Tensor self, int dim0, int dim1) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: transpose_copy_int
+  tags: view_copy
+  autogen: transpose_copy.int_out
+
+- func: unsqueeze_copy(Tensor self, int dim) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: unsqueeze_copy
+  tags: view_copy
+  autogen: unsqueeze_copy.out
+
+- func: _indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _indices_copy
+  tags: view_copy
+  autogen: _indices_copy.out
+
+- func: _values_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _values_copy
+  tags: view_copy
+  autogen: _values_copy.out
+
+- func: indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: indices_copy
+  tags: view_copy
+  autogen: indices_copy.out
+
+- func: values_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: values_copy
+  tags: view_copy
+  autogen: values_copy.out
+
+- func: crow_indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: crow_indices_copy
+  tags: view_copy
+  autogen: crow_indices_copy.out
+
+- func: col_indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: col_indices_copy
+  tags: view_copy
+  autogen: col_indices_copy.out
+
+- func: ccol_indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: ccol_indices_copy
+  tags: view_copy
+  autogen: ccol_indices_copy.out
+
+- func: row_indices_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: row_indices_copy
+  tags: view_copy
+  autogen: row_indices_copy.out
+
+- func: unbind_copy.int(Tensor self, int dim=0) -> Tensor[]
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: unbind_copy_int
+  tags: view_copy
+
+- func: unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> ()
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: unbind_copy_int_out
+
+- func: split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: split_copy_Tensor_out
+
+
+- func: split_with_sizes_copy.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: split_with_sizes_copy_out
+    CUDA: split_with_sizes_copy_out_cuda
+
+- func: view_copy(Tensor self, SymInt[] size) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: view_copy_symint
+  tags: view_copy
+  autogen: view_copy.out
+
+- func: view_copy.dtype(Tensor self, ScalarType dtype) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: view_copy_dtype
+  tags: view_copy
+  autogen: view_copy.dtype_out
+
+- func: unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: unfold_copy
+  tags: view_copy
+  autogen: unfold_copy.out
+
+- func: alias_copy(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: alias_copy
+  tags: view_copy
+  autogen: alias_copy.out
+
+- func: to_padded_tensor(Tensor self, float padding, SymInt[]? output_size=None) -> Tensor
+  variants: method
+  dispatch:
+    NestedTensorCPU: NestedTensor_to_padded_tensor_generic
+    NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda
+  autogen: to_padded_tensor.out
+
+- func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
+  dispatch:
+    NestedTensorCPU: NestedTensor_softmax_dropout
+    NestedTensorCUDA: NestedTensor_softmax_dropout_cuda
+  tags: nondeterministic_seeded
+
+# Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is.
+- func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward
+  autogen: _transformer_encoder_layer_fwd.out
+
+- func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU, NestedTensorCPU: native_multi_head_attention_cpu
+    CUDA, NestedTensorCUDA: native_multi_head_attention_cuda
+  autogen: _native_multi_head_attention.out
+
+- func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> Tensor
+  python_module: nn
+  variants: function
+  autogen: scaled_dot_product_attention.out
+  tags: nondeterministic_seeded
+
+# This aten function is kept so that we can test the choice function from Python
+- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> int
+  dispatch:
+    Meta: _fused_sdp_choice_meta
+    CPU, NestedTensorCPU: _fused_sdp_choice_cpp
+    CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda
+  tags: nondeterministic_seeded
+
+- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor)
+  variants: function
+  tags: nondeterministic_seeded
+
+- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+  dispatch:
+    CUDA: _scaled_dot_product_flash_attention_cuda
+    NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda
+  tags: nondeterministic_seeded
+
+- func: _scaled_dot_product_flash_attention_for_cpu(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor output, Tensor logsumexp)
+  dispatch:
+    CPU: _scaled_dot_product_flash_attention_cpu
+  tags: nondeterministic_seeded
+
+- func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CUDA: _scaled_dot_product_flash_attention_backward_cuda
+    NestedTensorCUDA: _scaled_dot_product_flash_attention_backward_nested
+
+- func: _scaled_dot_product_flash_attention_for_cpu_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, float dropout_p, bool is_causal, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: _scaled_dot_product_flash_attention_cpu_backward
+
+- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
+  dispatch:
+    CUDA: _scaled_dot_product_efficient_attention_cuda
+    NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda
+  tags: nondeterministic_seeded
+
+- func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor attn_bias, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, float dropout_p, bool[4] grad_input_mask, bool is_causal=False, *, float? scale=None) -> (Tensor, Tensor, Tensor, Tensor)
+  device_check: NoCheck
+  dispatch:
+    CUDA: _scaled_dot_product_efficient_attention_backward_cuda
+  tags: nondeterministic_seeded
+
+- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset)
+  dispatch:
+    CUDA: _scaled_dot_product_cudnn_attention_cuda
+  tags: nondeterministic_seeded
+
+- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+  variants: function
+  dispatch:
+    CUDA: _flash_attention_forward
+  tags: nondeterministic_seeded
+
+- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CUDA: _flash_attention_backward
+
+# Returns output, logsumexp if compute_logsumexp
+- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
+  variants: function
+  dispatch:
+    CUDA: _efficient_attention_forward
+  tags: nondeterministic_seeded
+
+- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CUDA: _efficient_attention_backward
+
+- func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: triton_scaled_dot_attention
+  tags: nondeterministic_seeded
+  autogen: _triton_scaled_dot_attention.out
+
+- func: _fill_mem_eff_dropout_mask_(Tensor(a!) self, float dropout_p, int seed, int offset) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CUDA: _fill_mem_eff_dropout_mask_
+  tags: nondeterministic_seeded
+
+- func: _triton_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: triton_multi_head_attention
+  autogen: _triton_multi_head_attention.out
+
+- func: special_airy_ai(Tensor x) -> Tensor
+  python_module: special
+  structured_delegate: special_airy_ai.out
+  variants: function
+  tags: pointwise
+
+- func: special_airy_ai.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_airy_ai_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_j0(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_bessel_j0.out
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_bessel_j0_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_j1(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_bessel_j1.out
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_bessel_j1_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_y0(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_bessel_y0.out
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_bessel_y0_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_y1(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_bessel_y1.out
+  variants: function
+  tags: pointwise
+
+- func: special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_bessel_y1_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_chebyshev_polynomial_t.out
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_t
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_t
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_chebyshev_polynomial_t_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_t_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_t_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_chebyshev_polynomial_u.out
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_u
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_u
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_chebyshev_polynomial_u_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_u_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_u_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_chebyshev_polynomial_v.out
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_v
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_v
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_chebyshev_polynomial_v_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_v_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_v_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_chebyshev_polynomial_w.out
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_w
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_w
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_chebyshev_polynomial_w_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_w_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_chebyshev_polynomial_w_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_h(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_hermite_polynomial_h.out
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_h.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_h
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_h.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_h
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_h.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_hermite_polynomial_h_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_h.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_h_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_h.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_h_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_he(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_hermite_polynomial_he.out
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_he.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_he
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_he.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_he
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_he.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_hermite_polynomial_he_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_he.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_he_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_hermite_polynomial_he.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_hermite_polynomial_he_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_laguerre_polynomial_l(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_laguerre_polynomial_l.out
+  variants: function
+  tags: pointwise
+
+- func: special_laguerre_polynomial_l.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_laguerre_polynomial_l
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_laguerre_polynomial_l.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_laguerre_polynomial_l
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_laguerre_polynomial_l.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_laguerre_polynomial_l_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_laguerre_polynomial_l.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_laguerre_polynomial_l_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_laguerre_polynomial_l.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_laguerre_polynomial_l_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_legendre_polynomial_p(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_legendre_polynomial_p.out
+  variants: function
+  tags: pointwise
+
+- func: special_legendre_polynomial_p.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_legendre_polynomial_p
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_legendre_polynomial_p.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_legendre_polynomial_p
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_legendre_polynomial_p.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_legendre_polynomial_p_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_legendre_polynomial_p.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_legendre_polynomial_p_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_legendre_polynomial_p.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_legendre_polynomial_p_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_i0(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_modified_bessel_i0.out
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_modified_bessel_i0_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_i1(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_modified_bessel_i1.out
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_modified_bessel_i1_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_k0(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_modified_bessel_k0.out
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_k0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_modified_bessel_k0_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_k1(Tensor self) -> Tensor
+  python_module: special
+  structured_delegate: special_modified_bessel_k1.out
+  variants: function
+  tags: pointwise
+
+- func: special_modified_bessel_k1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_modified_bessel_k1_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_scaled_modified_bessel_k0(Tensor x) -> Tensor
+  python_module: special
+  structured_delegate: special_scaled_modified_bessel_k0.out
+  variants: function
+  tags: pointwise
+
+- func: special_scaled_modified_bessel_k0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_scaled_modified_bessel_k0_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_scaled_modified_bessel_k1(Tensor x) -> Tensor
+  python_module: special
+  structured_delegate: special_scaled_modified_bessel_k1.out
+  variants: function
+  tags: pointwise
+
+- func: special_scaled_modified_bessel_k1.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_scaled_modified_bessel_k1_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_shifted_chebyshev_polynomial_t.out
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_shifted_chebyshev_polynomial_t_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_t_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_shifted_chebyshev_polynomial_u.out
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_shifted_chebyshev_polynomial_u_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_u_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_shifted_chebyshev_polynomial_v.out
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_shifted_chebyshev_polynomial_v_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_v_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor
+  device_check: NoCheck
+  python_module: special
+  structured_delegate: special_shifted_chebyshev_polynomial_w.out
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: special_shifted_chebyshev_polynomial_w_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_shifted_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: special_shifted_chebyshev_polynomial_w_out
+  device_check: NoCheck
+  python_module: special
+  variants: function
+  tags: pointwise
+
+- func: special_spherical_bessel_j0(Tensor x) -> Tensor
+  python_module: special
+  structured_delegate: special_spherical_bessel_j0.out
+  variants: function
+  tags: pointwise
+
+- func: special_spherical_bessel_j0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: special_spherical_bessel_j0_out
+  python_module: special
+  structured_inherits: TensorIteratorBase
+  structured: True
+  variants: function
+  tags: pointwise
+
+# Aux function used in the test TestPythonDispatch.test_kwarg_only_and_positional_default
+# within test/test_python_dispatch.py
+- func: _foobar(Tensor self, bool arg1=True, bool arg2=True, *, bool arg3=True) -> Tensor
+  dispatch:
+    CPU: foobar
+  autogen: _foobar.out
+
+# Fused Optimizer CUDA kernels.
+- func: _fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
+  variants: function
+  dispatch:
+    CUDA: _fused_adam_kernel_cuda_
+  autogen: _fused_adam, _fused_adam.out
+
+- func: _fused_adam_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now),
+  # but still skip the device check as the Tensor LR can be on CPU
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CUDA: _fused_adam_kernel_cuda_
+  autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out
+
+- func: _fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
+  variants: function
+  dispatch:
+    CUDA: _fused_adamw_kernel_cuda_
+  autogen: _fused_adamw, _fused_adamw.out
+
+- func: _fused_adamw_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now),
+  # but still skip the device check as the Tensor LR can be on CPU
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CUDA: _fused_adamw_kernel_cuda_
+  autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out
+
+- func: _fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
+  variants: function
+  dispatch:
+    CUDA: _fused_sgd_kernel_cuda_
+  autogen: _fused_sgd, _fused_sgd.out
+
+- func: _fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
+  # but still skip the device check as the Tensor LR can be on CPU
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CUDA: _fused_sgd_kernel_cuda_
+  autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out
+
+# This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
+- func: _propagate_xla_data(Tensor input, Tensor output) -> ()
+  variants: function
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/native/tags.yaml b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/native/tags.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c31721729036ff14989f3053d22400d9d100558a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/native/tags.yaml
@@ -0,0 +1,65 @@
+# This yaml file contains all the possible tags that can be defined in `tags` in `native_functions.yaml`
+
+- tag: inplace_view
+  desc: |
+          This tag indicates if an operator *only* modifies the tensor metadata
+- tag: pt2_compliant_tag
+  desc: |
+          This tag indicates if the operator is guaranteed to
+          work with the PT2 compilation APIs (torch.compile,
+          torch.export, etc). If you add this tag to an
+          operator, please use
+          `torch.testing._internal.optest.opcheck` to test that
+          the operator has been registered correctly and
+          works with torch.compile
+- tag: view_copy
+  desc: |
+          This tag indicates operators that are *_copy* variants
+          of view/aliasing operators. If an operator has a view_copy tag,
+          then it should have the name {op}_copy, where {op} is a view operator.
+- tag: dynamic_output_shape
+  desc: |
+          This tag indicates if an operator's output's shape depends on input Tensor
+          data.
+- tag: data_dependent_output
+  desc: |
+          Operator has a non-Tensor output whose value is dependent on the data
+          of Tensor inputs.  Among other things, this implies that this operator
+          cannot be run with meta tensor (since data is not available), nor
+          can it be symbolically traced.
+- tag: generated
+  desc: |
+          This tag indicates that the operator doesn't have an explicit entry in
+          native_functions.yaml, and instead was generated automatically by the codegen.
+- tag: nondeterministic_seeded
+  desc: |
+          This tag indicates if an operator is nondeterministically seeded
+          (i.e., is random) such that the operator intentionally produces
+          different results when run twice on the same inputs, but this randomness
+          is controlled by a Generator which, if reseeded would give you the
+          same result.
+- tag: nondeterministic_bitwise
+  desc: |
+          This tag indicates if an operator doesn't guarantee bitwise equivalence
+          across different runs of an operator with identical inputs.
+- tag: needs_fixed_stride_order
+  desc: |
+          This tag indicates that the operator should be passed Tensors following
+          the same stride permutation as observed in eager when compiled in inductor.
+
+# NOTE [Core ATen Ops]
+- tag: core
+  desc: |
+          Core aten ops is a subset of aten ops that remains after aten-to-aten decomposition and
+          functionalization pass. Core aten ops are fully functional and adhere to single static
+          assignment (SSA): this implies there will be no `inplace` or `_out` variants in this opset.
+          This opset is designed to serve as the functional IR to interface with compiler backends.
+          In contrast to primTorch, core aten opset doesn't decompose ops into explicit
+          type promotion and broadcasting ops.
+          Core aten ops is also effectively the opset produced by torchdynamo.export(aten_graph=True),
+          and thus can be used as an opset for export purpose.
+- tag: pointwise
+  desc: |
+          Pointwise operators are operators where each element of the output is computed only by accessing
+          the corresponding element of all the broadcasted inputs. The output shape will be the broadcasted
+          shape of the inputs.
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/CompositeViewCopyKernels.cpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/CompositeViewCopyKernels.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47097d7aa4320674bec4bddbb5ac861309334f0c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/CompositeViewCopyKernels.cpp
@@ -0,0 +1,73 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include <ATen/InferSize.h>
+#include <ATen/Tensor.h>
+#include <ATen/native/Resize.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+#include <ATen/ops/clone.h>
+$ops_headers
+#endif
+
+namespace at {
+namespace native {
+
+// This file contains a number of kernels for aten functions that are fully code-generated.
+// TODO: rename this file to something more generic.
+
+namespace {
+at::Tensor clone_arg(const at::Tensor& t) {
+    return t.clone();
+}
+
+std::vector<at::Tensor> clone_arg(const at::TensorList& t_list) {
+    std::vector<at::Tensor> out(t_list.size());
+    for (const auto& i : c10::irange(t_list.size())) {
+        out[i] = t_list[i].clone();
+    }
+    return out;
+}
+
+// duped with gen_resize_out_helper from structured kernels
+void copy_arg(const at::Tensor& dst, const at::Tensor& src) {
+    TORCH_CHECK(src.dtype() == dst.dtype(),
+        "Expected out tensor to have dtype ", src.dtype(), ", but got ", dst.dtype(), " instead");
+    TORCH_CHECK(src.device() == dst.device(),
+        "Expected out tensor to have device ", src.device(), ", but got ", dst.device(), " instead");
+    dst.copy_(src);
+}
+
+void copy_arg(const at::TensorList& dst, const at::TensorList& src) {
+    TORCH_INTERNAL_ASSERT(dst.size() == src.size());
+    for (const auto& i : c10::irange(dst.size())) {
+        copy_arg(dst[i], src[i]);
+    }
+}
+
+// TODO: this doesn't handle restriding empty tensors correctly; see
+// gen_resize_out_helper for the correct algorithm
+
+void resize_out_helper(const at::Tensor& dst, const at::Tensor& src) {
+    at::native::resize_output(dst, src.sizes());
+}
+
+void resize_out_helper(const at::TensorList& dst, const at::TensorList& src) {
+    TORCH_INTERNAL_ASSERT(dst.size() == src.size());
+    for (const auto& i : c10::irange(dst.size())) {
+        at::native::resize_output(dst[i], src[i].sizes());
+    }
+}
+}
+
+
+${CompositeViewCopyKernel_Definitions}
+
+${GeneratedCompositeFunctional_Definitions}
+
+${GeneratedCompositeOut_Definitions}
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunction.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunction.h
new file mode 100644
index 0000000000000000000000000000000000000000..c92d5eb3898ecea0fb9e1f79c2725d1bc6dfa7fb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunction.h
@@ -0,0 +1,23 @@
+#pragma once
+// ${generated_comment}
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace ${dispatch_namespace} {
+
+${dispatch_namespaced_declarations}
+
+} // namespace ${dispatch_namespace}
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/FunctionalInverses.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/FunctionalInverses.h
new file mode 100644
index 0000000000000000000000000000000000000000..3217e097d7adf37ab7041c45f5dd413024fc33ef
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/FunctionalInverses.h
@@ -0,0 +1,33 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/Tensor.h>
+
+namespace at {
+namespace functionalization {
+
+enum class InverseReturnMode {
+  /// Specifies that functional inverses should always return a view.
+  AlwaysView,
+  /// Specifies that functional inverses should always return a non-view / copy.
+  NeverView,
+  /// Specifies that functional inverses should return a view unless a (copying) scatter
+  /// inverse exists, in which case that will be used instead.
+  /// This avoids as_strided() calls that can be difficult for subclasses to handle.
+  ViewOrScatterInverse,
+};
+
+struct FunctionalInverses {
+
+${view_inverse_declarations}
+
+// NB: These are not generated! They're manually implemented in the template.
+// TODO: Change codegen to generate these. See the following link:
+// https://github.com/pytorch/pytorch/blob/main/torchgen/model.py#L2583-L2585
+static at::Tensor chunk_inverse(const at::Tensor & base, const at::Tensor & mutated_view, InverseReturnMode inverse_return_mode, int64_t mutated_view_idx, int chunks, int dim);
+static at::Tensor narrow_inverse(const at::Tensor & base, const at::Tensor & mutated_view, InverseReturnMode inverse_return_mode, int dim, c10::SymInt start, c10::SymInt length);
+
+};
+}
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/Functions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/Functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb531363f53ea5fbaf1da4ca80e781145d628dca
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/Functions.h
@@ -0,0 +1,143 @@
+#pragma once
+
+// ${generated_comment}
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,            \
+  meaning the file will need to be re-compiled every time an operator     \
+  is changed or added. Consider if your change would be better placed in  \
+  another file, or if a more specific header might achieve the same goal. \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from <ATen/ops/{my_operator}.h> and   \
+  see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+// NOTE: [TORCH_ASSERT_ONLY_METHOD_OPERATORS]
+//
+// In ATen, certain generated headers files include the definitions of
+// every single operator in PyTorch. Unfortunately this means every
+// time an operator signature is updated or changed in
+// native_functions.yaml, you (and every other PyTorch developer) need
+// to recompile every source file that includes any of these headers.
+//
+// To break up these header dependencies, and improve incremental
+// build times for all PyTorch developers. These headers are split
+// into per-operator headers in the `ATen/ops` folder. This limits
+// incremental builds to only changes to methods of `Tensor`, or files
+// that use the specific operator being changed. With `at::sum` as an
+// example, you should include
+//
+//   <ATen/ops/sum.h>               // instead of ATen/Functions.h
+//   <ATen/ops/sum_native.h>        // instead of ATen/NativeFunctions.h
+//   <ATen/ops/sum_ops.h>           // instead of ATen/Operators.h
+//   <ATen/ops/sum_cpu_dispatch.h>  // instead of ATen/CPUFunctions.h
+//
+// However, even if you're careful to use this in your own code.
+// `Functions.h` might be included indirectly through another header
+// without you realising. To avoid this, you can add
+//
+//   #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+//
+// to the top of your source file. This way any time the non-specific
+// headers are included, the compiler will error out.
+//
+// Also, be aware that `ops` are not available in all build
+// configurations (namely fb-internal) so you must guard these
+// includes with `#ifdef AT_PER_OPERATOR_HEADERS`. e.g.
+//
+//   #ifndef AT_PER_OPERATOR_HEADERS
+//   #include <ATen/Functions.h>
+//   #else
+//   #include <ATen/ops/sum.h>
+//   #endif
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <c10/core/SymInt.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/util/OptionalArrayRef.h>
+
+#include <ATen/ops/from_blob.h>
+#include <ATen/ops/tensor.h>
+
+${Functions_includes}
+
+namespace at {
+
+${Functions_declarations}
+
+// Special C++ only overloads for std()-like functions (See gh-40287)
+// These are needed because int -> bool conversion takes precedence over int -> IntArrayRef
+// So, for example std(0) would select the std(unbiased=False) overload
+TORCH_API inline Tensor var(const Tensor& self, int dim) {
+  return at::var(self, IntArrayRef{dim});
+}
+TORCH_API inline std::tuple<Tensor, Tensor> var_mean(const Tensor& self, int dim) {
+  return at::var_mean(self, IntArrayRef{dim});
+}
+TORCH_API inline Tensor std(const Tensor& self, int dim) {
+  return at::std(self, IntArrayRef{dim});
+}
+TORCH_API inline std::tuple<Tensor, Tensor> std_mean(const Tensor& self, int dim) {
+  return at::std_mean(self, IntArrayRef{dim});
+}
+
+inline int64_t numel(const Tensor& tensor) {
+  return tensor.numel();
+}
+
+inline int64_t size(const Tensor& tensor, int64_t dim) {
+  return tensor.size(dim);
+}
+
+inline int64_t stride(const Tensor& tensor, int64_t dim) {
+  return tensor.stride(dim);
+}
+
+inline bool is_complex(const Tensor& tensor) {
+  return tensor.is_complex();
+}
+
+inline bool is_floating_point(const Tensor& tensor) {
+  return tensor.is_floating_point();
+}
+
+inline bool is_signed(const Tensor& tensor) {
+  return tensor.is_signed();
+}
+
+inline bool is_inference(const Tensor& tensor) {
+  return tensor.is_inference();
+}
+
+inline bool _is_zerotensor(const Tensor& tensor) {
+  return tensor._is_zerotensor();
+}
+
+inline bool is_conj(const Tensor& tensor) {
+  return tensor.is_conj();
+}
+
+inline Tensor conj(const Tensor& tensor) {
+  return tensor.conj();
+}
+
+inline bool is_neg(const Tensor& tensor) {
+  return tensor.is_neg();
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/LazyNonNativeIr.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/LazyNonNativeIr.h
new file mode 100644
index 0000000000000000000000000000000000000000..18eaf6da52e4b3654becac6cc89849bc0806ae09
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/LazyNonNativeIr.h
@@ -0,0 +1,11 @@
+#pragma once
+
+${lazy_non_native_ir_inc}
+
+// This file contains autogenerated LazyTensor Non Native IR nodes
+
+${namespace_prologue}
+
+${non_native_ir_nodes}
+
+${namespace_epilogue}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/Operators.cpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/Operators.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..082bb67c3e2043f2c36b29345f57048ec2e9eea7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/Operators.cpp
@@ -0,0 +1,19 @@
+#include <ATen/Tensor.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+
+// ${generated_comment}
+// NOTE See [Sharded File] comment in VariableType
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+${operator_headers}
+#endif
+
+${static_dispatch_extra_headers}
+
+namespace at { namespace _ops {
+
+${definitions}
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/RedispatchFunctions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/RedispatchFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..d89975a4a62257cdc163f32c7b9f13261b04f33f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/RedispatchFunctions.h
@@ -0,0 +1,32 @@
+#pragma once
+
+// ${generated_comment}
+
+#ifdef TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider using the at::_ops::{name}::redispatch() interface by including     \
+  the specific operator from <ATen/ops/{my_operator}_ops.h>
+#endif
+
+#include <c10/core/Scalar.h>
+#include <ATen/Tensor.h>
+#include <c10/core/Storage.h>
+#include <ATen/core/Generator.h>
+#include <c10/util/Deprecated.h>
+#include <ATen/DeviceGuard.h>
+#include <c10/core/TensorOptions.h>
+#include <ATen/core/Reduction.h>
+#include <c10/util/Optional.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Context.h>
+#include <ATen/TracerMode.h>
+#include <ATen/Operators.h>
+
+namespace at {
+
+namespace redispatch {
+    ${function_redispatch_definitions}
+} // namespace redispatch
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/UfuncCUDA.cu b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/UfuncCUDA.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e75d82d9cc84bd8fddfd303f610412e5d0a98729
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/UfuncCUDA.cu
@@ -0,0 +1,21 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/ufunc/${name}.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/core/Scalar.h>
+${cuda_headers}
+
+namespace at {
+
+// NB: this is explicitly copied here (via codegen) rather than
+// included via NativeFunctions.h to avoid recompiling this file when
+// NativeFunctions.h changes
+namespace meta {
+${meta_declaration}
+}
+
+namespace native {
+${native_declaration}
+${native_definitions}
+}} // namespace at::native
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/UnboxingFunctions.cpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/UnboxingFunctions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..86c13235d8623964d734e743f5f15cf68a8df63c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/ATen/templates/UnboxingFunctions.cpp
@@ -0,0 +1,35 @@
+#include <ATen/UnboxingFunctions.h>
+#include <ATen/Functions.h>
+
+#include <ATen/Tensor.h>
+#include <ATen/core/functional.h>
+#include <ATen/core/interned_strings.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstring>
+#include <sstream>
+#include <stdexcept>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+namespace at {
+namespace unboxing {
+
+using ::c10::fmap;
+using ::c10::filter;
+using torch::jit::peek;
+using torch::jit::drop;
+using torch::jit::pack;
+using torch::jit::pop;
+
+// Generated function declaration
+${definitions}
+
+} // namespace unboxing
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/gen_inplace_or_view_type.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/gen_inplace_or_view_type.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc5ea44ac15d79e788e45aeb2ffcbf292364ef86
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/gen_inplace_or_view_type.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/gen_python_functions.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/gen_python_functions.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ba76fdabe1f5a6b10b47c5ee277612b82f22eb0
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/gen_python_functions.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/gen_trace_type.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/gen_trace_type.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4242e12ad0f77ae94c3aee0efcaa59071d89a254
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/gen_trace_type.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/gen_variable_type.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/gen_variable_type.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40c745b05bdd8dbe0288e8a9170a084bd7705e4e
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/gen_variable_type.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/load_derivatives.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/load_derivatives.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..049aef76d4e758fd60535a81d26deedac64fa8b3
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/__pycache__/load_derivatives.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/build.bzl b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/build.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..588bd5944e29477119782591b231fd80a7a57cf4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/build.bzl
@@ -0,0 +1,14 @@
+def define_targets(rules):
+    rules.py_library(
+        name = "autograd",
+        srcs = rules.glob(["*.py"]),
+        data = rules.glob([
+            "*.yaml",
+            "templates/*",
+        ]),
+        visibility = ["//:__subpackages__"],
+        deps = [
+            rules.requirement("PyYAML"),
+            "//torchgen",
+        ],
+    )
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/context.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/context.py
new file mode 100644
index 0000000000000000000000000000000000000000..d838aa3c77bbbc0f37cd7fa6e005d85c9e9dd624
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/context.py
@@ -0,0 +1,31 @@
+import functools
+from typing import Callable
+
+from torchgen.api.autograd import NativeFunctionWithDifferentiabilityInfo as NFWDI
+from torchgen.context import native_function_manager
+from torchgen.utils import T
+
+
+# Like tools.api.context.with_native_function, but for
+# NativeFunctionWithDifferentiabilityInfo.
+def with_native_function_with_differentiability_info(
+    func: Callable[[NFWDI], T]
+) -> Callable[[NFWDI], T]:
+    @functools.wraps(func)
+    def wrapper(f: NFWDI) -> T:
+        with native_function_manager(f.func):
+            return func(f)
+
+    return wrapper
+
+
+# Like the above but with an additional dispatch key string argument
+def with_native_function_with_differentiability_info_and_key(
+    func: Callable[[NFWDI, str], T]
+) -> Callable[[NFWDI, str], T]:
+    @functools.wraps(func)
+    def wrapper(f: NFWDI, key: str) -> T:
+        with native_function_manager(f.func):
+            return func(f, key)
+
+    return wrapper
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/gen_annotated_fn_args.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/gen_annotated_fn_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..f935a9adf4c6f126fe45999613ed9871280aac61
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/gen_annotated_fn_args.py
@@ -0,0 +1,129 @@
+"""
+For procedural tests needed for __torch_function__, we use this function
+to export method names and signatures as needed by the tests in
+test/test_overrides.py.
+
+python -m tools.autograd.gen_annotated_fn_args \
+       aten/src/ATen/native/native_functions.yaml \
+       aten/src/ATen/native/tags.yaml \
+       $OUTPUT_DIR \
+       tools/autograd
+
+Where $OUTPUT_DIR is where you would like the files to be
+generated.  In the full build system, OUTPUT_DIR is
+torch/testing/_internal/generated
+"""
+
+import argparse
+import os
+import textwrap
+from collections import defaultdict
+
+from typing import Any, Dict, List, Sequence
+
+import torchgen.api.python as python
+from torchgen.context import with_native_function
+
+from torchgen.gen import parse_native_yaml
+from torchgen.model import Argument, BaseOperatorName, NativeFunction
+from torchgen.utils import FileManager
+
+from .gen_python_functions import (
+    is_py_fft_function,
+    is_py_linalg_function,
+    is_py_nn_function,
+    is_py_special_function,
+    is_py_torch_function,
+    is_py_variable_method,
+    should_generate_py_binding,
+)
+
+
+def gen_annotated(
+    native_yaml_path: str, tags_yaml_path: str, out: str, autograd_dir: str
+) -> None:
+    native_functions = parse_native_yaml(
+        native_yaml_path, tags_yaml_path
+    ).native_functions
+    mappings = (
+        (is_py_torch_function, "torch._C._VariableFunctions"),
+        (is_py_nn_function, "torch._C._nn"),
+        (is_py_linalg_function, "torch._C._linalg"),
+        (is_py_special_function, "torch._C._special"),
+        (is_py_fft_function, "torch._C._fft"),
+        (is_py_variable_method, "torch.Tensor"),
+    )
+    annotated_args: List[str] = []
+    for pred, namespace in mappings:
+        groups: Dict[BaseOperatorName, List[NativeFunction]] = defaultdict(list)
+        for f in native_functions:
+            if not should_generate_py_binding(f) or not pred(f):
+                continue
+            groups[f.func.name.name].append(f)
+        for group in groups.values():
+            for f in group:
+                annotated_args.append(f"{namespace}.{gen_annotated_args(f)}")
+
+    template_path = os.path.join(autograd_dir, "templates")
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write_with_template(
+        "annotated_fn_args.py",
+        "annotated_fn_args.py.in",
+        lambda: {
+            "annotated_args": textwrap.indent("\n".join(annotated_args), "    "),
+        },
+    )
+
+
+@with_native_function
+def gen_annotated_args(f: NativeFunction) -> str:
+    def _get_kwargs_func_exclusion_list() -> List[str]:
+        # functions that currently don't work with kwargs in test_overrides.py
+        return [
+            "diagonal",
+            "round_",
+            "round",
+            "scatter_",
+        ]
+
+    def _add_out_arg(
+        out_args: List[Dict[str, Any]], args: Sequence[Argument], *, is_kwarg_only: bool
+    ) -> None:
+        for arg in args:
+            if arg.default is not None:
+                continue
+            out_arg: Dict[str, Any] = {}
+            out_arg["is_kwarg_only"] = str(is_kwarg_only)
+            out_arg["name"] = arg.name
+            out_arg["simple_type"] = python.argument_type_str(
+                arg.type, simple_type=True
+            )
+            size_t = python.argument_type_size(arg.type)
+            if size_t:
+                out_arg["size"] = size_t
+            out_args.append(out_arg)
+
+    out_args: List[Dict[str, Any]] = []
+    _add_out_arg(out_args, f.func.arguments.flat_positional, is_kwarg_only=False)
+    if f"{f.func.name.name}" not in _get_kwargs_func_exclusion_list():
+        _add_out_arg(out_args, f.func.arguments.flat_kwarg_only, is_kwarg_only=True)
+
+    return f"{f.func.name.name}: {repr(out_args)},"
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate annotated_fn_args script")
+    parser.add_argument(
+        "native_functions", metavar="NATIVE", help="path to native_functions.yaml"
+    )
+    parser.add_argument("tags", metavar="TAGS", help="path to tags.yaml")
+    parser.add_argument("out", metavar="OUT", help="path to output directory")
+    parser.add_argument(
+        "autograd", metavar="AUTOGRAD", help="path to template directory"
+    )
+    args = parser.parse_args()
+    gen_annotated(args.native_functions, args.tags, args.out, args.autograd)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/gen_variable_factories.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/gen_variable_factories.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7cf4a342328db5c8cd83bf581eb9c5c21278cb0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/gen_variable_factories.py
@@ -0,0 +1,115 @@
+# Generates C++ functions that wrap ATen tensor factory methods to turn them into Variables.
+#
+# This writes one file: variable_factories.h
+
+import re
+from typing import List, Optional
+
+import torchgen.api.python as python
+from torchgen.api import cpp
+
+from torchgen.api.types import CppSignatureGroup
+from torchgen.context import with_native_function
+from torchgen.gen import parse_native_yaml
+from torchgen.model import NativeFunction, TensorOptionsArguments, Variant
+from torchgen.utils import FileManager, mapMaybe
+
+OPTIONAL_TYPE_PATTERN = re.compile(r"c10::optional<(.+)>")
+TYPE_PATTERN = re.compile(r"(?:const\s+)?([A-Z]\w+)")
+
+
+# Add 'at::' to types defined in ATen namespace, e.g. Tensor, TensorList, IntArrayRef and etc.
+# TODO: maybe update the cpp argument API to take optional namespace argument?
+def fully_qualified_type(argument_type: str) -> str:
+    def maybe_optional_type(type: str, is_opt: bool) -> str:
+        return f"c10::optional<{type}>" if is_opt else type
+
+    opt_match = OPTIONAL_TYPE_PATTERN.match(argument_type)
+    is_opt = opt_match is not None
+    if opt_match:
+        argument_type = argument_type[opt_match.start(1) : opt_match.end(1)]
+    match = TYPE_PATTERN.match(argument_type)
+    if match is None:
+        return maybe_optional_type(argument_type, is_opt)
+    index = match.start(1)
+    qualified_type = f"{argument_type[:index]}at::{argument_type[index:]}"
+    return maybe_optional_type(qualified_type, is_opt)
+
+
+def gen_variable_factories(
+    out: str, native_yaml_path: str, tags_yaml_path: str, template_path: str
+) -> None:
+    native_functions = parse_native_yaml(
+        native_yaml_path, tags_yaml_path
+    ).native_functions
+    factory_functions = [fn for fn in native_functions if is_factory_function(fn)]
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write_with_template(
+        "variable_factories.h",
+        "variable_factories.h",
+        lambda: {
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/variable_factories.h",
+            "ops_headers": [
+                f"#include <ATen/ops/{fn.root_name}.h>" for fn in factory_functions
+            ],
+            "function_definitions": list(mapMaybe(process_function, factory_functions)),
+        },
+    )
+
+
+@with_native_function
+def is_factory_function(f: NativeFunction) -> bool:
+    if Variant.function not in f.variants:
+        return False
+
+    name = cpp.name(f.func)
+    has_tensor_options = python.has_tensor_options(f)
+    return has_tensor_options or name.endswith("_like")
+
+
+@with_native_function
+def process_function(f: NativeFunction) -> Optional[str]:
+    name = cpp.name(f.func)
+    has_tensor_options = python.has_tensor_options(f)
+    is_factory = has_tensor_options or name.endswith("_like")
+
+    if Variant.function not in f.variants or not is_factory:
+        return None
+
+    cpp_sigs = CppSignatureGroup.from_native_function(f, method=False)
+    sigs = [cpp_sigs.signature]
+    if cpp_sigs.symint_signature is not None:
+        sigs.append(cpp_sigs.symint_signature)
+    r = ""
+    for sig in sigs:
+        formals: List[str] = []
+        exprs: List[str] = []
+        requires_grad = "false"
+        for arg in sig.arguments():
+            qualified_type = fully_qualified_type(arg.type)
+            if arg.default:
+                formals.append(f"{qualified_type} {arg.name} = {arg.default}")
+            else:
+                formals.append(f"{qualified_type} {arg.name}")
+
+            if isinstance(arg.argument, TensorOptionsArguments):
+                # note: we remove the requires_grad setting from the TensorOptions because
+                # it is ignored anyways (and we actually have an assertion that it isn't set
+                # which would fail otherwise). We handle requires_grad explicitly here
+                # instead of passing it through to the kernel.
+                exprs.append(
+                    f"at::TensorOptions({arg.name}).requires_grad(c10::nullopt)"
+                )
+                # Manually set the requires_grad bit on the result tensor.
+                requires_grad = f"{arg.name}.requires_grad()"
+            else:
+                exprs.append(arg.name)
+
+        r += f"""\
+inline at::Tensor {sig.name()}({', '.join(formals)}) {{
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::{sig.name()}({', '.join(exprs)}), /*requires_grad=*/{requires_grad});
+}}
+"""
+    return r
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/gen_variable_type.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/gen_variable_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fa0b98c5171a0984290c191c8f4d69fb2f6608b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/gen_variable_type.py
@@ -0,0 +1,2162 @@
+# Generates VariableType.h/cpp
+#
+# **If any changes are being made to the VariableType codegen please also check
+# if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+#
+# VariableType is a subclass of at::Type that provides the binding code
+# necessary to provide a differentiable version of ATen operators. There are a
+# number of different things we could mean:
+#
+#   - Given a non-differentiable forward implementation, we might
+#     directly associate it with a backward implementation to make
+#     it differentiable.  This is the common case.
+#
+#   - Some functions don't need a backwards implementation, because
+#     backpropagation will never propagate beyond them.  There are a
+#     number of different reasons why this may be the case:
+#
+#       - The function has no differentiable inputs
+#       - The function's output is not differentiable
+#       - The function has no data dependency on its input
+#
+#   - Some function don't need a backwards implementation because they
+#     are implemented as a composition of other (differentiable) ATen
+#     functions.  These are dispatched directly to the Type superclass,
+#     which will in turn dispatch back to VariableType for its
+#     differentiable subcomponents.
+#
+import re
+from typing import Callable, Dict, List, Optional, Sequence, Set, Tuple, Union
+
+from torchgen.api import cpp
+from torchgen.api.autograd import (
+    DifferentiableInput,
+    dispatch_strategy,
+    ForwardDerivative,
+    gen_differentiable_outputs,
+    is_differentiable,
+    NativeFunctionWithDifferentiabilityInfo,
+    SavedAttribute,
+)
+
+from torchgen.api.types import (
+    ArrayRefCType,
+    BaseCppType,
+    BaseCType,
+    Binding,
+    DispatcherSignature,
+    intArrayRefT,
+    iTensorListRefT,
+    ListCType,
+    MutRefCType,
+    OptionalCType,
+    scalarT,
+    SpecialArgName,
+    stringT,
+    symIntArrayRefT,
+    TENSOR_LIST_LIKE_CTYPES,
+    tensorListT,
+    tensorT,
+    TupleCType,
+    VectorCType,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.context import (
+    native_function_manager,
+    with_native_function,
+    with_native_function_and,
+)
+from torchgen.model import (
+    Argument,
+    BaseType,
+    ListType,
+    NativeFunction,
+    SchemaKind,
+    SelfArgument,
+    TensorOptionsArguments,
+)
+from torchgen.utils import FileManager, mapMaybe
+
+from .context import with_native_function_with_differentiability_info_and_key
+from .gen_inplace_or_view_type import (
+    ALL_VIEW_FUNCTIONS,
+    ASSIGN_RETURN_VALUE,
+    AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION,
+    gen_formals,
+    get_base_name,
+    get_view_info,
+    is_tensor_list_type,
+    is_tensor_type,
+    METHOD_DEFINITION,
+    modifies_arguments,
+    TMP_VAR,
+    unpack_args,
+    unpacked_name,
+    use_derived,
+    WRAPPER_REGISTRATION,
+)
+from .gen_trace_type import (
+    get_return_value,
+    MANUAL_AUTOGRAD_AND_TRACER,
+    MANUAL_BACKEND,
+    tie_return_values,
+    type_wrapper_name,
+)
+
+# We don't set or modify grad_fn on these methods. Generally, they return
+# tensors that have requires_grad=False. In-place functions listed here will
+# not examine or modify requires_grad or grad_fn.
+# NB: this does NOT include overload name
+DONT_REQUIRE_DERIVATIVE = {
+    # These only depend on the input Tensor's shape and device, not the data
+    "empty_like",
+    "ones_like",
+    "full_like",
+    "zeros_like",
+    "rand_like",
+    "randn_like",
+    "new_empty",
+    "new_empty_strided",
+    "new_full",
+    "new_zeros",
+    "new_ones",
+    # These are only implemented on integral types
+    "__and__",
+    "__iand__",
+    "__ilshift__",
+    "__ior__",
+    "__irshift__",
+    "__ixor__",
+    "__lshift__",
+    "__or__",
+    "__rshift__",
+    "__xor__",
+    # These work on integral data types, and hence don't require derivative
+    "_sobol_engine_draw",
+    "_sobol_engine_ff",
+    "_sobol_engine_scramble_",
+    "_sobol_engine_initialize_state_",
+    # This is an unsafe method that is meant to be out of reach of autograd.
+    "_coalesced_",
+    # Quantize functions should not record gradients
+    "quantize_per_tensor",
+    "quantize_per_channel",
+    # Functions that return integers should not have output that require gradients
+    "argmax",
+    "argmin",
+    "argsort",
+    "searchsorted",
+    "bucketize",
+    # Functions that return booleans are not differentiable
+    "isnan",
+    "isposinf",
+    "isneginf",
+    "isinf",
+    "signbit",
+    "isin",
+    "allclose",
+    # Functions return none are not differentiable
+    "record_stream",
+    # These functions are not differentiable
+    "logical_and",
+    "logical_xor",
+    "logical_not",
+    "logical_or",
+    # This function returns nested_tensor shape as a tensor that is non-differentiable
+    "_nested_tensor_size",
+    "_nested_tensor_strides",
+    "_nested_tensor_storage_offsets",
+}
+
+# The C -> R functions at the time of adding this are still being audited and tested
+# but will not error out.
+# C -> C, R -> C functions for which backward is correctly implemented and tested
+GRADIENT_IMPLEMENTED_FOR_COMPLEX = {
+    "fill",
+    "t",
+    "view",
+    "reshape",
+    "reshape_as",
+    "view_as",
+    "roll",
+    "clone",
+    "block_diag",
+    "diag_embed",
+    "repeat",
+    "expand",
+    "flip",
+    "fliplr",
+    "flipud",
+    "rot90",
+    "nanmean",
+    "nansum",
+    "transpose",
+    "permute",
+    "squeeze",
+    "unsqueeze",
+    "resize",
+    "resize_as",
+    "tril",
+    "triu",
+    "chunk",
+    "zero_",
+    "eq_",
+    "ne_",
+    "add",
+    "__radd__",
+    "sum",
+    "_conj",
+    "sin",
+    "cos",
+    "mul",
+    "sinc",
+    "sinh",
+    "cosh",
+    "__rmul__",
+    "sgn",
+    "asin",
+    "acos",
+    "sub",
+    "div",
+    "cat",
+    "view_as_complex",
+    "index_put",
+    "neg",
+    "complex",
+    "select",
+    "where",
+    "as_strided",
+    "as_strided_scatter",
+    "slice",
+    "constant_pad_nd",
+    "unbind",
+    "split",
+    "split_with_sizes",
+    "unsafe_split",
+    "split_with_sizes_backward",
+    "dot",
+    "vdot",
+    "cholesky",
+    "triangular_solve",
+    "mm",
+    "_unsafe_view",
+    "mv",
+    "outer",
+    "bmm",
+    "diagonal",
+    "alias",
+    "atan",
+    "log",
+    "log10",
+    "log1p",
+    "log2",
+    "logaddexp",
+    "logcumsumexp",
+    "reciprocal",
+    "tan",
+    "pow",
+    "rsqrt",
+    "tanh",
+    "tanh_backward",
+    "asinh",
+    "acosh",
+    "atanh",
+    "take",
+    "fill_",
+    "exp",
+    "exp2",
+    "expm1",
+    "nonzero",
+    "mean",
+    "std_mean",
+    "var_mean",
+    "inverse",
+    "solve",
+    "linalg_cholesky",
+    "addcmul",
+    "addcdiv",
+    "matrix_exp",
+    "linalg_matrix_exp",
+    "_linalg_eigh",
+    "cholesky_solve",
+    "linalg_qr",
+    "_linalg_svd",
+    "_fft_c2c",
+    "_fft_r2c",
+    "linalg_solve",
+    "sqrt",
+    "stack",
+    "gather",
+    "index_select",
+    "index_add_",
+    "linalg_inv",
+    "linalg_inv_ex",
+    "baddbmm",
+    "addbmm",
+    "addmm",
+    "addmv",
+    "addr",
+    "linalg_householder_product",
+    "ormqr",
+    "reflection_pad1d",
+    "reflection_pad2d",
+    "reflection_pad3d",
+    "linalg_cholesky_ex",
+    "linalg_eig",
+    "diagonal_copy",
+    "diagonal_scatter",
+    "select_backward",
+    "diagonal_backward",
+    "slice_backward",
+    "reflection_pad1d_backward",
+    "reflection_pad2d_backward",
+    "reflection_pad3d_backward",
+    "_sparse_sparse_matmul",
+    "replication_pad1d",
+    "replication_pad2d",
+    "replication_pad3d",
+    "put",
+    "put_",
+    "_to_copy",
+    "replication_pad1d_backward",
+    "replication_pad2d_backward",
+    "replication_pad3d_backward",
+    "diag",
+    "masked_scatter",
+    "masked_select",
+    "index_add",
+    "index_fill",
+    "trace",
+    "polar",
+    "cumsum",
+    "rsub",
+    "eig",
+    "lerp",
+    "linalg_vector_norm",
+    "cumprod",
+    "prod",
+    "index_copy",
+    "lu",
+    "unfold",
+    "unfold_backward",
+    "index",
+    "masked_fill",
+    "masked_scatter_backward",
+    "linalg_cross",
+    "lu_unpack",
+    "renorm",
+    "_conj_physical",
+    "linalg_lu_factor_ex",
+    "scatter",
+    "scatter_add",
+    "sigmoid",
+    "sigmoid_backward",
+    "sparse_mask",
+    "trapezoid",
+    "cumulative_trapezoid",
+    "conj_physical_",
+    "_neg_view",
+    "_reshape_alias",
+    "_reshape_copy",
+    "_linalg_det",
+    "lu_solve",
+    "linalg_solve_triangular",
+    "linalg_pinv",
+    "linalg_lstsq",
+    "unfold_copy",
+    "col2im",
+    "im2col",
+    "cholesky_inverse",
+    "to_sparse",
+    "sparse_sampled_addmm",
+    "linalg_lu",
+    "pixel_shuffle",
+    "pixel_unshuffle",
+    "linalg_lu_solve",
+    "_linalg_slogdet",
+    "_linalg_solve_ex",
+}
+
+GRADIENT_IMPLEMENTED_FOR_SPARSE_COMPLEX = {
+    "_to_dense",
+    "_coalesce",
+    "coalesce",
+    "values",
+    "_sparse_coo_tensor_with_dims_and_tensors",
+    "_sparse_addmm",
+}
+
+GRADIENT_IMPLEMENTED_FOR_COMPLEX.update(GRADIENT_IMPLEMENTED_FOR_SPARSE_COMPLEX)
+
+# Some operators invalidate the grad_accumulator. Let's reset it.
+RESET_GRAD_ACCUMULATOR = {"set_", "resize_"}
+
+# NOTE [ TensorImpl and Storage Pointer Sanity Checks ]
+#
+# We check the following properties:
+#   1) A function should never change the input tensors' underlying c10::TensorImpl
+#      pointers or c10::Storage pointers, even if it modifies its input tensors (via
+#      inplace or out-variants)
+# If the function does not modify its arguments, we also check the following properties
+# pertaining to its output:
+#   2) Its TensorImpl has use_count of 1
+#   3) If the function is a view function, it has the same StorageImpl as that of
+#      the input it is aliased with. Otherwise, its StorageImpl has use_count of 1
+#
+# The following code templates implement the checks for this invariant:
+SAVE_TENSOR_STORAGE = CodeTemplate(
+    """\
+c10::optional<Storage> ${tensor_name}_storage_saved =
+  ${tensor_name}.has_storage() ? c10::optional<Storage>(${tensor_name}.storage()) : c10::nullopt;
+"""
+)
+
+
+# If tensor_name == out_tensor_name, used to enforce (1), otherwise used for (2)
+ENFORCE_SAME_TENSOR_STORAGE = CodeTemplate(
+    """\
+if (${tensor_name}_storage_saved.has_value() &&
+    !at::impl::dispatch_mode_enabled() &&
+    !at::impl::tensor_has_dispatch(${tensor_name}))
+  TORCH_INTERNAL_ASSERT(${tensor_name}_storage_saved.value().is_alias_of(${out_tensor_name}.storage()));
+"""
+)
+
+SAVE_TENSORLIST_STORAGE = CodeTemplate(
+    """\
+std::vector<c10::optional<Storage>> ${tensorlist_name}_storage_saved(${tensorlist_name}.size());
+for (const Tensor& tensor : ${tensorlist_name})
+  ${tensorlist_name}_storage_saved.push_back(
+    tensor.has_storage() ? c10::optional<Storage>(tensor.storage()) : c10::nullopt);
+"""
+)
+
+ENFORCE_SAME_TENSORLIST_STORAGE = CodeTemplate(
+    """\
+for (size_t i=0; i<${tensorlist_name}.size() && !at::impl::dispatch_mode_enabled(); i++) {
+  if (${tensorlist_name}_storage_saved[i].has_value() && !at::impl::tensorlist_has_dispatch(${tensorlist_name}))
+    TORCH_INTERNAL_ASSERT(${tensorlist_name}_storage_saved[i].value().is_alias_of(${tensorlist_name}[i].storage()));
+}
+"""
+)
+
+SAVE_OPTIONALTENSORLIST_STORAGE = CodeTemplate(
+    """\
+std::vector<c10::optional<Storage>> ${tensorlist_name}_storage_saved(${tensorlist_name}.size());
+for (const c10::optional<Tensor>& tensor : ${tensorlist_name})
+  ${tensorlist_name}_storage_saved.push_back(
+    tensor.has_value() && tensor->has_storage() ? c10::optional<Storage>(tensor->storage()) : c10::nullopt);
+"""
+)
+
+ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE = CodeTemplate(
+    """\
+for (size_t i=0; i<${tensorlist_name}.size() && !at::impl::dispatch_mode_enabled(); i++) {
+  if (${tensorlist_name}_storage_saved[i].has_value() && !at::impl::tensorlist_has_dispatch(${tensorlist_name}))
+    TORCH_INTERNAL_ASSERT(${tensorlist_name}_storage_saved[i].value().is_alias_of(
+        static_cast<c10::optional<Tensor>>(${tensorlist_name}[i])->storage()));
+}
+"""
+)
+
+SAVE_TENSOR_IMPL = CodeTemplate(
+    """\
+c10::intrusive_ptr<TensorImpl> ${tensor_name}_impl_saved;
+if (${tensor_name}.defined()) ${tensor_name}_impl_saved = ${tensor_name}.getIntrusivePtr();
+"""
+)
+
+ENFORCE_SAME_TENSOR_IMPL = CodeTemplate(
+    """\
+if (${tensor_name}_impl_saved && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(${tensor_name}))
+  TORCH_INTERNAL_ASSERT(${tensor_name}_impl_saved == ${tensor_name}.getIntrusivePtr());
+"""
+)
+
+ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE = CodeTemplate(
+    """\
+if (!at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(${tensor_name}))
+  TORCH_INTERNAL_ASSERT(${tensor_name}.use_count() <= 1, "function: ${fn_name}");
+"""
+)
+
+ENFORCE_TENSOR_STORAGE_USE_COUNT_EQUALS_ONE = CodeTemplate(
+    """\
+if (${tensor_name}.has_storage() && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(${tensor_name})) {
+  TORCH_INTERNAL_ASSERT(${tensor_name}.storage().use_count() == 1, "function: ${fn_name}");
+}
+"""
+)
+
+SAVE_TENSORLIST_IMPL = CodeTemplate(
+    """\
+std::vector<c10::intrusive_ptr<TensorImpl>> ${tensorlist_name}_impl_saved(${tensorlist_name}.size());
+for (size_t i=0; i<${tensorlist_name}.size(); i++)
+  if (${tensorlist_name}[i].defined()) ${tensorlist_name}_impl_saved[i] = ${tensorlist_name}[i].getIntrusivePtr();
+"""
+)
+
+ENFORCE_SAME_TENSORLIST_IMPL = CodeTemplate(
+    """\
+for (size_t i=0; i<${tensorlist_name}.size() && !at::impl::dispatch_mode_enabled(); i++) {
+  if (${tensorlist_name}_impl_saved[i] && !at::impl::tensorlist_has_dispatch(${tensorlist_name}))
+    TORCH_INTERNAL_ASSERT(${tensorlist_name}_impl_saved[i] == ${tensorlist_name}[i].getIntrusivePtr());
+}
+"""
+)
+
+SAVE_OPTIONALTENSORLIST_IMPL = CodeTemplate(
+    """\
+std::vector<c10::intrusive_ptr<TensorImpl>> ${tensorlist_name}_impl_saved(${tensorlist_name}.size());
+for (size_t i=0; i<${tensorlist_name}.size(); i++) {
+  c10::optional<Tensor> t = ${tensorlist_name}[i];
+  if (t.has_value() && t->defined()) ${tensorlist_name}_impl_saved[i] = t->getIntrusivePtr();
+}
+"""
+)
+
+ENFORCE_SAME_OPTIONALTENSORLIST_IMPL = CodeTemplate(
+    """\
+for (size_t i=0; i<${tensorlist_name}.size() && !at::impl::dispatch_mode_enabled(); i++) {
+  if (${tensorlist_name}_impl_saved[i])
+    TORCH_INTERNAL_ASSERT(
+      ${tensorlist_name}_impl_saved[i] == static_cast<c10::optional<Tensor>>(${tensorlist_name}[i])->getIntrusivePtr());
+}
+"""
+)
+
+# The following list contains functions that we don't enforce the invariant on.
+DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE = {
+    # These functions are expected to change impl or storage of input tensors
+    "set_",
+    "_cudnn_rnn_flatten_weight",
+}
+DONT_ENFORCE_TENSOR_IMPL_USE_COUNT = {
+    # These non-inplace, non-out functions return tensors with use_count > 1
+    # Therefore, they MAY (but not necessarily) return one of its inputs as-is
+    # See https://github.com/pytorch/pytorch/issues/60426 for more information
+    "_embedding_bag",
+    "_embedding_bag_forward_only",
+    "q_per_channel_scales",
+    "q_per_channel_zero_points",
+    "lu_unpack",
+    "_cudnn_rnn_backward",
+    # The below failed StorageImpl use_count check but we skip tensor_impl check
+    # just in case
+    "_cudnn_rnn",
+    "dequantize_self",
+    # lift() should never actually be called with a requires_grad=True tensor,
+    "lift",
+    "lift_fresh",
+    "lift_fresh_copy",
+    # Nested Tensors related functions
+    # _nested_tensor_size() should never actually be called with requires_grad=True tensor
+    "_nested_tensor_size",
+    "_nested_tensor_strides",
+    "_nested_tensor_storage_offsets",
+}
+
+DONT_ENFORCE_STORAGE_IMPL_USE_COUNT = {
+    # These non-view functions return tensors with storage use_count != 1
+    "_slow_conv2d_forward",
+    "slow_conv3d_forward",
+    "channel_shuffle",
+    # If an input is returned as-is in output, we cannot guarantee its storage_impl
+    # use count to be 1 either.
+    *DONT_ENFORCE_TENSOR_IMPL_USE_COUNT,
+}
+# END CHECKS FOR [ TensorImpl and Storage Pointer Sanity Checks ]
+
+DECLARE_GRAD_FN = CodeTemplate(
+    """\
+std::shared_ptr<${op}> grad_fn;
+"""
+)
+
+DECLARE_VECTOR_OF_GRAD_FN = CodeTemplate(
+    """\
+std::vector<std::shared_ptr<${op}>> grad_fns;
+"""
+)
+
+SETUP_ANY_REQUIRES_GRAD = CodeTemplate(
+    """\
+[[maybe_unused]] auto _any_requires_grad = compute_requires_grad( ${args_with_derivatives} );
+${extra_differentiability_conditions}
+"""
+)
+
+SETUP_DERIVATIVE = CodeTemplate(
+    """\
+if (_any_requires_grad) {
+  ${setup}
+}
+"""
+)
+
+SETUP_NONE_REQUIRES_GRAD = CodeTemplate(
+    """\
+if (compute_requires_grad( ${args_to_check} )) {
+  throw_error_out_requires_grad("${base_name}");
+}
+"""
+)
+
+ASSIGN_GRAD_FN = CodeTemplate(
+    """\
+grad_fn = std::shared_ptr<${op}>(new ${op}(${op_ctor}), deleteNode);
+grad_fn->set_next_edges(collect_next_edges( ${args_with_derivatives} ));
+"""
+)
+
+# note(crcrpar): `compute_requires_grad` in the template below is supplied with arguments indexed with `i`
+# while the `SETUP_ANY_REQUIRES_GRAD` above takes whole tensors and scalars.
+ASSIGN_VECTOR_OF_GRAD_FN = CodeTemplate(
+    """\
+for (const auto& i : c10::irange( ${irange} )) {
+  const auto ith_requires_grad = compute_requires_grad(${args_with_derivatives});
+  check_inplace(self[i], ith_requires_grad);
+  grad_fns.push_back([&]() -> std::shared_ptr<${op}> {
+      if (!ith_requires_grad) {
+          return nullptr;
+      } else {
+          auto grad_fn = std::shared_ptr<${op}>(new ${op}(${op_ctor}), deleteNode);
+          grad_fn->set_next_edges(collect_next_edges( ${args_with_derivatives} ));
+          return grad_fn;
+      }
+  }());
+}
+"""
+)
+
+CALL_REDISPATCH = CodeTemplate(
+    """\
+at::redispatch::${api_name}(${unpacked_args})"""
+)
+# If the non-variable operation has return values, we use the `tmp` variable to hold the
+# values temporarily and pass the values to the return variables outside of the
+# `at::AutoDispatchBelowAutograd` guard block.
+DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES_JVP_DECOMP = CodeTemplate(
+    """\
+auto ${tmp_var} = ([&]() {
+  if (${any_has_forward_grad}) {
+    static c10::OperatorName full_name("aten::${op_name}", "${op_overload}");
+    static c10::optional<c10::OperatorHandle> opt_op = c10::Dispatcher::singleton().findSchema(full_name);
+    return impl::run_jit_decomposition_with_args_for_jvp<${return_types}>("${op_name}", *opt_op, ks, ${arg_names});
+  } else {
+    ${guard}
+    return ${base_type_call};
+  }
+})();
+"""
+)
+
+DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES = CodeTemplate(
+    """\
+auto ${tmp_var} = ([&]() {
+  ${guard}
+  return ${base_type_call};
+})();
+"""
+)
+
+DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES = CodeTemplate(
+    """\
+{
+  ${guard}
+  ${base_type_call};
+}
+"""
+)
+
+SET_HISTORY = CodeTemplate(
+    """\
+if (grad_fn) {
+    ${fn}_history(${differentiable_outputs}, grad_fn);
+}
+"""
+)
+
+LOOP_OVER_VECTOR_OF_GRAD_FNS = CodeTemplate(
+    """\
+if (!grad_fns.empty()) {
+    ${preamble}
+    for (const auto& i : c10::irange(grad_fns.size())) {
+        auto grad_fn = grad_fns[i];
+        if (grad_fn != nullptr) {
+            ${statements}
+        }
+    }
+}
+"""
+)
+
+CONDITIONAL = CodeTemplate(
+    """\
+if (${cond}) {
+  ${statements}
+}
+"""
+)
+
+RUN_ONLY_IN_DEBUG_MODE = CodeTemplate(
+    """\
+#ifndef NDEBUG
+${statements}
+#endif
+"""
+)
+
+FW_DERIVATIVE_CHECK_TEMPLATE = CodeTemplate(
+    """\
+isFwGradDefined(${req_inp})\
+"""
+)
+FW_DERIVATIVE_SIZE_CHECK_TEMPLATE = CodeTemplate(
+    """\
+TORCH_CHECK(
+    self.size() == ${inp_name}.size(),
+      "Tensor lists must have the same number of tensors, got ",
+    self.size(),
+      " and ",
+    ${inp_name}.size());
+"""
+)
+
+FW_DERIVATIVE_TENSORLIST_CHECK_TEMPLATE = CodeTemplate(
+    """\
+isFwGradDefinedTensorList(${req_inp})\
+"""
+)
+
+FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE = CodeTemplate(
+    """\
+auto ${inp_name}_t_raw = toNonOptFwGrad(${inp});
+auto ${inp_name}_tensor = toNonOptTensor(${inp});
+auto ${inp_name}_t = (${inp_name}_t_raw.defined() || !${inp_name}_tensor.defined())
+  ? ${inp_name}_t_raw : at::${zeros_fn}(${inp_name}_tensor.sizes(), ${inp_name}_tensor.options());
+"""
+)
+
+FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE = CodeTemplate(
+    """\
+auto ${inp_name}_p = toNonOptPrimal(${inp});
+"""
+)
+
+FW_DERIVATIVE_SETTER_TENSOR = CodeTemplate(
+    """\
+if (${out_arg}_new_fw_grad_opt.has_value() && ${out_arg}_new_fw_grad_opt.value().defined() && ${out_arg}.defined()) {
+  // The hardcoded 0 here will need to be updated once we support multiple levels.
+  ${out_arg}._set_fw_grad(${out_arg}_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ ${is_inplace});
+}
+"""
+)
+
+FW_DERIVATIVE_SETTER_TENSOR_FOREACH = CodeTemplate(
+    """\
+for (const auto& i : c10::irange(${out_arg}_new_fw_grad_opts.size())) {
+  auto& ${out_arg}_new_fw_grad_opt = ${out_arg}_new_fw_grad_opts[i];
+  if (${out_arg}_new_fw_grad_opt.has_value() && ${out_arg}_new_fw_grad_opt.value().defined() && ${out_arg}[i].defined()) {
+    // The hardcoded 0 here will need to be updated once we support multiple levels.
+    ${out_arg}[i]._set_fw_grad(${out_arg}_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ ${is_inplace});
+  }
+}
+"""
+)
+
+FW_DERIVATIVE_SETTER_MULTI_OUTPUT = CodeTemplate(
+    """\
+if (${all_res}_new_fw_grad_opt.has_value() && std::get<${idx}>(${all_res}_new_fw_grad_opt.value()).defined()
+    && ${out_arg}.defined()) {
+  ${out_arg}._set_fw_grad(std::get<${idx}>(${all_res}_new_fw_grad_opt.value()), /* level */ 0, /* is_inplace_op */ false);
+}
+"""
+)
+
+FW_DERIVATIVE_SETTER_TENSOR_LIST = CodeTemplate(
+    """\
+if (${out_arg}_new_fw_grad_opt.has_value()) {
+  auto ${out_arg}_new_fw_grad = ${out_arg}_new_fw_grad_opt.value();
+  TORCH_INTERNAL_ASSERT(${out_arg}.size() == ${out_arg}_new_fw_grad.size());
+  for (const auto i : c10::irange(${out_arg}.size())) {
+    if (${out_arg}_new_fw_grad[i].defined() && ${out_arg}[i].defined()) {
+      // The hardcoded 0 here will need to be updated once we support multiple levels.
+      ${out_arg}[i]._set_fw_grad(${out_arg}_new_fw_grad[i], /* level */ 0, /* is_inplace_op */ ${is_inplace});
+    }
+  }
+}
+"""
+)
+
+FW_DERIVATIVE_TEMPLATE = CodeTemplate(
+    """\
+${fw_grad_opt_definition}
+if (${requires_fw_grad}) {
+    ${unpacked_arguments}
+    ${out_arg}_new_fw_grad_opt = ${formula};
+}
+"""
+)
+
+FW_DERIVATIVE_FOREACH_TEMPLATE = CodeTemplate(
+    """\
+${fw_grad_opt_definition}
+for (const auto& i : c10::irange(${vector_of_optional_tensor}.size())) {
+  if (${any_has_forward_grad_for_current_index}) {
+      ${unpacked_arguments}
+      ${vector_of_optional_tensor}[i] = ${formula};
+  }
+}
+"""
+)
+
+FW_DERIVATIVE_FORBID_TEMPLATE = CodeTemplate(
+    """\
+TORCH_CHECK_NOT_IMPLEMENTED(!(${cond}), "Trying to use forward AD with ${name} that does not support it ${msg}");
+"""
+)
+
+FW_DERIVATIVE_FORBID_LIST_TEMPLATE = CodeTemplate(
+    """\
+for (const auto& _t: ${arg}) {
+    TORCH_CHECK_NOT_IMPLEMENTED(!(${cond}), "Trying to use forward AD with ${name} that does not support it ${msg}");
+}
+"""
+)
+
+
+def gen_variable_type(
+    out: str,
+    native_yaml_path: str,
+    tags_yaml_path: str,
+    fns_with_diff_infos: List[NativeFunctionWithDifferentiabilityInfo],
+    template_path: str,
+    used_keys: Set[str],
+) -> None:
+    """VariableType.h and VariableType.cpp body
+
+    This is the at::Type subclass for differentiable tensors. The
+    implementation of each function dispatches to the base tensor type to
+    compute the output. The grad_fn is attached to differentiable functions.
+    """
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write(
+        "VariableType.h",
+        lambda: {
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/VariableType.h"
+        },
+    )
+
+    # helper that generates a TORCH_LIBRARY_IMPL macro for each
+    # dispatch key that appears in derivatives.yaml
+    def wrapper_registrations(used_keys: Set[str]) -> str:
+        library_impl_macro_list: List[str] = []
+        for key in sorted(used_keys):
+            dispatch_key = key
+            if key == "Default":
+                dispatch_key = "Autograd"
+            library_impl_macro = (
+                f"TORCH_LIBRARY_IMPL(aten, {dispatch_key}, m) "
+                + "{\n"
+                + "${"
+                + f"wrapper_registrations_{key}"
+                + "}\n}"
+            )
+            library_impl_macro_list += [library_impl_macro]
+        return "\n\n".join(library_impl_macro_list)
+
+    # Generate a new template from VariableType.cpp which replaces ${wrapper_registrations}
+    # with per key TORCH_LIBRARY_IMPL macros for each key that appears in derivatives.yaml
+    fm1 = FileManager(
+        install_dir=out + "/templates", template_dir=template_path, dry_run=False
+    )
+    fm1.write(
+        "VariableType.cpp",
+        lambda: {
+            "type_derived_method_definitions": "\n\n".join(
+                [
+                    "${" + f"type_derived_method_definitions_{key}" + "}"
+                    for key in sorted(used_keys)
+                ]
+            ),
+            "wrapper_registrations": wrapper_registrations(used_keys),
+        },
+    )
+
+    # Generate final VariableType_*.cpp files from the generated template
+    fm2 = FileManager(install_dir=out, template_dir=out + "/templates", dry_run=False)
+
+    sharded_keys = set(
+        [f"type_derived_method_definitions_{key}" for key in sorted(used_keys)]
+        + [f"wrapper_registrations_{key}" for key in sorted(used_keys)]
+    )
+    # NOTE: see Note [Sharded File] at the top of the VariableType.cpp
+    # template regarding sharding of the generated files.
+    fm2.write_sharded(
+        "VariableType.cpp",
+        [fn for fn in fns_with_diff_infos if use_derived(fn)],
+        key_fn=lambda fn: cpp.name(fn.func.func),
+        base_env={
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/VariableType.cpp",
+        },
+        env_callable=gen_variable_type_func,
+        num_shards=5,
+        sharded_keys=sharded_keys,
+    )
+
+
+@with_native_function_and
+def gen_wrapper_registration(f: NativeFunction, key: str = "Default") -> str:
+    return WRAPPER_REGISTRATION.substitute(
+        unqual_operator_name_with_overload=f.func.name,
+        type_wrapper_name=type_wrapper_name(f, key),
+        class_type="VariableType",
+    )
+
+
+def gen_variable_type_func(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> Dict[str, List[str]]:
+    f = fn.func
+    result = {}
+    with native_function_manager(f):
+        name = cpp.name(f.func)
+        formals = gen_formals(f)
+
+        if (
+            fn.info is None
+            and str(f.func.name.name) not in RESET_GRAD_ACCUMULATOR
+            and get_base_name(f) not in DONT_REQUIRE_DERIVATIVE
+            and len(gen_differentiable_outputs(fn)) > 0
+            and cpp.name(f.func) not in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE
+            and type_wrapper_name(f) not in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT
+            and type_wrapper_name(f) not in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT
+        ):
+            # NOTE: [ Registering AutogradNotImplemented boxed kernel ]
+            #
+            # When there is no derivatives.yaml entry, we register a generic boxed
+            # NotImplemented kernel to set grad_fn to be NotImplemented, so that forward
+            # proceeds as usual but an error is properly produced on backward.
+            # TODO: it would be nice to not have these special cases
+            #
+            # There are several cases where still let codegen handle it:
+            # 1) ops that need to reset grad accumulator (we let codegen handle this case
+            #     because) the list is (currently) only accessible in Python.
+            # 2) User explicitly specifies DONT_REQUIRE_DERIVATIVE. This basically makes
+            #    autograd a fallthrough with NDEBUG checks. This can be useful for when all
+            #    outputs are integral.
+            # 3) When there are no differentiable outputs. This is similar to (2).
+            # 4) There are certain ops where we skip certain NDEBUG checks. this is similar
+            #    to (1).
+            type_definition = ""
+            wrapper_registration = AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION.substitute(
+                unqual_operator_name_with_overload=f.func.name
+            )
+            result["type_derived_method_definitions_Default"] = [type_definition]
+            result["wrapper_registrations_Default"] = [wrapper_registration]
+        else:
+            if not fn.info:
+                key = "Default"
+                type_definition = METHOD_DEFINITION.substitute(
+                    return_type=cpp.returns_type(
+                        f.func.returns, symint=True
+                    ).cpp_type(),
+                    type_wrapper_name=type_wrapper_name(f, key),
+                    type_definition_body=emit_body(fn, key),
+                    formals=formals,
+                )
+                wrapper_registration = gen_wrapper_registration(f, key)
+                result[f"type_derived_method_definitions_{key}"] = [type_definition]
+                result[f"wrapper_registrations_{key}"] = [wrapper_registration]
+            else:
+                for key in fn.info.keys():
+                    type_definition = METHOD_DEFINITION.substitute(
+                        return_type=cpp.returns_type(
+                            f.func.returns, symint=True
+                        ).cpp_type(),
+                        type_wrapper_name=type_wrapper_name(f, key),
+                        type_definition_body=emit_body(fn, key),
+                        formals=formals,
+                    )
+                    wrapper_registration = gen_wrapper_registration(f, key)
+                    result[f"type_derived_method_definitions_{key}"] = [type_definition]
+                    result[f"wrapper_registrations_{key}"] = [wrapper_registration]
+    # See Note [Manual Backend kernels]
+    assert (name in MANUAL_BACKEND) == f.manual_kernel_registration
+    # If you want to register a kernel to Autograd, you must make the op abstract.
+    # In other words, this op must have dispatch section in native_functions.yaml.
+    if name in MANUAL_AUTOGRAD_AND_TRACER or (
+        fn.info and any(info.has_derivatives for info in fn.info.values())
+    ):
+        msg = (
+            f"There's a formula for {name}(or its functional variant) in derivatives.yaml. "
+            f"It's required to add a dispatch section for it with explicit supported backends e.g CPU/CUDA "
+            f"or CompositeExplicitAutograd in native_functions.yaml. Please see "
+            f"https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native#choosing-the-right-dispatch-keyword "
+            f"for instructions to choose the right dispatch keyword."
+        )
+        assert f.is_abstract, msg
+
+    return result
+
+
+_foreach_ops_without_differentiability_info = {
+    # No reference backward available due to the lack of `{maximum, minimum}(tensor, scalar)`.
+    ("_foreach_maximum", "Scalar"),
+    ("_foreach_maximum", "ScalarList"),
+    ("_foreach_minimum", "Scalar"),
+    ("_foreach_minimum", "ScalarList"),
+    # No reference backward available as addcdiv/addcmul don't support Tensor as scaling factor.
+    ("_foreach_addcdiv", "Tensor"),
+    ("_foreach_addcmul", "Tensor"),
+    ("_foreach_copy", ""),
+}
+
+_foreach_ops_with_different_arity = {
+    # These ops lack `alpha` of scaling factor to applied to the right hand side argument.
+    ("_foreach_add", "Scalar"),
+    ("_foreach_add", "ScalarList"),
+    ("_foreach_sub", "Scalar"),
+    ("_foreach_sub", "ScalarList"),
+}
+
+
+@with_native_function_with_differentiability_info_and_key
+def emit_body(
+    fn: NativeFunctionWithDifferentiabilityInfo, key: str = "Default"
+) -> List[str]:
+    assert dispatch_strategy(fn) == "use_derived"
+    f = fn.func
+    info = fn.info[key] if fn.info else None
+    fw_derivatives = fn.fw_derivatives.get(key, []) if fn.fw_derivatives else []
+
+    name = cpp.name(f.func)
+    inplace = f.func.kind() == SchemaKind.inplace
+    is_out_fn = f.func.kind() == SchemaKind.out
+    returns_void = len(f.func.returns) == 0
+    base_name = get_base_name(f)
+    view_info = get_view_info(f)
+
+    is_foreach = name.startswith("_foreach")
+    is_inplace_foreach = is_foreach and inplace
+    if is_inplace_foreach:
+        inplace_foreacharg2refarg: Dict[Argument, Argument] = {}
+        refargname2inplace_foreacharg: Dict[str, Argument] = {}
+        base_name_and_overload_name = (f.func.name.name.base, f.func.name.overload_name)
+        if info is None:
+            assert (
+                base_name_and_overload_name
+                in _foreach_ops_without_differentiability_info
+            ), f"{'.'.join(base_name_and_overload_name)} should have a differentiability info"
+        else:
+            assert (
+                len(f.func.arguments.flat_non_out)
+                == len(info.func.func.arguments.flat_non_out)
+            ) or (base_name_and_overload_name in _foreach_ops_with_different_arity), (
+                f"{'.'.join(base_name_and_overload_name)} has {len(f.func.arguments.flat_non_out)} args "
+                f"but the reference has {len(info.func.func.arguments.flat_non_out)}"
+            )
+            for foreach_arg, ref_arg in zip(
+                f.func.arguments.flat_non_out, info.func.func.arguments.flat_non_out
+            ):
+                foreach_arg_type = foreach_arg.type
+                if isinstance(foreach_arg_type, ListType):
+                    foreach_arg_type = foreach_arg_type.elem
+                assert foreach_arg_type == ref_arg.type
+                inplace_foreacharg2refarg[foreach_arg] = ref_arg
+                refargname2inplace_foreacharg[ref_arg.name] = foreach_arg
+
+    def gen_differentiable_input(
+        arg: Union[Argument, SelfArgument, TensorOptionsArguments]
+    ) -> Optional[DifferentiableInput]:
+        if isinstance(arg, TensorOptionsArguments):
+            return None
+        a: Argument = arg.argument if isinstance(arg, SelfArgument) else arg
+
+        # TODO: `cpp_type` is only to keep it byte-for-byte compatible with the old codegen, should remove.
+        # NB: This is not a clone of cpp.argument() - TensorOptionsArguments / faithful / binds are
+        # not handled properly as they are irrelevant for this codegen.
+        cpp_type = cpp.argument_type(a, binds=a.name, symint=True).cpp_type()
+
+        if not is_differentiable(a.name, a.type, info):
+            return None
+        return DifferentiableInput(
+            name=a.name,
+            type=a.type,
+            cpp_type=cpp_type,
+        )
+
+    @with_native_function
+    def gen_differentiable_inputs(f: NativeFunction) -> List[DifferentiableInput]:
+        arguments = list(f.func.arguments.non_out)
+        if is_inplace_foreach and info is not None:
+            for i, arg in enumerate(f.func.arguments.flat_non_out):
+                if arg in inplace_foreacharg2refarg:
+                    # note(crcrpar): From what I understand, what matters is only the name.
+                    # Thus originally I only replace argument only when the names are different.
+                    # TODO(crcrpar): Make it simpler.
+                    mapped_arg = inplace_foreacharg2refarg[arg]
+                    arguments[i] = Argument(
+                        mapped_arg.name,
+                        mapped_arg.type,
+                        mapped_arg.default,
+                        mapped_arg.annotation,
+                    )
+        return list(mapMaybe(gen_differentiable_input, arguments))
+
+    def find_args_with_derivatives(
+        differentiable_inputs: List[DifferentiableInput],
+    ) -> List[DifferentiableInput]:
+        """Find arguments that have derivative definitions"""
+        if info is None or not info.has_derivatives:
+            return differentiable_inputs
+        names = {name for d in info.derivatives for name in d.var_names}
+        differentiable = [arg for arg in differentiable_inputs if arg.name in names]
+        if len(differentiable) != len(names):
+            missing = names - {arg.name for arg in differentiable}
+            raise RuntimeError(
+                f"Missing arguments for derivatives: {missing} in {info.name}"
+            )
+        return differentiable
+
+    differentiable_inputs = gen_differentiable_inputs(f)
+    args_with_derivatives = find_args_with_derivatives(differentiable_inputs)
+    differentiable_outputs = gen_differentiable_outputs(fn, key)
+
+    undifferentiable = (base_name in DONT_REQUIRE_DERIVATIVE) or (
+        name in DONT_REQUIRE_DERIVATIVE
+    )
+
+    requires_derivative = (
+        (not undifferentiable)
+        and (len(differentiable_inputs) > 0)
+        and (
+            (len(differentiable_outputs) > 0)
+            # note(crcrpar): In-place foreach functions are a void function.
+            or is_inplace_foreach
+        )
+    )
+
+    if (
+        info is not None
+        and info.has_derivatives
+        and not requires_derivative
+        # out= ops are allowed to have zero returns which cause requires_derivative to be False
+        # we shouldn't error out though (out= ops for autograd just redispatch)
+        and len(f.func.returns) > 0
+    ):
+        raise RuntimeError(
+            f"ERROR: derivative ignored for {name} -- specified an autograd function without derivative"
+        )
+
+    # note(crcrpar): In-place foreach functions do not support forward AD
+    if requires_derivative and len(fw_derivatives) > 0 and not is_inplace_foreach:
+        assert sum(len(derivative.var_names) for derivative in fw_derivatives) == len(
+            differentiable_outputs
+        ), (
+            "Expected the number of forward derivatives implemented to match the "
+            "number of differentiable outputs. NB: This only applies when at least "
+            "one forward derivative is implemented. Not implementing any forward "
+            "derivatives is also okay, and we would require inputs to the op to "
+            "not have associated tangents in that case."
+        )
+
+    try_jit_decomposition = (
+        requires_derivative
+        and len(fw_derivatives) == 0
+        and (not modifies_arguments(f))
+        and (not returns_void)
+    )
+
+    def emit_save_inputs() -> List[str]:
+        setup: List[str] = []
+        if info is None or not info.has_derivatives:
+            return setup
+
+        has_tensorlist_arg = any(
+            is_tensor_list_type(arg.type) for arg in args_with_derivatives
+        )
+
+        # We don't want to save tensors if we know that they will never be used
+        # when computing the derivative, so we add guards to those statements
+        def guard_for(arg: SavedAttribute) -> Optional[str]:
+            assert info is not None
+
+            # It's hard to determine the edge offset if we have TensorLists
+            # NOTE(crcrpar): in-place foreach functions' arguments include tensorlist
+            # but their derivatives don't use it, so let them bypass this check.
+            if has_tensorlist_arg and (not is_inplace_foreach):
+                return None
+
+            # Empirical evaluation of the cases where we insert those guards in
+            # backward show that they are somewhat useless. E.g. there's no need
+            # to guard on some values captured from forward, because they had to
+            # require_grad if the backward function even gets executed. I don't
+            # have any good ideas for detecting those cases, so I simply disabled the
+            # checks.
+            if "backward" in info.name:
+                return None
+
+            # If there's a single derivative we could compute, we already have
+            # a requires_grad check that is sufficient
+            if len(args_with_derivatives) <= 1:
+                return None
+
+            # We really only care about trimming down the amount of tensors we save
+            if arg.nctype.type != BaseCType(tensorT):
+                return None
+
+            # We want to emit simple guards, so we only allow that if checking one
+            # input is enough to determine whether we need that value
+            used_in = [d for d in info.derivatives if arg in d.saved_inputs]
+            assert len(used_in) > 0
+            if len(used_in) != 1:
+                return None
+            derivative = used_in[0]
+
+            # Case with multioutput formulas
+            # TODO: process all derivative formulas!!!
+            if len(derivative.var_names) != 1:
+                wrap_opt_if_start = derivative.formula.find(
+                    f"wrap_opt_if({arg.nctype.name}"
+                )
+                if wrap_opt_if_start == -1:
+                    return None
+
+                wrap_opt_if_match = re.match(
+                    rf"wrap_opt_if\({arg.nctype.name},(.*?)\)",
+                    derivative.formula[wrap_opt_if_start:],
+                )
+                assert wrap_opt_if_match is not None
+
+                # Condition is between 'wrap_opt_if(var_name,' and ')'.
+                condition_slice = slice(len(rf"wrap_opt_if\({arg.nctype.name},"), -1)
+                wrap_opt_if_condition = wrap_opt_if_match.group(0)[
+                    condition_slice
+                ].strip()
+                # replace 'grad_input_mask[num]' with 'grad_fn->should_compute_output(num)'
+                wrap_opt_if_condition = re.sub(
+                    r"grad_input_mask\[(\d+)\]",
+                    r"grad_fn->should_compute_output(\1)",
+                    wrap_opt_if_condition,
+                )
+                return f"{wrap_opt_if_condition}"
+
+            # Figure out the offset of the edge that uses this variable
+            derivative_var_name = derivative.var_names[0]
+            for edge_off, a in enumerate(args_with_derivatives):
+                if a.name == derivative_var_name:
+                    break
+            else:
+                raise AssertionError()
+            return f"grad_fn->should_compute_output({edge_off})"
+
+        if is_inplace_foreach:
+            save_input_stmts = save_variables(info.all_saved_inputs, False, guard_for)
+            if save_input_stmts:
+                setup.append(
+                    LOOP_OVER_VECTOR_OF_GRAD_FNS.substitute(
+                        preamble="", statements=save_input_stmts
+                    )
+                )
+        else:
+            setup.extend(save_variables(info.all_saved_inputs, False, guard_for))
+            for arg in args_with_derivatives:
+                if is_tensor_list_type(arg.type):
+                    setup.append(f"grad_fn->{arg.name}_size_ = {arg.name}.size();")
+        return setup
+
+    def setup_derivative(differentiable_inputs: List[DifferentiableInput]) -> List[str]:
+        body: List[str] = []
+        if is_out_fn:
+            # For out functions, ensure that no input or output requires grad
+            body.append(DECLARE_GRAD_FN.substitute(op="Node"))
+            body.append(
+                SETUP_NONE_REQUIRES_GRAD.substitute(
+                    base_name=base_name,
+                    args_to_check=[arg.name for arg in differentiable_inputs],
+                )
+            )
+            body.append(
+                SETUP_NONE_REQUIRES_GRAD.substitute(
+                    base_name=base_name,
+                    args_to_check=[arg.name for arg in differentiable_outputs],
+                )
+            )
+            return body
+
+        op = info.op if info is not None and info.has_derivatives else "NotImplemented"
+        setup = []
+        if not is_inplace_foreach:
+            setup.extend(
+                ASSIGN_GRAD_FN.substitute(
+                    op=op,
+                    op_ctor=""
+                    if info is not None and info.has_derivatives
+                    else f'"{cpp.name(f.func)}"',
+                    args_with_derivatives=[arg.name for arg in args_with_derivatives],
+                ).split("\n")
+            )
+        else:
+            # note(crcrpar): Assuming in-place foreach function's self_arg is always TensorList.
+            list_like_arg = "self"
+            args = [arg.name for arg in args_with_derivatives]
+            for i, arg in enumerate(args):
+                if is_inplace_foreach and info is not None:
+                    if arg in refargname2inplace_foreacharg:
+                        foreach_arg = refargname2inplace_foreacharg[arg]
+                        args[i] = foreach_arg.name + (
+                            "[i]" if isinstance(foreach_arg.type, ListType) else ""
+                        )
+                else:
+                    if arg == list_like_arg:
+                        args[i] = arg + "[i]"
+            setup.extend(
+                ASSIGN_VECTOR_OF_GRAD_FN.substitute(
+                    op=op,
+                    op_ctor=""
+                    if info is not None and info.has_derivatives
+                    else f'"{cpp.name(f.func)}"',
+                    args_with_derivatives=args,
+                    irange=f"{list_like_arg}.size()",
+                ).split("\n")
+            )
+        setup.extend(emit_save_inputs())
+
+        body.extend(
+            emit_check_no_requires_grad(differentiable_inputs, args_with_derivatives)
+        )
+        declare_grad_fn_template = (
+            DECLARE_GRAD_FN if not is_inplace_foreach else DECLARE_VECTOR_OF_GRAD_FN
+        )
+        body.append(declare_grad_fn_template.substitute(op=op))
+        body.append(SETUP_DERIVATIVE.substitute(setup=setup))
+        return body
+
+    def emit_check_if_in_complex_autograd_allowlist() -> List[str]:
+        body: List[str] = []
+        if base_name in GRADIENT_IMPLEMENTED_FOR_COMPLEX:
+            return body
+        for arg in differentiable_outputs:
+            name = arg.name
+            # TODO: should be `arg.type.is_tensor_like()`?
+            if arg.cpp_type == "at::Tensor" or arg.cpp_type in TENSOR_LIST_LIKE_CTYPES:
+                body.append(f'throw_error_for_complex_autograd({name}, "{base_name}");')
+        return body
+
+    def emit_check_no_requires_grad(
+        tensor_args: List[DifferentiableInput],
+        args_with_derivatives: List[DifferentiableInput],
+    ) -> List[str]:
+        """Checks that arguments without derivatives don't require grad"""
+        body: List[str] = []
+        for arg in tensor_args:
+            if arg in args_with_derivatives:
+                continue
+            arg_name = arg.name
+            if info and arg_name in info.non_differentiable_arg_names:
+                continue
+            if arg_name == "output":
+                # Double-backwards definitions sometimes take in 'input' and
+                # 'output', but only define the derivative for input.
+                continue
+            body.append(f'check_no_requires_grad({arg_name}, "{arg_name}", "{name}");')
+        return body
+
+    def emit_original_self_definition() -> List[str]:
+        body: List[str] = []
+        if inplace:
+            if is_inplace_foreach:
+                body.append(
+                    "std::vector<c10::optional<at::Tensor>> original_selfs(self.size());"
+                )
+            else:
+                body.append("c10::optional<at::Tensor> original_self;")
+
+            all_forward_grad_cond = []
+            for derivative in fw_derivatives:
+                if derivative.required_original_self_value:
+                    all_forward_grad_cond.append(
+                        get_any_has_forward_grad_name(derivative.var_names)
+                    )
+
+            if all_forward_grad_cond:
+                if not is_inplace_foreach:
+                    body.append(f'if ({" || ".join(all_forward_grad_cond)}) {{')
+                    body.append("  original_self = self.clone();")
+                    body.append("}")
+                else:
+                    current_all_forward_grad_cond = [
+                        f"{cond}[i]" for cond in all_forward_grad_cond
+                    ]
+                    body.append("for (const auto& i : c10::irange(self.size())) {")
+                    body.append(
+                        f"  if ({' || '.join(current_all_forward_grad_cond)}) {{"
+                    )
+                    body.append("    original_selfs[i] = self[i].clone();")
+                    body.append("  }")
+                    body.append("}")
+
+        return body
+
+    def save_variables(
+        saved_variables: Sequence[SavedAttribute],
+        is_output: bool,
+        guard_for: Callable[[SavedAttribute], Optional[str]] = lambda name: None,
+    ) -> Sequence[str]:
+        # assign the saved variables to the generated grad_fn
+        stmts: List[str] = []
+        for arg in sorted(saved_variables, key=lambda sa: str(sa.nctype.name)):
+            name = (
+                arg.nctype.name.name
+                if isinstance(arg.nctype.name, SpecialArgName)
+                else arg.nctype.name
+            )
+            foreacharg: Optional[Argument] = None
+            is_foreacharg_list_type: bool = False
+            type = arg.nctype.type
+            expr = arg.expr
+            stmts_prepend = None
+            if is_inplace_foreach and info is not None:
+                # todo(crcrpar): See if we can add some check e.g. `assert foreacharg is not None`.
+                # for now the example assert would fail.
+                name_to_query = name.split("_scalar_type")[0]
+                if name_to_query in refargname2inplace_foreacharg:
+                    foreacharg = refargname2inplace_foreacharg[name_to_query]
+                    is_foreacharg_list_type = isinstance(foreacharg.type, ListType)
+                if foreacharg is not None:
+                    name_in_expr = (
+                        f"{foreacharg.name}{'[i]' if is_foreacharg_list_type else ''}"
+                    )
+                    src_name = name
+                    if "_scalar_type" in src_name:
+                        split_src_name = src_name.split("_scalar_type")
+                        assert len(split_src_name) == 2
+                        src_name = split_src_name[0]
+                    expr = expr.replace(src_name, name_in_expr)
+            if (
+                type == BaseCType(tensorT)
+                or type == OptionalCType(BaseCType(tensorT))
+                or type == MutRefCType(OptionalCType(BaseCType(tensorT)))
+                or (is_output and type == BaseCType(scalarT))
+            ):
+                # note(crcrpar): Here `expr` is generated from scratch, `arg.expr` is ignored.
+                var = name
+                name += "_"
+                if var == "self" and inplace:
+                    original_self_var = (
+                        "original_self"
+                        if not is_inplace_foreach
+                        else "original_selfs[i]"
+                    )
+                    self_var = var if not is_inplace_foreach else var + "[i]"
+                    stmts_prepend = f"if (!{original_self_var}.has_value()) {original_self_var} = {self_var}.clone()"
+                    var = f"{original_self_var}.value()"
+                    assert not is_output
+                if inplace and is_output:
+                    assert name == "result_"
+                    var = (
+                        "self[i]"
+                        if is_inplace_foreach or is_foreacharg_list_type
+                        else "self"
+                    )
+                    is_inplace_view = f"{var}.is_view()"
+                    expr = f"SavedVariable({var}, {str(is_output).lower()}, {is_inplace_view})"
+                else:
+                    expr = f"SavedVariable({var}, {str(is_output).lower()})"
+                    if foreacharg is not None and "original_selfs" not in expr:
+                        expr = expr.replace(src_name, name_in_expr)
+            elif (
+                type == BaseCType(tensorListT)
+                or type == ListCType(OptionalCType(BaseCType(tensorT)))
+                or type == BaseCType(iTensorListRefT)
+                or type == VectorCType(BaseCType(tensorT))
+            ):
+                # See Note [nuanced return type of out-of-place foreach functions]
+                if type == VectorCType(BaseCType(tensorT)):
+                    assert is_foreach and is_output
+                expr = f"make_saved_variable_list({name}, {str(is_foreach and is_output).lower()})"
+                name += "_"
+            elif type == BaseCType(intArrayRefT):
+                expr = expr + ".vec()"
+            elif type == BaseCType(symIntArrayRefT):
+                expr = expr + ".vec()"
+            elif type == BaseCType(stringT):
+                expr = f"std::string({expr})"
+            elif type == OptionalCType(BaseCType(stringT)):
+                expr = f"{expr}.has_value() ? c10::optional<std::string>(std::string({expr}.value())) : c10::nullopt"
+            elif type == ArrayRefCType(
+                elem=BaseCType(type=BaseCppType(ns="at", name="Scalar"))
+            ):
+                expr = expr + ".vec()"
+
+            guard = guard_for(arg)
+            if guard is None:
+                if stmts_prepend:
+                    stmts.append(f"{stmts_prepend};")
+                stmts.append(f"grad_fn->{name} = {expr};")
+            else:
+                stmts.append(f"if ({guard}) {{")
+                if stmts_prepend:
+                    stmts.append(f"  {stmts_prepend};")
+                stmts.append(f"  grad_fn->{name} = {expr};")
+                stmts.append("}")
+        return stmts
+
+    # Generates a Dispatcher::redispatch() call into the dispatcher. We do this mainly for performance reasons:
+    #  - Pre-compute the full DispatchKeySet. This saves the dispatcher from having to read from TLS.
+    #  - redispatch() avoids a redundant call to RecordFunction, which was already called right before
+    #    we entered this autograd kernel.
+    def emit_dispatch_call(
+        f: NativeFunction, input_base: str, unpacked_args: Sequence[str]
+    ) -> str:
+        """Dispatch call via function in a namespace or method on Tensor."""
+        dispatcher_sig = DispatcherSignature.from_schema(f.func)
+        dispatcher_exprs = dispatcher_sig.exprs()
+
+        # code-generated autograd kernels plumb and recompute dispatch keys directly through the kernel for performance.
+        # Ops also always have a function variant of the redispatch API.
+        # See Note [Plumbing Keys Through The Dispatcher] for details.
+        dispatch_key_set = "ks & c10::after_autograd_keyset"
+        call = CALL_REDISPATCH.substitute(
+            api_name=cpp.name(
+                f.func,
+                faithful_name_for_out_overloads=True,
+                symint_overload=f.func.has_symint(),
+            ),
+            unpacked_args=[dispatch_key_set] + list(unpacked_args),
+        )
+        return call
+
+    def wrap_output(
+        f: NativeFunction, unpacked_bindings: List[Binding], var: str
+    ) -> str:
+        call = ""
+        rhs_value: Optional[str] = None
+        if not any(r.type.is_tensor_like() for r in f.func.returns):
+            rhs_value = var
+        else:
+            rhs_value = f"std::move({var})"
+        assert rhs_value is not None
+        call += ASSIGN_RETURN_VALUE.substitute(
+            return_values=tie_return_values(f), rhs_value=rhs_value
+        )
+        return call
+
+    def check_tensorimpl_and_storage(
+        call: str, unpacked_bindings: List[Binding]
+    ) -> str:
+        # See NOTE [ TensorImpl and Storage Pointer Sanity Checks ]
+        stmts_before_call: List[str] = []
+        stmts_after_call: List[str] = []
+
+        if cpp.name(f.func) in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE:
+            return call
+
+        # Check properties of inputs (enforce (1))
+        for unpacked_binding in unpacked_bindings:
+            arg = unpacked_binding.name
+            noref_cpp_type = unpacked_binding.nctype.type.remove_const_ref()
+            if noref_cpp_type == BaseCType(tensorListT) or noref_cpp_type == BaseCType(
+                iTensorListRefT
+            ):
+                stmts_before_call += [
+                    SAVE_TENSORLIST_STORAGE.substitute(tensorlist_name=arg),
+                    SAVE_TENSORLIST_IMPL.substitute(tensorlist_name=arg),
+                ]
+                stmts_after_call += [
+                    ENFORCE_SAME_TENSORLIST_STORAGE.substitute(tensorlist_name=arg),
+                    ENFORCE_SAME_TENSORLIST_IMPL.substitute(tensorlist_name=arg),
+                ]
+            elif noref_cpp_type == ListCType(OptionalCType(BaseCType(tensorT))):
+                stmts_before_call += [
+                    SAVE_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg),
+                    SAVE_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg),
+                ]
+                stmts_after_call += [
+                    ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE.substitute(
+                        tensorlist_name=arg
+                    ),
+                    ENFORCE_SAME_OPTIONALTENSORLIST_IMPL.substitute(
+                        tensorlist_name=arg
+                    ),
+                ]
+            elif noref_cpp_type == BaseCType(tensorT):
+                stmts_before_call += [
+                    SAVE_TENSOR_STORAGE.substitute(tensor_name=arg),
+                    SAVE_TENSOR_IMPL.substitute(tensor_name=arg),
+                ]
+                stmts_after_call += [
+                    ENFORCE_SAME_TENSOR_STORAGE.substitute(
+                        tensor_name=arg, out_tensor_name=arg
+                    ),
+                    ENFORCE_SAME_TENSOR_IMPL.substitute(tensor_name=arg),
+                ]
+
+        assert (stmts_before_call and stmts_after_call) or (
+            not stmts_before_call and not stmts_after_call
+        )
+
+        # Check properties of outputs (enforce (2), (3))
+        if f.func.kind() not in (SchemaKind.inplace, SchemaKind.out):
+            base_name = f.func.name.name.base  # TODO: should be str(f.func.name.name)?
+            aliased_arg_name = ALL_VIEW_FUNCTIONS.get(base_name, None)
+            if aliased_arg_name is not None:
+                aliased_arg_name = unpacked_name(aliased_arg_name)
+            for i, (ret, ret_name) in enumerate(
+                zip(f.func.returns, cpp.return_names(f))
+            ):
+                noref_cpp_type = cpp.return_type(ret, symint=True).remove_const_ref()
+                if noref_cpp_type == BaseCType(tensorT):
+                    if aliased_arg_name is not None:
+                        assert (
+                            i == 0
+                        ), "Expect non-CompositeImplicitAutograd view function {base} to return single output"
+                        stmts_after_call += [
+                            ENFORCE_SAME_TENSOR_STORAGE.substitute(
+                                tensor_name=aliased_arg_name, out_tensor_name=ret_name
+                            )
+                        ]
+                    else:
+                        if (
+                            type_wrapper_name(f)
+                            not in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT
+                        ):
+                            stmts_after_call += [
+                                ENFORCE_TENSOR_STORAGE_USE_COUNT_EQUALS_ONE.substitute(
+                                    tensor_name=ret_name, fn_name=type_wrapper_name(f)
+                                )
+                            ]
+
+                    if type_wrapper_name(f) not in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT:
+                        stmts_after_call += [
+                            ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE.substitute(
+                                tensor_name=ret_name, fn_name=type_wrapper_name(f)
+                            )
+                        ]
+
+                # Currently we don't have any functions that return the following types, but
+                # we should update the checks once we do
+                elif noref_cpp_type == ListCType(OptionalCType(BaseCType(tensorT))):
+                    raise AssertionError(
+                        f"Please add use_count checks for {noref_cpp_type}"
+                    )
+                elif noref_cpp_type == BaseCType(tensorListT):
+                    raise AssertionError(
+                        f"Please add use_count checks for {noref_cpp_type}"
+                    )
+
+        if stmts_before_call and stmts_after_call:
+            call = (
+                RUN_ONLY_IN_DEBUG_MODE.substitute(statements=stmts_before_call)
+                + call
+                + RUN_ONLY_IN_DEBUG_MODE.substitute(statements=stmts_after_call)
+            )
+        return call
+
+    def emit_call(
+        f: NativeFunction, unpacked_bindings: List[Binding], try_jit_decomposition: bool
+    ) -> str:
+        # We only care about adding `at::AutoDispatchBelowAutograd` guard for non-variable dispatch
+        # (which corresponds to 'use_derived' strategy). The purpose of this guard is to make sure
+        # the baseType operations still dispatch to non-Variable type, even if the arguments passed
+        # in are now Variables.
+        # See NOTE [ Treating Variables as non-Variables in type dispatch ] for details.
+        unpacked_args = [b.name for b in unpacked_bindings]
+        base_type_call = emit_dispatch_call(f, "self_", unpacked_args)
+
+        if get_view_info(f) is not None or modifies_arguments(f):
+            guard = "at::AutoDispatchBelowAutograd guard;"
+        else:
+            guard = "at::AutoDispatchBelowADInplaceOrView guard;"
+
+        any_has_forward_grad = (
+            get_any_has_fw_grad_cond(derivative=None)
+            if requires_derivative
+            else "false"
+        )
+        return_types = ", ".join(
+            [cpp.return_type(a, symint=True).cpp_type() for a in f.func.returns]
+        )
+        if len(f.func.returns) > 1:
+            return_types = f"std::tuple<{return_types}>"
+
+        arg_names = [
+            a.name
+            for a in cpp.arguments(
+                f.func.arguments,
+                faithful=True,
+                symint=True,
+                method=False,
+                cpp_no_default_args=set(),
+            )
+        ]
+
+        if not modifies_arguments(f) and not returns_void:
+            if try_jit_decomposition:
+                call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES_JVP_DECOMP.substitute(
+                    base_type_call=base_type_call,
+                    tmp_var=TMP_VAR,
+                    guard=guard,
+                    any_has_forward_grad=any_has_forward_grad,
+                    op_name=cpp.name(f.func),
+                    op_overload=f.func.name.overload_name,
+                    return_types=return_types,
+                    arg_names=arg_names,
+                )
+            else:
+                call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES.substitute(
+                    base_type_call=base_type_call,
+                    tmp_var=TMP_VAR,
+                    guard=guard,
+                )
+
+            call += wrap_output(f, unpacked_bindings, TMP_VAR)
+        else:
+            assert not try_jit_decomposition
+            call = DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES.substitute(
+                base_type_call=base_type_call, guard=guard
+            )
+        call = check_tensorimpl_and_storage(call, unpacked_bindings)
+        return call
+
+    def emit_history() -> str:
+        fn = "rebase" if modifies_arguments(f) and view_info is None else "set"
+        output_names = [r.name for r in differentiable_outputs]
+        # TODO: flatten allocates a std::vector, which could be expensive
+        outs = CodeTemplate("flatten_tensor_args( ${outs} )").substitute(
+            outs=output_names if not is_inplace_foreach else "self"
+        )
+        if not is_inplace_foreach:
+            return SET_HISTORY.substitute(fn=fn, differentiable_outputs=outs)
+        else:
+            return LOOP_OVER_VECTOR_OF_GRAD_FNS.substitute(
+                preamble=(
+                    f"auto differentiable_outputs = {outs};\n"
+                    f"TORCH_INTERNAL_ASSERT(differentiable_outputs.size() == grad_fns.size());"
+                ),
+                statements=f"{fn}_history(differentiable_outputs[i], grad_fns[i]);",
+            )
+
+    def emit_save_outputs() -> str:
+        if is_out_fn:
+            # out functions don't currently support differentiation
+            return ""
+        if info is not None and info.has_derivatives:
+            stmts = save_variables(info.all_saved_outputs, True)
+            if len(stmts) == 0:
+                return ""
+            if not is_inplace_foreach:
+                return CONDITIONAL.substitute(cond="grad_fn", statements=stmts)
+            else:
+                return LOOP_OVER_VECTOR_OF_GRAD_FNS.substitute(
+                    preamble="", statements=stmts
+                )
+        return ""
+
+    def emit_any_requires_grad() -> List[str]:
+        extra_condition = ""
+        if info and info.output_differentiability_conditions:
+            assert len(info.output_differentiability_conditions) == 1
+            extra_condition = f"_any_requires_grad &= ({info.output_differentiability_conditions[0]});"
+        names_of_args_with_derivatives = [arg.name for arg in args_with_derivatives]
+        if is_inplace_foreach and info is not None:
+            for i, arg in enumerate(names_of_args_with_derivatives):
+                for f_arg, r_arg in inplace_foreacharg2refarg.items():
+                    if arg == r_arg.name:
+                        names_of_args_with_derivatives[i] = f_arg.name
+        return [
+            SETUP_ANY_REQUIRES_GRAD.substitute(
+                args_with_derivatives=names_of_args_with_derivatives,
+                extra_differentiability_conditions=extra_condition,
+            )
+        ]
+
+    def get_any_has_forward_grad_name(var_names: Tuple[str, ...]) -> str:
+        if len(var_names) == 1:
+            return f"_any_has_forward_grad_{var_names[0]}"
+        else:
+            return f'_any_has_forward_grad_{"_".join(var_names)}'
+
+    def emit_any_has_forward_grad() -> List[str]:
+        content: List[str] = []
+        if not is_foreach:
+            for derivative in fw_derivatives:
+                requires_fw_grad = get_any_has_fw_grad_cond(derivative=derivative)
+                if info and info.output_differentiability_conditions:
+                    assert len(info.output_differentiability_conditions) == 1
+                    requires_fw_grad = f"({info.output_differentiability_conditions[0]}) && {requires_fw_grad}"
+                content.append(
+                    f"[[maybe_unused]] auto {get_any_has_forward_grad_name(derivative.var_names)} = {requires_fw_grad};"
+                )
+        else:
+            for derivative in fw_derivatives:
+                bool_vector_name = get_any_has_forward_grad_name(derivative.var_names)
+                cur_derivative_conditions = []
+                for inp in differentiable_inputs:
+                    if derivative.required_inputs_fw_grad is None:
+                        continue
+                    if inp.name not in derivative.required_inputs_fw_grad:
+                        continue
+                    inp_name = (
+                        inp.name
+                        if not inplace
+                        else refargname2inplace_foreacharg[inp.name].name
+                    )
+                    inp_type = (
+                        inp.type
+                        if not inplace
+                        else refargname2inplace_foreacharg[inp.name].type
+                    )
+                    is_list_type = is_tensor_list_type(inp_type)
+                    if is_list_type:
+                        if inp_name != "self":
+                            content.append(
+                                FW_DERIVATIVE_SIZE_CHECK_TEMPLATE.substitute(
+                                    inp_name=inp_name
+                                )
+                            )
+                        cur_derivative_conditions.append(
+                            FW_DERIVATIVE_CHECK_TEMPLATE.substitute(
+                                req_inp=inp_name + "[i]"
+                            )
+                        )
+                    else:
+                        cur_derivative_conditions.append(
+                            FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp_name)
+                        )
+
+                content.append(f"std::vector<bool> {bool_vector_name}(self.size());")
+                content.append("for (const auto& i : c10::irange(self.size())) {")
+                content.append(
+                    f"  {bool_vector_name}[i] = {' || '.join(cur_derivative_conditions)};"
+                )
+                content.append("}")
+        return content
+
+    def emit_check_inplace() -> List[str]:
+        if not inplace:
+            return []
+        return [
+            f"check_inplace({arg.name}, _any_requires_grad);"
+            for arg in differentiable_outputs
+        ]
+
+    def emit_fw_derivatives() -> List[str]:
+        content: List[str] = []
+        fw_grad_setters: List[str] = []
+        for derivative in fw_derivatives:
+            res = derivative.var_names
+            if f.func.name.name.inplace:
+                assert (
+                    len(res) == 1
+                ), "Expected number of outputs to be 1 if function is inplace"
+                # TODO update this when inplace namings are unified
+                res = ("self",)
+
+            assert derivative.required_inputs_fw_grad is not None
+
+            unpacked_arguments = ""
+            for inp in differentiable_inputs:
+                inp_name = inp.name
+                is_input_tensorlist = is_foreach and is_tensor_list_type(
+                    inp.type
+                    if not inplace
+                    else refargname2inplace_foreacharg[inp.name].type
+                )
+                input_suffix = "[i]" if is_input_tensorlist else ""
+                if is_inplace_foreach:
+                    if inp.name in refargname2inplace_foreacharg:
+                        inp_name = refargname2inplace_foreacharg[inp.name].name
+                zeros_fn = (
+                    "zeros"
+                    if inplace and inp.name == "self"
+                    else "_efficientzerotensor"
+                )
+                if inp.name in derivative.required_inputs_fw_grad:
+                    unpacked_arguments += (
+                        FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(
+                            inp_name=inp.name,
+                            inp=inp_name + input_suffix,
+                            zeros_fn=zeros_fn,
+                        )
+                    )
+                if inp.name in (derivative.required_inputs_primal or []):
+                    unpacked_arguments += (
+                        FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(
+                            inp_name=inp.name,
+                            inp=inp_name + input_suffix,
+                        )
+                    )
+            if derivative.required_original_self_value:
+                input_suffix = "s[i]" if is_inplace_foreach else ""
+                unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(
+                    inp_name="original_self",
+                    inp="original_self" + input_suffix,
+                    zeros_fn=zeros_fn,
+                )
+                unpacked_arguments += FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(
+                    inp_name="original_self",
+                    inp="original_self" + input_suffix,
+                )
+            elif inplace and derivative.is_reusing_outplace_formula:
+                # The gradient wasn't already cloned, do it if grad mode is enabled
+                unpacked_arguments += (
+                    "self_t = GradMode::is_enabled() ? self_t.clone() : self_t;"
+                )
+
+            if inplace:
+                is_inplace_str = "true"
+            else:
+                is_inplace_str = "false"
+
+            requires_fw_grad = get_any_has_forward_grad_name(derivative.var_names)
+
+            if all(
+                (isinstance(var_type, BaseType) and var_type.is_tensor_like())
+                for var_type in derivative.var_types
+            ):
+                # Is there a way to get from BaseType to BaseCType
+                if len(derivative.var_types) == 1:
+                    opt_res_grad_type = OptionalCType(BaseCType(tensorT)).cpp_type()
+                    if not is_foreach:
+                        fw_grad_setters.append(
+                            FW_DERIVATIVE_SETTER_TENSOR.substitute(
+                                out_arg=res[0], is_inplace=is_inplace_str
+                            )
+                        )
+                    else:
+                        assert res[0] == ("result" if not inplace else "self")
+                        fw_grad_setters.append(
+                            FW_DERIVATIVE_SETTER_TENSOR_FOREACH.substitute(
+                                out_arg=res[0], is_inplace=is_inplace_str
+                            )
+                        )
+                    requires_fw_grad += f" && ({derivative.var_names[0]}.defined())"
+                else:
+                    tuple_type = TupleCType(
+                        [BaseCType(tensorT)] * len(derivative.var_types)
+                    )
+                    opt_res_grad_type = OptionalCType(tuple_type).cpp_type()
+                    for idx, single_res in enumerate(res):
+                        fw_grad_setters.append(
+                            FW_DERIVATIVE_SETTER_MULTI_OUTPUT.substitute(
+                                idx=idx, all_res="_".join(res), out_arg=single_res
+                            )
+                        )
+            elif (
+                isinstance(derivative.var_types[0], ListType)
+                and derivative.var_types[0].is_tensor_like()
+            ):
+                assert (
+                    len(derivative.var_types) == 1
+                ), "Expected number of outputs to be 1 if function returns ListType"
+                if not is_foreach:
+                    opt_res_grad_type = OptionalCType(
+                        VectorCType(BaseCType(tensorT))
+                    ).cpp_type()
+                    fw_grad_setters.append(
+                        FW_DERIVATIVE_SETTER_TENSOR_LIST.substitute(
+                            out_arg=res[0], is_inplace=is_inplace_str
+                        )
+                    )
+                else:
+                    # TODO(crcrpar): Should this (= the foreach specific logic) be refactored somehow?
+                    # Only out-place foreach functions that have entries in `tools/autograd/derivatives.yaml`
+                    # can reach here.
+                    opt_res_grad_type = OptionalCType(BaseCType(tensorT)).cpp_type()
+                    fw_grad_setters.append(
+                        FW_DERIVATIVE_SETTER_TENSOR_FOREACH.substitute(
+                            out_arg=res[0], is_inplace=is_inplace_str
+                        )
+                    )
+            else:
+                raise RuntimeError("Unsupported output type for forward derivative")
+
+            if not is_foreach:
+                fw_grad_opt_definition = f"{opt_res_grad_type} {'_'.join(res)}_new_fw_grad_opt = c10::nullopt;"
+                # View ops create fw_grad that already is a view of the base's fw_grad so just use that
+                content.append(
+                    FW_DERIVATIVE_TEMPLATE.substitute(
+                        fw_grad_opt_definition=fw_grad_opt_definition,
+                        requires_fw_grad=requires_fw_grad,
+                        formula=derivative.formula,
+                        out_arg="_".join(res),
+                        unpacked_arguments=unpacked_arguments,
+                    )
+                )
+            else:
+                # note(crcrpar): Assuming `self` is TensorList.
+                fw_grad_opt_definition = (
+                    f"std::vector<{opt_res_grad_type}> {'_'.join(res)}_new_fw_grad_opts"
+                    "(self.size(), c10::nullopt);"
+                )
+                foreach_forward_grad_formula = derivative.formula
+                _foreach_arg: Union[Argument, DifferentiableInput]
+                if inplace:
+                    for _foreach_arg, _ref_arg in inplace_foreacharg2refarg.items():
+                        # note(crcrpar): Massage only Scalar and ArrayRef<Scalar> here.
+                        if not (
+                            is_tensor_type(_foreach_arg.type)
+                            or is_tensor_list_type(_foreach_arg.type)
+                        ):
+                            pattern = _foreach_arg.name
+                            if isinstance(_foreach_arg.type, ListType):
+                                pattern += "[i]"
+                            foreach_forward_grad_formula = (
+                                foreach_forward_grad_formula.replace(
+                                    _ref_arg.name, pattern
+                                )
+                            )
+                else:
+                    if (
+                        "result" in foreach_forward_grad_formula
+                        and "result[i]" not in foreach_forward_grad_formula
+                    ):
+                        foreach_forward_grad_formula = (
+                            foreach_forward_grad_formula.replace("result", "result[i]")
+                        )
+
+                content.append(
+                    FW_DERIVATIVE_FOREACH_TEMPLATE.substitute(
+                        fw_grad_opt_definition=fw_grad_opt_definition,
+                        vector_of_optional_tensor=f"{'_'.join(res)}_new_fw_grad_opts",
+                        any_has_forward_grad_for_current_index=" || ".join(
+                            get_any_has_forward_grad_name(derivative.var_names) + "[i]"
+                            for derivative in fw_derivatives
+                        ),
+                        formula=foreach_forward_grad_formula,
+                        unpacked_arguments=unpacked_arguments,
+                    )
+                )
+
+        # Set all the grads at the end to avoid: https://github.com/pytorch/pytorch/issues/67367
+        content.append("\n".join(fw_grad_setters))
+        return content
+
+    def get_any_has_fw_grad_cond(derivative: Optional[ForwardDerivative]) -> str:
+        #
+        # Produces a condition string (e.g, "isFwGradDefined(grad_output) || isFwGradDefined(output)")
+        #
+        if derivative is None:
+            # (1) If a derivative is NOT provided, cond will check fw_grad of ALL differentiable inputs
+            # - Used in the out_fn case when we want to forbid fw derivatives
+            # - Used in the case where the fw_derivative is not defined, but we want
+            #   To check if there is a decomposition registered for jvp
+            to_check: List[str] = []
+            for inp in list(
+                mapMaybe(
+                    gen_differentiable_input,
+                    f.func.arguments.non_out + list(f.func.arguments.out),  # type: ignore[operator]
+                )
+            ):
+                if is_tensor_type(inp.type):
+                    to_check.append(
+                        FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
+                    )
+                elif is_tensor_list_type(inp.type):
+                    to_check.append(
+                        FW_DERIVATIVE_TENSORLIST_CHECK_TEMPLATE.substitute(
+                            req_inp=inp.name
+                        )
+                    )
+                else:
+                    raise RuntimeError(
+                        f'Unsupported input type for "{name}" when forbidding forward AD usage.'
+                    )
+            return f'({" || ".join(to_check)})'
+        else:
+            # (2) If derivative is provided, use that information to determine which inputs
+            #     to check fw_grad for
+            assert derivative.required_inputs_fw_grad is not None
+
+            if len(derivative.required_inputs_fw_grad) == 0:
+                # Handle functions like stack
+                # For these, we don't unpack anything and always call the user function
+                if not (
+                    len(differentiable_inputs) == 1
+                    and is_tensor_list_type(differentiable_inputs[0].type)
+                ):
+                    raise RuntimeError(
+                        f'No differentiable input to "{name}" is a differentiable Tensor (as the provided '
+                        "forward AD formula does not use any input tangent) even though a forward gradient "
+                        "formula has been defined for it. This case should only happen for function that "
+                        "take a single TensorList as input. All other cases are not supported right now."
+                    )
+                any_has_fw_grad = "true"
+            else:
+                any_has_fw_grad = " || ".join(
+                    [
+                        (
+                            FW_DERIVATIVE_TENSORLIST_CHECK_TEMPLATE
+                            if is_tensor_list_type(inp.type)
+                            else FW_DERIVATIVE_CHECK_TEMPLATE
+                        ).substitute(req_inp=inp.name)
+                        for inp in differentiable_inputs
+                        if inp.name in derivative.required_inputs_fw_grad
+                    ]
+                )
+                any_has_fw_grad = f"({any_has_fw_grad})"
+
+            return any_has_fw_grad
+
+    def emit_forbid_fw_derivatives(is_out_fn: bool = False) -> str:
+        if is_out_fn:
+            msg = "because it is an out= function"
+        else:
+            msg = (
+                "because it has not been implemented yet.\\nPlease file an issue "
+                "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
+                "so that we can prioritize its implementation."
+            )
+        cond = get_any_has_fw_grad_cond(derivative=None)
+        return (
+            FW_DERIVATIVE_FORBID_TEMPLATE.substitute(cond=cond, name=name, msg=msg)
+            if cond != ""
+            else ""
+        )
+
+    body: List[str] = []
+    unpack_args_stats, unpacked_bindings = unpack_args(f)
+
+    body.extend(unpack_args_stats)
+    if requires_derivative:
+        body.extend(emit_any_requires_grad())
+        body.extend(emit_any_has_forward_grad())
+        body.extend(emit_check_inplace())
+        body.extend(emit_original_self_definition())
+        body.extend(setup_derivative(differentiable_inputs))
+
+    body.append(emit_call(f, unpacked_bindings, try_jit_decomposition))
+    if requires_derivative:
+        # set_flags has to appear after version_counter, because rebase_history
+        # requires that the counter is incremented before it is called
+        body.append(emit_history())
+        body.extend(emit_check_if_in_complex_autograd_allowlist())
+
+    if is_out_fn:
+        body.append(emit_forbid_fw_derivatives(is_out_fn=True))
+    else:
+        if requires_derivative and not try_jit_decomposition:
+            if len(fw_derivatives) > 0:
+                body.extend(emit_fw_derivatives())
+            else:
+                body.append(emit_forbid_fw_derivatives())
+
+    if requires_derivative:
+        # Save only after the forward AD has been set up
+        body.append(emit_save_outputs())
+
+    if str(f.func.name.name) in RESET_GRAD_ACCUMULATOR:
+        # `inplace` implies that there is exactly one output named `self`,
+        # so we can keep the generated code easy. If you need to
+        # `reset_grad_accumulator` in an operator that's not `inplace`, you can
+        # remove this assert but the code generation will get more elaborate
+        assert inplace
+        body.append("reset_grad_accumulator(self);")
+    if not returns_void:
+        body.append(f"return {get_return_value(f)};")
+    return body
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/gen_view_funcs.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/gen_view_funcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9f7561dca17d9b1e28c4f875648231b10ea3532
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/gen_view_funcs.py
@@ -0,0 +1,334 @@
+# Generates ViewFuncs.h/cpp
+#
+# NOTE: If any changes are being made to the ViewFunc codegen please also check
+# if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+# The fallback is expected to mimic this codegen, so we should keep the two in sync.
+
+from typing import List, Tuple
+
+import torchgen.api.dispatcher as dispatcher
+from torchgen.api.autograd import NativeFunctionWithDifferentiabilityInfo
+from torchgen.api.translate import translate
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    NamedCType,
+    SymIntT,
+    tensorT,
+    VectorCType,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.model import Argument, NativeFunction, OptionalType
+from torchgen.utils import FileManager
+
+from .gen_inplace_or_view_type import (
+    CALL_DISPATCH,
+    extract_bindings,
+    get_view_info,
+    modifies_arguments,
+    use_derived,
+)
+
+FUNCTION_DECLARATION = CodeTemplate(
+    """\
+#define ${uppercase_op}_AVAILABLE
+struct ${op} : public ${superclass} {
+  ${op}(${constructor_args}) ${initializer_list}
+  {};
+  virtual ~${op}() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ${state}
+};
+
+"""
+)
+
+FUNCTION_DEFINITION = CodeTemplate(
+    """\
+std::vector<c10::SymInt> ${op}::get_symints() const {
+  ${get_symints}
+}
+
+size_t ${op}::num_symints() const {
+  return static_cast<size_t>(${num_symints});
+}
+
+void ${op}::set_symints(std::vector<c10::SymInt> ${symints_vec}) {
+  TORCH_INTERNAL_ASSERT(${symints_vec}.size() == num_symints());
+  ${set_symints}
+}
+
+std::vector<at::Tensor> ${op}::get_tensors() const {
+  ${get_tensors}
+}
+
+size_t ${op}::num_tensors() const {
+  return static_cast<size_t>(${num_tensors});
+}
+
+void ${op}::set_tensors(std::vector<at::Tensor> ${tensors_vec}) {
+  TORCH_INTERNAL_ASSERT(${tensors_vec}.size() == num_tensors());
+  ${set_tensors}
+}
+
+at::Tensor ${op}::operator()(const at::Tensor& ${call_input_name}) const {
+  return ${op_call};
+}
+
+std::unique_ptr<ViewFunc> ${op}::clone_and_set(
+    std::optional<std::vector<c10::SymInt>> ${symints_vec},
+    std::optional<std::vector<at::Tensor>> ${tensors_vec}) const {
+  auto output = std::make_unique<${op}>(${clone_args});
+  if (${symints_vec}.has_value()) {
+    output->set_symints(std::move(*(${symints_vec})));
+  }
+  if (${tensors_vec}.has_value()) {
+    output->set_tensors(std::move(*(${tensors_vec})));
+  }
+  return output;
+}
+
+"""
+)
+
+
+# e.g. as_strided -> AsStridedViewFunc for camel case or
+# as_strided_view_func otherwise
+def view_func_name(
+    f: NativeFunction, include_namespace: bool = False, camel_case: bool = True
+) -> str:
+    name = f.func.name.unambiguous_name()
+    view_func_name = f"{name.replace('.', '_')}_view_func"
+    if camel_case:
+        is_private = view_func_name.startswith("_")
+        view_func_name = "".join(
+            [p.title() for p in view_func_name.replace(".", "_").split("_")]
+        )
+        if is_private:
+            # put the leading underscore back in
+            view_func_name = f"_{view_func_name}"
+    namespace = "torch::autograd::generated::" if include_namespace else ""
+    return f"{namespace}{view_func_name}"
+
+
+def is_symint_or_tensor(arg: Argument) -> bool:
+    return arg.type.is_tensor_like() or arg.type.is_symint_like()
+
+
+def remove_const_ref(binding: Binding) -> Binding:
+    return Binding(
+        name=binding.name,
+        nctype=binding.nctype.remove_const_ref(),
+        argument=binding.argument,
+        default=binding.default,
+    )
+
+
+def returns_multi_tensor(fn: NativeFunction) -> bool:
+    returns = fn.func.returns
+    assert len(returns) == 1
+    returns_list_like = returns[0].type.is_list_like() is not None
+    returns_tensor_like = returns[0].type.is_tensor_like()
+    return returns_list_like and returns_tensor_like
+
+
+# Generates strings with logic for getting / setting state of a particular type.
+#
+# Args:
+#   bindings (list): List of state bindings of interest (may be empty)
+#   state_vec_type (NamedCType): Type of vector to either return or copy from
+#
+# Returns:
+#   tuple: (list of getter logic strings, list of setter logic strings, string
+#     with num items expression)
+def generate_state_getter_setter(
+    bindings: List[Binding],
+    state_vec_type: NamedCType,
+) -> Tuple[List[str], List[str], str]:
+    getter_logic = []
+    setter_logic = []
+
+    state_vec = state_vec_type.name
+    getter_logic.append(f"{state_vec_type.cpp_type()} {state_vec};")
+    if len(bindings) > 0:
+        setter_logic.append("auto i = 0;")
+
+    num_exprs = []
+    for i, b in enumerate(bindings):
+        assert isinstance(b.argument, Argument)
+        if b.argument.type.is_list_like():
+            # Handle list-likes.
+            num_expr = f"{b.name}.size()"
+            num_exprs.append(num_expr)
+            getter = f"{state_vec}.insert({state_vec}.end(), {b.name}.begin(), {b.name}.end());"
+            setter = f"std::copy({state_vec}.begin() + i, {state_vec}.begin() + i + {b.name}.size(), {b.name}.begin());"
+        elif isinstance(b.argument.type, OptionalType):
+            # Handle optionals.
+            num_expr = f"({b.name}.has_value() ? 1 : 0)"
+            num_exprs.append(num_expr)
+            conditional = f"if({b.name}.has_value())"
+            getter = (
+                f"{conditional} {state_vec}.insert({state_vec}.end(), *({b.name}));"
+            )
+            setter = f"{conditional} {b.name} = {state_vec}[i];"
+        else:
+            num_expr = "1"
+            num_exprs.append(num_expr)
+            getter = f"{state_vec}.push_back({b.name});"
+            setter = f"{b.name} = {state_vec}[i];"
+
+        getter_logic.append(getter)
+        setter_logic.append(setter)
+        if i < len(bindings) - 1:
+            setter_logic.append(f"i += {num_expr};")
+
+    # Reserve / assert based on the total number of items expression.
+    num_items = "0" if len(num_exprs) == 0 else " + ".join(num_exprs)
+    if len(bindings) > 0:
+        getter_logic.insert(1, f"{state_vec}.reserve({num_items});")
+
+    getter_logic.append(f"return {state_vec};")
+
+    return getter_logic, setter_logic, num_items
+
+
+def process_function(fn: NativeFunction, template: CodeTemplate) -> str:
+    bindings = extract_bindings(fn)
+    non_self_bindings = [b for b in bindings if b.name != "self"]
+
+    non_self_args = fn.func.arguments.flat_all[1:]
+    non_self_value_bindings = [
+        dispatcher.argument(a, remove_non_owning_ref_types=True) for a in non_self_args
+    ]
+
+    # Generate constructor / clone args for the generated struct.
+    constructor_args = [b.defn() for b in non_self_bindings]
+    clone_args = [b.name for b in non_self_bindings]
+
+    # Generate state variable declarations for the generated struct.
+    state_variables = [
+        f"{remove_const_ref(b).defn()};" for b in non_self_value_bindings
+    ]
+
+    # Generate initializer list expressions for the generated struct.
+    # allow_expensive_conversions=True because we need to store e.g. SymIntArrayRefs as
+    # vector<SymInt>s.
+    init_exprs = translate(
+        non_self_bindings, non_self_value_bindings, allow_expensive_conversions=True
+    )
+    initializers = []
+    for b, init_expr in zip(non_self_bindings, init_exprs):
+        name = b.nctype.name
+        assert isinstance(name, str)
+        initializers.append(f"{name}({init_expr.expr})")
+
+    # Generate call to underlying view op
+    call_input_name = "input_base"
+    op_call_args = [call_input_name, *(b.name for b in non_self_bindings)]
+    op_call = CALL_DISPATCH.substitute(
+        unambiguous_name=fn.func.name.unambiguous_name(),
+        unpacked_args=op_call_args,
+    )
+
+    # Multi-output views additionally require a view_idx for disambiguation.
+    if returns_multi_tensor(fn):
+        view_idx_name = "view_idx"
+        view_idx_typename = "int64_t"
+        view_idx_decl = f"{view_idx_typename} {view_idx_name}"
+        constructor_args.append(view_idx_decl)
+        clone_args.append(view_idx_name)
+        state_variables.append(f"{view_idx_decl};")
+        initializers.append(f"{view_idx_name}({view_idx_name})")
+        op_call += f"[{view_idx_name}]"
+
+    # Generate initializer list for the generated struct.
+    initializer_list = f": {', '.join(initializers)}" if len(initializers) > 0 else ""
+
+    # Generate getter / setter logic for any symints.
+    symint_bindings = [
+        b
+        for b in non_self_bindings
+        if isinstance(b.argument, Argument) and b.argument.type.is_symint_like()
+    ]
+    symints_vec_type = NamedCType("symints", VectorCType(BaseCType(SymIntT)))
+    get_symints, set_symints, num_symints = generate_state_getter_setter(
+        symint_bindings, symints_vec_type
+    )
+
+    # Generate getter / setter logic for any tensors.
+    tensor_bindings = [
+        b
+        for b in non_self_bindings
+        if isinstance(b.argument, Argument) and b.argument.type.is_tensor_like()
+    ]
+    tensors_vec_type = NamedCType("tensors", VectorCType(BaseCType(tensorT)))
+    get_tensors, set_tensors, num_tensors = generate_state_getter_setter(
+        tensor_bindings, tensors_vec_type
+    )
+
+    return template.substitute(
+        op=view_func_name(fn),
+        uppercase_op=view_func_name(fn, camel_case=False).upper(),
+        superclass="torch::autograd::ViewFunc",
+        initializer_list=initializer_list,
+        state=state_variables,
+        constructor_args=constructor_args,
+        clone_args=clone_args,
+        symints_vec=symints_vec_type.name,
+        get_symints=get_symints,
+        set_symints=set_symints,
+        num_symints=num_symints,
+        tensors_vec=tensors_vec_type.name,
+        get_tensors=get_tensors,
+        set_tensors=set_tensors,
+        num_tensors=num_tensors,
+        call_input_name=call_input_name,
+        op_call=op_call,
+    )
+
+
+def gen_view_funcs(
+    out: str,
+    fns_with_infos: List[NativeFunctionWithDifferentiabilityInfo],
+    template_path: str,
+) -> None:
+    # don't need the info parts, just the function
+    fns = [fn.func for fn in fns_with_infos if use_derived(fn)]
+    # only want out-of-place views
+    view_fns = [
+        fn for fn in fns if get_view_info(fn) is not None and not modifies_arguments(fn)
+    ]
+
+    declarations = [process_function(fn, FUNCTION_DECLARATION) for fn in view_fns]
+    definitions = [process_function(fn, FUNCTION_DEFINITION) for fn in view_fns]
+    ops_headers = [f"#include <ATen/ops/{fn.root_name}_ops.h>" for fn in view_fns]
+
+    file_basename = "ViewFuncs"
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    for suffix in [".h", ".cpp"]:
+        fname = file_basename + suffix
+        fm.write_with_template(
+            fname,
+            fname,
+            lambda: {
+                "generated_comment": "@"
+                + f"generated from {fm.template_dir_for_comments()}/"
+                + fname,
+                "view_func_declarations": declarations,
+                "view_func_definitions": definitions,
+                "ops_headers": ops_headers,
+            },
+        )
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/templates/python_nested_functions.cpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/templates/python_nested_functions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3acb5128cee1e180de887080106e7cf5559f15ee
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/packaged/autograd/templates/python_nested_functions.cpp
@@ -0,0 +1,81 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_nested_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/out_types.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/device_lazy_init.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::OptionalIntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef nested_functions[] = {
+  {NULL, NULL, 0, NULL},
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPNestedVariableFunctionsModule = NULL;
+
+void initNestedFunctions(PyObject* module) {
+  nested_functions[0] = get_nested_functions_manual()[0];
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._nested",
+     NULL,
+     -1,
+     nested_functions
+  };
+  PyObject* nested = PyModule_Create(&def);
+  THPNestedVariableFunctionsModule = nested;
+  if (!nested) {
+    throw python_error();
+  }
+  // steals a reference to nested
+  if (PyModule_AddObject(module, "_nested", nested) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd