koichi12 commited on Feb 12, 2025

Commit

1c6d8d5

verified ·

1 Parent(s): e0a38d8

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/TestCyCache.py +119 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/TestDependencies.py +142 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/TestIpythonMagic.py +295 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/TestStripLiterals.py +56 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/__pycache__/TestCythonizeArgsParser.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/__pycache__/TestDependencies.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/__pycache__/TestInline.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/__pycache__/TestIpythonMagic.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/__pycache__/TestRecythonize.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/__pycache__/TestStripLiterals.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/Buffer.py +749 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/Code.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/FusedNode.cpython-311-x86_64-linux-gnu.so +3 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/Parsing.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__init__.py +1 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Lexicon.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Naming.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Options.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Pythran.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Scanning.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/UtilityCode.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/networkx/generators/atlas.dat.gz +3 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pyproject_hooks/__init__.py +23 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pyproject_hooks/__pycache__/_compat.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pyproject_hooks/_in_process/__init__.py +18 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pyproject_hooks/_in_process/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/code_template.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/context.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen_aoti_c_shim.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen_backend_stubs.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen_executorch.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen_functionalization_type.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen_lazy_tensor.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen_vmap_plumbing.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/local.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/native_function_generation.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/yaml_utils.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/__init__.py +19 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/__pycache__/lazy_ts_lowering.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/__pycache__/ufunc.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/lazy_ir.py +707 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/lazy_ts_lowering.py +48 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/native_functions.py +64 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/register_dispatch_key.py +989 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/ufunc.py +545 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/executorch/__init__.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/executorch/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/executorch/__pycache__/parse.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/executorch/api/__pycache__/__init__.cpython-311.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -38,3 +38,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/networkx/algorith
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Symtab.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Plex/Scanners.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Plex/DFA.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text

 tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Symtab.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Plex/Scanners.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Plex/DFA.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/FusedNode.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/TestCyCache.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import difflib
+import glob
+import gzip
+import os
+import sys
+import tempfile
+import unittest
+import Cython.Build.Dependencies
+import Cython.Utils
+from Cython.TestUtils import CythonTest
+class TestCyCache(CythonTest):
+    def setUp(self):
+        CythonTest.setUp(self)
+        self.temp_dir = tempfile.mkdtemp(
+            prefix='cycache-test',
+            dir='TEST_TMP' if os.path.isdir('TEST_TMP') else None)
+        self.src_dir = tempfile.mkdtemp(prefix='src', dir=self.temp_dir)
+        self.cache_dir = tempfile.mkdtemp(prefix='cache', dir=self.temp_dir)
+    def cache_files(self, file_glob):
+        return glob.glob(os.path.join(self.cache_dir, file_glob))
+    def fresh_cythonize(self, *args, **kwargs):
+        Cython.Utils.clear_function_caches()
+        Cython.Build.Dependencies._dep_tree = None  # discard method caches
+        Cython.Build.Dependencies.cythonize(*args, **kwargs)
+    def test_cycache_switch(self):
+        content1 = 'value = 1\n'
+        content2 = 'value = 2\n'
+        a_pyx = os.path.join(self.src_dir, 'a.pyx')
+        a_c = a_pyx[:-4] + '.c'
+        with open(a_pyx, 'w') as f:
+            f.write(content1)
+        self.fresh_cythonize(a_pyx, cache=self.cache_dir)
+        self.fresh_cythonize(a_pyx, cache=self.cache_dir)
+        self.assertEqual(1, len(self.cache_files('a.c*')))
+        with open(a_c) as f:
+            a_contents1 = f.read()
+        os.unlink(a_c)
+        with open(a_pyx, 'w') as f:
+            f.write(content2)
+        self.fresh_cythonize(a_pyx, cache=self.cache_dir)
+        with open(a_c) as f:
+            a_contents2 = f.read()
+        os.unlink(a_c)
+        self.assertNotEqual(a_contents1, a_contents2, 'C file not changed!')
+        self.assertEqual(2, len(self.cache_files('a.c*')))
+        with open(a_pyx, 'w') as f:
+            f.write(content1)
+        self.fresh_cythonize(a_pyx, cache=self.cache_dir)
+        self.assertEqual(2, len(self.cache_files('a.c*')))
+        with open(a_c) as f:
+            a_contents = f.read()
+        self.assertEqual(
+            a_contents, a_contents1,
+            msg='\n'.join(list(difflib.unified_diff(
+                a_contents.split('\n'), a_contents1.split('\n')))[:10]))
+    def test_cycache_uses_cache(self):
+        a_pyx = os.path.join(self.src_dir, 'a.pyx')
+        a_c = a_pyx[:-4] + '.c'
+        with open(a_pyx, 'w') as f:
+            f.write('pass')
+        self.fresh_cythonize(a_pyx, cache=self.cache_dir)
+        a_cache = os.path.join(self.cache_dir, os.listdir(self.cache_dir)[0])
+        with gzip.GzipFile(a_cache, 'wb') as gzipfile:
+            gzipfile.write('fake stuff'.encode('ascii'))
+        os.unlink(a_c)
+        self.fresh_cythonize(a_pyx, cache=self.cache_dir)
+        with open(a_c) as f:
+            a_contents = f.read()
+        self.assertEqual(a_contents, 'fake stuff',
+                         'Unexpected contents: %s...' % a_contents[:100])
+    def test_multi_file_output(self):
+        a_pyx = os.path.join(self.src_dir, 'a.pyx')
+        a_c = a_pyx[:-4] + '.c'
+        a_h = a_pyx[:-4] + '.h'
+        a_api_h = a_pyx[:-4] + '_api.h'
+        with open(a_pyx, 'w') as f:
+            f.write('cdef public api int foo(int x): return x\n')
+        self.fresh_cythonize(a_pyx, cache=self.cache_dir)
+        expected = [a_c, a_h, a_api_h]
+        for output in expected:
+            self.assertTrue(os.path.exists(output), output)
+            os.unlink(output)
+        self.fresh_cythonize(a_pyx, cache=self.cache_dir)
+        for output in expected:
+            self.assertTrue(os.path.exists(output), output)
+    def test_options_invalidation(self):
+        hash_pyx = os.path.join(self.src_dir, 'options.pyx')
+        hash_c = hash_pyx[:-len('.pyx')] + '.c'
+        with open(hash_pyx, 'w') as f:
+            f.write('pass')
+        self.fresh_cythonize(hash_pyx, cache=self.cache_dir, cplus=False)
+        self.assertEqual(1, len(self.cache_files('options.c*')))
+        os.unlink(hash_c)
+        self.fresh_cythonize(hash_pyx, cache=self.cache_dir, cplus=True)
+        self.assertEqual(2, len(self.cache_files('options.c*')))
+        os.unlink(hash_c)
+        self.fresh_cythonize(hash_pyx, cache=self.cache_dir, cplus=False, show_version=False)
+        self.assertEqual(2, len(self.cache_files('options.c*')))
+        os.unlink(hash_c)
+        self.fresh_cythonize(hash_pyx, cache=self.cache_dir, cplus=False, show_version=True)
+        self.assertEqual(2, len(self.cache_files('options.c*')))

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/TestDependencies.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import contextlib
+import os.path
+import sys
+import tempfile
+import unittest
+from io import open
+from os.path import join as pjoin
+from ..Dependencies import extended_iglob
+@contextlib.contextmanager
+def writable_file(dir_path, filename):
+    with open(pjoin(dir_path, filename), "w", encoding="utf8") as f:
+        yield f
+class TestGlobbing(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._orig_dir = os.getcwd()
+        if sys.version_info[0] < 3:
+            temp_path = cls._tmpdir = tempfile.mkdtemp()
+        else:
+            cls._tmpdir = tempfile.TemporaryDirectory()
+            temp_path = cls._tmpdir.name
+        os.chdir(temp_path)
+        for dir1 in "abcd":
+            for dir1x in [dir1, dir1 + 'x']:
+                for dir2 in "xyz":
+                    dir_path = pjoin(dir1x, dir2)
+                    os.makedirs(dir_path)
+                    with writable_file(dir_path, "file2_pyx.pyx") as f:
+                        f.write(u'""" PYX """')
+                    with writable_file(dir_path, "file2_py.py") as f:
+                        f.write(u'""" PY """')
+                with writable_file(dir1x, "file1_pyx.pyx") as f:
+                    f.write(u'""" PYX """')
+                with writable_file(dir1x, "file1_py.py") as f:
+                    f.write(u'""" PY """')
+    @classmethod
+    def tearDownClass(cls):
+        os.chdir(cls._orig_dir)
+        if sys.version_info[0] < 3:
+            import shutil
+            shutil.rmtree(cls._tmpdir)
+        else:
+            cls._tmpdir.cleanup()
+    def files_equal(self, pattern, expected_files):
+        expected_files = sorted(expected_files)
+        # It's the users's choice whether '/' will appear on Windows.
+        matched_files = sorted(path.replace('/', os.sep) for path in extended_iglob(pattern))
+        self.assertListEqual(matched_files, expected_files)  # /
+        # Special case for Windows: also support '\' in patterns.
+        if os.sep == '\\' and '/' in pattern:
+            matched_files = sorted(extended_iglob(pattern.replace('/', '\\')))
+            self.assertListEqual(matched_files, expected_files)  # \
+    def test_extended_iglob_simple(self):
+        ax_files = [pjoin("a", "x", "file2_pyx.pyx"), pjoin("a", "x", "file2_py.py")]
+        self.files_equal("a/x/*", ax_files)
+        self.files_equal("a/x/*.c12", [])
+        self.files_equal("a/x/*.{py,pyx,c12}", ax_files)
+        self.files_equal("a/x/*.{py,pyx}", ax_files)
+        self.files_equal("a/x/*.{pyx}", ax_files[:1])
+        self.files_equal("a/x/*.pyx", ax_files[:1])
+        self.files_equal("a/x/*.{py}", ax_files[1:])
+        self.files_equal("a/x/*.py", ax_files[1:])
+    def test_extended_iglob_simple_star(self):
+        for basedir in "ad":
+            files = [
+                pjoin(basedir, dirname, filename)
+                for dirname in "xyz"
+                for filename in ["file2_pyx.pyx", "file2_py.py"]
+            ]
+            self.files_equal(basedir + "/*/*", files)
+            self.files_equal(basedir + "/*/*.c12", [])
+            self.files_equal(basedir + "/*/*.{py,pyx,c12}", files)
+            self.files_equal(basedir + "/*/*.{py,pyx}", files)
+            self.files_equal(basedir + "/*/*.{pyx}", files[::2])
+            self.files_equal(basedir + "/*/*.pyx", files[::2])
+            self.files_equal(basedir + "/*/*.{py}", files[1::2])
+            self.files_equal(basedir + "/*/*.py", files[1::2])
+            for subdir in "xy*":
+                files = [
+                    pjoin(basedir, dirname, filename)
+                    for dirname in "xyz"
+                    if subdir in ('*', dirname)
+                    for filename in ["file2_pyx.pyx", "file2_py.py"]
+                ]
+                path = basedir + '/' + subdir + '/'
+                self.files_equal(path + "*", files)
+                self.files_equal(path + "*.{py,pyx}", files)
+                self.files_equal(path + "*.{pyx}", files[::2])
+                self.files_equal(path + "*.pyx", files[::2])
+                self.files_equal(path + "*.{py}", files[1::2])
+                self.files_equal(path + "*.py", files[1::2])
+    def test_extended_iglob_double_star(self):
+        basedirs = os.listdir(".")
+        files = [
+            pjoin(basedir, dirname, filename)
+            for basedir in basedirs
+            for dirname in "xyz"
+            for filename in ["file2_pyx.pyx", "file2_py.py"]
+        ]
+        all_files = [
+            pjoin(basedir, filename)
+            for basedir in basedirs
+            for filename in ["file1_pyx.pyx", "file1_py.py"]
+        ] + files
+        self.files_equal("*/*/*", files)
+        self.files_equal("*/*/**/*", files)
+        self.files_equal("*/**/*.*", all_files)
+        self.files_equal("**/*.*", all_files)
+        self.files_equal("*/**/*.c12", [])
+        self.files_equal("**/*.c12", [])
+        self.files_equal("*/*/*.{py,pyx,c12}", files)
+        self.files_equal("*/*/**/*.{py,pyx,c12}", files)
+        self.files_equal("*/**/*/*.{py,pyx,c12}", files)
+        self.files_equal("**/*/*/*.{py,pyx,c12}", files)
+        self.files_equal("**/*.{py,pyx,c12}", all_files)
+        self.files_equal("*/*/*.{py,pyx}", files)
+        self.files_equal("**/*/*/*.{py,pyx}", files)
+        self.files_equal("*/**/*/*.{py,pyx}", files)
+        self.files_equal("**/*.{py,pyx}", all_files)
+        self.files_equal("*/*/*.{pyx}", files[::2])
+        self.files_equal("**/*.{pyx}", all_files[::2])
+        self.files_equal("*/**/*/*.pyx", files[::2])
+        self.files_equal("*/*/*.pyx", files[::2])
+        self.files_equal("**/*.pyx", all_files[::2])
+        self.files_equal("*/*/*.{py}", files[1::2])
+        self.files_equal("**/*.{py}", all_files[1::2])
+        self.files_equal("*/*/*.py", files[1::2])
+        self.files_equal("**/*.py", all_files[1::2])

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/TestIpythonMagic.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# -*- coding: utf-8 -*-
+# tag: ipython
+"""Tests for the Cython magics extension."""
+from __future__ import absolute_import
+import os
+import io
+import sys
+from contextlib import contextmanager
+from unittest import skipIf
+from Cython.Build import IpythonMagic
+from Cython.TestUtils import CythonTest
+from Cython.Compiler.Annotate import AnnotationCCodeWriter
+try:
+    import IPython.testing.globalipapp
+except ImportError:
+    # Disable tests and fake helpers for initialisation below.
+    def skip_if_not_installed(_):
+        return None
+else:
+    def skip_if_not_installed(c):
+        return c
+# not using IPython's decorators here because they depend on "nose"
+skip_win32 = skipIf(sys.platform == 'win32', "Skip on Windows")
+skip_py27 = skipIf(sys.version_info[:2] == (2,7), "Disabled in Py2.7")
+try:
+    # disable IPython history thread before it gets started to avoid having to clean it up
+    from IPython.core.history import HistoryManager
+    HistoryManager.enabled = False
+except ImportError:
+    pass
+@contextmanager
+def capture_output():
+    backup = sys.stdout, sys.stderr
+    try:
+        replacement = [
+             io.TextIOWrapper(io.BytesIO(), encoding=sys.stdout.encoding),
+             io.TextIOWrapper(io.BytesIO(), encoding=sys.stderr.encoding),
+        ]
+        sys.stdout, sys.stderr = replacement
+        output = []
+        yield output
+    finally:
+        sys.stdout, sys.stderr = backup
+        for wrapper in replacement:
+            wrapper.seek(0)  # rewind
+            output.append(wrapper.read())
+            wrapper.close()
+code = u"""\
+def f(x):
+    return 2*x
+"""
+cython3_code = u"""\
+def f(int x):
+    return 2 / x
+def call(x):
+    return f(*(x,))
+"""
+pgo_cython3_code = cython3_code + u"""\
+def main():
+    for _ in range(100): call(5)
+main()
+"""
+compile_error_code = u'''\
+cdef extern from *:
+    """
+    xxx a=1;
+    """
+    int a;
+def doit():
+    return a
+'''
+compile_warning_code = u'''\
+cdef extern from *:
+    """
+    #pragma message ( "CWarning" )
+    int a = 42;
+    """
+    int a;
+def doit():
+    return a
+'''
+@skip_if_not_installed
+class TestIPythonMagic(CythonTest):
+    @classmethod
+    def setUpClass(cls):
+        CythonTest.setUpClass()
+        cls._ip = IPython.testing.globalipapp.get_ipython()
+    def setUp(self):
+        CythonTest.setUp(self)
+        self._ip.extension_manager.load_extension('cython')
+    def test_cython_inline(self):
+        ip = self._ip
+        ip.ex('a=10; b=20')
+        result = ip.run_cell_magic('cython_inline', '', 'return a+b')
+        self.assertEqual(result, 30)
+    @skip_win32
+    def test_cython_pyximport(self):
+        ip = self._ip
+        module_name = '_test_cython_pyximport'
+        ip.run_cell_magic('cython_pyximport', module_name, code)
+        ip.ex('g = f(10)')
+        self.assertEqual(ip.user_ns['g'], 20.0)
+        ip.run_cell_magic('cython_pyximport', module_name, code)
+        ip.ex('h = f(-10)')
+        self.assertEqual(ip.user_ns['h'], -20.0)
+        try:
+            os.remove(module_name + '.pyx')
+        except OSError:
+            pass
+    def test_cython(self):
+        ip = self._ip
+        ip.run_cell_magic('cython', '', code)
+        ip.ex('g = f(10)')
+        self.assertEqual(ip.user_ns['g'], 20.0)
+    def test_cython_name(self):
+        # The Cython module named 'mymodule' defines the function f.
+        ip = self._ip
+        ip.run_cell_magic('cython', '--name=mymodule', code)
+        # This module can now be imported in the interactive namespace.
+        ip.ex('import mymodule; g = mymodule.f(10)')
+        self.assertEqual(ip.user_ns['g'], 20.0)
+    def test_cython_language_level(self):
+        # The Cython cell defines the functions f() and call().
+        ip = self._ip
+        ip.run_cell_magic('cython', '', cython3_code)
+        ip.ex('g = f(10); h = call(10)')
+        if sys.version_info[0] < 3:
+            self.assertEqual(ip.user_ns['g'], 2 // 10)
+            self.assertEqual(ip.user_ns['h'], 2 // 10)
+        else:
+            self.assertEqual(ip.user_ns['g'], 2.0 / 10.0)
+            self.assertEqual(ip.user_ns['h'], 2.0 / 10.0)
+    def test_cython3(self):
+        # The Cython cell defines the functions f() and call().
+        ip = self._ip
+        ip.run_cell_magic('cython', '-3', cython3_code)
+        ip.ex('g = f(10); h = call(10)')
+        self.assertEqual(ip.user_ns['g'], 2.0 / 10.0)
+        self.assertEqual(ip.user_ns['h'], 2.0 / 10.0)
+    def test_cython2(self):
+        # The Cython cell defines the functions f() and call().
+        ip = self._ip
+        ip.run_cell_magic('cython', '-2', cython3_code)
+        ip.ex('g = f(10); h = call(10)')
+        self.assertEqual(ip.user_ns['g'], 2 // 10)
+        self.assertEqual(ip.user_ns['h'], 2 // 10)
+    def test_cython_compile_error_shown(self):
+        ip = self._ip
+        with capture_output() as out:
+            ip.run_cell_magic('cython', '-3', compile_error_code)
+        captured_out, captured_err = out
+        # it could be that c-level output is captured by distutil-extension
+        # (and not by us) and is printed to stdout:
+        captured_all = captured_out + "\n" + captured_err
+        self.assertTrue("error" in captured_all, msg="error in " + captured_all)
+    def test_cython_link_error_shown(self):
+        ip = self._ip
+        with capture_output() as out:
+            ip.run_cell_magic('cython', '-3 -l=xxxxxxxx', code)
+        captured_out, captured_err = out
+        # it could be that c-level output is captured by distutil-extension
+        # (and not by us) and is printed to stdout:
+        captured_all = captured_out + "\n!" + captured_err
+        self.assertTrue("error" in captured_all, msg="error in " + captured_all)
+    def test_cython_warning_shown(self):
+        ip = self._ip
+        with capture_output() as out:
+            # force rebuild, otherwise no warning as after the first success
+            # no build step is performed
+            ip.run_cell_magic('cython', '-3 -f', compile_warning_code)
+        captured_out, captured_err = out
+        # check that warning was printed to stdout even if build hasn't failed
+        self.assertTrue("CWarning" in captured_out)
+    @skip_py27  # Not strictly broken in Py2.7 but currently fails in CI due to C compiler issues.
+    @skip_win32
+    def test_cython3_pgo(self):
+        # The Cython cell defines the functions f() and call().
+        ip = self._ip
+        ip.run_cell_magic('cython', '-3 --pgo', pgo_cython3_code)
+        ip.ex('g = f(10); h = call(10); main()')
+        self.assertEqual(ip.user_ns['g'], 2.0 / 10.0)
+        self.assertEqual(ip.user_ns['h'], 2.0 / 10.0)
+    @skip_win32
+    def test_extlibs(self):
+        ip = self._ip
+        code = u"""
+from libc.math cimport sin
+x = sin(0.0)
+        """
+        ip.user_ns['x'] = 1
+        ip.run_cell_magic('cython', '-l m', code)
+        self.assertEqual(ip.user_ns['x'], 0)
+    def test_cython_verbose(self):
+        ip = self._ip
+        ip.run_cell_magic('cython', '--verbose', code)
+        ip.ex('g = f(10)')
+        self.assertEqual(ip.user_ns['g'], 20.0)
+    def test_cython_verbose_thresholds(self):
+        @contextmanager
+        def mock_distutils():
+            class MockLog:
+                DEBUG = 1
+                INFO = 2
+                thresholds = [INFO]
+                def set_threshold(self, val):
+                    self.thresholds.append(val)
+                    return self.thresholds[-2]
+            new_log = MockLog()
+            old_log = IpythonMagic.distutils.log
+            try:
+                IpythonMagic.distutils.log = new_log
+                yield new_log
+            finally:
+                IpythonMagic.distutils.log = old_log
+        ip = self._ip
+        with mock_distutils() as verbose_log:
+            ip.run_cell_magic('cython', '--verbose', code)
+            ip.ex('g = f(10)')
+        self.assertEqual(ip.user_ns['g'], 20.0)
+        self.assertEqual([verbose_log.INFO, verbose_log.DEBUG, verbose_log.INFO],
+                          verbose_log.thresholds)
+        with mock_distutils() as normal_log:
+            ip.run_cell_magic('cython', '', code)
+            ip.ex('g = f(10)')
+        self.assertEqual(ip.user_ns['g'], 20.0)
+        self.assertEqual([normal_log.INFO], normal_log.thresholds)
+    def test_cython_no_annotate(self):
+        ip = self._ip
+        html = ip.run_cell_magic('cython', '', code)
+        self.assertTrue(html is None)
+    def test_cython_annotate(self):
+        ip = self._ip
+        html = ip.run_cell_magic('cython', '--annotate', code)
+        # somewhat brittle way to differentiate between annotated htmls
+        # with/without complete source code:
+        self.assertTrue(AnnotationCCodeWriter.COMPLETE_CODE_TITLE not in html.data)
+    def test_cython_annotate_default(self):
+        ip = self._ip
+        html = ip.run_cell_magic('cython', '-a', code)
+        # somewhat brittle way to differentiate between annotated htmls
+        # with/without complete source code:
+        self.assertTrue(AnnotationCCodeWriter.COMPLETE_CODE_TITLE not in html.data)
+    def test_cython_annotate_complete_c_code(self):
+        ip = self._ip
+        html = ip.run_cell_magic('cython', '--annotate-fullc', code)
+        # somewhat brittle way to differentiate between annotated htmls
+        # with/without complete source code:
+        self.assertTrue(AnnotationCCodeWriter.COMPLETE_CODE_TITLE in html.data)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/TestStripLiterals.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from Cython.Build.Dependencies import strip_string_literals
+from Cython.TestUtils import CythonTest
+class TestStripLiterals(CythonTest):
+    def t(self, before, expected):
+        actual, literals = strip_string_literals(before, prefix="_L")
+        self.assertEqual(expected, actual)
+        for key, value in literals.items():
+            actual = actual.replace(key, value)
+        self.assertEqual(before, actual)
+    def test_empty(self):
+        self.t("", "")
+    def test_single_quote(self):
+        self.t("'x'", "'_L1_'")
+    def test_double_quote(self):
+        self.t('"x"', '"_L1_"')
+    def test_nested_quotes(self):
+        self.t(""" '"' "'" """, """ '_L1_' "_L2_" """)
+    def test_triple_quote(self):
+        self.t(" '''a\n''' ", " '''_L1_''' ")
+    def test_backslash(self):
+        self.t(r"'a\'b'", "'_L1_'")
+        self.t(r"'a\\'", "'_L1_'")
+        self.t(r"'a\\\'b'", "'_L1_'")
+    def test_unicode(self):
+        self.t("u'abc'", "u'_L1_'")
+    def test_raw(self):
+        self.t(r"r'abc\\'", "r'_L1_'")
+    def test_raw_unicode(self):
+        self.t(r"ru'abc\\'", "ru'_L1_'")
+    def test_comment(self):
+        self.t("abc # foo", "abc #_L1_")
+    def test_comment_and_quote(self):
+        self.t("abc # 'x'", "abc #_L1_")
+        self.t("'abc#'", "'_L1_'")
+    def test_include(self):
+        self.t("include 'a.pxi' # something here",
+               "include '_L1_' #_L2_")
+    def test_extern(self):
+        self.t("cdef extern from 'a.h': # comment",
+               "cdef extern from '_L1_': #_L2_")

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/__pycache__/TestCythonizeArgsParser.cpython-311.pyc ADDED Viewed

Binary file (41.1 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/__pycache__/TestDependencies.cpython-311.pyc ADDED Viewed

Binary file (11.8 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/__pycache__/TestInline.cpython-311.pyc ADDED Viewed

Binary file (7.92 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/__pycache__/TestIpythonMagic.cpython-311.pyc ADDED Viewed

Binary file (17 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/__pycache__/TestRecythonize.cpython-311.pyc ADDED Viewed

Binary file (13.5 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Build/Tests/__pycache__/TestStripLiterals.cpython-311.pyc ADDED Viewed

Binary file (4.57 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/Buffer.py ADDED Viewed

	@@ -0,0 +1,749 @@

+from __future__ import absolute_import
+from .Visitor import CythonTransform
+from .ModuleNode import ModuleNode
+from .Errors import CompileError
+from .UtilityCode import CythonUtilityCode
+from .Code import UtilityCode, TempitaUtilityCode
+from . import Options
+from . import Interpreter
+from . import PyrexTypes
+from . import Naming
+from . import Symtab
+def dedent(text, reindent=0):
+    from textwrap import dedent
+    text = dedent(text)
+    if reindent > 0:
+        indent = " " * reindent
+        text = '\n'.join([indent + x for x in text.split('\n')])
+    return text
+class IntroduceBufferAuxiliaryVars(CythonTransform):
+    #
+    # Entry point
+    #
+    buffers_exists = False
+    using_memoryview = False
+    def __call__(self, node):
+        assert isinstance(node, ModuleNode)
+        self.max_ndim = 0
+        result = super(IntroduceBufferAuxiliaryVars, self).__call__(node)
+        if self.buffers_exists:
+            use_bufstruct_declare_code(node.scope)
+            use_py2_buffer_functions(node.scope)
+        return result
+    #
+    # Basic operations for transforms
+    #
+    def handle_scope(self, node, scope):
+        # For all buffers, insert extra variables in the scope.
+        # The variables are also accessible from the buffer_info
+        # on the buffer entry
+        scope_items = scope.entries.items()
+        bufvars = [entry for name, entry in scope_items if entry.type.is_buffer]
+        if len(bufvars) > 0:
+            bufvars.sort(key=lambda entry: entry.name)
+            self.buffers_exists = True
+        memviewslicevars = [entry for name, entry in scope_items if entry.type.is_memoryviewslice]
+        if len(memviewslicevars) > 0:
+            self.buffers_exists = True
+        for (name, entry) in scope_items:
+            if name == 'memoryview' and isinstance(entry.utility_code_definition, CythonUtilityCode):
+                self.using_memoryview = True
+                break
+        del scope_items
+        if isinstance(node, ModuleNode) and len(bufvars) > 0:
+            # for now...note that pos is wrong
+            raise CompileError(node.pos, "Buffer vars not allowed in module scope")
+        for entry in bufvars:
+            if entry.type.dtype.is_ptr:
+                raise CompileError(node.pos, "Buffers with pointer types not yet supported.")
+            name = entry.name
+            buftype = entry.type
+            if buftype.ndim > Options.buffer_max_dims:
+                raise CompileError(node.pos,
+                        "Buffer ndims exceeds Options.buffer_max_dims = %d" % Options.buffer_max_dims)
+            if buftype.ndim > self.max_ndim:
+                self.max_ndim = buftype.ndim
+            # Declare auxiliary vars
+            def decvar(type, prefix):
+                cname = scope.mangle(prefix, name)
+                aux_var = scope.declare_var(name=None, cname=cname,
+                                            type=type, pos=node.pos)
+                if entry.is_arg:
+                    aux_var.used = True  # otherwise, NameNode will mark whether it is used
+                return aux_var
+            auxvars = ((PyrexTypes.c_pyx_buffer_nd_type, Naming.pybuffernd_prefix),
+                       (PyrexTypes.c_pyx_buffer_type, Naming.pybufferstruct_prefix))
+            pybuffernd, rcbuffer = [decvar(type, prefix) for (type, prefix) in auxvars]
+            entry.buffer_aux = Symtab.BufferAux(pybuffernd, rcbuffer)
+        scope.buffer_entries = bufvars
+        self.scope = scope
+    def visit_ModuleNode(self, node):
+        self.handle_scope(node, node.scope)
+        self.visitchildren(node)
+        return node
+    def visit_FuncDefNode(self, node):
+        self.handle_scope(node, node.local_scope)
+        self.visitchildren(node)
+        return node
+#
+# Analysis
+#
+buffer_options = ("dtype", "ndim", "mode", "negative_indices", "cast")  # ordered!
+buffer_defaults = {"ndim": 1, "mode": "full", "negative_indices": True, "cast": False}
+buffer_positional_options_count = 1  # anything beyond this needs keyword argument
+ERR_BUF_OPTION_UNKNOWN = '"%s" is not a buffer option'
+ERR_BUF_TOO_MANY = 'Too many buffer options'
+ERR_BUF_DUP = '"%s" buffer option already supplied'
+ERR_BUF_MISSING = '"%s" missing'
+ERR_BUF_MODE = 'Only allowed buffer modes are: "c", "fortran", "full", "strided" (as a compile-time string)'
+ERR_BUF_NDIM = 'ndim must be a non-negative integer'
+ERR_BUF_DTYPE = 'dtype must be "object", numeric type or a struct'
+ERR_BUF_BOOL = '"%s" must be a boolean'
+def analyse_buffer_options(globalpos, env, posargs, dictargs, defaults=None, need_complete=True):
+    """
+    Must be called during type analysis, as analyse is called
+    on the dtype argument.
+    posargs and dictargs should consist of a list and a dict
+    of tuples (value, pos). Defaults should be a dict of values.
+    Returns a dict containing all the options a buffer can have and
+    its value (with the positions stripped).
+    """
+    if defaults is None:
+        defaults = buffer_defaults
+    posargs, dictargs = Interpreter.interpret_compiletime_options(
+        posargs, dictargs, type_env=env, type_args=(0, 'dtype'))
+    if len(posargs) > buffer_positional_options_count:
+        raise CompileError(posargs[-1][1], ERR_BUF_TOO_MANY)
+    options = {}
+    for name, (value, pos) in dictargs.items():
+        if name not in buffer_options:
+            raise CompileError(pos, ERR_BUF_OPTION_UNKNOWN % name)
+        options[name] = value
+    for name, (value, pos) in zip(buffer_options, posargs):
+        if name not in buffer_options:
+            raise CompileError(pos, ERR_BUF_OPTION_UNKNOWN % name)
+        if name in options:
+            raise CompileError(pos, ERR_BUF_DUP % name)
+        options[name] = value
+    # Check that they are all there and copy defaults
+    for name in buffer_options:
+        if name not in options:
+            try:
+                options[name] = defaults[name]
+            except KeyError:
+                if need_complete:
+                    raise CompileError(globalpos, ERR_BUF_MISSING % name)
+    dtype = options.get("dtype")
+    if dtype and dtype.is_extension_type:
+        raise CompileError(globalpos, ERR_BUF_DTYPE)
+    ndim = options.get("ndim")
+    if ndim and (not isinstance(ndim, int) or ndim < 0):
+        raise CompileError(globalpos, ERR_BUF_NDIM)
+    mode = options.get("mode")
+    if mode and not (mode in ('full', 'strided', 'c', 'fortran')):
+        raise CompileError(globalpos, ERR_BUF_MODE)
+    def assert_bool(name):
+        x = options.get(name)
+        if not isinstance(x, bool):
+            raise CompileError(globalpos, ERR_BUF_BOOL % name)
+    assert_bool('negative_indices')
+    assert_bool('cast')
+    return options
+#
+# Code generation
+#
+class BufferEntry(object):
+    def __init__(self, entry):
+        self.entry = entry
+        self.type = entry.type
+        self.cname = entry.buffer_aux.buflocal_nd_var.cname
+        self.buf_ptr = "%s.rcbuffer->pybuffer.buf" % self.cname
+        self.buf_ptr_type = entry.type.buffer_ptr_type
+        self.init_attributes()
+    def init_attributes(self):
+        self.shape = self.get_buf_shapevars()
+        self.strides = self.get_buf_stridevars()
+        self.suboffsets = self.get_buf_suboffsetvars()
+    def get_buf_suboffsetvars(self):
+        return self._for_all_ndim("%s.diminfo[%d].suboffsets")
+    def get_buf_stridevars(self):
+        return self._for_all_ndim("%s.diminfo[%d].strides")
+    def get_buf_shapevars(self):
+        return self._for_all_ndim("%s.diminfo[%d].shape")
+    def _for_all_ndim(self, s):
+        return [s % (self.cname, i) for i in range(self.type.ndim)]
+    def generate_buffer_lookup_code(self, code, index_cnames):
+        # Create buffer lookup and return it
+        # This is done via utility macros/inline functions, which vary
+        # according to the access mode used.
+        params = []
+        nd = self.type.ndim
+        mode = self.type.mode
+        if mode == 'full':
+            for i, s, o in zip(index_cnames,
+                               self.get_buf_stridevars(),
+                               self.get_buf_suboffsetvars()):
+                params.append(i)
+                params.append(s)
+                params.append(o)
+            funcname = "__Pyx_BufPtrFull%dd" % nd
+            funcgen = buf_lookup_full_code
+        else:
+            if mode == 'strided':
+                funcname = "__Pyx_BufPtrStrided%dd" % nd
+                funcgen = buf_lookup_strided_code
+            elif mode == 'c':
+                funcname = "__Pyx_BufPtrCContig%dd" % nd
+                funcgen = buf_lookup_c_code
+            elif mode == 'fortran':
+                funcname = "__Pyx_BufPtrFortranContig%dd" % nd
+                funcgen = buf_lookup_fortran_code
+            else:
+                assert False
+            for i, s in zip(index_cnames, self.get_buf_stridevars()):
+                params.append(i)
+                params.append(s)
+        # Make sure the utility code is available
+        if funcname not in code.globalstate.utility_codes:
+            code.globalstate.utility_codes.add(funcname)
+            protocode = code.globalstate['utility_code_proto']
+            defcode = code.globalstate['utility_code_def']
+            funcgen(protocode, defcode, name=funcname, nd=nd)
+        buf_ptr_type_code = self.buf_ptr_type.empty_declaration_code()
+        ptrcode = "%s(%s, %s, %s)" % (funcname, buf_ptr_type_code, self.buf_ptr,
+                                      ", ".join(params))
+        return ptrcode
+def get_flags(buffer_aux, buffer_type):
+    flags = 'PyBUF_FORMAT'
+    mode = buffer_type.mode
+    if mode == 'full':
+        flags += '| PyBUF_INDIRECT'
+    elif mode == 'strided':
+        flags += '| PyBUF_STRIDES'
+    elif mode == 'c':
+        flags += '| PyBUF_C_CONTIGUOUS'
+    elif mode == 'fortran':
+        flags += '| PyBUF_F_CONTIGUOUS'
+    else:
+        assert False
+    if buffer_aux.writable_needed: flags += "| PyBUF_WRITABLE"
+    return flags
+def used_buffer_aux_vars(entry):
+    buffer_aux = entry.buffer_aux
+    buffer_aux.buflocal_nd_var.used = True
+    buffer_aux.rcbuf_var.used = True
+def put_unpack_buffer_aux_into_scope(buf_entry, code):
+    # Generate code to copy the needed struct info into local
+    # variables.
+    buffer_aux, mode = buf_entry.buffer_aux, buf_entry.type.mode
+    pybuffernd_struct = buffer_aux.buflocal_nd_var.cname
+    fldnames = ['strides', 'shape']
+    if mode == 'full':
+        fldnames.append('suboffsets')
+    ln = []
+    for i in range(buf_entry.type.ndim):
+        for fldname in fldnames:
+            ln.append("%s.diminfo[%d].%s = %s.rcbuffer->pybuffer.%s[%d];" % (
+                pybuffernd_struct, i, fldname,
+                pybuffernd_struct, fldname, i,
+            ))
+    code.putln(' '.join(ln))
+def put_init_vars(entry, code):
+    bufaux = entry.buffer_aux
+    pybuffernd_struct = bufaux.buflocal_nd_var.cname
+    pybuffer_struct = bufaux.rcbuf_var.cname
+    # init pybuffer_struct
+    code.putln("%s.pybuffer.buf = NULL;" % pybuffer_struct)
+    code.putln("%s.refcount = 0;" % pybuffer_struct)
+    # init the buffer object
+    # code.put_init_var_to_py_none(entry)
+    # init the pybuffernd_struct
+    code.putln("%s.data = NULL;" % pybuffernd_struct)
+    code.putln("%s.rcbuffer = &%s;" % (pybuffernd_struct, pybuffer_struct))
+def put_acquire_arg_buffer(entry, code, pos):
+    buffer_aux = entry.buffer_aux
+    getbuffer = get_getbuffer_call(code, entry.cname, buffer_aux, entry.type)
+    # Acquire any new buffer
+    code.putln("{")
+    code.putln("__Pyx_BufFmt_StackElem __pyx_stack[%d];" % entry.type.dtype.struct_nesting_depth())
+    code.putln(code.error_goto_if("%s == -1" % getbuffer, pos))
+    code.putln("}")
+    # An exception raised in arg parsing cannot be caught, so no
+    # need to care about the buffer then.
+    put_unpack_buffer_aux_into_scope(entry, code)
+def put_release_buffer_code(code, entry):
+    code.globalstate.use_utility_code(acquire_utility_code)
+    code.putln("__Pyx_SafeReleaseBuffer(&%s.rcbuffer->pybuffer);" % entry.buffer_aux.buflocal_nd_var.cname)
+def get_getbuffer_call(code, obj_cname, buffer_aux, buffer_type):
+    ndim = buffer_type.ndim
+    cast = int(buffer_type.cast)
+    flags = get_flags(buffer_aux, buffer_type)
+    pybuffernd_struct = buffer_aux.buflocal_nd_var.cname
+    dtype_typeinfo = get_type_information_cname(code, buffer_type.dtype)
+    code.globalstate.use_utility_code(acquire_utility_code)
+    return ("__Pyx_GetBufferAndValidate(&%(pybuffernd_struct)s.rcbuffer->pybuffer, "
+            "(PyObject*)%(obj_cname)s, &%(dtype_typeinfo)s, %(flags)s, %(ndim)d, "
+            "%(cast)d, __pyx_stack)" % locals())
+def put_assign_to_buffer(lhs_cname, rhs_cname, buf_entry,
+                         is_initialized, pos, code):
+    """
+    Generate code for reassigning a buffer variables. This only deals with getting
+    the buffer auxiliary structure and variables set up correctly, the assignment
+    itself and refcounting is the responsibility of the caller.
+    However, the assignment operation may throw an exception so that the reassignment
+    never happens.
+    Depending on the circumstances there are two possible outcomes:
+    - Old buffer released, new acquired, rhs assigned to lhs
+    - Old buffer released, new acquired which fails, reaqcuire old lhs buffer
+      (which may or may not succeed).
+    """
+    buffer_aux, buffer_type = buf_entry.buffer_aux, buf_entry.type
+    pybuffernd_struct = buffer_aux.buflocal_nd_var.cname
+    flags = get_flags(buffer_aux, buffer_type)
+    code.putln("{")  # Set up necessary stack for getbuffer
+    code.putln("__Pyx_BufFmt_StackElem __pyx_stack[%d];" % buffer_type.dtype.struct_nesting_depth())
+    getbuffer = get_getbuffer_call(code, "%s", buffer_aux, buffer_type)  # fill in object below
+    if is_initialized:
+        # Release any existing buffer
+        code.putln('__Pyx_SafeReleaseBuffer(&%s.rcbuffer->pybuffer);' % pybuffernd_struct)
+        # Acquire
+        retcode_cname = code.funcstate.allocate_temp(PyrexTypes.c_int_type, manage_ref=False)
+        code.putln("%s = %s;" % (retcode_cname, getbuffer % rhs_cname))
+        code.putln('if (%s) {' % (code.unlikely("%s < 0" % retcode_cname)))
+        # If acquisition failed, attempt to reacquire the old buffer
+        # before raising the exception. A failure of reacquisition
+        # will cause the reacquisition exception to be reported, one
+        # can consider working around this later.
+        exc_temps = tuple(code.funcstate.allocate_temp(PyrexTypes.py_object_type, manage_ref=False)
+                          for _ in range(3))
+        code.putln('PyErr_Fetch(&%s, &%s, &%s);' % exc_temps)
+        code.putln('if (%s) {' % code.unlikely("%s == -1" % (getbuffer % lhs_cname)))
+        code.putln('Py_XDECREF(%s); Py_XDECREF(%s); Py_XDECREF(%s);' % exc_temps)  # Do not refnanny these!
+        code.globalstate.use_utility_code(raise_buffer_fallback_code)
+        code.putln('__Pyx_RaiseBufferFallbackError();')
+        code.putln('} else {')
+        code.putln('PyErr_Restore(%s, %s, %s);' % exc_temps)
+        code.putln('}')
+        code.putln('%s = %s = %s = 0;' % exc_temps)
+        for t in exc_temps:
+            code.funcstate.release_temp(t)
+        code.putln('}')
+        # Unpack indices
+        put_unpack_buffer_aux_into_scope(buf_entry, code)
+        code.putln(code.error_goto_if_neg(retcode_cname, pos))
+        code.funcstate.release_temp(retcode_cname)
+    else:
+        # Our entry had no previous value, so set to None when acquisition fails.
+        # In this case, auxiliary vars should be set up right in initialization to a zero-buffer,
+        # so it suffices to set the buf field to NULL.
+        code.putln('if (%s) {' % code.unlikely("%s == -1" % (getbuffer % rhs_cname)))
+        code.putln('%s = %s; __Pyx_INCREF(Py_None); %s.rcbuffer->pybuffer.buf = NULL;' %
+                   (lhs_cname,
+                    PyrexTypes.typecast(buffer_type, PyrexTypes.py_object_type, "Py_None"),
+                    pybuffernd_struct))
+        code.putln(code.error_goto(pos))
+        code.put('} else {')
+        # Unpack indices
+        put_unpack_buffer_aux_into_scope(buf_entry, code)
+        code.putln('}')
+    code.putln("}")  # Release stack
+def put_buffer_lookup_code(entry, index_signeds, index_cnames, directives,
+                           pos, code, negative_indices, in_nogil_context):
+    """
+    Generates code to process indices and calculate an offset into
+    a buffer. Returns a C string which gives a pointer which can be
+    read from or written to at will (it is an expression so caller should
+    store it in a temporary if it is used more than once).
+    As the bounds checking can have any number of combinations of unsigned
+    arguments, smart optimizations etc. we insert it directly in the function
+    body. The lookup however is delegated to a inline function that is instantiated
+    once per ndim (lookup with suboffsets tend to get quite complicated).
+    entry is a BufferEntry
+    """
+    negative_indices = directives['wraparound'] and negative_indices
+    if directives['boundscheck']:
+        # Check bounds and fix negative indices.
+        # We allocate a temporary which is initialized to -1, meaning OK (!).
+        # If an error occurs, the temp is set to the index dimension the
+        # error is occurring at.
+        failed_dim_temp = code.funcstate.allocate_temp(PyrexTypes.c_int_type, manage_ref=False)
+        code.putln("%s = -1;" % failed_dim_temp)
+        for dim, (signed, cname, shape) in enumerate(zip(index_signeds, index_cnames, entry.get_buf_shapevars())):
+            if signed != 0:
+                # not unsigned, deal with negative index
+                code.putln("if (%s < 0) {" % cname)
+                if negative_indices:
+                    code.putln("%s += %s;" % (cname, shape))
+                    code.putln("if (%s) %s = %d;" % (
+                        code.unlikely("%s < 0" % cname),
+                        failed_dim_temp, dim))
+                else:
+                    code.putln("%s = %d;" % (failed_dim_temp, dim))
+                code.put("} else ")
+            # check bounds in positive direction
+            if signed != 0:
+                cast = ""
+            else:
+                cast = "(size_t)"
+            code.putln("if (%s) %s = %d;" % (
+                code.unlikely("%s >= %s%s" % (cname, cast, shape)),
+                failed_dim_temp, dim))
+        if in_nogil_context:
+            code.globalstate.use_utility_code(raise_indexerror_nogil)
+            func = '__Pyx_RaiseBufferIndexErrorNogil'
+        else:
+            code.globalstate.use_utility_code(raise_indexerror_code)
+            func = '__Pyx_RaiseBufferIndexError'
+        code.putln("if (%s) {" % code.unlikely("%s != -1" % failed_dim_temp))
+        code.putln('%s(%s);' % (func, failed_dim_temp))
+        code.putln(code.error_goto(pos))
+        code.putln('}')
+        code.funcstate.release_temp(failed_dim_temp)
+    elif negative_indices:
+        # Only fix negative indices.
+        for signed, cname, shape in zip(index_signeds, index_cnames, entry.get_buf_shapevars()):
+            if signed != 0:
+                code.putln("if (%s < 0) %s += %s;" % (cname, cname, shape))
+    return entry.generate_buffer_lookup_code(code, index_cnames)
+def use_bufstruct_declare_code(env):
+    env.use_utility_code(buffer_struct_declare_code)
+def buf_lookup_full_code(proto, defin, name, nd):
+    """
+    Generates a buffer lookup function for the right number
+    of dimensions. The function gives back a void* at the right location.
+    """
+    # _i_ndex, _s_tride, sub_o_ffset
+    macroargs = ", ".join(["i%d, s%d, o%d" % (i, i, i) for i in range(nd)])
+    proto.putln("#define %s(type, buf, %s) (type)(%s_imp(buf, %s))" % (name, macroargs, name, macroargs))
+    funcargs = ", ".join(["Py_ssize_t i%d, Py_ssize_t s%d, Py_ssize_t o%d" % (i, i, i) for i in range(nd)])
+    proto.putln("static CYTHON_INLINE void* %s_imp(void* buf, %s);" % (name, funcargs))
+    defin.putln(dedent("""
+        static CYTHON_INLINE void* %s_imp(void* buf, %s) {
+          char* ptr = (char*)buf;
+        """) % (name, funcargs) + "".join([dedent("""\
+          ptr += s%d * i%d;
+          if (o%d >= 0) ptr = *((char**)ptr) + o%d;
+        """) % (i, i, i, i) for i in range(nd)]
+        ) + "\nreturn ptr;\n}")
+def buf_lookup_strided_code(proto, defin, name, nd):
+    """
+    Generates a buffer lookup function for the right number
+    of dimensions. The function gives back a void* at the right location.
+    """
+    # _i_ndex, _s_tride
+    args = ", ".join(["i%d, s%d" % (i, i) for i in range(nd)])
+    offset = " + ".join(["i%d * s%d" % (i, i) for i in range(nd)])
+    proto.putln("#define %s(type, buf, %s) (type)((char*)buf + %s)" % (name, args, offset))
+def buf_lookup_c_code(proto, defin, name, nd):
+    """
+    Similar to strided lookup, but can assume that the last dimension
+    doesn't need a multiplication as long as.
+    Still we keep the same signature for now.
+    """
+    if nd == 1:
+        proto.putln("#define %s(type, buf, i0, s0) ((type)buf + i0)" % name)
+    else:
+        args = ", ".join(["i%d, s%d" % (i, i) for i in range(nd)])
+        offset = " + ".join(["i%d * s%d" % (i, i) for i in range(nd - 1)])
+        proto.putln("#define %s(type, buf, %s) ((type)((char*)buf + %s) + i%d)" % (name, args, offset, nd - 1))
+def buf_lookup_fortran_code(proto, defin, name, nd):
+    """
+    Like C lookup, but the first index is optimized instead.
+    """
+    if nd == 1:
+        proto.putln("#define %s(type, buf, i0, s0) ((type)buf + i0)" % name)
+    else:
+        args = ", ".join(["i%d, s%d" % (i, i) for i in range(nd)])
+        offset = " + ".join(["i%d * s%d" % (i, i) for i in range(1, nd)])
+        proto.putln("#define %s(type, buf, %s) ((type)((char*)buf + %s) + i%d)" % (name, args, offset, 0))
+def use_py2_buffer_functions(env):
+    env.use_utility_code(GetAndReleaseBufferUtilityCode())
+class GetAndReleaseBufferUtilityCode(object):
+    # Emulation of PyObject_GetBuffer and PyBuffer_Release for Python 2.
+    # For >= 2.6 we do double mode -- use the new buffer interface on objects
+    # which has the right tp_flags set, but emulation otherwise.
+    requires = None
+    is_cython_utility = False
+    def __init__(self):
+        pass
+    def __eq__(self, other):
+        return isinstance(other, GetAndReleaseBufferUtilityCode)
+    def __hash__(self):
+        return 24342342
+    def get_tree(self, **kwargs): pass
+    def put_code(self, output):
+        code = output['utility_code_def']
+        proto_code = output['utility_code_proto']
+        env = output.module_node.scope
+        cython_scope = env.context.cython_scope
+        # Search all types for __getbuffer__ overloads
+        types = []
+        visited_scopes = set()
+        def find_buffer_types(scope):
+            if scope in visited_scopes:
+                return
+            visited_scopes.add(scope)
+            for m in scope.cimported_modules:
+                find_buffer_types(m)
+            for e in scope.type_entries:
+                if isinstance(e.utility_code_definition, CythonUtilityCode):
+                    continue
+                t = e.type
+                if t.is_extension_type:
+                    if scope is cython_scope and not e.used:
+                        continue
+                    release = get = None
+                    for x in t.scope.pyfunc_entries:
+                        if x.name == u"__getbuffer__": get = x.func_cname
+                        elif x.name == u"__releasebuffer__": release = x.func_cname
+                    if get:
+                        types.append((t.typeptr_cname, get, release))
+        find_buffer_types(env)
+        util_code = TempitaUtilityCode.load(
+            "GetAndReleaseBuffer", from_file="Buffer.c",
+            context=dict(types=types))
+        proto = util_code.format_code(util_code.proto)
+        impl = util_code.format_code(
+            util_code.inject_string_constants(util_code.impl, output)[1])
+        proto_code.putln(proto)
+        code.putln(impl)
+def mangle_dtype_name(dtype):
+    # Use prefixes to separate user defined types from builtins
+    # (consider "typedef float unsigned_int")
+    if dtype.is_pyobject:
+        return "object"
+    elif dtype.is_ptr:
+        return "ptr"
+    else:
+        if dtype.is_typedef or dtype.is_struct_or_union:
+            prefix = "nn_"
+        else:
+            prefix = ""
+        return prefix + dtype.specialization_name()
+def get_type_information_cname(code, dtype, maxdepth=None):
+    """
+    Output the run-time type information (__Pyx_TypeInfo) for given dtype,
+    and return the name of the type info struct.
+    Structs with two floats of the same size are encoded as complex numbers.
+    One can separate between complex numbers declared as struct or with native
+    encoding by inspecting to see if the fields field of the type is
+    filled in.
+    """
+    namesuffix = mangle_dtype_name(dtype)
+    name = "__Pyx_TypeInfo_%s" % namesuffix
+    structinfo_name = "__Pyx_StructFields_%s" % namesuffix
+    if dtype.is_error: return "<error>"
+    # It's critical that walking the type info doesn't use more stack
+    # depth than dtype.struct_nesting_depth() returns, so use an assertion for this
+    if maxdepth is None: maxdepth = dtype.struct_nesting_depth()
+    if maxdepth <= 0:
+        assert False
+    if name not in code.globalstate.utility_codes:
+        code.globalstate.utility_codes.add(name)
+        typecode = code.globalstate['typeinfo']
+        arraysizes = []
+        if dtype.is_array:
+            while dtype.is_array:
+                arraysizes.append(dtype.size)
+                dtype = dtype.base_type
+        complex_possible = dtype.is_struct_or_union and dtype.can_be_complex()
+        declcode = dtype.empty_declaration_code()
+        if dtype.is_simple_buffer_dtype():
+            structinfo_name = "NULL"
+        elif dtype.is_struct:
+            struct_scope = dtype.scope
+            if dtype.is_cv_qualified:
+                struct_scope = struct_scope.base_type_scope
+            # Must pre-call all used types in order not to recurse during utility code writing.
+            fields = struct_scope.var_entries
+            assert len(fields) > 0
+            types = [get_type_information_cname(code, f.type, maxdepth - 1)
+                     for f in fields]
+            typecode.putln("static __Pyx_StructField %s[] = {" % structinfo_name, safe=True)
+            if dtype.is_cv_qualified:
+                # roughly speaking, remove "const" from struct_type
+                struct_type = dtype.cv_base_type.empty_declaration_code()
+            else:
+                struct_type = dtype.empty_declaration_code()
+            for f, typeinfo in zip(fields, types):
+                typecode.putln('  {&%s, "%s", offsetof(%s, %s)},' %
+                               (typeinfo, f.name, struct_type, f.cname), safe=True)
+            typecode.putln('  {NULL, NULL, 0}', safe=True)
+            typecode.putln("};", safe=True)
+        else:
+            assert False
+        rep = str(dtype)
+        flags = "0"
+        is_unsigned = "0"
+        if dtype is PyrexTypes.c_char_type:
+            is_unsigned = "__PYX_IS_UNSIGNED(%s)" % declcode
+            typegroup = "'H'"
+        elif dtype.is_int:
+            is_unsigned = "__PYX_IS_UNSIGNED(%s)" % declcode
+            typegroup = "%s ? 'U' : 'I'" % is_unsigned
+        elif complex_possible or dtype.is_complex:
+            typegroup = "'C'"
+        elif dtype.is_float:
+            typegroup = "'R'"
+        elif dtype.is_struct:
+            typegroup = "'S'"
+            if dtype.packed:
+                flags = "__PYX_BUF_FLAGS_PACKED_STRUCT"
+        elif dtype.is_pyobject:
+            typegroup = "'O'"
+        else:
+            assert False, dtype
+        typeinfo = ('static __Pyx_TypeInfo %s = '
+                        '{ "%s", %s, sizeof(%s), { %s }, %s, %s, %s, %s };')
+        tup = (name, rep, structinfo_name, declcode,
+               ', '.join([str(x) for x in arraysizes]) or '0', len(arraysizes),
+               typegroup, is_unsigned, flags)
+        typecode.putln(typeinfo % tup, safe=True)
+    return name
+def load_buffer_utility(util_code_name, context=None, **kwargs):
+    if context is None:
+        return UtilityCode.load(util_code_name, "Buffer.c", **kwargs)
+    else:
+        return TempitaUtilityCode.load(util_code_name, "Buffer.c", context=context, **kwargs)
+context = dict(max_dims=Options.buffer_max_dims)
+buffer_struct_declare_code = load_buffer_utility("BufferStructDeclare", context=context)
+buffer_formats_declare_code = load_buffer_utility("BufferFormatStructs")
+# Utility function to set the right exception
+# The caller should immediately goto_error
+raise_indexerror_code = load_buffer_utility("BufferIndexError")
+raise_indexerror_nogil = load_buffer_utility("BufferIndexErrorNogil")
+raise_buffer_fallback_code = load_buffer_utility("BufferFallbackError")
+acquire_utility_code = load_buffer_utility("BufferGetAndValidate", context=context)
+buffer_format_check_code = load_buffer_utility("BufferFormatCheck", context=context)
+# See utility code BufferFormatFromTypeInfo
+_typeinfo_to_format_code = load_buffer_utility("TypeInfoToFormat")

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/Code.py ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/FusedNode.cpython-311-x86_64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02498cb7e330a1a7ccae2b142938c3c3c01d80751d06f9ade63bac39f2ab681a
+size 517064

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/Parsing.py ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # empty file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Lexicon.cpython-311.pyc ADDED Viewed

Binary file (14.5 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Naming.cpython-311.pyc ADDED Viewed

Binary file (7.78 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Options.cpython-311.pyc ADDED Viewed

Binary file (24.3 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Pythran.cpython-311.pyc ADDED Viewed

Binary file (12.8 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Scanning.cpython-311.pyc ADDED Viewed

Binary file (29.7 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/UtilityCode.cpython-311.pyc ADDED Viewed

Binary file (13.3 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/networkx/generators/atlas.dat.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73fc416df0164923607751cb759f4ae81deb5f6550bf25be59c86de3b747e41d
+size 8887

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pyproject_hooks/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""Wrappers to call pyproject.toml-based build backend hooks.
+"""
+from ._impl import (
+    BackendInvalid,
+    BackendUnavailable,
+    BuildBackendHookCaller,
+    HookMissing,
+    UnsupportedOperation,
+    default_subprocess_runner,
+    quiet_subprocess_runner,
+)
+__version__ = '1.0.0'
+__all__ = [
+    'BackendUnavailable',
+    'BackendInvalid',
+    'HookMissing',
+    'UnsupportedOperation',
+    'default_subprocess_runner',
+    'quiet_subprocess_runner',
+    'BuildBackendHookCaller',
+]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pyproject_hooks/__pycache__/_compat.cpython-311.pyc ADDED Viewed

Binary file (425 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pyproject_hooks/_in_process/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""This is a subpackage because the directory is on sys.path for _in_process.py
+The subpackage should stay as empty as possible to avoid shadowing modules that
+the backend might import.
+"""
+import importlib.resources as resources
+try:
+    resources.files
+except AttributeError:
+    # Python 3.8 compatibility
+    def _in_proc_script_path():
+        return resources.path(__package__, '_in_process.py')
+else:
+    def _in_proc_script_path():
+        return resources.as_file(
+            resources.files(__package__).joinpath('_in_process.py'))

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pyproject_hooks/_in_process/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.19 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/code_template.cpython-311.pyc ADDED Viewed

Binary file (5.1 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/context.cpython-311.pyc ADDED Viewed

Binary file (7.73 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen_aoti_c_shim.cpython-311.pyc ADDED Viewed

Binary file (18.2 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen_backend_stubs.cpython-311.pyc ADDED Viewed

Binary file (27 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen_executorch.cpython-311.pyc ADDED Viewed

Binary file (46.6 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen_functionalization_type.cpython-311.pyc ADDED Viewed

Binary file (41.1 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen_lazy_tensor.cpython-311.pyc ADDED Viewed

Binary file (22.2 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/gen_vmap_plumbing.cpython-311.pyc ADDED Viewed

Binary file (15.6 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/local.cpython-311.pyc ADDED Viewed

Binary file (2.18 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/native_function_generation.cpython-311.pyc ADDED Viewed

Binary file (25.3 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/yaml_utils.cpython-311.pyc ADDED Viewed

Binary file (1.65 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from .lazy_ir import (
+    generate_non_native_lazy_ir_nodes as generate_non_native_lazy_ir_nodes,
+    GenLazyIR as GenLazyIR,
+    GenLazyNativeFuncDefinition as GenLazyNativeFuncDefinition,
+    GenLazyShapeInferenceDefinition as GenLazyShapeInferenceDefinition,
+)
+from .native_functions import (
+    compute_native_function_declaration as compute_native_function_declaration,
+)
+from .register_dispatch_key import (
+    gen_registration_headers as gen_registration_headers,
+    gen_registration_helpers as gen_registration_helpers,
+    RegisterDispatchKey as RegisterDispatchKey,
+)
+from .ufunc import (
+    compute_ufunc_cpu as compute_ufunc_cpu,
+    compute_ufunc_cpu_kernel as compute_ufunc_cpu_kernel,
+    compute_ufunc_cuda as compute_ufunc_cuda,
+)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/__pycache__/lazy_ts_lowering.cpython-311.pyc ADDED Viewed

Binary file (3.54 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/__pycache__/ufunc.cpython-311.pyc ADDED Viewed

Binary file (28.1 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/lazy_ir.py ADDED Viewed

	@@ -0,0 +1,707 @@

+import itertools
+from abc import ABC
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torchgen.api.dispatcher as dispatcher
+from torchgen.api.lazy import (
+    getValueT,
+    isValueType,
+    LazyArgument,
+    LazyIrProperties,
+    LazyIrSchema,
+    tensorListValueT,
+)
+from torchgen.api.translate import translate
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    deviceT,
+    DispatcherSignature,
+    kernel_signature,
+    NativeSignature,
+    OptionalCType,
+    VectorCType,
+)
+from torchgen.context import method_with_native_function
+from torchgen.dest.lazy_ts_lowering import ts_lowering_body
+from torchgen.model import (
+    Argument,
+    BackendIndex,
+    BackendMetadata,
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    ListType,
+    NativeFunction,
+    NativeFunctionsGroup,
+)
+def node_ctor_arg_rvalue_string(arg: LazyArgument) -> str:
+    """
+    Given a LazyArgument,
+    generate a c++ string for materializing an rvalue of that arg for passing into
+    a lazy Node constructor.
+    """
+    # TODO: Matching on CType seems wrong; should be matching on Type
+    if isValueType(arg.lazy_type):
+        if isinstance(arg.lazy_type, BaseCType):
+            if arg.is_wrapped_scalar:
+                return f"node_{arg.name}"
+            elif arg.lazy_type.type is tensorListValueT:
+                return f"lazy_{arg.name}_tensorlist"
+            elif arg.is_symint_or_list:
+                return f"GetSymIntValue({arg.name})"
+            return f"lazy_{arg.name}->GetIrValue()"
+        elif isinstance(arg.lazy_type, OptionalCType):
+            if arg.is_symint_or_list:
+                # TODO: I don't understand when you should put lazy_ in the name
+                # or not
+                return f"{arg.name} ? c10::make_optional(GetSymIntValue(*{arg.name})) : c10::nullopt"
+            elif arg.is_wrapped_scalar:
+                return f"node_{arg.name}"
+            return (
+                f"lazy_{arg.name} ? "
+                f"c10::make_optional(lazy_{arg.name}->GetIrValue()) : "
+                "c10::nullopt"
+            )
+        else:
+            raise AssertionError(
+                f"TODO not sure if there are other valid types to handle here ({arg.lazy_type})"
+            )
+    else:
+        # NB: this is here because right now we aren't treating SymInt[] as a
+        # value type; when we do this needs to move above
+        # NB: we cannot test arg.lazy_type as we've already specified it is an
+        # int64_t and so we cannot distinguish between SymInt and int64_t
+        if isinstance(arg.orig_type, ListType) and arg.orig_type.elem == BaseType(
+            BaseTy.SymInt
+        ):
+            if arg.symint:
+                return f"GetSymIntArrayRefValue({arg.name})"
+            else:
+                return f"std::vector<int64_t>({arg.name}.begin(), {arg.name}.end())"
+        elif isinstance(arg.lazy_type, VectorCType) and isinstance(
+            arg.lazy_type.elem, BaseCType
+        ):
+            return f"std::vector<{arg.lazy_type.elem.type}>({arg.name}.begin(), {arg.name}.end())"
+        elif (
+            isinstance(arg.lazy_type, OptionalCType)
+            and isinstance(arg.lazy_type.elem, VectorCType)
+            and isinstance(arg.lazy_type.elem.elem, BaseCType)
+        ):
+            return f"torch::lazy::ToOptionalVector<{arg.lazy_type.elem.elem.type}>({arg.name})"
+        else:
+            return f"{arg.name}"
+def node_ctor_inputs(schema: LazyIrSchema) -> str:
+    """
+    Produce a formatted string with the arguments as passed into the constructor of a node class.
+    """
+    node_ctor_values = [
+        node_ctor_arg_rvalue_string(arg) for arg in schema.filtered_args()
+    ]
+    return ", ".join(node_ctor_values)
+def gen_fallback_code(
+    schema: LazyIrSchema,
+    sig: Union[DispatcherSignature, NativeSignature],
+    overload_name: str,
+) -> str:
+    """
+    Generate code that falls back to eager conditioned on a predicate
+    """
+    dispatcher_sig = DispatcherSignature.from_schema(schema.func)
+    exprs = translate(sig.arguments(), dispatcher_sig.arguments())
+    fallback_args = ",\n                ".join([a.expr for a in exprs])
+    if len(overload_name):
+        aten_op_str = f"ATEN_OP2({schema.aten_name}, {overload_name})"
+    else:
+        aten_op_str = f"ATEN_OP({schema.aten_name})"
+    return f"""
+        if (force_eager_fallback({aten_symbol(schema)})) {{
+            return at::native::call_fallback_fn_symint<&ltc_eager_fallback, {aten_op_str}>::call(
+                {fallback_args}
+            );
+        }}
+"""
+def aten_symbol(schema: LazyIrSchema) -> str:
+    missing_interned_strings = {
+        "sigmoid_backward",
+    }
+    if schema.aten_name in missing_interned_strings:
+        return f'c10::Symbol::fromQualString("aten::{schema.aten_name}")'
+    if not schema.aten_name.startswith("at::"):
+        return f"at::aten::{schema.aten_name}"
+    else:
+        return schema.aten_name
+# converts  all tensor-like arguments to meta tensors. Returns:
+# (1) a string containing all of the logic that does the conversions.
+# (2) a context, to be used by translate(), with all of the relevant bindings.
+def convert_to_meta_tensors(sig: DispatcherSignature) -> Tuple[str, List[Binding]]:
+    context: List[Binding] = []
+    unwrapped_tensor_args: List[str] = []
+    for arg in sig.arguments():
+        if isinstance(arg.argument, Argument) and arg.argument.type.is_tensor_like():
+            unwrapped_name = f"{arg.name}_meta"
+            unwrapped_tensor_args.append(
+                f"auto {unwrapped_name} = to_meta({arg.name});"
+            )
+            context.append(arg.with_name(unwrapped_name))
+        else:
+            context.append(arg)
+    unwrap_tensor_args_str = "\n        ".join(unwrapped_tensor_args)
+    return unwrap_tensor_args_str, context
+@dataclass(frozen=True)
+class GenLazyIR(ABC):
+    backend_index: BackendIndex
+    backend_name: str
+    node_base: str
+    use_lazy_shape: bool
+    @method_with_native_function
+    def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]:
+        func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func
+        metadata = self.backend_index.get_kernel(
+            f.functional if isinstance(f, NativeFunctionsGroup) else f
+        )
+        schema = LazyIrSchema(
+            func, symint=metadata is not None and metadata.supports_symint()
+        )
+        return self.gen(schema)
+    # there is no lowering functionality generated unless this IR base class is subclassed and
+    # implemented as a backend-specific node
+    def lowering_function(self, schema: LazyIrSchema) -> str:
+        return ""
+    def create_function(self, schema: LazyIrSchema, node_ctor_args: str) -> str:
+        return ""
+    def can_be_reused_function(self, schema: LazyIrSchema, node_ctor_args: str) -> str:
+        return f"""bool CanBeReused({node_ctor_args}) const {{
+    return false;
+    }}"""
+    def node_base_ctor_call(self, schema: LazyIrSchema) -> str:
+        value_args = schema.filtered_args(values=True, scalars=False)
+        # backends can customize the way the node base class constructor is called,
+        # as long as all of its arguments can be generated from information available from the schema
+        base_ctor_value_args_list = []
+        for arg in value_args:
+            if isinstance(arg.lazy_type, (BaseCType, VectorCType)):
+                base_ctor_value_args_list.append(f"{arg.name}")
+            elif isinstance(arg.lazy_type, OptionalCType):
+                base_ctor_value_args_list.append(f"{arg.name}.value_or(kNullValue)")
+            else:
+                raise AssertionError(
+                    f"Unsupported type ({arg.lazy_type}) - add support if necessary"
+                )
+        base_ctor_value_args = ", ".join(base_ctor_value_args_list)
+        scalar_args = schema.filtered_args(values=False, scalars=True)
+        # Shape construction.
+        # Conditionally build shape depending on specified shape property
+        if schema.properties.ShapePrecompute:
+            shape_ctor_arg = "std::move(shapes),"
+        elif schema.properties.ShapeCompute:
+            shape_args = [a.name for a in value_args]
+            shape_args.extend(a.name for a in scalar_args)
+            shape_ctor_arg = f"compute_shape_{schema.name}({', '.join(shape_args)}),"
+        elif schema.properties.ShapeCache:
+            shape_args = [f"operand({i})" for i in range(len(value_args))]
+            shape_args.extend(a.name for a in scalar_args)
+            shape_ctor_arg = f"[&](){{ return compute_shape_{schema.name}({', '.join(shape_args)})[0]; }},"
+        else:
+            shape_ctor_arg = ""
+        scalar_hashes = ", ".join(f"{a.name}" for a in scalar_args)
+        return f"""{self.node_base}(
+              {schema.node_name}::ClassOpKind(),
+              OpList{{{base_ctor_value_args}}},
+              {shape_ctor_arg}
+              /* num_outputs */ {len(schema.returns)},
+              torch::lazy::MHash({scalar_hashes}))"""
+    def gen(self, schema: LazyIrSchema) -> List[str]:
+        opkind = schema.opkind or aten_symbol(schema)
+        # for now, we just want one IR class decl and soon after also the method defs
+        # and we use the functional version not out/inplace.
+        all_args = schema.filtered_args()
+        value_args = schema.filtered_args(values=True, scalars=False)
+        scalar_args = schema.filtered_args(values=False, scalars=True)
+        ctor_args = [f"const {i.lazy_type.cpp_type()}& {i.name}" for i in all_args]
+        reuse_ctor_args = ", ".join(ctor_args)
+        if self.use_lazy_shape and schema.properties.ShapePrecompute:
+            ctor_args.append("std::vector<torch::lazy::Shape>&& shapes")
+        node_ctor_args = ", ".join(ctor_args)
+        scalar_initializers = ",\n        ".join(
+            [
+                # This code is just special casing the mapping from string_view -> strings
+                f"{a.name}({a.name}.has_value() ? c10::make_optional(std::string(*{a.name})) : c10::nullopt)"
+                if a.lazy_type.cpp_type() == "c10::optional<c10::string_view>"
+                else f"{a.name}({a.name})"
+                for a in scalar_args
+            ]
+        )
+        if len(scalar_initializers):
+            scalar_initializers = f",\n        {scalar_initializers}"
+        scalar_decls = "\n  ".join(
+            [
+                f"std::string {a.name};"
+                if a.lazy_type.cpp_type() == "c10::string_view"
+                else f"c10::optional<std::string> {a.name};"
+                if a.lazy_type.cpp_type() == "c10::optional<c10::string_view>"
+                else f"{a.lazy_type.cpp_type()} {a.name};"
+                for a in scalar_args
+            ]
+        )
+        optional_values = [
+            arg.name
+            for arg in schema.filtered_args(values=True, scalars=False)
+            if isinstance(arg.lazy_type, OptionalCType)
+        ]
+        has_optional_decls = "\n  ".join(
+            [f"bool has_{value}: 1;" for value in optional_values]
+        )
+        has_optional_defs = "\n    ".join(
+            [f"has_{value} = !!{value};" for value in optional_values]
+        )
+        members_to_string = []
+        for arg in scalar_args:
+            if isinstance(arg.lazy_type, OptionalCType):
+                value = f"{arg.name}.value()"
+                if arg.is_generator:
+                    value = '"torch.Generator()"'
+                members_to_string.append(
+                    f"""if ({arg.name}.has_value()) {{
+      ss << ", {arg.name}=" << {value};
+    }} else {{
+      ss << ", {arg.name}=null";
+    }}"""
+                )
+            else:
+                members_to_string.append(f'ss << ", {arg.name}=" << {arg.name};')
+        members_to_string_str = "\n    ".join(members_to_string)
+        return [
+            f"""\
+class {schema.node_name} : public {self.node_base} {{
+ public:
+  static torch::lazy::OpKind ClassOpKind() {{
+    return torch::lazy::OpKind({opkind});
+  }}
+  {schema.node_name}({node_ctor_args})
+      : {self.node_base_ctor_call(schema)}{scalar_initializers}
+  {{
+    {has_optional_defs}
+  }}
+  std::string ToString() const override {{
+    std::stringstream ss;
+    ss << {self.node_base}::ToString();
+    {members_to_string_str}
+    return ss.str();
+  }}
+  {self.create_function(schema, reuse_ctor_args)}
+  {self.can_be_reused_function(schema, reuse_ctor_args)}
+  {self.lowering_function(schema)}
+  {scalar_decls}
+  {has_optional_decls}
+}};
+""",
+        ]
+@dataclass(frozen=True)
+class GenTSLazyIR(GenLazyIR):
+    def lowering_function(self, schema: LazyIrSchema) -> str:
+        signature = """
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override"""
+        if schema.properties.LowerDeclOnly:
+            return f"{signature};"
+        elif schema.properties.Lower:
+            return f"""{signature} {{
+    {ts_lowering_body(schema)}
+  }}
+            """
+        else:
+            return ""
+    def create_function(self, schema: LazyIrSchema, node_ctor_args: str) -> str:
+        signature = f"static NodePtr Create({node_ctor_args})"
+        if schema.properties.CreateFnDeclOnly:
+            return f"{signature};"
+        elif not schema.properties.CreateFn:
+            return ""
+        return f"""{signature} {{
+    return ReuseOrMakeNode<{schema.node_name}>(data);
+  }}"""
+    def can_be_reused_function(self, schema: LazyIrSchema, node_ctor_args: str) -> str:
+        signature = f"bool CanBeReused({node_ctor_args}) const"
+        if schema.properties.CanBeReusedDeclOnly:
+            return f"{signature};"
+        elif not schema.properties.CanBeReused:
+            return ""
+        value_comparison = []
+        for arg in itertools.chain(schema.positional_values, schema.keyword_values):
+            if isinstance(arg.lazy_type, OptionalCType):
+                value_comparison.append(
+                    f"nullable_operand(i++) == {arg.name}.value_or(kNullValue)"
+                )
+            else:
+                value_comparison.append(f"operand(i++) == {arg.name}")
+        for arg in itertools.chain(schema.positional_scalars, schema.keyword_scalars):
+            if isinstance(arg.lazy_type, OptionalCType):
+                value_comparison.append(
+                    f"((!this->{arg.name}&&!{arg.name}) || (this->{arg.name}&&{arg.name} && *(this->{arg.name}) == *{arg.name}))"
+                )
+            else:
+                value_comparison.append(f"this->{arg.name} == {arg.name}")
+        value_comparison_str = " &&\n        ".join(value_comparison)
+        return f"""{signature} {{
+    size_t i = 0;
+    return ({value_comparison_str});
+  }}"""
+@dataclass(frozen=True)
+class GenLazyNativeFuncDefinition:
+    class_method_name: str
+    backend_index: BackendIndex
+    tensor_class: str
+    gen_forced_fallback_code: bool
+    backend_namespace: str
+    get_tensorlist: str
+    get_tensor_or_wrap_number: str
+    try_get_tensor: str
+    metrics_counter: str
+    create_tensor: str
+    create_from_first_tensor: bool
+    create_aten_from_ltc_tensor: str
+    tuple_aten_from_ltc_tensors: str
+    lazy_tensor_ptr: str
+    get_device_fn: str
+    def lazy_tensor_decls(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        value_args = schema.filtered_args(values=True, scalars=False)
+        # Generates lazy_{name} variables for LazyTensors wrapping input tensors
+        lazy_tensor_decls: List[str] = []
+        for arg in value_args:
+            if arg.is_wrapped_scalar:
+                if isinstance(arg.lazy_type, OptionalCType):
+                    lazy_tensor_decls.append(
+                        f"""auto node_{arg.name} = {arg.name} ?
+                c10::make_optional(torch::lazy::LazyGraphExecutor::Get()->
+                    GetIrValueForScalarFromCodegen(*{arg.name}, *common_device)):
+                c10::nullopt;"""
+                    )
+                else:
+                    lazy_tensor_decls.append(
+                        f"""auto node_{arg.name} = torch::lazy::LazyGraphExecutor::Get()->
+                            GetIrValueForScalarFromCodegen({arg.name}, *common_device);"""
+                    )
+            elif arg.is_symint_or_list:
+                continue  # values are extracted in isValueType
+            elif isinstance(arg.lazy_type, BaseCType):
+                if arg.lazy_type.type is tensorListValueT:
+                    lazy_tensor_decls.append(
+                        f"auto lazy_{arg.name}_tensorlist = "
+                        f"{self.backend_namespace}::{self.get_tensorlist}({arg.name});"
+                    )
+                else:
+                    lazy_tensor_decls.append(
+                        f"{self.lazy_tensor_ptr} lazy_{arg.name} = "
+                        f"{self.backend_namespace}::{self.get_tensor_or_wrap_number}({arg.name}, *common_device);"
+                    )
+            elif isinstance(arg.lazy_type, OptionalCType):
+                assert arg.lazy_type.elem == BaseCType(getValueT()), arg.lazy_type.elem
+                # TODO(alanwaketan): Maybe we want to apply GetLtcTensorOrCreateForWrappedNumber here, but hold it
+                # until we encounter a real world example.
+                lazy_tensor_decls.append(
+                    f"{self.lazy_tensor_ptr} lazy_{arg.name} = "
+                    f"{self.backend_namespace}::{self.try_get_tensor}({arg.name}.value_or(at::Tensor()));"
+                )
+            else:
+                raise AssertionError(
+                    f"TODO not sure if there are other valid types to handle here ({arg.lazy_type})"
+                )
+        return ("\n        ").join(lazy_tensor_decls)
+    def force_eager_fallback(
+        self,
+        func: NativeFunction,
+        schema: LazyIrSchema,
+        metadata: BackendMetadata,
+        sig: Union[DispatcherSignature, NativeSignature],
+    ) -> str:
+        if self.gen_forced_fallback_code:
+            return gen_fallback_code(
+                schema, sig, overload_name=func.func.name.overload_name
+            )
+        return ""
+    def metrics(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        return f"{self.metrics_counter};"
+    def get_device(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        value_args = schema.filtered_args(values=True, scalars=False)
+        scalar_args = schema.filtered_args(values=False, scalars=True)
+        value_types_names = [f"{a.name}" for a in value_args if not a.is_wrapped_scalar]
+        optional_device = OptionalCType(BaseCType(deviceT))
+        optional_devices = [
+            a.name for a in scalar_args if a.lazy_type == optional_device
+        ]
+        assert (
+            len(value_types_names) > 0 or len(optional_devices) > 0
+        ), "Expected at least one Value or Device type"
+        get_device_str = (
+            f"{self.get_device_fn}({', '.join(value_types_names + optional_devices)})"
+        )
+        return f"""auto common_device = {get_device_str};
+        TORCH_INTERNAL_ASSERT(common_device);
+        """
+    def shape_inference(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        metadata = self.backend_index.get_kernel(func)
+        assert metadata is not None
+        all_args = schema.filtered_args()
+        returns_length = len(schema.returns)
+        # call the meta kernel if it exists, to compute output shape/dtype for our IR
+        # Note [Generated LTC Shape Functions]
+        # LTC uses meta tensors from core to do shape inference when possible, and otherwise
+        # we generate a shape function declaration that needs to be manually implemented.
+        # How do we detect which ops are eligible to use meta tensors?
+        # In general we should be able to use meta tensors not just on structured operators,
+        # but also on composite operators that are implemented in terms of structured kernels.
+        # We don't currently have a way of knowing at codegen time which ops are implemented that way.
+        # This is the case for all view and view_copy operators however, so we're going to
+        # use them specifically for all of the view_copy ops (instead of manually writing shape rules for all of them).
+        is_view_copy_op = "view_copy" in func.tags
+        is_structured = func.structured or func.structured_delegate is not None
+        if is_structured or is_view_copy_op:
+            meta_out = """
+std::vector<torch::lazy::Shape> shapes{torch::lazy::Shape(out_meta.scalar_type(), out_meta.sizes().vec())};"""
+            if returns_length > 1:
+                def this_shape(i: int) -> str:
+                    return f"torch::lazy::Shape(std::get<{i}>(out_meta).scalar_type(), std::get<{i}>(out_meta).sizes().vec())"
+                shapes_str = ",".join([this_shape(i) for i in range(returns_length)])
+                meta_out = "std::vector<torch::lazy::Shape> shapes{" + shapes_str + "};"
+            # Convert tensor args to the meta device and call it.
+            # (We can't pass in the input tensors directly, because they are "functional wrappers".
+            # If any of the meta kernels call a tensor op and redispatch, we don't want to hit the functionalize kernels.)
+            # Even at::meta:: functions might redispatch, e.g. if they call into view ops.
+            dispatcher_sig = DispatcherSignature.from_schema(func.func)
+            meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig)
+            meta_call_args = [
+                e.expr
+                for e in translate(
+                    meta_call_ctx, dispatcher_sig.arguments(), method=False
+                )
+            ]
+            if is_view_copy_op:
+                # view_copy ops always have a CompositeExplicitAutogradNonFunctional kernel
+                assert func.has_composite_explicit_autograd_non_functional_kernel
+                dispatch_ns = "compositeexplicitautogradnonfunctional"
+            else:
+                dispatch_ns = "meta"
+            aten_name = schema.aten_name
+            # TODO: this is trolling
+            if func.func.has_symint() and metadata.supports_symint():
+                aten_name += "_symint"
+            shape_str = f"""\
+        {meta_conversion_str}
+        auto out_meta = at::{dispatch_ns}::{aten_name}({', '.join(meta_call_args)});
+        {meta_out}"""
+        else:
+            shape_sig = ComputeShapeSignature(
+                metadata.kernel, func, symint=metadata.supports_symint()
+            )
+            shape_str = f"""
+            auto shapes = {shape_sig.shape_call};"""
+        shape_str += f"""
+            TORCH_INTERNAL_ASSERT(shapes.size() == {returns_length});"""
+        # Calculating which dimensions are symbolic
+        func_schema_str = "aten::" + str(func.func)
+        shape_str += f"""
+            if(torch::lazy::symbolicShapeEnabled()){{
+                std::vector<torch::jit::IValue> inputs = {{ {', '.join(str(a.name) for a in all_args)} }};
+                const char* schema_str = "{func_schema_str}";
+                applySymbolicShapesOnLT(schema_str, inputs, shapes);
+            }}
+        """
+        return shape_str
+    def build_ir_node(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        node_ctor_input_str = node_ctor_inputs(schema)
+        return f"""torch::lazy::NodePtr node = torch::lazy::ReuseNode<{schema.node_name}>({node_ctor_input_str});
+        if (!node) {{
+            {self.shape_inference(func, schema)}
+            node = torch::lazy::MakeNode<{schema.node_name}>({node_ctor_input_str}, std::move(shapes));
+            CacheNode(node);
+        }}
+        """
+    def create_lazy_tensor(self, first_tensor_name: Optional[str] = None) -> str:
+        # xla uses an instance method for tensor creation, for the time being
+        if self.create_from_first_tensor:
+            # TODO(whc) remove this if XLA switches to using static method for creation
+            assert (
+                first_tensor_name is not None
+            ), "Requires first tensor to create lazy tensor"
+            return f"{first_tensor_name}.{self.create_tensor}"
+        return f"{self.backend_namespace}::{self.create_tensor}"
+    def return_aten_tensor(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+        returns_length = len(schema.returns)
+        value_args = schema.filtered_args(values=True, scalars=False)
+        value_types_names = [f"{a.name}" for a in value_args if not a.is_wrapped_scalar]
+        first_tensor_name = value_types_names[0] if len(value_types_names) > 0 else None
+        bridge_str = f"""auto result = {self.create_aten_from_ltc_tensor}(
+                {self.create_lazy_tensor(first_tensor_name)}(std::move(node), *common_device));"""
+        if returns_length > 1:
+            assert (
+                len(value_types_names) > 0
+            ), "Code below assumes there is at least one tensor arg"
+            bridge_str = f"""std::vector<{self.lazy_tensor_ptr}> lazy_tensors;
+        for (int i = 0; i < {returns_length}; i++) {{
+            lazy_tensors.push_back({self.create_lazy_tensor(first_tensor_name)}({getValueT()}(node, i), *common_device));
+        }}
+        auto result = {self.tuple_aten_from_ltc_tensors}<{returns_length}>(lazy_tensors);"""
+        if schema.name.name.inplace or func.func.is_out_fn():
+            assert returns_length == 1, (
+                "We assumed there was no such case where an op is an in-place variant "
+                f"and has tuple outputs, but got tuple of len {returns_length}."
+            )
+            bridge_str = f"""lazy_{first_tensor_name}->SetInPlaceIrValue(node);
+        auto& result = {first_tensor_name};"""
+        bridge_str += """
+        return result;"""
+        return bridge_str
+    @method_with_native_function
+    def __call__(self, func: NativeFunction) -> List[str]:
+        sig = kernel_signature(func, self.backend_index)
+        metadata = self.backend_index.get_kernel(func)
+        assert metadata is not None
+        schema = LazyIrSchema(func.func, symint=metadata.supports_symint())
+        return [
+            f"""\
+    {sig.decl(name=f"{self.class_method_name}::{metadata.kernel}")} {{
+        {self.force_eager_fallback(func, schema, metadata, sig)}
+        {self.metrics(func, schema)}
+        {self.get_device(func, schema)}
+        {self.lazy_tensor_decls(func, schema)}
+        {self.build_ir_node(func, schema)}
+        {self.return_aten_tensor(func, schema)}
+    }}\n
+    """
+        ]
+class ComputeShapeSignature:
+    """
+    Here we use the base name as the suffix of the signature to avoid generating for in-place variants.
+    """
+    def __init__(self, kernel_name: str, f: NativeFunction, *, symint: bool):
+        self.__schema = LazyIrSchema(f.func, symint=symint)
+        self.__dispatch_args = ", ".join(
+            [a.decl() for a in dispatcher.arguments(f.func, symint=symint)]
+        )
+        self.__call_args = ", ".join(
+            [f"{arg.name}" for arg in self.__schema.filtered_args(generator=True)]
+        )
+        self.__kernel_name = kernel_name
+    def __decl_suffix(self) -> str:
+        return f"{self.__kernel_name}({self.__dispatch_args})"
+    def __call_suffix(self) -> str:
+        return f"{self.__kernel_name}({self.__call_args})"
+    @property
+    def shape_decl(self) -> str:
+        return f"TORCH_API std::vector<torch::lazy::Shape> compute_shape_{self.__decl_suffix()}"
+    @property
+    def shape_call(self) -> str:
+        return f"torch::lazy::compute_shape_{self.__call_suffix()}"
+@dataclass(frozen=True)
+class GenLazyShapeInferenceDefinition:
+    backend_index: BackendIndex
+    tensor_class: str
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> List[str]:
+        sig = kernel_signature(f, self.backend_index)
+        metadata = self.backend_index.get_kernel(f)
+        assert metadata is not None
+        # See Note [Generated LTC Shape Functions]
+        is_view_copy_op = "view_copy" in f.tags
+        is_structured = f.structured or f.structured_delegate is not None
+        if is_structured or is_view_copy_op:
+            return []
+        else:
+            shape_sig = ComputeShapeSignature(
+                metadata.kernel, f, symint=metadata.supports_symint()
+            )
+            return ["\n".join([f"{shape_sig.shape_decl};"])]
+def generate_non_native_lazy_ir_nodes(
+    non_native: List[Dict[str, Any]], gen_lazy_ir: GenLazyIR
+) -> List[str]:
+    """Generate the non-native lazy IR node classes"""
+    nodes = []
+    for op in non_native:
+        # Set default properties for Non-Native IRs
+        properties = LazyIrProperties("ShapeCache", "CanBeReused", "LowerDeclOnly")
+        for p in op.get("properties", []):
+            setattr(properties, p, True)
+        # non-native is assumed to want symint bindings if you wrote symint
+        schema = LazyIrSchema(FunctionSchema.parse(op["func"]), properties, symint=True)
+        schema.opkind = op.get("opkind")
+        nodes.append(gen_lazy_ir.gen(schema)[0])
+    return nodes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/lazy_ts_lowering.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from torchgen.api.lazy import LazyArgument, LazyIrSchema
+from torchgen.api.types import OptionalCType
+def ts_lowering_body(schema: LazyIrSchema) -> str:
+    # for now, we just want one IR class decl and soon after also the method defs
+    # and we use the functional version not out/inplace.
+    emplace_arguments = []
+    def get_value(arg: LazyArgument) -> str:
+        if isinstance(arg.lazy_type, OptionalCType):
+            return f"has_{arg.name} ? loctx->GetOutputOp(operand(i++)) : nullptr"
+        return "loctx->GetOutputOp(operand(i++))"
+    for arg in schema.positional_args:
+        if arg.is_lazy_value:
+            emplace_arguments.append(get_value(arg))
+            continue
+        emplace_arguments.append(f'"{arg.name}", {arg.name}')
+    emplace_arguments_str = "\n    ".join(
+        [f"arguments.emplace_back({a});" for a in emplace_arguments]
+    )
+    emplace_kwarg_values = [
+        f'"{arg.name}", {get_value(arg)}' for arg in schema.keyword_values
+    ]
+    emplace_kwarg_scalars = [
+        f'"{arg.name}", {arg.name}' for arg in schema.keyword_scalars
+    ]
+    emplace_kwarguments = "\n    ".join(
+        [
+            f"kwarguments.emplace_back({a});"
+            for a in emplace_kwarg_values + emplace_kwarg_scalars
+        ]
+    )
+    return f"""\
+    std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve({len(emplace_arguments)});
+    kwarguments.reserve({len(emplace_kwarg_values + emplace_kwarg_scalars)});
+    size_t i = 0;
+    {emplace_arguments_str}
+    {emplace_kwarguments}
+    torch::lazy::TSOpVector {schema.aten_name}_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ({schema.aten_name}_out.size(), {len(schema.returns)});
+    return {schema.aten_name}_out;
+"""

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/native_functions.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from typing import List, Optional, Union
+import torchgen.api.meta as meta
+import torchgen.api.structured as structured
+from torchgen.api.types import kernel_signature
+from torchgen.context import with_native_function_and_index
+from torchgen.model import BackendIndex, NativeFunction, NativeFunctionsGroup
+from torchgen.utils import mapMaybe
+@with_native_function_and_index
+def gen_unstructured(f: NativeFunction, backend_index: BackendIndex) -> Optional[str]:
+    sig = kernel_signature(f, backend_index)
+    metadata = backend_index.get_kernel(f)
+    if metadata is None:
+        return None
+    if "legacy::" in metadata.kernel:
+        return None
+    else:
+        prefix = "static" if backend_index.external else "TORCH_API"
+        return f"{prefix} {sig.decl(name=metadata.kernel)};"
+@with_native_function_and_index
+def gen_structured(g: NativeFunctionsGroup, backend_index: BackendIndex) -> List[str]:
+    meta_name = meta.name(g)
+    out_args = structured.impl_arguments(g)
+    metadata = backend_index.get_kernel(g)
+    if metadata is None:
+        return []
+    prefix = "" if backend_index.external else "TORCH_API "
+    return [
+        f"""\
+struct {prefix}structured_{metadata.kernel} : public at::meta::structured_{meta_name} {{
+void impl({', '.join(a.decl() for a in out_args)});
+}};
+"""
+    ]
+# Generates NativeFunctions.h, a list of forward declarations of all
+# actual kernel definitions we keep in aten/src/ATen/native/
+@with_native_function_and_index
+def compute_native_function_declaration(
+    g: Union[NativeFunctionsGroup, NativeFunction], backend_index: BackendIndex
+) -> List[str]:
+    metadata = backend_index.get_kernel(g)
+    if isinstance(g, NativeFunctionsGroup):
+        if metadata is not None and metadata.structured:
+            if backend_index.external:
+                # Structured hasn't been tested with external backends yet.
+                raise AssertionError(
+                    "Structured external backend functions are not implemented yet."
+                )
+            else:
+                return gen_structured(g, backend_index)
+        else:
+            return list(
+                mapMaybe(lambda f: gen_unstructured(f, backend_index), g.functions())
+            )
+    else:
+        x = gen_unstructured(g, backend_index)
+        return [] if x is None else [x]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/register_dispatch_key.py ADDED Viewed

	@@ -0,0 +1,989 @@

+import itertools
+import textwrap
+from dataclasses import dataclass
+from typing import List, Literal, Optional, Tuple, Union
+import torchgen.api.cpp as cpp
+import torchgen.api.meta as meta
+import torchgen.api.structured as structured
+from torchgen.api.translate import translate
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    ConstRefCType,
+    CppSignature,
+    CppSignatureGroup,
+    DispatcherSignature,
+    Expr,
+    kernel_signature,
+    MutRefCType,
+    NamedCType,
+    NativeSignature,
+    tensorT,
+)
+from torchgen.context import method_with_native_function, native_function_manager
+from torchgen.model import (
+    Argument,
+    BackendIndex,
+    DeviceCheckType,
+    DispatchKey,
+    gets_generated_out_inplace_wrapper,
+    is_cuda_dispatch_key,
+    NativeFunction,
+    NativeFunctionsGroup,
+    SchemaKind,
+    TensorOptionsArguments,
+)
+from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.utils import assert_never, mapMaybe, Target
+def gen_registration_headers(
+    backend_index: BackendIndex,
+    per_operator_headers: bool,
+    rocm: bool,
+) -> List[str]:
+    if per_operator_headers:
+        headers = ["#include <ATen/ops/as_strided_native.h>"]
+    else:
+        headers = ["#include <ATen/NativeFunctions.h>"]
+    if backend_index.dispatch_key in (DispatchKey.CPU, DispatchKey.Meta):
+        headers.append("#include <ATen/EmptyTensor.h>")
+    elif backend_index.dispatch_key == DispatchKey.CUDA:
+        if rocm:
+            headers.append("#include <ATen/hip/EmptyTensor.h>")
+        else:
+            headers.append("#include <ATen/cuda/EmptyTensor.h>")
+    elif backend_index.dispatch_key == DispatchKey.MPS:
+        headers.append("#include <ATen/mps/EmptyTensor.h>")
+    elif per_operator_headers:
+        headers += [
+            "#include <ATen/ops/empty.h>",
+            "#include <ATen/ops/empty_strided.h>",
+            "#include <ATen/ops/_copy_from_and_resize.h>",
+            "#include <ATen/ops/_copy_from.h>",
+        ]
+    else:
+        headers.append("#include <ATen/Functions.h>")
+    return headers
+def gen_empty_impl_names(
+    backend_index: BackendIndex,
+) -> Tuple[Optional[str], Optional[str]]:
+    empty_impl = None
+    empty_strided_impl = None
+    if backend_index.dispatch_key in (
+        DispatchKey.Meta,
+        DispatchKey.CPU,
+        DispatchKey.CUDA,
+        DispatchKey.MPS,
+    ):
+        dispatch = str(backend_index.dispatch_key).lower()
+        empty_impl = f"at::detail::empty_{dispatch}"
+        empty_strided_impl = f"at::detail::empty_strided_{dispatch}"
+    elif backend_index.dispatch_key in (
+        DispatchKey.CompositeExplicitAutogradNonFunctional,
+        DispatchKey.QuantizedCPU,
+        DispatchKey.QuantizedCUDA,
+    ):
+        empty_impl = "at::empty"
+        empty_strided_impl = "at::empty_strided"
+    return empty_impl, empty_strided_impl
+def gen_create_out_helper(backend_index: BackendIndex) -> List[str]:
+    if backend_index.dispatch_key == DispatchKey.Meta:
+        empty_options = "options.device(at::kMeta)"
+    else:
+        empty_options = "options"
+    empty_impl, empty_strided_impl = gen_empty_impl_names(backend_index)
+    if empty_impl is None:
+        return []
+    return [
+        f"""
+Tensor create_out(IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {{
+  if (strides.empty()) {{
+      return {empty_impl}(sizes, {empty_options});
+  }} else {{
+      return {empty_strided_impl}(sizes, strides, {empty_options});
+  }}
+}}
+"""
+    ]
+def gen_maybe_create_proxy_helper(backend_index: BackendIndex) -> List[str]:
+    _, empty_strided_impl = gen_empty_impl_names(backend_index)
+    return (
+        []
+        if empty_strided_impl is None
+        else [
+            f"""
+c10::optional<Tensor> maybe_create_proxy(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {{
+  if (out.strides() != strides) {{
+    return {empty_strided_impl}(sizes, strides, options);
+  }}
+  return c10::nullopt;
+}}
+"""
+        ]
+    )
+def gen_resize_out_helper(backend_index: BackendIndex) -> List[str]:
+    if backend_index.dispatch_key == DispatchKey.CompositeExplicitAutogradNonFunctional:
+        # The function isn't used by this key (since only functional ops have a kernel for this key),
+        # so we need to not include it to avoid a defined-but-not-used error.
+        return []
+    return [
+        """
+void resize_out(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {
+  TORCH_CHECK(options.dtype() == out.dtype(),
+      "Expected out tensor to have dtype ", options.dtype(), ", but got ", out.dtype(), " instead");
+  TORCH_CHECK(options.device() == out.device(),
+      "Expected out tensor to have device ", options.device(), ", but got ", out.device(), " instead");
+  const bool resized = at::native::resize_output(out, sizes);
+  // Only restride if a resize occurred; otherwise we ignore the (advisory)
+  // strides from the meta function and directly use the output tensor's
+  // preexisting strides
+  if (resized) {
+    if (!strides.empty()) {
+      TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
+      // TODO: avoid the redispatch here
+      out.as_strided_(sizes, strides);
+    } else if (options.memory_format_opt().has_value()) {
+      out.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt());
+    }
+  }
+}
+"""
+    ]
+def gen_check_inplace_helper(backend_index: BackendIndex) -> List[str]:
+    return [
+        """
+void check_inplace(const Tensor &self, IntArrayRef sizes, const TensorOptions &options) {
+  // These checks are needed on those operators that:
+  //   1) don't use 'TensorIterator' (e.g. 'addmm' and 'baddbmm')
+  //   2) have particular typing rules (e.g. 'cumsum' and 'cumprod')
+  // For other operators (e.g. 'add'), 'TensorIterator' already checks
+  // these things separately.
+  TORCH_CHECK(options.dtype() == self.dtype(),
+      "Bad in-place call: ",
+      "input tensor dtype ", self.dtype(), " and output tensor dtype ", options.dtype(), " should match");
+  TORCH_CHECK(options.device() == self.device(),
+      "Bad in-place call: ",
+      "input tensor device ", self.device(), " and output tensor device ", options.device(), " should match");
+  TORCH_CHECK(sizes == self.sizes(),
+      "Bad in-place call: ",
+      "input tensor size ", self.sizes(), " and output tensor size ", sizes, " should match");
+}
+"""
+    ]
+def gen_registration_helpers(backend_index: BackendIndex) -> List[str]:
+    return [
+        *gen_create_out_helper(backend_index),
+        *gen_resize_out_helper(backend_index),
+        *gen_check_inplace_helper(backend_index),
+        *gen_maybe_create_proxy_helper(backend_index),
+    ]
+# Generates Register{dispatch}.cpp (e.g., RegisterCPU.cpp).
+#
+#   - The primary function of this file is to register all of the
+#     implementations for the given dispatch key to the dispatcher,
+#     so they are available for use in PyTorch.  If dispatch is
+#     None, we generate schema (def) registrations and catchall
+#     registrations.
+#   - The secondary function of this file is to generate a wrapper
+#     around functions.  In CPUType these wrappers do nothing
+#     (and should be removed), but in other cases they handle
+#     DeviceGuard. A small extra benefit of wrappers is they
+#     are not overloaded, so they can be used in the registration
+#     API without having to disambiguate which overload you want
+#     (as would be the case if you directly registered native::
+#     functions).
+#   - The tertiary function of this file is to generate *static*
+#     cpp API bindings which can be used to bypass dispatcher
+#     directly to kernels, but with user-friendly cpp-style API
+@dataclass(frozen=True)
+class RegisterDispatchKey:
+    backend_index: BackendIndex
+    target: Literal[
+        Target.ANONYMOUS_DEFINITION,
+        Target.NAMESPACED_DEFINITION,
+        Target.NAMESPACED_DECLARATION,
+        Target.REGISTRATION,
+    ]
+    # Selector object to determine which operators to generate
+    # registration code for.
+    selector: SelectiveBuilder
+    # Whether or not we are actually code-genning for ROCm
+    rocm: bool
+    # Whether or not to generate symint registrations or not.  External users
+    # of codegen who don't care about symints can set this to false to get
+    # non-SymInt codegen
+    symint: bool
+    # The class that all unstructured native functions live under. This is used to improve
+    # compiler error messages when a kernel writer adds a native function with the wrong signature.
+    # This is only used in unstructured kernels, since structured kernels already live in a class.
+    # Finally, this field is currently Optional because it is only used by external backends.
+    # It would be nice if we can add the same logic to in-tree kernels too, but that requires updating
+    # all of the existing kernel signatures scattered across aten/src/ATen/native.
+    class_method_name: Optional[str]
+    # Only set to true in lightweight dispatch. If lightweight dispatch is enabled we are registering
+    # operators into JIT op registry, thus we need to avoid generating code to register into the dispatcher.
+    skip_dispatcher_op_registration: bool
+    @staticmethod
+    def gen_device_check(
+        type: DeviceCheckType, args: List[Argument], method_name: str
+    ) -> str:
+        if type == DeviceCheckType.NoCheck:
+            return "  // No device check\n"
+        device_check = "c10::optional<Device> common_device = nullopt;\n"
+        device_check += "(void)common_device; // Suppress unused variable warning\n"
+        for arg in args:
+            # Only tensor like arguments are eligible
+            if arg.type.is_tensor_like():
+                device_check += f"""
+  c10::impl::check_and_update_common_device(common_device, {arg.name}, "{method_name}", "{arg.name}");"""
+        return device_check
+    @method_with_native_function
+    def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]:
+        if isinstance(f, NativeFunctionsGroup):
+            g: NativeFunctionsGroup = f
+            # Note: We call gen_structured() if the operator is marked structured, regardless of the backend.
+            # gen_structured() has special logic to handle auto-generated kernels.
+            if g.structured:
+                return self.gen_structured(g)
+            else:
+                return list(
+                    mapMaybe(lambda f: self.gen_unstructured(f, g), g.functions())
+                )
+        elif isinstance(f, NativeFunction):
+            r = self.gen_unstructured(f)
+            return [] if r is None else [r]
+        else:
+            assert_never(f)
+    def wrapper_kernel_sig(
+        self, f: NativeFunction
+    ) -> Union[NativeSignature, DispatcherSignature]:
+        # The prefix is just to ensure uniqueness. The Dispatcher API doesn't guarantee unique kernel names.
+        return DispatcherSignature.from_schema(
+            f.func,
+            prefix=f"wrapper_{self.backend_index.dispatch_key}_{f.func.name.overload_name}_",
+            symint=self.symint,
+        )
+    def gen_out_inplace_wrapper(
+        self, f: NativeFunction, g: Optional[NativeFunctionsGroup]
+    ) -> Optional[str]:
+        if g is None:
+            return None
+        k = f.func.kind()
+        if k is SchemaKind.inplace:
+            copy_op = "at::_copy_from"
+        elif k is SchemaKind.out:
+            copy_op = "at::_copy_from_and_resize"
+        else:
+            raise AssertionError("gen_out_inplace_wrapper called on a functional op")
+        sig = self.wrapper_kernel_sig(f)
+        name = sig.name()
+        func_res = f"{name}_tmp"
+        return_names = cpp.return_names(f)
+        if len(return_names) > 1:
+            updates = "\n  ".join(
+                f"{copy_op}(std::get<{i}>({func_res}), {ret_name});"
+                for i, ret_name in enumerate(return_names)
+            )
+            returns = f'{sig.returns_type().cpp_type()}({", ".join(return_names)})'
+        elif len(return_names) == 1:
+            ret_name = return_names[0]
+            updates = f"{copy_op}({func_res}, {ret_name});"
+            returns = ret_name
+        else:
+            assert len(f.func.arguments.out) == 1
+            returns = ""
+            out_arg = f.func.arguments.out[0]
+            if out_arg.type.is_list_like():
+                updates = f"""\
+    for (int64_t i = 0; i < {func_res}.size(); ++i) {{
+        {copy_op}({func_res}[i], {out_arg.name}[i]);
+    }}"""
+            else:
+                updates = f"{copy_op}({func_res}, {out_arg.name});"
+        functional_sig = self.wrapper_kernel_sig(g.functional)
+        wrapper_name = sig.name()
+        return f"""\
+{sig.defn(name=wrapper_name)} {{
+  auto {func_res} = {functional_sig.name()}({", ".join(e.expr for e in translate(sig.arguments(), functional_sig.arguments()))});
+  {updates}
+  return {returns};
+}}
+"""
+    def gen_structured(self, g: NativeFunctionsGroup) -> List[str]:
+        metadata = self.backend_index.get_kernel(g)
+        if self.backend_index.dispatch_key == DispatchKey.Meta:
+            assert not self.backend_index.has_kernel(g.out), (
+                "Do not explicitly specify Meta dispatch key on structured "
+                "functions, they will be automatically generated for you"
+            )
+        elif (
+            self.backend_index.dispatch_key
+            == DispatchKey.CompositeExplicitAutogradNonFunctional
+        ):
+            assert not self.backend_index.has_kernel(g.out), (
+                "Do not explicitly specify CompositeExplicitAutograd dispatch key on structured "
+                "functions, they will be automatically generated for you"
+            )
+        elif metadata is None or not metadata.structured:
+            return list(mapMaybe(lambda f: self.gen_unstructured(f, g), g.functions()))
+        structured_gen = StructuredRegisterDispatchKey(
+            self.backend_index,
+            self.target,
+            self.selector,
+            self.rocm,
+            self.symint,
+            self.class_method_name,
+            self.skip_dispatcher_op_registration,
+            g,
+        )
+        return list(mapMaybe(structured_gen.gen_one, g.functions()))
+    def gen_unstructured(
+        self, f: NativeFunction, g: Optional[NativeFunctionsGroup] = None
+    ) -> Optional[str]:
+        with native_function_manager(f):
+            inplace_meta = False
+            gets_out_inplace_wrapper = False
+            if not self.backend_index.has_kernel(f):
+                if (
+                    self.backend_index.dispatch_key == DispatchKey.Meta
+                    and f.func.kind() is SchemaKind.inplace
+                    and
+                    # Defer to composites for meta implementation
+                    not f.has_composite_kernel
+                    and
+                    # Inplace list operations are not supported
+                    len(f.func.returns) == 1
+                ):
+                    inplace_meta = True
+                elif (
+                    not self.backend_index.use_out_as_primary
+                    and g is not None
+                    and gets_generated_out_inplace_wrapper(f, g, self.backend_index)
+                ):
+                    # We want to generate inplace/out wrappers, that don't have a kernel for the backend.
+                    gets_out_inplace_wrapper = True
+                else:
+                    return None
+            if f.manual_kernel_registration:
+                return None
+            if (
+                self.target is Target.REGISTRATION
+                and not self.selector.is_native_function_selected(f)
+            ):
+                return None
+            sig = self.wrapper_kernel_sig(f)
+            name = sig.name()
+            returns_type = sig.returns_type().cpp_type()
+            args = sig.arguments()
+            args_str = ", ".join(a.defn() for a in args)
+            # See Note [Direct dispatch bindings]
+            cpp_sig_group = CppSignatureGroup.from_native_function(
+                f, method=False, fallback_binding=False
+            )
+            # TODO: dedupe this with the structured codegen
+            if self.target is Target.NAMESPACED_DECLARATION:
+                result = ""
+                for cpp_sig in cpp_sig_group.signatures(symint=self.symint):
+                    result += f"TORCH_API {cpp_sig.decl()};\n"
+                return result
+            elif self.target is Target.NAMESPACED_DEFINITION:
+                def generate_defn(cpp_sig: CppSignature) -> str:
+                    return f"""
+{cpp_sig.defn()} {{
+return {sig.name()}({', '.join(e.expr for e in translate(cpp_sig.arguments(), sig.arguments()))});
+}}
+"""
+                result = ""
+                for cpp_sig in cpp_sig_group.signatures(symint=self.symint):
+                    result += generate_defn(cpp_sig)
+                return result
+            elif self.target is Target.ANONYMOUS_DEFINITION:
+                # short circuit for inplace_meta
+                if inplace_meta:
+                    assert f.func.arguments.self_arg is not None
+                    self_arg_name = f.func.arguments.self_arg.argument.name
+                    # TODO: handle in place on tensor list
+                    return f"""
+{returns_type} {name}({args_str}) {{
+  TORCH_CHECK_NOT_IMPLEMENTED({self_arg_name}.is_meta(),
+    "Cannot inplace into non-meta tensor with meta tensor argument");
+  return {self_arg_name};
+}}
+"""
+                # short circuit for generated inplace/out wrappers
+                if gets_out_inplace_wrapper:
+                    return self.gen_out_inplace_wrapper(f, g)
+                metadata = self.backend_index.get_kernel(f)
+                if metadata is None:
+                    return None
+                if self.class_method_name is None:
+                    impl_name = f"{metadata.cpp_namespace}::{metadata.kernel}"
+                else:
+                    impl_name = f"{metadata.cpp_namespace}::{self.class_method_name}::{metadata.kernel}"
+                kernel_sig = kernel_signature(f, self.backend_index)
+                args_exprs_str = ", ".join(
+                    e.expr
+                    for e in translate(
+                        sig.arguments(), kernel_sig.arguments(), method=False
+                    )
+                )
+                device_check = "  // No device check\n"
+                # Backends that require device guards presumably also require device checks.
+                if self.backend_index.device_guard:
+                    device_check_args = itertools.chain(
+                        f.func.arguments.out, f.func.arguments.flat_positional
+                    )
+                    device_check = RegisterDispatchKey.gen_device_check(
+                        f.device_check, list(device_check_args), name
+                    )
+                device_guard = "// DeviceGuard omitted"  # default
+                if f.device_guard and self.backend_index.device_guard:
+                    has_tensor_options = any(
+                        isinstance(a, TensorOptionsArguments)
+                        for a in f.func.arguments.non_out
+                    )
+                    if has_tensor_options:
+                        # kernel is creating a tensor
+                        device_guard = """
+  const DeviceGuard device_guard(device_or_default(device));"""
+                        # CUDA requires special handling
+                        if is_cuda_dispatch_key(self.backend_index.dispatch_key):
+                            device_guard = (
+                                f"globalContext().lazyInitCUDA();\n{device_guard}"
+                            )
+                    else:
+                        # kernel is operating on existing tensors
+                        # There is precedence for which argument we use to do
+                        # device guard.  This describes the precedence order.
+                        self_arg = (
+                            [f.func.arguments.self_arg.argument]
+                            if f.func.arguments.self_arg is not None
+                            else []
+                        )
+                        candidate_args = itertools.chain(
+                            self_arg,
+                            f.func.arguments.out,
+                            f.func.arguments.flat_positional,
+                        )
+                        # Only tensor like arguments are eligible
+                        device_of = next(
+                            (
+                                f"{a.name}"
+                                for a in candidate_args
+                                if a.type.is_tensor_like()
+                            ),
+                            None,
+                        )
+                        if device_of is not None:
+                            device_guard = f"const OptionalDeviceGuard device_guard(device_of({device_of}));"
+                return f"""\
+namespace {{
+{returns_type} {name}({args_str}) {{
+  {device_check}
+  {device_guard}
+  return {impl_name}({args_exprs_str});
+}}
+}} // anonymous namespace
+"""
+            elif self.target is Target.REGISTRATION:
+                if f.manual_kernel_registration or self.skip_dispatcher_op_registration:
+                    return None
+                else:
+                    payload = f"TORCH_FN({name})"
+                    return f'm.impl("{f.func.name}",\n{payload});\n'
+            else:
+                assert_never(self.target)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                           STRUCTURED
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+@dataclass(frozen=True)
+class StructuredRegisterDispatchKey(RegisterDispatchKey):
+    g: NativeFunctionsGroup
+    def gen_class_set_output_functions(
+        self, k: SchemaKind, parent_class: str, generate_super: bool
+    ) -> str:
+        if generate_super:
+            set_output_super = f"{parent_class}::set_output_raw_strided(output_idx, sizes, strides, options, names);"
+        else:
+            set_output_super = ""
+        def gen_set_output_function(name: str, maybe_create_proxy: bool) -> str:
+            return f"""
+void set_output_{name}(
+    int64_t output_idx, IntArrayRef sizes, IntArrayRef strides,
+    TensorOptions options, DimnameList names
+) override {{
+{textwrap.indent(self.gen_class_set_output_body(k, maybe_create_proxy), "    ")}
+    if (!names.empty()) {{
+      namedinference::propagate_names(outputs_[output_idx], names);
+    }}
+    // super must happen after, so that downstream can use maybe_get_output
+    // to retrieve the output
+{textwrap.indent(set_output_super, "    ")}
+}}
+"""
+        return f"""
+{gen_set_output_function("strided", maybe_create_proxy=True)}
+{gen_set_output_function("raw_strided", maybe_create_proxy=False)}
+"""
+    def gen_class_set_output_body(self, k: SchemaKind, maybe_create_proxy: bool) -> str:
+        if self.backend_index.dispatch_key in [
+            DispatchKey.CUDA,
+            DispatchKey.MPS,
+            DispatchKey.CompositeExplicitAutogradNonFunctional,
+        ]:
+            maybe_set_guard = """
+auto current_device = guard_.current_device();
+if (C10_UNLIKELY(current_device.has_value())) {
+  TORCH_INTERNAL_ASSERT(*current_device == options.device(),
+    "structured kernels don't support multi-device outputs");
+} else {
+  guard_.reset_device(options.device());
+}
+"""
+            maybe_set_guard_line = maybe_set_guard + "\n"
+        else:
+            maybe_set_guard_line = maybe_set_guard = ""
+        if maybe_create_proxy:
+            create_proxy = """
+auto maybe_proxy = maybe_create_proxy(out, sizes, strides, options);
+if (C10_UNLIKELY(maybe_proxy.has_value())) {
+    proxy_outputs_[output_idx] = std::move(maybe_proxy).value();
+}
+"""
+        else:
+            create_proxy = ""
+        if k is SchemaKind.functional:
+            assert self.backend_index.dispatch_key in (
+                DispatchKey.Meta,
+                DispatchKey.CPU,
+                DispatchKey.CUDA,
+                DispatchKey.MPS,
+                DispatchKey.CompositeExplicitAutogradNonFunctional,
+            )
+            return f"""{maybe_set_guard_line}
+outputs_[output_idx] = create_out(sizes, strides, options);"""
+        elif k is SchemaKind.inplace:
+            return f"""{maybe_set_guard_line}
+const auto& out = outputs_[output_idx].get();
+check_inplace(out, sizes, options);
+{create_proxy}"""
+        elif k is SchemaKind.out:
+            return f"""{maybe_set_guard_line}
+const auto& out = outputs_[output_idx].get();
+resize_out(out, sizes, strides, options);
+{create_proxy}"""
+        elif k is SchemaKind.mutable or k is SchemaKind.scratch:
+            raise AssertionError(
+                f"{k} structured operators are currently not supported"
+            )
+        else:
+            assert_never(k)
+    # returns the definition of a ctor, as well as how to construct
+    # this class to a variable named op
+    def gen_class_ctor(self, k: SchemaKind, class_name: str, returns: int) -> str:
+        if k is SchemaKind.functional:
+            return ""
+        elif k is SchemaKind.inplace:
+            # TODO: Make sure out argument is guaranteed to be self
+            return f"{class_name}(Tensor& self) : outputs_{{std::ref(self)}} {{}}"
+        elif k is SchemaKind.out:
+            out_args = ", ".join(f"Tensor& out{i}" for i in range(returns))
+            out_refs = ", ".join(f"std::ref(out{i})" for i in range(returns))
+            return f"{class_name}({out_args}) : outputs_{{ {out_refs} }} {{}}"
+        elif k is SchemaKind.mutable or k is SchemaKind.scratch:
+            raise AssertionError(
+                f"{k} structured operators are currently not supported"
+            )
+        else:
+            assert_never(k)
+    def gen_class(
+        self,
+        f: NativeFunction,
+        k: SchemaKind,
+        *,
+        class_name: str,
+        parent_class: str,
+        generate_super: bool,
+    ) -> str:
+        if k is SchemaKind.functional:
+            output_type = "Tensor"
+            output_value = "outputs_[output_idx]"
+            proxy_field = ""
+        elif k is SchemaKind.inplace:
+            output_type = "std::reference_wrapper<Tensor>"
+            output_value = "proxy_outputs_[output_idx].has_value() ? *proxy_outputs_[output_idx] : outputs_[output_idx].get()"
+            proxy_field = f"std::array<c10::optional<Tensor>, {len(f.func.returns)}> proxy_outputs_;"
+        elif k is SchemaKind.out:
+            output_type = "std::reference_wrapper<Tensor>"
+            output_value = "proxy_outputs_[output_idx].has_value() ? *proxy_outputs_[output_idx] : outputs_[output_idx].get()"
+            proxy_field = f"std::array<c10::optional<Tensor>, {len(f.func.returns)}> proxy_outputs_;"
+        if self.backend_index.dispatch_key == DispatchKey.CUDA:
+            if self.rocm:
+                guard_field = "c10::hip::OptionalHIPGuardMasqueradingAsCUDA guard_;"
+            else:
+                guard_field = "c10::cuda::OptionalCUDAGuard guard_;"
+        elif (
+            self.backend_index.dispatch_key
+            == DispatchKey.CompositeExplicitAutogradNonFunctional
+        ):
+            guard_field = "c10::OptionalDeviceGuard guard_;"
+        elif self.backend_index.dispatch_key == DispatchKey.MPS:
+            # TODO: Move to OptionalMPSGuard.
+            guard_field = "c10::OptionalDeviceGuard guard_;"
+        else:
+            guard_field = ""
+        indent = " " * 4
+        class_ctor_str = self.gen_class_ctor(k, class_name, len(f.func.returns))
+        lines = (
+            f"struct {class_name} final : public {parent_class} {{",
+            f"{textwrap.indent(class_ctor_str, indent)}",
+            f"{textwrap.indent(self.gen_class_set_output_functions(k, parent_class, generate_super), indent)}",
+            "    const Tensor& maybe_get_output(int64_t output_idx) override {",
+            f"      return {output_value};\n",  # type: ignore[possibly-undefined]  # TODO: audit
+            "    }",
+            f"    std::array<{output_type}, {len(f.func.returns)}> outputs_;",  # type: ignore[possibly-undefined]  # TODO: audit
+            f"{textwrap.indent(proxy_field, indent)}",  # type: ignore[possibly-undefined]  # TODO: audit
+            f"{textwrap.indent(guard_field, indent)}",
+            "};",
+        )
+        return "\n".join(line for line in lines if line)
+    @method_with_native_function
+    def gen_one(self, f: NativeFunction) -> Optional[str]:
+        assert not f.manual_kernel_registration
+        if (
+            self.target is Target.REGISTRATION
+            and not self.selector.is_native_function_selected(f)
+        ):
+            return None
+        # TODO: Now, there is something interesting going on here.  In the code below,
+        # we generate CompositeExplicitAutogradNonFunctional implementations of functional and inplace
+        # based on the out implementation.  But in fact, out is definable by
+        # functional too (just not very efficiently), and this is honestly the
+        # MORE likely situation for a backend implementor.  How do we pick?
+        # Well, taking a page from Haskell type classes and default methods,
+        # we could conceivably register a circular definition (out in terms
+        # of functional, and functional in terms of out) and just require
+        # someone to implement one or the other.  We'd have to do a little bit
+        # of work to not register one of these "weak" definitions unless there
+        # is a strong definition somewhere in the DAG!  So it's not implemented yet.
+        if (
+            self.backend_index.dispatch_key
+            == DispatchKey.CompositeExplicitAutogradNonFunctional
+            and f.func.kind() is SchemaKind.out
+        ):
+            # Never generate a default implementation for out, that's what you
+            # have to define as a backend implementor
+            return None
+        # Note [Direct dispatch bindings]
+        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # Signature of the non-dispatched function we'll expose in a header
+        # (e.g., at::cpu::add).  We don't generate methods (TODO: do this
+        # when CPUTensor class is a thing); nor do we generate fallback
+        # bindings for manual_cpp_binding functions.
+        cpp_sig_group = CppSignatureGroup.from_native_function(
+            f, method=False, fallback_binding=False
+        )
+        # Signature of the wrapper function we'll register to the dispatcher
+        kern = self.backend_index.get_kernel(f)
+        sig = NativeSignature(
+            f.func,
+            prefix=f"wrapper_{self.backend_index.dispatch_key}_",
+            symint=kern is not None and kern.supports_symint(),
+        )
+        if self.target is Target.NAMESPACED_DECLARATION:
+            result = ""
+            for cpp_sig in cpp_sig_group.signatures(symint=self.symint):
+                result += f"TORCH_API {cpp_sig.decl()};\n"
+            return result
+        elif self.target is Target.NAMESPACED_DEFINITION:
+            def generate_defn(cpp_sig: CppSignature) -> str:
+                return f"""
+{cpp_sig.defn()} {{
+return {sig.name()}({', '.join(e.expr for e in translate(cpp_sig.arguments(), sig.arguments()))});
+}}
+"""
+            result = ""
+            for cpp_sig in cpp_sig_group.signatures(symint=self.symint):
+                result += generate_defn(cpp_sig)
+            return result
+        elif self.target is Target.ANONYMOUS_DEFINITION:
+            k = f.func.kind()
+            # Construct the body of the wrapper function with signature sig
+            sig_body = []
+            # We'll use context to keep track of any variables we've brought
+            # into scope while generating code
+            context: List[Union[Binding, Expr]] = list(sig.arguments())
+            # Initialize the class corresponding to this structured
+            # operator; feeding it the output argument(s) if it is known
+            if self.backend_index.dispatch_key is DispatchKey.Meta:
+                class_name = f"structured_{meta.name(self.g)}_meta_{k.name}"
+                parent_class = f"at::meta::structured_{meta.name(self.g)}"
+            elif (
+                self.backend_index.dispatch_key
+                is DispatchKey.CompositeExplicitAutogradNonFunctional
+            ):
+                # TODO: dedup this branch
+                class_name = f"structured_{meta.name(self.g)}_default_backend_{k.name}"
+                parent_class = f"at::meta::structured_{meta.name(self.g)}"
+            else:
+                metadata = self.backend_index.get_kernel(self.g)
+                assert metadata is not None
+                class_name = f"structured_{metadata.kernel}_{k.name}"
+                parent_class = f"{metadata.cpp_namespace}::structured_{metadata.kernel}"
+            if self.backend_index.device_guard:
+                device_check_args = itertools.chain(
+                    f.func.arguments.out, f.func.arguments.flat_positional
+                )
+                sig_body.append(
+                    RegisterDispatchKey.gen_device_check(
+                        f.device_check, list(device_check_args), sig.name()
+                    )
+                )
+            if k is SchemaKind.functional:
+                sig_body.append(f"{class_name} op;")
+            elif k is SchemaKind.inplace:
+                sig_body.append(f"{class_name} op(self);")
+            elif k is SchemaKind.out:
+                out_args_str = ", ".join(a.name for a in f.func.arguments.out)
+                sig_body.append(f"{class_name} op({out_args_str});")
+            # Translate the input native arguments into structured
+            # arguments for the meta call
+            meta_exprs = ", ".join(
+                e.expr
+                for e in translate(
+                    context, structured.meta_arguments(self.g), method=False
+                )
+            )
+            if self.g.out.precomputed:
+                # If this function group has precomputed elements, the meta function
+                # returns a struct containing them which must be saved so that it
+                # can be unpacked when generating code to call the impl.
+                sig_body.append(f"auto precompute = op.meta({meta_exprs});")
+                # Put all of the contents of the precompute struct into the context
+                # so that translate will be able to return the correct args for the
+                # call to the impl.
+                precomputed_values = [
+                    *self.g.out.precomputed.replace.values(),
+                    self.g.out.precomputed.add,
+                ]
+                for precomputed_elems in precomputed_values:
+                    for arg in precomputed_elems:
+                        context.append(
+                            Expr(
+                                expr=f"precompute.{arg.name}",
+                                type=structured.argument_type(arg, binds=arg.name),
+                            )
+                        )
+                # Add a use of the precompute struct so FB internal compilers don't
+                # complain that there is an unused variable.
+                sig_body.append("(void)precompute;")
+            else:
+                sig_body.append(f"op.meta({meta_exprs});")
+            # After running meta, op.outputs_ is guaranteed to be valid;
+            # add it to the context
+            out_args = structured.out_arguments(self.g)
+            for i, out_arg in enumerate(out_args):
+                assert ConstRefCType(BaseCType(tensorT)) == out_arg.nctype.type
+                if k is SchemaKind.out:
+                    expr = f"op.maybe_get_output({i})"
+                else:
+                    expr = f"op.outputs_[{i}]"
+                context.append(
+                    Expr(
+                        expr=expr,
+                        # TODO: Stop hardcoding that the output type is a Tensor.  Note
+                        # that for the codegen here this is fine because outputs_ is
+                        # hardcoded to be tensor already
+                        type=NamedCType(
+                            out_arg.nctype.name, MutRefCType(BaseCType(tensorT))
+                        ),
+                    )
+                )
+            # With the expanded context, do the impl call (if not a meta
+            # function)
+            if (
+                self.backend_index.dispatch_key
+                == DispatchKey.CompositeExplicitAutogradNonFunctional
+            ):
+                # TODO: https://github.com/pytorch/pytorch/issues/53023
+                out_sig_group = CppSignatureGroup.from_native_function(
+                    self.g.out, method=False, fallback_binding=f.manual_cpp_binding
+                )
+                out_sig = out_sig_group.most_faithful_signature()
+                api_name = out_sig.name()
+                out_exprs = ", ".join(
+                    e.expr
+                    for e in translate(context, out_sig.arguments(), method=False)
+                )
+                # TODO: I think this means structured won't work with method
+                # only functions (but maybe you're saved by faithful? iunno.)
+                # NB: Originally I wrote this as an at::redispatch call, but
+                # I got in trouble because that meant I needed a DispatchKeySet
+                # in the wrapper function, which meant I needed a DispatchKeySet
+                # in the DispatchKeyFunctions declarations, but the defined API
+                # there does NOT permit a dispatch key set.  I think you can
+                # probably unwind this by calling some function to do the TLS
+                # fetch and get the DispatchKeySet when you don't have it, but
+                # I didn't do it for this version
+                sig_body.append(f"at::{api_name}({out_exprs});")
+            elif self.backend_index.dispatch_key != DispatchKey.Meta:
+                impl_exprs = ", ".join(
+                    e.expr
+                    for e in translate(
+                        context, structured.impl_arguments(self.g), method=False
+                    )
+                )
+                sig_body.append(f"op.impl({impl_exprs});")
+            # Go over each output, and check if there is a proxy created for it.
+            # If so, copy it over to the original output.
+            if k is SchemaKind.out or k is SchemaKind.inplace:
+                for i in range(len(f.func.returns)):
+                    sig_body.append(
+                        f"if (op.proxy_outputs_[{i}].has_value()) op.outputs_[{i}].get().copy_(*op.proxy_outputs_[{i}]);"
+                    )
+            # Destructively return the final tensors
+            # TODO: Do this in translate instead
+            if k is SchemaKind.functional:
+                if len(f.func.returns) == 1:
+                    ret_expr = "std::move(op.outputs_[0])"  # small optimization
+                else:
+                    moved = ", ".join(
+                        f"std::move(op.outputs_[{i}])"
+                        for i in range(len(f.func.returns))
+                    )
+                    ret_expr = f"std::make_tuple({moved})"
+            elif k is SchemaKind.inplace:
+                ret_expr = "self"
+            elif k is SchemaKind.out:
+                if len(f.func.returns) == 1:
+                    ret_expr = f.func.arguments.out[0].name
+                else:
+                    refs = ", ".join(a.name for a in f.func.arguments.out)
+                    ret_expr = f"std::forward_as_tuple({refs})"
+            sig_body.append(f"return {ret_expr};")  # type: ignore[possibly-undefined]  # TODO: audit
+            sig_body_str = "\n".join(sig_body)
+            # For an overview of what this template code looks like, see
+            # https://github.com/pytorch/rfcs/pull/9
+            return f"""\
+{self.gen_class(
+f, k,
+class_name=class_name,
+parent_class=parent_class,
+generate_super=self.g.out.structured_inherits is not None
+)}
+{sig.defn()} {{
+{sig_body_str}
+}}
+"""
+        elif self.target is Target.REGISTRATION:
+            return f'm.impl("{f.func.name}", TORCH_FN({sig.name()}));'
+        else:
+            assert_never(self.target)
+            # Silence mypy's "Missing return statement" error
+            return None

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/dest/ufunc.py ADDED Viewed

	@@ -0,0 +1,545 @@

+from dataclasses import dataclass
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+import torchgen.api.ufunc as ufunc
+from torchgen.api.translate import translate
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    CType,
+    Expr,
+    NamedCType,
+    opmath_t,
+    scalar_t,
+    StructuredImplSignature,
+    VectorizedCType,
+)
+from torchgen.api.ufunc import UfunctorBindings
+from torchgen.context import with_native_function
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    DispatchKey,
+    NativeFunctionsGroup,
+    ScalarType,
+    UfuncKey,
+)
+from torchgen.utils import OrderedSet
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                                  CUDA STUFF
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+# NB: not bothering to generate dispatch stub forward declaration in header,
+# we can just paste it whereever necessary
+# TODO: use BackendIndex
+# dispatch_key: DispatchKey  # only CPU/CUDA right now
+# Represents functors for implementing CUDA ufuncs.
+# Functors are templated by scalar_t because when USERS instantiate functors
+# they are templated.  A functor looks something like this:
+#
+#   template <typename scalar_t>
+#   struct CUDAFunctorOnSelf_add {
+#     using opmath_t = at::opmath_type<scalar_t>;
+#     opmath_t other_;
+#     opmath_t alpha_;
+#     CUDAFunctorOnSelf_add(opmath_t other, opmath_t alpha)
+#         : other_(other), alpha_(alpha) {}
+#     __device__ scalar_t operator()(scalar_t self) {
+#       return ufunc::add(static_cast<opmath_t>(self), other_, alpha_);
+#     }
+#   };
+#
+@dataclass(frozen=True)
+class UfunctorSignature:
+    g: NativeFunctionsGroup
+    scalar_tensor_idx: Optional[int]
+    name: str
+    def arguments(self) -> UfunctorBindings:
+        return ufunc.ufunctor_arguments(
+            self.g, scalar_tensor_idx=self.scalar_tensor_idx, scalar_t=scalar_t
+        )
+    def fields(self) -> List[Binding]:
+        # fields are renamed to have a trailing underscore, as is conventional
+        return [b.rename(f"{b.name}_") for b in self.arguments().ctor]
+    def returns_type(self) -> CType:
+        # TODO: don't hardcode; return type will be inferred based on tags on
+        # the native function
+        return BaseCType(scalar_t)
+    def decl_fields(self) -> str:
+        return "\n".join(f"{f.type} {f.name};" for f in self.fields())
+    def inline_defn_ctor(self) -> str:
+        args_str = ", ".join(a.decl() for a in self.arguments().ctor)
+        # NB: hypothetically could do this with translate but the
+        # transition here is very regular
+        init_str = ", ".join(f"{a.name}_({a.name})" for a in self.arguments().ctor)
+        return f"{self.name}({args_str}) : {init_str} {{}}"
+    def decl_apply(self) -> str:
+        args_str = ", ".join(a.decl() for a in self.arguments().apply)
+        return f"{self.returns_type().cpp_type()} operator()({args_str}) const"
+@dataclass(frozen=True)
+class UfuncSignature:
+    g: NativeFunctionsGroup
+    name: str
+    compute_t: CType
+    def arguments(self) -> List[Binding]:
+        return ufunc.ufunc_arguments(self.g, compute_t=self.compute_t)
+    def call(self, ctx: Sequence[Union[Binding, Expr]]) -> str:
+        return f"{self.name}({', '.join(a.expr for a in translate(ctx, self.arguments()))})"
+# steps:
+#   1. take the functional signature
+#   2. use api.ufunc to convert it to template signature.  this establishes
+#      the type of the template function
+#   3. use api.ufunc (II) to generate a split struct / operator() signature.
+#      this establish context in which we call the template signature
+#
+# StructuredImplSignature context
+#   ~> functor constructor sig
+#
+# Functor constructor context
+#   ~> functor fields sig
+#
+# Functor apply context (functor fields + functor apply sig)
+#   ~> template sig
+#
+def eligible_for_binary_scalar_specialization(g: NativeFunctionsGroup) -> bool:
+    num_tensors = sum(
+        1 for a in g.functional.func.arguments.flat_non_out if a.type.is_tensor_like()
+    )
+    return num_tensors == 2
+def compute_ufunc_cuda_functors(
+    g: NativeFunctionsGroup,
+) -> Tuple[Dict[ScalarType, Dict[UfuncKey, UfunctorSignature]], str]:
+    # First, build the functors.
+    ufunctor_sigs: Dict[ScalarType, Dict[UfuncKey, UfunctorSignature]] = {}
+    ufunctors: List[str] = []
+    loops = g.out.ufunc_inner_loop
+    scalar_tensor_idx_lookup = {
+        UfuncKey.CUDAFunctorOnSelf: 1,
+        UfuncKey.CUDAFunctorOnOther: 0,
+        UfuncKey.CUDAFunctor: None,
+    }
+    if eligible_for_binary_scalar_specialization(g):
+        keys = [
+            UfuncKey.CUDAFunctorOnSelf,
+            UfuncKey.CUDAFunctorOnOther,
+            UfuncKey.CUDAFunctor,
+        ]
+    else:
+        keys = [UfuncKey.CUDAFunctor]
+        for k in [UfuncKey.CUDAFunctorOnSelf, UfuncKey.CUDAFunctorOnOther]:
+            assert k not in loops, f"cannot use {k} on non-binary function"
+    for k in keys:
+        # If the key was directly defined, skip functor codegen; we assume the
+        # user already done it for us
+        if k in loops:
+            ufunctor_sig = UfunctorSignature(
+                g, scalar_tensor_idx=scalar_tensor_idx_lookup[k], name=loops[k].name
+            )
+            for dtype in loops[k].supported_dtypes:
+                ufunctor_sigs.setdefault(dtype, {})[k] = ufunctor_sig
+            continue
+        # Note [ScalarOnly and Generic must match names for CUDA]
+        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # Otherwise, look in ANY of the generic entries.  For simplicity of
+        # codegen, both ScalarOnly and Generic are defined, the ufunc name
+        # must match  (if they didn't match, we'd have to generate distinct
+        # functors per dtype, which is awful, so we're not going to do it unless
+        # someone really forces us to)
+        ufunc_name = None
+        supported_dtypes: OrderedSet[ScalarType] = OrderedSet()
+        for lk in [UfuncKey.ScalarOnly, UfuncKey.Generic]:
+            if lk not in loops:
+                continue
+            if ufunc_name is None:
+                ufunc_name = loops[lk].name
+            else:
+                # See Note [ScalarOnly and Generic must match names for CUDA]
+                assert (
+                    ufunc_name == loops[lk].name
+                ), "ScalarOnly and Generic must have same ufunc name"
+            supported_dtypes |= loops[lk].supported_dtypes
+        assert ufunc_name is not None
+        name = f"{k}_{ufunc_name}"
+        ufunctor_sig = UfunctorSignature(
+            g, scalar_tensor_idx=scalar_tensor_idx_lookup[k], name=name
+        )
+        for dtype in supported_dtypes:
+            ufunctor_sigs.setdefault(dtype, {})[k] = ufunctor_sig
+        ufunc_sig = UfuncSignature(
+            g, name=f"ufunc::{ufunc_name}", compute_t=BaseCType(opmath_t)
+        )
+        apply_ctx = ufunctor_sig.fields() + ufunctor_sig.arguments().apply
+        ufunctors.append(
+            f"""
+template <typename scalar_t>
+struct {ufunctor_sig.name} {{
+  using opmath_t = at::opmath_type<scalar_t>;
+  {ufunctor_sig.decl_fields()}
+  {ufunctor_sig.inline_defn_ctor()}
+  __device__ {ufunctor_sig.decl_apply()} {{
+    return {ufunc_sig.call(apply_ctx)};
+  }}
+}};
+"""
+        )
+    return ufunctor_sigs, "\n".join(ufunctors)
+@dataclass(frozen=True)
+class BinaryScalarSpecializationConfig:
+    scalar_idx: int
+    ctor_tensor: str
+    ufunc_key: UfuncKey
+BinaryScalarSpecializationConfigs = [
+    BinaryScalarSpecializationConfig(
+        scalar_idx=0,
+        ctor_tensor="self",
+        ufunc_key=UfuncKey.CUDAFunctorOnOther,
+    ),
+    BinaryScalarSpecializationConfig(
+        scalar_idx=1,
+        ctor_tensor="other",
+        ufunc_key=UfuncKey.CUDAFunctorOnSelf,
+    ),
+]
+def compute_ufunc_cuda_dtype_body(
+    g: NativeFunctionsGroup,
+    dtype: ScalarType,
+    inner_loops: Dict[UfuncKey, UfunctorSignature],
+    parent_ctx: Sequence[Binding],
+) -> str:
+    body = "using opmath_t = at::opmath_type<scalar_t>;"
+    body += "if (false) {}\n"  # for ease of codegen
+    for config in BinaryScalarSpecializationConfigs:
+        if config.ufunc_key not in inner_loops:
+            continue
+        ufunctor_sig = inner_loops[config.ufunc_key]
+        scalar_idx = config.scalar_idx + 1
+        # Make a copy and at the same time widen the type (not permissible
+        # without copy; we don't want to mutate the input argument anyway)
+        ctx: List[Union[Expr, Binding]] = list(parent_ctx)
+        ctx.append(
+            Expr(
+                expr=f"iter.scalar_value<opmath_t>({scalar_idx})",
+                type=NamedCType(config.ctor_tensor, BaseCType(opmath_t)),
+            )
+        )
+        ufunctor_ctor_exprs_str = ", ".join(
+            a.expr for a in translate(ctx, ufunctor_sig.arguments().ctor)
+        )
+        # NB: ufunctor must be allocated before iter.remove_operand is called,
+        # as it relies on iter
+        body += f"""\
+else if (iter.is_cpu_scalar({scalar_idx})) {{
+  {ufunctor_sig.name}<scalar_t> ufunctor({ufunctor_ctor_exprs_str});
+  iter.remove_operand({scalar_idx});
+  gpu_kernel(iter, ufunctor);
+}}"""
+    ufunctor_sig = inner_loops[UfuncKey.CUDAFunctor]
+    ufunctor_ctor_exprs_str = ", ".join(
+        a.expr for a in translate(parent_ctx, ufunctor_sig.arguments().ctor)
+    )
+    body += f"""
+else {{
+  gpu_kernel(iter, {ufunctor_sig.name}<scalar_t>({ufunctor_ctor_exprs_str}));
+}}
+    """
+    return body
+@with_native_function
+def compute_ufunc_cuda(g: NativeFunctionsGroup) -> str:
+    # First, build the functors, indexing them by dtype
+    ufunctor_sigs, ufunctors = compute_ufunc_cuda_functors(g)
+    # Next, build the conditionals
+    sig = StructuredImplSignature(g, ufunc.kernel_name(g, DispatchKey.CUDA))
+    dtype_cases = []
+    for dtype, inner_ufunc_sigs in ufunctor_sigs.items():
+        dtype_cases.append(
+            f"""
+AT_DISPATCH_CASE(at::ScalarType::{dtype},
+  [&]() {{
+    {compute_ufunc_cuda_dtype_body(g, dtype, inner_ufunc_sigs, sig.arguments())}
+  }}
+)
+"""
+        )
+    dtype_cases_str = "\n".join(dtype_cases)
+    stub_sig = StubSignature(g)
+    return f"""
+{ufunctors}
+{stub_sig.type_defn()};
+{stub_sig.dispatch_decl()};
+{stub_sig.kernel_defn()} {{
+  AT_DISPATCH_SWITCH(iter.common_dtype(), "{sig.name}",
+    {dtype_cases_str}
+  );
+}}
+REGISTER_DISPATCH({stub_sig.name}, &{stub_sig.kernel_name});
+{sig.defn()} {{
+  {stub_sig.direct_call(sig.arguments())};
+}}
+"""
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#
+#                                   CPU STUFF
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+@dataclass(frozen=True)
+class StubSignature:
+    g: NativeFunctionsGroup
+    @property
+    def name(self) -> str:
+        return f"{str(self.g.functional.func.name.name)}_stub"
+    @property
+    def kernel_name(self) -> str:
+        return f"{str(self.g.functional.func.name.name)}_kernel"
+    @property
+    def type_name(self) -> str:
+        return f"{str(self.g.functional.func.name.name)}_fn"
+    def arguments(self) -> List[Binding]:
+        return ufunc.stub_arguments(self.g)
+    def type(self) -> str:
+        cpp_args = self.arguments()
+        return f"void(*)(TensorIteratorBase&, {', '.join(a.type for a in cpp_args)})"
+    def dispatch_decl(self) -> str:
+        return f"DECLARE_DISPATCH({self.type_name}, {self.name})"
+    def dispatch_defn(self) -> str:
+        return f"DEFINE_DISPATCH({self.name})"
+    def kernel_defn(self) -> str:
+        return f"void {self.kernel_name}(TensorIteratorBase& iter, {', '.join(a.defn() for a in self.arguments())})"
+    def type_defn(self) -> str:
+        return f"using {self.type_name} = {self.type()}"
+    # must be called from context where this is TensorIteratorBase*
+    def call(self, ctx: Sequence[Binding]) -> str:
+        return f"{self.name}(device_type(), *this, {', '.join(a.expr for a in translate(ctx, self.arguments()))})"
+    # used in CUDA to skip the unnecessary dynamic dispatch
+    def direct_call(self, ctx: Sequence[Binding]) -> str:
+        return f"{self.kernel_name}(*this, {', '.join(a.expr for a in translate(ctx, self.arguments()))})"
+@with_native_function
+def compute_ufunc_cpu(g: NativeFunctionsGroup) -> str:
+    stub_sig = StubSignature(g)
+    sig = StructuredImplSignature(g, ufunc.kernel_name(g, DispatchKey.CPU))
+    return f"""
+{stub_sig.type_defn()};
+{stub_sig.dispatch_decl()};
+{stub_sig.dispatch_defn()};
+{sig.defn()} {{
+  {stub_sig.call(sig.arguments())};
+}}
+"""
+def compute_ufunc_cpu_dtype_body(
+    g: NativeFunctionsGroup,
+    dtype: ScalarType,
+    inner_loops: Dict[UfuncKey, UfuncSignature],
+    parent_ctx: Sequence[Binding],
+) -> str:
+    assert UfuncKey.CPUScalar in inner_loops, f"{dtype}, {inner_loops.keys()}"
+    assert inner_loops.keys() <= {UfuncKey.CPUScalar, UfuncKey.CPUVector}
+    scalar_loop = inner_loops[UfuncKey.CPUScalar]
+    vec_loop = None
+    if UfuncKey.CPUVector in inner_loops:
+        vec_loop = inner_loops[UfuncKey.CPUVector]
+    # NB: We DON'T use translate here, because translate is
+    # incapable of CSE'ing the scalar accesses in case it is also
+    # used by Vectorized; also, the unpacking here is very simple
+    # and only affects Scalar; everything else is implicitly captured
+    # by the lambda
+    # Setup scalar in scope
+    body = []
+    ctx = []
+    for b in parent_ctx:
+        if isinstance(b.argument, Argument) and b.argument.type != BaseType(
+            BaseTy.Scalar
+        ):
+            continue
+        body.append(f"auto _s_{b.name} = {b.name}.to<scalar_t>();")
+        ctx.append(Expr(f"_s_{b.name}", NamedCType(b.nctype.name, BaseCType(scalar_t))))
+    if vec_loop is not None:
+        for b in parent_ctx:
+            if isinstance(b.argument, Argument) and b.argument.type != BaseType(
+                BaseTy.Scalar
+            ):
+                continue
+            body.append(
+                f"auto _v_{b.name} = at::vec::Vectorized<scalar_t>(_s_{b.name});"
+            )
+            ctx.append(
+                Expr(
+                    f"_v_{b.name}",
+                    NamedCType(b.nctype.name, VectorizedCType(BaseCType(scalar_t))),
+                )
+            )
+    # Setup lambda signature
+    # NB: simplified version of ufunctor_arguments
+    scalar_bindings = []
+    vec_bindings = []
+    for a in g.functional.func.arguments.flat_non_out:
+        if not a.type.is_tensor_like():
+            continue
+        assert a.type == BaseType(BaseTy.Tensor)
+        scalar_bindings.append(
+            Binding(
+                name=a.name,
+                nctype=NamedCType(a.name, BaseCType(scalar_t)),
+                argument=a,
+            )
+        )
+        if vec_loop is not None:
+            vec_bindings.append(
+                Binding(
+                    name=a.name,
+                    nctype=NamedCType(a.name, VectorizedCType(BaseCType(scalar_t))),
+                    argument=a,
+                )
+            )
+    def with_ctx(b: Sequence[Binding]) -> List[Union[Expr, Binding]]:
+        r: List[Union[Expr, Binding]] = []
+        r.extend(ctx)
+        r.extend(b)
+        return r
+    body_str = "\n".join(body)
+    if vec_loop is not None:
+        return f"""
+{body_str}
+cpu_kernel_vec(iter,
+  [=]({', '.join(b.decl() for b in scalar_bindings)}) {{ return {scalar_loop.call(with_ctx(scalar_bindings))}; }},
+  [=]({', '.join(b.decl() for b in vec_bindings)}) {{ return {vec_loop.call(with_ctx(vec_bindings))}; }}
+);
+"""
+    else:
+        return f"""
+{body_str}
+cpu_kernel(iter,
+  [=]({', '.join(b.decl() for b in scalar_bindings)}) {{ return {scalar_loop.call(with_ctx(scalar_bindings))}; }}
+);
+"""
+@with_native_function
+def compute_ufunc_cpu_kernel(g: NativeFunctionsGroup) -> str:
+    stub_sig = StubSignature(g)
+    # Reindex the ufunc by dtypes; processing generic/scalaronly as well
+    loops = g.out.ufunc_inner_loop
+    ufunc_sigs: Dict[ScalarType, Dict[UfuncKey, UfuncSignature]] = {}
+    for k in [UfuncKey.CPUScalar, UfuncKey.CPUVector]:
+        lks = []
+        # ORDER MATTERS: this specifies overriding precedence
+        if k in loops:  # should happen rarely
+            lks.append(k)
+        if UfuncKey.ScalarOnly in loops and k is UfuncKey.CPUScalar:
+            lks.append(UfuncKey.ScalarOnly)
+        if UfuncKey.Generic in loops:
+            lks.append(UfuncKey.Generic)
+        # TODO: don't hardcode ufunc:: namespace here, should be centralized smh
+        for lk in lks:
+            for dtype in loops[lk].supported_dtypes:
+                compute_t: CType
+                if k is UfuncKey.CPUScalar:
+                    compute_t = BaseCType(scalar_t)
+                elif k is UfuncKey.CPUVector:
+                    compute_t = VectorizedCType(BaseCType(scalar_t))
+                else:
+                    raise AssertionError()
+                inner_ufunc_sigs = ufunc_sigs.setdefault(dtype, {})
+                if k not in inner_ufunc_sigs:
+                    inner_ufunc_sigs[k] = UfuncSignature(
+                        g, name=f"ufunc::{loops[lk].name}", compute_t=compute_t
+                    )
+    # Build the conditionals
+    dtype_cases = []
+    for dtype, inner_ufunc_sigs in ufunc_sigs.items():
+        dtype_cases.append(
+            f"""
+AT_DISPATCH_CASE(at::ScalarType::{dtype},
+  [&]() {{
+    {compute_ufunc_cpu_dtype_body(g, dtype, inner_ufunc_sigs, stub_sig.arguments())}
+  }}
+)
+"""
+        )
+    dtype_cases_str = "\n".join(dtype_cases)
+    return f"""
+namespace {{
+{stub_sig.kernel_defn()} {{
+  AT_DISPATCH_SWITCH(iter.common_dtype(), "{stub_sig.name}",
+    {dtype_cases_str}
+  );
+}}
+}} // anonymous namespace
+{stub_sig.type_defn()};
+{stub_sig.dispatch_decl()};
+REGISTER_DISPATCH({stub_sig.name}, &{stub_sig.kernel_name});
+"""

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/executorch/__init__.py ADDED Viewed

File without changes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/executorch/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (220 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/executorch/__pycache__/parse.cpython-311.pyc ADDED Viewed

Binary file (7.11 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/executorch/api/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (224 Bytes). View file