diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/cython.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/cython.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d3d1e7b3d74b9917ce1864a3ccead9f9872f588
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/cython.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/isympy.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/isympy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71824cc2e8d5612f0eb2937d4accf83c862a5a90
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/isympy.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_soft.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_soft.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8846ae14b6651773d26126e576c7693edf8dc92
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_soft.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_error.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_error.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7ff08c0f508ad7077eb6ed1990898840c952b3a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_error.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+from typing import Any
+
+
+class Timeout(TimeoutError):  # noqa: N818
+    """Raised when the lock could not be acquired in *timeout* seconds."""
+
+    def __init__(self, lock_file: str) -> None:
+        super().__init__()
+        self._lock_file = lock_file
+
+    def __reduce__(self) -> str | tuple[Any, ...]:
+        return self.__class__, (self._lock_file,)  # Properly pickle the exception
+
+    def __str__(self) -> str:
+        return f"The file lock '{self._lock_file}' could not be acquired."
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.lock_file!r})"
+
+    @property
+    def lock_file(self) -> str:
+        """:return: The path of the file lock."""
+        return self._lock_file
+
+
+__all__ = [
+    "Timeout",
+]
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_soft.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_soft.py
new file mode 100644
index 0000000000000000000000000000000000000000..28c67f74cc82b8f55e47afd6a71972cc1fb95eb6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_soft.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import os
+import sys
+from contextlib import suppress
+from errno import EACCES, EEXIST
+from pathlib import Path
+
+from ._api import BaseFileLock
+from ._util import ensure_directory_exists, raise_on_not_writable_file
+
+
+class SoftFileLock(BaseFileLock):
+    """Simply watches the existence of the lock file."""
+
+    def _acquire(self) -> None:
+        raise_on_not_writable_file(self.lock_file)
+        ensure_directory_exists(self.lock_file)
+        # first check for exists and read-only mode as the open will mask this case as EEXIST
+        flags = (
+            os.O_WRONLY  # open for writing only
+            | os.O_CREAT
+            | os.O_EXCL  # together with above raise EEXIST if the file specified by filename exists
+            | os.O_TRUNC  # truncate the file to zero byte
+        )
+        try:
+            file_handler = os.open(self.lock_file, flags, self._context.mode)
+        except OSError as exception:  # re-raise unless expected exception
+            if not (
+                exception.errno == EEXIST  # lock already exist
+                or (exception.errno == EACCES and sys.platform == "win32")  # has no access to this lock
+            ):  # pragma: win32 no cover
+                raise
+        else:
+            self._context.lock_file_fd = file_handler
+
+    def _release(self) -> None:
+        assert self._context.lock_file_fd is not None  # noqa: S101
+        os.close(self._context.lock_file_fd)  # the lock file is definitely not None
+        self._context.lock_file_fd = None
+        with suppress(OSError):  # the file is already deleted and that's what we want
+            Path(self.lock_file).unlink()
+
+
+__all__ = [
+    "SoftFileLock",
+]
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/py.typed b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/version.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc9fc1550b3b64cc4ff85291e33b6cb0a745af97
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/version.py
@@ -0,0 +1,16 @@
+# file generated by setuptools_scm
+# don't change, don't track in version control
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple, Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+else:
+    VERSION_TUPLE = object
+
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+
+__version__ = version = '3.13.1'
+__version_tuple__ = version_tuple = (3, 13, 1)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2270130fdf3c2a11c116488da9c93859844df79
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2dca1351ab88b1aec13d16c1b9620b53e30f773
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/utils.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..505a4694d4a018f04bf02cba50e7439b32bc64bb
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..565cde39ec49ebce7deb8a65ae1005bd9d862fb3
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e3711e6bdfacb381a0cd04b6aaa854f565acf272
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/put.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/put.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36df0352ed9ee1ea5c902438ab75b2a3b640106b
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/put.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/get.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/get.py
new file mode 100644
index 0000000000000000000000000000000000000000..851ab81ee581e74cac41c64c83ef0af75826d6b0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/get.py
@@ -0,0 +1,587 @@
+from hashlib import md5
+from itertools import product
+
+import pytest
+
+from fsspec.implementations.local import make_path_posix
+from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
+
+
+class AbstractGetTests:
+    def test_get_file_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1a
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+        assert local_fs.isdir(target)
+
+        target_file2 = local_join(target, "file2")
+        target_subfile1 = local_join(target, "subfile1")
+
+        # Copy from source directory
+        fs.get(fs_join(source, "file2"), target)
+        assert local_fs.isfile(target_file2)
+
+        # Copy from sub directory
+        fs.get(fs_join(source, "subdir", "subfile1"), target)
+        assert local_fs.isfile(target_subfile1)
+
+        # Remove copied files
+        local_fs.rm([target_file2, target_subfile1])
+        assert not local_fs.exists(target_file2)
+        assert not local_fs.exists(target_subfile1)
+
+        # Repeat with trailing slash on target
+        fs.get(fs_join(source, "file2"), target + "/")
+        assert local_fs.isdir(target)
+        assert local_fs.isfile(target_file2)
+
+        fs.get(fs_join(source, "subdir", "subfile1"), target + "/")
+        assert local_fs.isfile(target_subfile1)
+
+    def test_get_file_to_new_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1b
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        fs.get(
+            fs_join(source, "subdir", "subfile1"), local_join(target, "newdir/")
+        )  # Note trailing slash
+
+        assert local_fs.isdir(target)
+        assert local_fs.isdir(local_join(target, "newdir"))
+        assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
+
+    def test_get_file_to_file_in_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1c
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        fs.get(fs_join(source, "subdir", "subfile1"), local_join(target, "newfile"))
+        assert local_fs.isfile(local_join(target, "newfile"))
+
+    def test_get_file_to_file_in_new_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1d
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        fs.get(
+            fs_join(source, "subdir", "subfile1"),
+            local_join(target, "newdir", "newfile"),
+        )
+        assert local_fs.isdir(local_join(target, "newdir"))
+        assert local_fs.isfile(local_join(target, "newdir", "newfile"))
+
+    def test_get_directory_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1e
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+        assert local_fs.isdir(target)
+
+        for source_slash, target_slash in zip([False, True], [False, True]):
+            s = fs_join(source, "subdir")
+            if source_slash:
+                s += "/"
+            t = target + "/" if target_slash else target
+
+            # Without recursive does nothing
+            fs.get(s, t)
+            assert local_fs.ls(target) == []
+
+            # With recursive
+            fs.get(s, t, recursive=True)
+            if source_slash:
+                assert local_fs.isfile(local_join(target, "subfile1"))
+                assert local_fs.isfile(local_join(target, "subfile2"))
+                assert local_fs.isdir(local_join(target, "nesteddir"))
+                assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile"))
+                assert not local_fs.exists(local_join(target, "subdir"))
+
+                local_fs.rm(
+                    [
+                        local_join(target, "subfile1"),
+                        local_join(target, "subfile2"),
+                        local_join(target, "nesteddir"),
+                    ],
+                    recursive=True,
+                )
+            else:
+                assert local_fs.isdir(local_join(target, "subdir"))
+                assert local_fs.isfile(local_join(target, "subdir", "subfile1"))
+                assert local_fs.isfile(local_join(target, "subdir", "subfile2"))
+                assert local_fs.isdir(local_join(target, "subdir", "nesteddir"))
+                assert local_fs.isfile(
+                    local_join(target, "subdir", "nesteddir", "nestedfile")
+                )
+
+                local_fs.rm(local_join(target, "subdir"), recursive=True)
+            assert local_fs.ls(target) == []
+
+            # Limit recursive by maxdepth
+            fs.get(s, t, recursive=True, maxdepth=1)
+            if source_slash:
+                assert local_fs.isfile(local_join(target, "subfile1"))
+                assert local_fs.isfile(local_join(target, "subfile2"))
+                assert not local_fs.exists(local_join(target, "nesteddir"))
+                assert not local_fs.exists(local_join(target, "subdir"))
+
+                local_fs.rm(
+                    [
+                        local_join(target, "subfile1"),
+                        local_join(target, "subfile2"),
+                    ],
+                    recursive=True,
+                )
+            else:
+                assert local_fs.isdir(local_join(target, "subdir"))
+                assert local_fs.isfile(local_join(target, "subdir", "subfile1"))
+                assert local_fs.isfile(local_join(target, "subdir", "subfile2"))
+                assert not local_fs.exists(local_join(target, "subdir", "nesteddir"))
+
+                local_fs.rm(local_join(target, "subdir"), recursive=True)
+            assert local_fs.ls(target) == []
+
+    def test_get_directory_to_new_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1f
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        for source_slash, target_slash in zip([False, True], [False, True]):
+            s = fs_join(source, "subdir")
+            if source_slash:
+                s += "/"
+            t = local_join(target, "newdir")
+            if target_slash:
+                t += "/"
+
+            # Without recursive does nothing
+            fs.get(s, t)
+            assert local_fs.ls(target) == []
+
+            # With recursive
+            fs.get(s, t, recursive=True)
+            assert local_fs.isdir(local_join(target, "newdir"))
+            assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
+            assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
+            assert local_fs.isdir(local_join(target, "newdir", "nesteddir"))
+            assert local_fs.isfile(
+                local_join(target, "newdir", "nesteddir", "nestedfile")
+            )
+            assert not local_fs.exists(local_join(target, "subdir"))
+
+            local_fs.rm(local_join(target, "newdir"), recursive=True)
+            assert local_fs.ls(target) == []
+
+            # Limit recursive by maxdepth
+            fs.get(s, t, recursive=True, maxdepth=1)
+            assert local_fs.isdir(local_join(target, "newdir"))
+            assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
+            assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
+            assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
+            assert not local_fs.exists(local_join(target, "subdir"))
+
+            local_fs.rm(local_join(target, "newdir"), recursive=True)
+            assert not local_fs.exists(local_join(target, "newdir"))
+
+    def test_get_glob_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1g
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        for target_slash in [False, True]:
+            t = target + "/" if target_slash else target
+
+            # Without recursive
+            fs.get(fs_join(source, "subdir", "*"), t)
+            assert local_fs.isfile(local_join(target, "subfile1"))
+            assert local_fs.isfile(local_join(target, "subfile2"))
+            assert not local_fs.isdir(local_join(target, "nesteddir"))
+            assert not local_fs.exists(local_join(target, "nesteddir", "nestedfile"))
+            assert not local_fs.exists(local_join(target, "subdir"))
+
+            local_fs.rm(
+                [
+                    local_join(target, "subfile1"),
+                    local_join(target, "subfile2"),
+                ],
+                recursive=True,
+            )
+            assert local_fs.ls(target) == []
+
+            # With recursive
+            for glob, recursive in zip(["*", "**"], [True, False]):
+                fs.get(fs_join(source, "subdir", glob), t, recursive=recursive)
+                assert local_fs.isfile(local_join(target, "subfile1"))
+                assert local_fs.isfile(local_join(target, "subfile2"))
+                assert local_fs.isdir(local_join(target, "nesteddir"))
+                assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile"))
+                assert not local_fs.exists(local_join(target, "subdir"))
+
+                local_fs.rm(
+                    [
+                        local_join(target, "subfile1"),
+                        local_join(target, "subfile2"),
+                        local_join(target, "nesteddir"),
+                    ],
+                    recursive=True,
+                )
+                assert local_fs.ls(target) == []
+
+                # Limit recursive by maxdepth
+                fs.get(
+                    fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
+                )
+                assert local_fs.isfile(local_join(target, "subfile1"))
+                assert local_fs.isfile(local_join(target, "subfile2"))
+                assert not local_fs.exists(local_join(target, "nesteddir"))
+                assert not local_fs.exists(local_join(target, "subdir"))
+
+                local_fs.rm(
+                    [
+                        local_join(target, "subfile1"),
+                        local_join(target, "subfile2"),
+                    ],
+                    recursive=True,
+                )
+                assert local_fs.ls(target) == []
+
+    def test_get_glob_to_new_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1h
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        for target_slash in [False, True]:
+            t = fs_join(target, "newdir")
+            if target_slash:
+                t += "/"
+
+            # Without recursive
+            fs.get(fs_join(source, "subdir", "*"), t)
+            assert local_fs.isdir(local_join(target, "newdir"))
+            assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
+            assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
+            assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
+            assert not local_fs.exists(
+                local_join(target, "newdir", "nesteddir", "nestedfile")
+            )
+            assert not local_fs.exists(local_join(target, "subdir"))
+            assert not local_fs.exists(local_join(target, "newdir", "subdir"))
+
+            local_fs.rm(local_join(target, "newdir"), recursive=True)
+            assert local_fs.ls(target) == []
+
+            # With recursive
+            for glob, recursive in zip(["*", "**"], [True, False]):
+                fs.get(fs_join(source, "subdir", glob), t, recursive=recursive)
+                assert local_fs.isdir(local_join(target, "newdir"))
+                assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
+                assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
+                assert local_fs.isdir(local_join(target, "newdir", "nesteddir"))
+                assert local_fs.isfile(
+                    local_join(target, "newdir", "nesteddir", "nestedfile")
+                )
+                assert not local_fs.exists(local_join(target, "subdir"))
+                assert not local_fs.exists(local_join(target, "newdir", "subdir"))
+
+                local_fs.rm(local_join(target, "newdir"), recursive=True)
+                assert not local_fs.exists(local_join(target, "newdir"))
+
+                # Limit recursive by maxdepth
+                fs.get(
+                    fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
+                )
+                assert local_fs.isdir(local_join(target, "newdir"))
+                assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
+                assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
+                assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
+                assert not local_fs.exists(local_join(target, "subdir"))
+                assert not local_fs.exists(local_join(target, "newdir", "subdir"))
+
+                local_fs.rm(local_fs.ls(target, detail=False), recursive=True)
+                assert not local_fs.exists(local_join(target, "newdir"))
+
+    @pytest.mark.parametrize(
+        GLOB_EDGE_CASES_TESTS["argnames"],
+        GLOB_EDGE_CASES_TESTS["argvalues"],
+    )
+    def test_get_glob_edge_cases(
+        self,
+        path,
+        recursive,
+        maxdepth,
+        expected,
+        fs,
+        fs_join,
+        fs_glob_edge_cases_files,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 1g
+        source = fs_glob_edge_cases_files
+
+        target = local_target
+
+        for new_dir, target_slash in product([True, False], [True, False]):
+            local_fs.mkdir(target)
+
+            t = local_join(target, "newdir") if new_dir else target
+            t = t + "/" if target_slash else t
+
+            fs.get(fs_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
+
+            output = local_fs.find(target)
+            if new_dir:
+                prefixed_expected = [
+                    make_path_posix(local_join(target, "newdir", p)) for p in expected
+                ]
+            else:
+                prefixed_expected = [
+                    make_path_posix(local_join(target, p)) for p in expected
+                ]
+            assert sorted(output) == sorted(prefixed_expected)
+
+            try:
+                local_fs.rm(target, recursive=True)
+            except FileNotFoundError:
+                pass
+
+    def test_get_list_of_files_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 2a
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        source_files = [
+            fs_join(source, "file1"),
+            fs_join(source, "file2"),
+            fs_join(source, "subdir", "subfile1"),
+        ]
+
+        for target_slash in [False, True]:
+            t = target + "/" if target_slash else target
+
+            fs.get(source_files, t)
+            assert local_fs.isfile(local_join(target, "file1"))
+            assert local_fs.isfile(local_join(target, "file2"))
+            assert local_fs.isfile(local_join(target, "subfile1"))
+
+            local_fs.rm(
+                [
+                    local_join(target, "file1"),
+                    local_join(target, "file2"),
+                    local_join(target, "subfile1"),
+                ],
+                recursive=True,
+            )
+            assert local_fs.ls(target) == []
+
+    def test_get_list_of_files_to_new_directory(
+        self,
+        fs,
+        fs_join,
+        fs_bulk_operations_scenario_0,
+        local_fs,
+        local_join,
+        local_target,
+    ):
+        # Copy scenario 2b
+        source = fs_bulk_operations_scenario_0
+
+        target = local_target
+        local_fs.mkdir(target)
+
+        source_files = [
+            fs_join(source, "file1"),
+            fs_join(source, "file2"),
+            fs_join(source, "subdir", "subfile1"),
+        ]
+
+        fs.get(source_files, local_join(target, "newdir") + "/")  # Note trailing slash
+        assert local_fs.isdir(local_join(target, "newdir"))
+        assert local_fs.isfile(local_join(target, "newdir", "file1"))
+        assert local_fs.isfile(local_join(target, "newdir", "file2"))
+        assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
+
+    def test_get_directory_recursive(
+        self, fs, fs_join, fs_path, local_fs, local_join, local_target
+    ):
+        # https://github.com/fsspec/filesystem_spec/issues/1062
+        # Recursive cp/get/put of source directory into non-existent target directory.
+        src = fs_join(fs_path, "src")
+        src_file = fs_join(src, "file")
+        fs.mkdir(src)
+        fs.touch(src_file)
+
+        target = local_target
+
+        # get without slash
+        assert not local_fs.exists(target)
+        for loop in range(2):
+            fs.get(src, target, recursive=True)
+            assert local_fs.isdir(target)
+
+            if loop == 0:
+                assert local_fs.isfile(local_join(target, "file"))
+                assert not local_fs.exists(local_join(target, "src"))
+            else:
+                assert local_fs.isfile(local_join(target, "file"))
+                assert local_fs.isdir(local_join(target, "src"))
+                assert local_fs.isfile(local_join(target, "src", "file"))
+
+        local_fs.rm(target, recursive=True)
+
+        # get with slash
+        assert not local_fs.exists(target)
+        for loop in range(2):
+            fs.get(src + "/", target, recursive=True)
+            assert local_fs.isdir(target)
+            assert local_fs.isfile(local_join(target, "file"))
+            assert not local_fs.exists(local_join(target, "src"))
+
+    def test_get_directory_without_files_with_same_name_prefix(
+        self,
+        fs,
+        fs_join,
+        local_fs,
+        local_join,
+        local_target,
+        fs_dir_and_file_with_same_name_prefix,
+    ):
+        # Create the test dirs
+        source = fs_dir_and_file_with_same_name_prefix
+        target = local_target
+
+        # Test without glob
+        fs.get(fs_join(source, "subdir"), target, recursive=True)
+
+        assert local_fs.isfile(local_join(target, "subfile.txt"))
+        assert not local_fs.isfile(local_join(target, "subdir.txt"))
+
+        local_fs.rm([local_join(target, "subfile.txt")])
+        assert local_fs.ls(target) == []
+
+        # Test with glob
+        fs.get(fs_join(source, "subdir*"), target, recursive=True)
+
+        assert local_fs.isdir(local_join(target, "subdir"))
+        assert local_fs.isfile(local_join(target, "subdir", "subfile.txt"))
+        assert local_fs.isfile(local_join(target, "subdir.txt"))
+
+    def test_get_with_source_and_destination_as_list(
+        self,
+        fs,
+        fs_join,
+        local_fs,
+        local_join,
+        local_target,
+        fs_10_files_with_hashed_names,
+    ):
+        # Create the test dir
+        source = fs_10_files_with_hashed_names
+        target = local_target
+
+        # Create list of files for source and destination
+        source_files = []
+        destination_files = []
+        for i in range(10):
+            hashed_i = md5(str(i).encode("utf-8")).hexdigest()
+            source_files.append(fs_join(source, f"{hashed_i}.txt"))
+            destination_files.append(
+                make_path_posix(local_join(target, f"{hashed_i}.txt"))
+            )
+
+        # Copy and assert order was kept
+        fs.get(rpath=source_files, lpath=destination_files)
+
+        for i in range(10):
+            file_content = local_fs.cat(destination_files[i]).decode("utf-8")
+            assert file_content == str(i)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/put.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/put.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fc349977f0384d9fc86126498be5c6ad99a21d3
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/put.py
@@ -0,0 +1,591 @@
+from hashlib import md5
+from itertools import product
+
+import pytest
+
+from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
+
+
+class AbstractPutTests:
+    def test_put_file_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        local_join,
+        local_bulk_operations_scenario_0,
+        supports_empty_directories,
+    ):
+        # Copy scenario 1a
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            fs.touch(fs_join(target, "dummy"))
+        assert fs.isdir(target)
+
+        target_file2 = fs_join(target, "file2")
+        target_subfile1 = fs_join(target, "subfile1")
+
+        # Copy from source directory
+        fs.put(local_join(source, "file2"), target)
+        assert fs.isfile(target_file2)
+
+        # Copy from sub directory
+        fs.put(local_join(source, "subdir", "subfile1"), target)
+        assert fs.isfile(target_subfile1)
+
+        # Remove copied files
+        fs.rm([target_file2, target_subfile1])
+        assert not fs.exists(target_file2)
+        assert not fs.exists(target_subfile1)
+
+        # Repeat with trailing slash on target
+        fs.put(local_join(source, "file2"), target + "/")
+        assert fs.isdir(target)
+        assert fs.isfile(target_file2)
+
+        fs.put(local_join(source, "subdir", "subfile1"), target + "/")
+        assert fs.isfile(target_subfile1)
+
+    def test_put_file_to_new_directory(
+        self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
+    ):
+        # Copy scenario 1b
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        fs.put(
+            local_join(source, "subdir", "subfile1"), fs_join(target, "newdir/")
+        )  # Note trailing slash
+        assert fs.isdir(target)
+        assert fs.isdir(fs_join(target, "newdir"))
+        assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+
+    def test_put_file_to_file_in_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        local_join,
+        supports_empty_directories,
+        local_bulk_operations_scenario_0,
+    ):
+        # Copy scenario 1c
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            fs.touch(fs_join(target, "dummy"))
+        assert fs.isdir(target)
+
+        fs.put(local_join(source, "subdir", "subfile1"), fs_join(target, "newfile"))
+        assert fs.isfile(fs_join(target, "newfile"))
+
+    def test_put_file_to_file_in_new_directory(
+        self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
+    ):
+        # Copy scenario 1d
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        fs.put(
+            local_join(source, "subdir", "subfile1"),
+            fs_join(target, "newdir", "newfile"),
+        )
+        assert fs.isdir(fs_join(target, "newdir"))
+        assert fs.isfile(fs_join(target, "newdir", "newfile"))
+
+    def test_put_directory_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        local_bulk_operations_scenario_0,
+        supports_empty_directories,
+    ):
+        # Copy scenario 1e
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            dummy = fs_join(target, "dummy")
+            fs.touch(dummy)
+        assert fs.isdir(target)
+
+        for source_slash, target_slash in zip([False, True], [False, True]):
+            s = fs_join(source, "subdir")
+            if source_slash:
+                s += "/"
+            t = target + "/" if target_slash else target
+
+            # Without recursive does nothing
+            fs.put(s, t)
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+            # With recursive
+            fs.put(s, t, recursive=True)
+            if source_slash:
+                assert fs.isfile(fs_join(target, "subfile1"))
+                assert fs.isfile(fs_join(target, "subfile2"))
+                assert fs.isdir(fs_join(target, "nesteddir"))
+                assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
+                assert not fs.exists(fs_join(target, "subdir"))
+
+                fs.rm(
+                    [
+                        fs_join(target, "subfile1"),
+                        fs_join(target, "subfile2"),
+                        fs_join(target, "nesteddir"),
+                    ],
+                    recursive=True,
+                )
+            else:
+                assert fs.isdir(fs_join(target, "subdir"))
+                assert fs.isfile(fs_join(target, "subdir", "subfile1"))
+                assert fs.isfile(fs_join(target, "subdir", "subfile2"))
+                assert fs.isdir(fs_join(target, "subdir", "nesteddir"))
+                assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile"))
+
+                fs.rm(fs_join(target, "subdir"), recursive=True)
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+            # Limit recursive by maxdepth
+            fs.put(s, t, recursive=True, maxdepth=1)
+            if source_slash:
+                assert fs.isfile(fs_join(target, "subfile1"))
+                assert fs.isfile(fs_join(target, "subfile2"))
+                assert not fs.exists(fs_join(target, "nesteddir"))
+                assert not fs.exists(fs_join(target, "subdir"))
+
+                fs.rm(
+                    [
+                        fs_join(target, "subfile1"),
+                        fs_join(target, "subfile2"),
+                    ],
+                    recursive=True,
+                )
+            else:
+                assert fs.isdir(fs_join(target, "subdir"))
+                assert fs.isfile(fs_join(target, "subdir", "subfile1"))
+                assert fs.isfile(fs_join(target, "subdir", "subfile2"))
+                assert not fs.exists(fs_join(target, "subdir", "nesteddir"))
+
+                fs.rm(fs_join(target, "subdir"), recursive=True)
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+    def test_put_directory_to_new_directory(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        local_bulk_operations_scenario_0,
+        supports_empty_directories,
+    ):
+        # Copy scenario 1f
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        for source_slash, target_slash in zip([False, True], [False, True]):
+            s = fs_join(source, "subdir")
+            if source_slash:
+                s += "/"
+            t = fs_join(target, "newdir")
+            if target_slash:
+                t += "/"
+
+            # Without recursive does nothing
+            fs.put(s, t)
+            if supports_empty_directories:
+                assert fs.ls(target) == []
+            else:
+                with pytest.raises(FileNotFoundError):
+                    fs.ls(target)
+
+            # With recursive
+            fs.put(s, t, recursive=True)
+            assert fs.isdir(fs_join(target, "newdir"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+            assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
+            assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
+            assert not fs.exists(fs_join(target, "subdir"))
+
+            fs.rm(fs_join(target, "newdir"), recursive=True)
+            assert not fs.exists(fs_join(target, "newdir"))
+
+            # Limit recursive by maxdepth
+            fs.put(s, t, recursive=True, maxdepth=1)
+            assert fs.isdir(fs_join(target, "newdir"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+            assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
+            assert not fs.exists(fs_join(target, "subdir"))
+
+            fs.rm(fs_join(target, "newdir"), recursive=True)
+            assert not fs.exists(fs_join(target, "newdir"))
+
+    def test_put_glob_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        local_join,
+        supports_empty_directories,
+        local_bulk_operations_scenario_0,
+    ):
+        # Copy scenario 1g
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            dummy = fs_join(target, "dummy")
+            fs.touch(dummy)
+        assert fs.isdir(target)
+
+        for target_slash in [False, True]:
+            t = target + "/" if target_slash else target
+
+            # Without recursive
+            fs.put(local_join(source, "subdir", "*"), t)
+            assert fs.isfile(fs_join(target, "subfile1"))
+            assert fs.isfile(fs_join(target, "subfile2"))
+            assert not fs.isdir(fs_join(target, "nesteddir"))
+            assert not fs.exists(fs_join(target, "nesteddir", "nestedfile"))
+            assert not fs.exists(fs_join(target, "subdir"))
+
+            fs.rm(
+                [
+                    fs_join(target, "subfile1"),
+                    fs_join(target, "subfile2"),
+                ],
+                recursive=True,
+            )
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+            # With recursive
+            for glob, recursive in zip(["*", "**"], [True, False]):
+                fs.put(local_join(source, "subdir", glob), t, recursive=recursive)
+                assert fs.isfile(fs_join(target, "subfile1"))
+                assert fs.isfile(fs_join(target, "subfile2"))
+                assert fs.isdir(fs_join(target, "nesteddir"))
+                assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
+                assert not fs.exists(fs_join(target, "subdir"))
+
+                fs.rm(
+                    [
+                        fs_join(target, "subfile1"),
+                        fs_join(target, "subfile2"),
+                        fs_join(target, "nesteddir"),
+                    ],
+                    recursive=True,
+                )
+                assert fs.ls(target, detail=False) == (
+                    [] if supports_empty_directories else [dummy]
+                )
+
+                # Limit recursive by maxdepth
+                fs.put(
+                    local_join(source, "subdir", glob),
+                    t,
+                    recursive=recursive,
+                    maxdepth=1,
+                )
+                assert fs.isfile(fs_join(target, "subfile1"))
+                assert fs.isfile(fs_join(target, "subfile2"))
+                assert not fs.exists(fs_join(target, "nesteddir"))
+                assert not fs.exists(fs_join(target, "subdir"))
+
+                fs.rm(
+                    [
+                        fs_join(target, "subfile1"),
+                        fs_join(target, "subfile2"),
+                    ],
+                    recursive=True,
+                )
+                assert fs.ls(target, detail=False) == (
+                    [] if supports_empty_directories else [dummy]
+                )
+
+    def test_put_glob_to_new_directory(
+        self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
+    ):
+        # Copy scenario 1h
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        for target_slash in [False, True]:
+            t = fs_join(target, "newdir")
+            if target_slash:
+                t += "/"
+
+            # Without recursive
+            fs.put(local_join(source, "subdir", "*"), t)
+            assert fs.isdir(fs_join(target, "newdir"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+            assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+            assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
+            assert not fs.exists(fs_join(target, "newdir", "nesteddir", "nestedfile"))
+            assert not fs.exists(fs_join(target, "subdir"))
+            assert not fs.exists(fs_join(target, "newdir", "subdir"))
+
+            fs.rm(fs_join(target, "newdir"), recursive=True)
+            assert not fs.exists(fs_join(target, "newdir"))
+
+            # With recursive
+            for glob, recursive in zip(["*", "**"], [True, False]):
+                fs.put(local_join(source, "subdir", glob), t, recursive=recursive)
+                assert fs.isdir(fs_join(target, "newdir"))
+                assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+                assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+                assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
+                assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
+                assert not fs.exists(fs_join(target, "subdir"))
+                assert not fs.exists(fs_join(target, "newdir", "subdir"))
+
+                fs.rm(fs_join(target, "newdir"), recursive=True)
+                assert not fs.exists(fs_join(target, "newdir"))
+
+                # Limit recursive by maxdepth
+                fs.put(
+                    local_join(source, "subdir", glob),
+                    t,
+                    recursive=recursive,
+                    maxdepth=1,
+                )
+                assert fs.isdir(fs_join(target, "newdir"))
+                assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+                assert fs.isfile(fs_join(target, "newdir", "subfile2"))
+                assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
+                assert not fs.exists(fs_join(target, "subdir"))
+                assert not fs.exists(fs_join(target, "newdir", "subdir"))
+
+                fs.rm(fs_join(target, "newdir"), recursive=True)
+                assert not fs.exists(fs_join(target, "newdir"))
+
+    @pytest.mark.parametrize(
+        GLOB_EDGE_CASES_TESTS["argnames"],
+        GLOB_EDGE_CASES_TESTS["argvalues"],
+    )
+    def test_put_glob_edge_cases(
+        self,
+        path,
+        recursive,
+        maxdepth,
+        expected,
+        fs,
+        fs_join,
+        fs_target,
+        local_glob_edge_cases_files,
+        local_join,
+        fs_sanitize_path,
+    ):
+        # Copy scenario 1g
+        source = local_glob_edge_cases_files
+
+        target = fs_target
+
+        for new_dir, target_slash in product([True, False], [True, False]):
+            fs.mkdir(target)
+
+            t = fs_join(target, "newdir") if new_dir else target
+            t = t + "/" if target_slash else t
+
+            fs.put(local_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
+
+            output = fs.find(target)
+            if new_dir:
+                prefixed_expected = [
+                    fs_sanitize_path(fs_join(target, "newdir", p)) for p in expected
+                ]
+            else:
+                prefixed_expected = [
+                    fs_sanitize_path(fs_join(target, p)) for p in expected
+                ]
+            assert sorted(output) == sorted(prefixed_expected)
+
+            try:
+                fs.rm(target, recursive=True)
+            except FileNotFoundError:
+                pass
+
+    def test_put_list_of_files_to_existing_directory(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        local_join,
+        local_bulk_operations_scenario_0,
+        supports_empty_directories,
+    ):
+        # Copy scenario 2a
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+        if not supports_empty_directories:
+            # Force target directory to exist by adding a dummy file
+            dummy = fs_join(target, "dummy")
+            fs.touch(dummy)
+        assert fs.isdir(target)
+
+        source_files = [
+            local_join(source, "file1"),
+            local_join(source, "file2"),
+            local_join(source, "subdir", "subfile1"),
+        ]
+
+        for target_slash in [False, True]:
+            t = target + "/" if target_slash else target
+
+            fs.put(source_files, t)
+            assert fs.isfile(fs_join(target, "file1"))
+            assert fs.isfile(fs_join(target, "file2"))
+            assert fs.isfile(fs_join(target, "subfile1"))
+
+            fs.rm(
+                [
+                    fs_join(target, "file1"),
+                    fs_join(target, "file2"),
+                    fs_join(target, "subfile1"),
+                ],
+                recursive=True,
+            )
+            assert fs.ls(target, detail=False) == (
+                [] if supports_empty_directories else [dummy]
+            )
+
+    def test_put_list_of_files_to_new_directory(
+        self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
+    ):
+        # Copy scenario 2b
+        source = local_bulk_operations_scenario_0
+
+        target = fs_target
+        fs.mkdir(target)
+
+        source_files = [
+            local_join(source, "file1"),
+            local_join(source, "file2"),
+            local_join(source, "subdir", "subfile1"),
+        ]
+
+        fs.put(source_files, fs_join(target, "newdir") + "/")  # Note trailing slash
+        assert fs.isdir(fs_join(target, "newdir"))
+        assert fs.isfile(fs_join(target, "newdir", "file1"))
+        assert fs.isfile(fs_join(target, "newdir", "file2"))
+        assert fs.isfile(fs_join(target, "newdir", "subfile1"))
+
+    def test_put_directory_recursive(
+        self, fs, fs_join, fs_target, local_fs, local_join, local_path
+    ):
+        # https://github.com/fsspec/filesystem_spec/issues/1062
+        # Recursive cp/get/put of source directory into non-existent target directory.
+        src = local_join(local_path, "src")
+        src_file = local_join(src, "file")
+        local_fs.mkdir(src)
+        local_fs.touch(src_file)
+
+        target = fs_target
+
+        # put without slash
+        assert not fs.exists(target)
+        for loop in range(2):
+            fs.put(src, target, recursive=True)
+            assert fs.isdir(target)
+
+            if loop == 0:
+                assert fs.isfile(fs_join(target, "file"))
+                assert not fs.exists(fs_join(target, "src"))
+            else:
+                assert fs.isfile(fs_join(target, "file"))
+                assert fs.isdir(fs_join(target, "src"))
+                assert fs.isfile(fs_join(target, "src", "file"))
+
+        fs.rm(target, recursive=True)
+
+        # put with slash
+        assert not fs.exists(target)
+        for loop in range(2):
+            fs.put(src + "/", target, recursive=True)
+            assert fs.isdir(target)
+            assert fs.isfile(fs_join(target, "file"))
+            assert not fs.exists(fs_join(target, "src"))
+
+    def test_put_directory_without_files_with_same_name_prefix(
+        self,
+        fs,
+        fs_join,
+        fs_target,
+        local_join,
+        local_dir_and_file_with_same_name_prefix,
+        supports_empty_directories,
+    ):
+        # Create the test dirs
+        source = local_dir_and_file_with_same_name_prefix
+        target = fs_target
+
+        # Test without glob
+        fs.put(local_join(source, "subdir"), fs_target, recursive=True)
+
+        assert fs.isfile(fs_join(fs_target, "subfile.txt"))
+        assert not fs.isfile(fs_join(fs_target, "subdir.txt"))
+
+        fs.rm([fs_join(target, "subfile.txt")])
+        if supports_empty_directories:
+            assert fs.ls(target) == []
+        else:
+            assert not fs.exists(target)
+
+        # Test with glob
+        fs.put(local_join(source, "subdir*"), fs_target, recursive=True)
+
+        assert fs.isdir(fs_join(fs_target, "subdir"))
+        assert fs.isfile(fs_join(fs_target, "subdir", "subfile.txt"))
+        assert fs.isfile(fs_join(fs_target, "subdir.txt"))
+
+    def test_copy_with_source_and_destination_as_list(
+        self, fs, fs_target, fs_join, local_join, local_10_files_with_hashed_names
+    ):
+        # Create the test dir
+        source = local_10_files_with_hashed_names
+        target = fs_target
+
+        # Create list of files for source and destination
+        source_files = []
+        destination_files = []
+        for i in range(10):
+            hashed_i = md5(str(i).encode("utf-8")).hexdigest()
+            source_files.append(local_join(source, f"{hashed_i}.txt"))
+            destination_files.append(fs_join(target, f"{hashed_i}.txt"))
+
+        # Copy and assert order was kept
+        fs.put(lpath=source_files, rpath=destination_files)
+
+        for i in range(10):
+            file_content = fs.cat(destination_files[i]).decode("utf-8")
+            assert file_content == str(i)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aff35e592d80e510b08ff18a4c3571202b653d6e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+
+from torch._functorch.deprecated import (
+    combine_state_for_ensemble,
+    functionalize,
+    grad,
+    grad_and_value,
+    hessian,
+    jacfwd,
+    jacrev,
+    jvp,
+    make_functional,
+    make_functional_with_buffers,
+    vjp,
+    vmap,
+)
+
+# utilities. Maybe these should go in their own namespace in the future?
+from torch._functorch.make_functional import (
+    FunctionalModule,
+    FunctionalModuleWithBuffers,
+)
+
+# Top-level APIs. Please think carefully before adding something to the
+# top-level namespace:
+# - private helper functions should go into torch._functorch
+# - very experimental things should go into functorch.experimental
+# - compilation related things should go into functorch.compile
+
+# Was never documented
+from torch._functorch.python_key import make_fx
+
+__version__ = torch.__version__
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/control_flow.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/control_flow.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..236bad2830b7beabcb777696e7e30459429613a0
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/control_flow.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/ops.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/ops.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44827bc40e40795ca52391a98242140f6d4aa366
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/ops.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/control_flow.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/control_flow.py
new file mode 100644
index 0000000000000000000000000000000000000000..e24fc6142820013002f6cbc1d6f85e7e132aade8
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/control_flow.py
@@ -0,0 +1,8 @@
+from torch import cond  # noqa: F401
+from torch._higher_order_ops.cond import UnsupportedAliasMutationException  # noqa: F401
+
+from torch._higher_order_ops.map import (  # noqa: F401
+    _stack_pytree,
+    _unstack_pytree,
+    map,
+)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaProfilerTypedefs.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaProfilerTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..bea7df4573aff2fa5b0d0029ce9d40a7ebe2de46
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaProfilerTypedefs.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAPROFILERTYPEDEFS_H
+#define CUDAPROFILERTYPEDEFS_H
+
+#include <cudaProfiler.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaProfiler.h
+ */
+#define PFN_cuProfilerInitialize  PFN_cuProfilerInitialize_v4000
+#define PFN_cuProfilerStart  PFN_cuProfilerStart_v4000
+#define PFN_cuProfilerStop  PFN_cuProfilerStop_v4000
+
+
+/**
+ * Type definitions for functions defined in cudaProfiler.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuProfilerInitialize_v4000)(const char *configFile, const char *outputFile, CUoutput_mode outputMode);
+typedef CUresult (CUDAAPI *PFN_cuProfilerStart_v4000)(void);
+typedef CUresult (CUDAAPI *PFN_cuProfilerStop_v4000)(void);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAU.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAU.h
new file mode 100644
index 0000000000000000000000000000000000000000..97de57ae494d62ae176fc02ad3c0c3f4d43e1526
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAU.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAVDPAU_H
+#define CUDAVDPAU_H
+
+#ifdef CUDA_FORCE_API_VERSION
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+
+#define cuVDPAUCtxCreate cuVDPAUCtxCreate_v2
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \defgroup CUDA_VDPAU VDPAU Interoperability
+ * \ingroup CUDA_DRIVER
+ *
+ * ___MANBRIEF___ VDPAU interoperability functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the VDPAU interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets the CUDA device associated with a VDPAU device
+ *
+ * Returns in \p *pDevice the CUDA device associated with a \p vdpDevice, if
+ * applicable.
+ *
+ * \param pDevice           - Device associated with vdpDevice
+ * \param vdpDevice         - A VdpDevice handle
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cudaVDPAUGetDevice
+ */
+CUresult CUDAAPI cuVDPAUGetDevice(CUdevice *pDevice, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+
+/**
+ * \brief Create a CUDA context for interoperability with VDPAU
+ *
+ * Creates a new CUDA context, initializes VDPAU interoperability, and
+ * associates the CUDA context with the calling thread. It must be called
+ * before performing any other VDPAU interoperability operations. It may fail
+ * if the needed VDPAU driver facilities are not available. For usage of the
+ * \p flags parameter, see ::cuCtxCreate().
+ *
+ * \param pCtx              - Returned CUDA context
+ * \param flags             - Options for CUDA context creation
+ * \param device            - Device on which to create the context
+ * \param vdpDevice         - The VdpDevice to interop with
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuVDPAUGetDevice
+ */
+CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+
+/**
+ * \brief Registers a VDPAU VdpVideoSurface object
+ *
+ * Registers the VdpVideoSurface specified by \p vdpSurface for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The VdpVideoSurface is presented as an array of subresources that may be
+ * accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
+ * The exact number of valid \p arrayIndex values depends on the VDPAU surface
+ * format. The mapping is shown in the table below. \p mipLevel must be 0.
+ *
+ * \htmlonly
+ * <table>
+ * <tr><th>VdpChromaType                               </th><th>arrayIndex</th><th>Size     </th><th>Format</th><th>Content            </th></tr>
+ * <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_420</td><td>0         </td><td>w   x h/2</td><td>R8    </td><td>Top-field luma     </td></tr>
+ * <tr>                                                     <td>1         </td><td>w   x h/2</td><td>R8    </td><td>Bottom-field luma  </td></tr>
+ * <tr>                                                     <td>2         </td><td>w/2 x h/4</td><td>R8G8  </td><td>Top-field chroma   </td></tr>
+ * <tr>                                                     <td>3         </td><td>w/2 x h/4</td><td>R8G8  </td><td>Bottom-field chroma</td></tr>
+ * <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_422</td><td>0         </td><td>w   x h/2</td><td>R8    </td><td>Top-field luma     </td></tr>
+ * <tr>                                                     <td>1         </td><td>w   x h/2</td><td>R8    </td><td>Bottom-field luma  </td></tr>
+ * <tr>                                                     <td>2         </td><td>w/2 x h/2</td><td>R8G8  </td><td>Top-field chroma   </td></tr>
+ * <tr>                                                     <td>3         </td><td>w/2 x h/2</td><td>R8G8  </td><td>Bottom-field chroma</td></tr>
+ * </table>
+ * \endhtmlonly
+ *
+ * \latexonly
+ * \begin{tabular}{|l|l|l|l|l|}
+ * \hline
+ * VdpChromaType          & arrayIndex & Size      & Format & Content             \\
+ * \hline
+ * VDP\_CHROMA\_TYPE\_420 & 0          & w x h/2   & R8     & Top-field luma      \\
+ *                        & 1          & w x h/2   & R8     & Bottom-field luma   \\
+ *                        & 2          & w/2 x h/4 & R8G8   & Top-field chroma    \\
+ *                        & 3          & w/2 x h/4 & R8G8   & Bottom-field chroma \\
+ * \hline
+ * VDP\_CHROMA\_TYPE\_422 & 0          & w x h/2   & R8     & Top-field luma      \\
+ *                        & 1          & w x h/2   & R8     & Bottom-field luma   \\
+ *                        & 2          & w/2 x h/2 & R8G8   & Top-field chroma    \\
+ *                        & 3          & w/2 x h/2 & R8G8   & Bottom-field chroma \\
+ * \hline
+ * \end{tabular}
+ * \endlatexonly
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param vdpSurface    - The VdpVideoSurface to be registered
+ * \param flags         - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuVDPAUGetDevice,
+ * ::cudaGraphicsVDPAURegisterVideoSurface
+ */
+CUresult CUDAAPI cuGraphicsVDPAURegisterVideoSurface(CUgraphicsResource *pCudaResource, VdpVideoSurface vdpSurface, unsigned int flags);
+
+/**
+ * \brief Registers a VDPAU VdpOutputSurface object
+ *
+ * Registers the VdpOutputSurface specified by \p vdpSurface for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The VdpOutputSurface is presented as an array of subresources that may be
+ * accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
+ * The exact number of valid \p arrayIndex values depends on the VDPAU surface
+ * format. The mapping is shown in the table below. \p mipLevel must be 0.
+ *
+ * \htmlonly
+ * <table>
+ * <tr><th>VdpRGBAFormat              </th><th>arrayIndex</th><th>Size </th><th>Format </th><th>Content       </th></tr>
+ * <tr><td>VDP_RGBA_FORMAT_B8G8R8A8   </td><td>0         </td><td>w x h</td><td>ARGB8  </td><td>Entire surface</td></tr>
+ * <tr><td>VDP_RGBA_FORMAT_R10G10B10A2</td><td>0         </td><td>w x h</td><td>A2BGR10</td><td>Entire surface</td></tr>
+ * </table>
+ * \endhtmlonly
+ *
+ * \latexonly
+ * \begin{tabular}{|l|l|l|l|l|}
+ * \hline
+ * VdpRGBAFormat                  & arrayIndex & Size  & Format  & Content        \\
+ * \hline
+ * VDP\_RGBA\_FORMAT\_B8G8R8A8    & 0          & w x h & ARGB8   & Entire surface \\
+ * VDP\_RGBA\_FORMAT\_R10G10B10A2 & 0          & w x h & A2BGR10 & Entire surface \\
+ * \hline
+ * \end{tabular}
+ * \endlatexonly
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param vdpSurface    - The VdpOutputSurface to be registered
+ * \param flags         - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
+ * ::cuGraphicsVDPAURegisterVideoSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuVDPAUGetDevice,
+ * ::cudaGraphicsVDPAURegisterOutputSurface
+ */
+CUresult CUDAAPI cuGraphicsVDPAURegisterOutputSurface(CUgraphicsResource *pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags);
+
+/** @} */ /* END CUDA_VDPAU */
+
+
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuVDPAUCtxCreate
+
+    CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..12b74c94deef2bdea5bd14c9247814427308870b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_20_ATOMIC_FUNCTIONS_H__)
+#define __SM_20_ATOMIC_FUNCTIONS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#ifndef __CUDA_ARCH__
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+#ifdef __CUDA_ARCH__
+extern "C"
+{
+extern __device__ __device_builtin__ float __fAtomicAdd(float *address, float val);
+}
+#endif /* __CUDA_ARCH__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val) __DEF_IF_HOST
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_20_ATOMIC_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_20_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
+
+#endif /* !__SM_20_ATOMIC_FUNCTIONS_H__ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.hpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ac4aa9bfc6b8d5d4d240e05a2fd557889f30c47f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_20_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_20_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val)
+{
+  return __fAtomicAdd(address, val);
+}
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_20_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__SM_20_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_intrinsics.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..da1e823a24171ed1ca9414955c6c68159a4411f5
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_intrinsics.h
@@ -0,0 +1,116 @@
+/*
+
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+
+ *
+
+ * NOTICE TO LICENSEE:
+
+ *
+
+ * This source code and/or documentation ("Licensed Deliverables") are
+
+ * subject to NVIDIA intellectual property rights under U.S. and
+
+ * international Copyright laws.
+
+ *
+
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+
+ * conditions of a form of NVIDIA software license agreement by and
+
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+
+ * the contrary in the License Agreement, reproduction or disclosure
+
+ * of the Licensed Deliverables to any third party without the express
+
+ * written consent of NVIDIA is prohibited.
+
+ *
+
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+
+ * OF THESE LICENSED DELIVERABLES.
+
+ *
+
+ * U.S. Government End Users.  These Licensed Deliverables are a
+
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+
+ * 1995), consisting of "commercial computer software" and "commercial
+
+ * computer software documentation" as such terms are used in 48
+
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+
+ * U.S. Government End Users acquire the Licensed Deliverables with
+
+ * only those rights set forth herein.
+
+ *
+
+ * Any use of the Licensed Deliverables in individual and commercial
+
+ * software must include, in the user documentation and internal
+
+ * comments to the code, the above Disclaimer and U.S. Government End
+
+ * Users Notice.
+
+ */
+
+
+
+#if !defined(__SM_35_INTRINSICS_H__)
+
+#define __SM_35_INTRINSICS_H__
+
+
+
+/**********************************************************************************
+
+* All sm_35 intrinsics are supported by sm_32 so simply include its header file   *
+
+**********************************************************************************/
+
+#include "sm_32_intrinsics.h"
+
+
+
+#endif /* !__SM_35_INTRINSICS_H__ */
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.hpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab69cf38045a7c44dae67e7149d49ac4c6148747
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.hpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_FUNCTIONS_HPP__)
+#define __VECTOR_FUNCTIONS_HPP__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if defined(__CUDACC_RTC__)
+#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
+{
+  char1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
+{
+  uchar1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
+{
+  char2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
+{
+  uchar2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
+{
+  char3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
+{
+  uchar3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
+{
+  char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
+{
+  uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
+{
+  short1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
+{
+  ushort1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
+{
+  short2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
+{
+  ushort2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
+{ 
+  short3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
+{
+  ushort3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
+{
+  short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
+{
+  ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
+{
+  int1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
+{
+  uint1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
+{
+  int2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
+{
+  uint2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
+{
+  int3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
+{
+  uint3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
+{
+  int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
+{
+  uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
+{
+  long1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
+{
+  ulong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
+{
+  long2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
+{
+  ulong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
+{
+  long3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
+{
+  ulong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
+{
+  long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
+{
+  ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
+{
+  float1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
+{
+  float2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
+{
+  float3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
+{
+  float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
+{
+  longlong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
+{
+  ulonglong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
+{
+  longlong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
+{
+  ulonglong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
+{
+  longlong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
+{
+  ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
+{
+  longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
+{
+  ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
+{
+  double1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
+{
+  double2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
+{
+  double3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
+{
+  double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+#undef __VECTOR_FUNCTIONS_DECL__
+
+#endif /* !__VECTOR_FUNCTIONS_HPP__ */
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50238b16212e1b5a84f2de7739346a07399bc09f
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4af83dc8e1c77ff0c084e00a6d7cbb158c4a16f6
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_train.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_train.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f1d6c07ffbce6289c4dba773ee73a52bcc99059
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_train.h
@@ -0,0 +1,540 @@
+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*   cudnn_adv_train : cuDNN's advanced and experimental features.
+
+*/
+
+#if !defined(CUDNN_ADV_TRAIN_H_)
+#define CUDNN_ADV_TRAIN_H_
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+#include "cudnn_version.h"
+#include "cudnn_ops_infer.h"
+#include "cudnn_ops_train.h"
+#include "cudnn_adv_infer.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_ADV_TRAIN_MAJOR 8
+#define CUDNN_ADV_TRAIN_MINOR 7
+#define CUDNN_ADV_TRAIN_PATCH 0
+
+#if (CUDNN_ADV_TRAIN_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_TRAIN_MINOR != CUDNN_MINOR) || \
+    (CUDNN_ADV_TRAIN_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN ADV TRAIN!!!
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef enum {
+    CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */
+    CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */
+} cudnnWgradMode_t;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardTraining(cudnnHandle_t handle,
+                        const cudnnRNNDescriptor_t rnnDesc,
+                        const int seqLength,
+                        const cudnnTensorDescriptor_t *xDesc,
+                        const void *x,
+                        const cudnnTensorDescriptor_t hxDesc,
+                        const void *hx,
+                        const cudnnTensorDescriptor_t cxDesc,
+                        const void *cx,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnTensorDescriptor_t *yDesc,
+                        void *y,
+                        const cudnnTensorDescriptor_t hyDesc,
+                        void *hy,
+                        const cudnnTensorDescriptor_t cyDesc,
+                        void *cy,
+                        void *workSpace,
+                        size_t workSpaceSizeInBytes,
+                        void *reserveSpace,
+                        size_t reserveSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData(cudnnHandle_t handle,
+                     const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength,
+                     const cudnnTensorDescriptor_t *yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy,
+                     const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy,
+                     const cudnnFilterDescriptor_t wDesc,
+                     const void *w,
+                     const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx,
+                     const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx,
+                     const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx,
+                     const cudnnTensorDescriptor_t dhxDesc,
+                     void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc,
+                     void *dcx,
+                     void *workSpace,
+                     size_t workSpaceSizeInBytes,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData_v8(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        const int32_t devSeqLengths[],
+                        cudnnRNNDataDescriptor_t yDesc,
+                        const void *y,
+                        const void *dy,
+                        cudnnRNNDataDescriptor_t xDesc,
+                        void *dx,
+                        cudnnTensorDescriptor_t hDesc,
+                        const void *hx,
+                        const void *dhy,
+                        void *dhx,
+                        cudnnTensorDescriptor_t cDesc,
+                        const void *cx,
+                        const void *dcy,
+                        void *dcx,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        size_t workSpaceSize,
+                        void *workSpace,
+                        size_t reserveSpaceSize,
+                        void *reserveSpace);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeights(cudnnHandle_t handle,
+                        const cudnnRNNDescriptor_t rnnDesc,
+                        const int seqLength,
+                        const cudnnTensorDescriptor_t *xDesc,
+                        const void *x,
+                        const cudnnTensorDescriptor_t hxDesc,
+                        const void *hx,
+                        const cudnnTensorDescriptor_t *yDesc,
+                        const void *y,
+                        const void *workSpace,
+                        size_t workSpaceSizeInBytes,
+                        const cudnnFilterDescriptor_t dwDesc,
+                        void *dw,
+                        const void *reserveSpace,
+                        size_t reserveSpaceSizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeights_v8(cudnnHandle_t handle,
+                           cudnnRNNDescriptor_t rnnDesc,
+                           cudnnWgradMode_t addGrad,
+                           const int32_t devSeqLengths[],
+                           cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           cudnnTensorDescriptor_t hDesc,
+                           const void *hx,
+                           cudnnRNNDataDescriptor_t yDesc,
+                           const void *y,
+                           size_t weightSpaceSize,
+                           void *dweightSpace,
+                           size_t workSpaceSize,
+                           void *workSpace,
+                           size_t reserveSpaceSize,
+                           void *reserveSpace);
+
+/* RNN EX API */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardTrainingEx(cudnnHandle_t handle,
+                          const cudnnRNNDescriptor_t rnnDesc,
+                          const cudnnRNNDataDescriptor_t xDesc,
+                          const void *x,
+                          const cudnnTensorDescriptor_t hxDesc,
+                          const void *hx,
+                          const cudnnTensorDescriptor_t cxDesc,
+                          const void *cx,
+                          const cudnnFilterDescriptor_t wDesc,
+                          const void *w,
+                          const cudnnRNNDataDescriptor_t yDesc,
+                          void *y,
+                          const cudnnTensorDescriptor_t hyDesc,
+                          void *hy,
+                          const cudnnTensorDescriptor_t cyDesc,
+                          void *cy,
+                          const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+                          const void *keys,                     /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+                          void *cAttn,                          /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+                          void *iAttn,                          /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+                          void *queries,                        /* reserved, should pass NULL */
+                          void *workSpace,
+                          size_t workSpaceSizeInBytes,
+                          void *reserveSpace,
+                          size_t reserveSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardDataEx(cudnnHandle_t handle,
+                       const cudnnRNNDescriptor_t rnnDesc,
+                       const cudnnRNNDataDescriptor_t yDesc,
+                       const void *y,
+                       const cudnnRNNDataDescriptor_t dyDesc,
+                       const void *dy,
+                       const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
+                       const void *dcAttn,                    /* reserved, should pass NULL */
+                       const cudnnTensorDescriptor_t dhyDesc,
+                       const void *dhy,
+                       const cudnnTensorDescriptor_t dcyDesc,
+                       const void *dcy,
+                       const cudnnFilterDescriptor_t wDesc,
+                       const void *w,
+                       const cudnnTensorDescriptor_t hxDesc,
+                       const void *hx,
+                       const cudnnTensorDescriptor_t cxDesc,
+                       const void *cx,
+                       const cudnnRNNDataDescriptor_t dxDesc,
+                       void *dx,
+                       const cudnnTensorDescriptor_t dhxDesc,
+                       void *dhx,
+                       const cudnnTensorDescriptor_t dcxDesc,
+                       void *dcx,
+                       const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
+                       void *dkeys,                           /* reserved, should pass NULL */
+                       void *workSpace,
+                       size_t workSpaceSizeInBytes,
+                       void *reserveSpace,
+                       size_t reserveSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeightsEx(cudnnHandle_t handle,
+                          const cudnnRNNDescriptor_t rnnDesc,
+                          const cudnnRNNDataDescriptor_t xDesc,
+                          const void *x,
+                          const cudnnTensorDescriptor_t hxDesc,
+                          const void *hx,
+                          const cudnnRNNDataDescriptor_t yDesc,
+                          const void *y,
+                          void *workSpace,
+                          size_t workSpaceSizeInBytes,
+                          const cudnnFilterDescriptor_t dwDesc,
+                          void *dw,
+                          void *reserveSpace,
+                          size_t reserveSpaceSizeInBytes);
+
+/* RNN FIND API */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnRNNDescriptor_t rnnDesc,
+                                       const int seqLength,
+                                       const cudnnTensorDescriptor_t *xDesc,
+                                       const void *x,
+                                       const cudnnTensorDescriptor_t hxDesc,
+                                       const void *hx,
+                                       const cudnnTensorDescriptor_t cxDesc,
+                                       const void *cx,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnTensorDescriptor_t *yDesc,
+                                       void *y,
+                                       const cudnnTensorDescriptor_t hyDesc,
+                                       void *hy,
+                                       const cudnnTensorDescriptor_t cyDesc,
+                                       void *cy,
+                                       const float findIntensity,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnAlgorithmPerformance_t *perfResults,
+                                       void *workspace,
+                                       size_t workSpaceSizeInBytes,
+                                       void *reserveSpace,
+                                       size_t reserveSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                    const cudnnRNNDescriptor_t rnnDesc,
+                                    const int seqLength,
+                                    const cudnnTensorDescriptor_t *yDesc,
+                                    const void *y,
+                                    const cudnnTensorDescriptor_t *dyDesc,
+                                    const void *dy,
+                                    const cudnnTensorDescriptor_t dhyDesc,
+                                    const void *dhy,
+                                    const cudnnTensorDescriptor_t dcyDesc,
+                                    const void *dcy,
+                                    const cudnnFilterDescriptor_t wDesc,
+                                    const void *w,
+                                    const cudnnTensorDescriptor_t hxDesc,
+                                    const void *hx,
+                                    const cudnnTensorDescriptor_t cxDesc,
+                                    const void *cx,
+                                    const cudnnTensorDescriptor_t *dxDesc,
+                                    void *dx,
+                                    const cudnnTensorDescriptor_t dhxDesc,
+                                    void *dhx,
+                                    const cudnnTensorDescriptor_t dcxDesc,
+                                    void *dcx,
+                                    const float findIntensity,
+                                    const int requestedAlgoCount,
+                                    int *returnedAlgoCount,
+                                    cudnnAlgorithmPerformance_t *perfResults,
+                                    void *workspace,
+                                    size_t workSpaceSizeInBytes,
+                                    void *reserveSpace,
+                                    size_t reserveSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnRNNDescriptor_t rnnDesc,
+                                       const int seqLength,
+                                       const cudnnTensorDescriptor_t *xDesc,
+                                       const void *x,
+                                       const cudnnTensorDescriptor_t hxDesc,
+                                       const void *hx,
+                                       const cudnnTensorDescriptor_t *yDesc,
+                                       const void *y,
+                                       const float findIntensity,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnAlgorithmPerformance_t *perfResults,
+                                       const void *workspace,
+                                       size_t workSpaceSizeInBytes,
+                                       const cudnnFilterDescriptor_t dwDesc,
+                                       void *dw,
+                                       const void *reserveSpace,
+                                       size_t reserveSpaceSizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
+                               const cudnnAttnDescriptor_t attnDesc,
+                               const int loWinIdx[],
+                               const int hiWinIdx[],
+                               const int devSeqLengthsDQDO[],
+                               const int devSeqLengthsDKDV[],
+                               const cudnnSeqDataDescriptor_t doDesc,
+                               const void *dout,
+                               const cudnnSeqDataDescriptor_t dqDesc,
+                               void *dqueries,
+                               const void *queries,
+                               const cudnnSeqDataDescriptor_t dkDesc,
+                               void *dkeys,
+                               const void *keys,
+                               const cudnnSeqDataDescriptor_t dvDesc,
+                               void *dvalues,
+                               const void *values,
+                               size_t weightSizeInBytes,
+                               const void *weights,
+                               size_t workSpaceSizeInBytes,
+                               void *workSpace,
+                               size_t reserveSpaceSizeInBytes,
+                               void *reserveSpace);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
+                                  const cudnnAttnDescriptor_t attnDesc,
+                                  cudnnWgradMode_t addGrad,
+                                  const cudnnSeqDataDescriptor_t qDesc,
+                                  const void *queries,
+                                  const cudnnSeqDataDescriptor_t kDesc,
+                                  const void *keys,
+                                  const cudnnSeqDataDescriptor_t vDesc,
+                                  const void *values,
+                                  const cudnnSeqDataDescriptor_t doDesc,
+                                  const void *dout,
+                                  size_t weightSizeInBytes,
+                                  const void *weights,
+                                  void *dweights,
+                                  size_t workSpaceSizeInBytes,
+                                  void *workSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  void *reserveSpace);
+
+/*
+* CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions
+*/
+/* Input normalization mode for loss function */
+typedef enum {
+    CUDNN_LOSS_NORMALIZATION_NONE    = 0,
+    CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1,
+} cudnnLossNormalizationMode_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t compType,
+                            cudnnLossNormalizationMode_t normMode,
+                            cudnnNanPropagation_t gradMode);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnNanPropagation_t gradMode,
+                             int maxLabelLength);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t *compType,
+                            cudnnLossNormalizationMode_t *normMode,
+                            cudnnNanPropagation_t *gradMode);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnNanPropagation_t *gradMode,
+                             int *maxLabelLength);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc);
+
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
+                          mini batch size, A is the alphabet size)  */
+    const void *probs, /* probabilities after softmax, in GPU memory */
+    const int hostLabels[],                      /* labels, in CPU memory */
+    const int hostLabelLengths[],                /* the length of each label, in CPU memory */
+    const int hostInputLengths[],                /* the lengths of timing steps in each batch, in CPU memory */
+    void *costs,                                 /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,         /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace,              /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes); /* size of the workspace */
+
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t
+        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
+                          mini batch size, A is the alphabet size)  */
+    const void *probs, /* probabilities after softmax, in GPU memory */
+    const int labels[],                          /* labels, in GPU memory */
+    const int labelLengths[],                    /* the length of each label, in GPU memory */
+    const int inputLengths[],                    /* the lengths of timing steps in each batch, in GPU memory */
+    void *costs,                                 /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,             /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    size_t workSpaceSizeInBytes, /* size of the workspace */
+    void *workspace);            /* pointer to the workspace, in GPU memory */
+
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    const int *labels,                           /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    size_t *sizeInBytes); /* pointer to the returned workspace size */
+
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    size_t *sizeInBytes);                        /* pointer to the returned workspace size */
+
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnAdvTrainVersionCheck(void);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_ADV_TRAIN_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfebee101195e52a789815eaf94ea0f581072ac4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend.h
@@ -0,0 +1,600 @@
+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDNN_BACKEND_H_
+#define _CUDNN_BACKEND_H_
+
+/*
+ * The content in this header file is under development to be included in cudnn.h in the future
+ * Production code should have all include of this header file remove.
+ */
+
+#include "cudnn_ops_infer.h"
+#include "cudnn_cnn_infer.h"
+
+/* NOTE: definition in extern "C" to be copied later to public header */
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef void *cudnnBackendDescriptor_t;
+
+typedef struct cudnnFractionStruct {
+    int64_t numerator;
+    int64_t denominator;
+} cudnnFraction_t;
+
+typedef enum {
+    CUDNN_POINTWISE_ADD        = 0,
+    CUDNN_POINTWISE_ADD_SQUARE = 5,
+    CUDNN_POINTWISE_DIV        = 6,
+    CUDNN_POINTWISE_MAX        = 3,
+    CUDNN_POINTWISE_MIN        = 2,
+    CUDNN_POINTWISE_MOD        = 7,
+    CUDNN_POINTWISE_MUL        = 1,
+    CUDNN_POINTWISE_POW        = 8,
+    CUDNN_POINTWISE_SUB        = 9,
+
+    CUDNN_POINTWISE_ABS      = 10,
+    CUDNN_POINTWISE_CEIL     = 11,
+    CUDNN_POINTWISE_COS      = 12,
+    CUDNN_POINTWISE_EXP      = 13,
+    CUDNN_POINTWISE_FLOOR    = 14,
+    CUDNN_POINTWISE_LOG      = 15,
+    CUDNN_POINTWISE_NEG      = 16,
+    CUDNN_POINTWISE_RSQRT    = 17,
+    CUDNN_POINTWISE_SIN      = 18,
+    CUDNN_POINTWISE_SQRT     = 4,
+    CUDNN_POINTWISE_TAN      = 19,
+    CUDNN_POINTWISE_ERF      = 20,
+    CUDNN_POINTWISE_IDENTITY = 21,
+
+    CUDNN_POINTWISE_RELU_FWD             = 100,
+    CUDNN_POINTWISE_TANH_FWD             = 101,
+    CUDNN_POINTWISE_SIGMOID_FWD          = 102,
+    CUDNN_POINTWISE_ELU_FWD              = 103,
+    CUDNN_POINTWISE_GELU_FWD             = 104,
+    CUDNN_POINTWISE_SOFTPLUS_FWD         = 105,
+    CUDNN_POINTWISE_SWISH_FWD            = 106,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107,
+
+    CUDNN_POINTWISE_RELU_BWD             = 200,
+    CUDNN_POINTWISE_TANH_BWD             = 201,
+    CUDNN_POINTWISE_SIGMOID_BWD          = 202,
+    CUDNN_POINTWISE_ELU_BWD              = 203,
+    CUDNN_POINTWISE_GELU_BWD             = 204,
+    CUDNN_POINTWISE_SOFTPLUS_BWD         = 205,
+    CUDNN_POINTWISE_SWISH_BWD            = 206,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207,
+
+    CUDNN_POINTWISE_CMP_EQ  = 300,
+    CUDNN_POINTWISE_CMP_NEQ = 301,
+    CUDNN_POINTWISE_CMP_GT  = 302,
+    CUDNN_POINTWISE_CMP_GE  = 303,
+    CUDNN_POINTWISE_CMP_LT  = 304,
+    CUDNN_POINTWISE_CMP_LE  = 305,
+
+    CUDNN_POINTWISE_LOGICAL_AND = 400,
+    CUDNN_POINTWISE_LOGICAL_OR  = 401,
+    CUDNN_POINTWISE_LOGICAL_NOT = 402,
+
+    CUDNN_POINTWISE_GEN_INDEX = 501,
+
+    CUDNN_POINTWISE_BINARY_SELECT = 601,
+} cudnnPointwiseMode_t;
+
+typedef enum {
+    CUDNN_RESAMPLE_NEAREST                 = 0,
+    CUDNN_RESAMPLE_BILINEAR                = 1,
+    CUDNN_RESAMPLE_AVGPOOL                 = 2,
+    CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2,
+    CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4,
+    CUDNN_RESAMPLE_MAXPOOL                 = 3,
+} cudnnResampleMode_t;
+
+typedef enum {
+    CUDNN_SIGNAL_SET  = 0,
+    CUDNN_SIGNAL_WAIT = 1,
+} cudnnSignalMode_t;
+
+typedef enum {
+    CUDNN_GENSTATS_SUM_SQSUM = 0,
+} cudnnGenStatsMode_t;
+
+typedef enum {
+    CUDNN_BN_FINALIZE_STATISTICS_TRAINING  = 0,
+    CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1,
+} cudnnBnFinalizeStatsMode_t;
+
+typedef enum {
+    CUDNN_RNG_DISTRIBUTION_BERNOULLI,
+    CUDNN_RNG_DISTRIBUTION_UNIFORM,
+    CUDNN_RNG_DISTRIBUTION_NORMAL,
+} cudnnRngDistribution_t;
+
+typedef enum {
+    CUDNN_ATTR_POINTWISE_MODE                  = 0,
+    CUDNN_ATTR_POINTWISE_MATH_PREC             = 1,
+    CUDNN_ATTR_POINTWISE_NAN_PROPAGATION       = 2,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP       = 3,
+    CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP       = 4,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE = 5,
+    CUDNN_ATTR_POINTWISE_ELU_ALPHA             = 6,
+    CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA         = 7,
+    CUDNN_ATTR_POINTWISE_SWISH_BETA            = 8,
+    CUDNN_ATTR_POINTWISE_AXIS                  = 9,
+
+    CUDNN_ATTR_CONVOLUTION_COMP_TYPE      = 100,
+    CUDNN_ATTR_CONVOLUTION_CONV_MODE      = 101,
+    CUDNN_ATTR_CONVOLUTION_DILATIONS      = 102,
+    CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103,
+    CUDNN_ATTR_CONVOLUTION_POST_PADDINGS  = 104,
+    CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS   = 105,
+    CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS   = 106,
+
+    CUDNN_ATTR_ENGINEHEUR_MODE            = 200,
+    CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201,
+    CUDNN_ATTR_ENGINEHEUR_RESULTS         = 202,
+
+    CUDNN_ATTR_ENGINECFG_ENGINE            = 300,
+    CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301,
+    CUDNN_ATTR_ENGINECFG_KNOB_CHOICES      = 302,
+
+    CUDNN_ATTR_EXECUTION_PLAN_HANDLE                     = 400,
+    CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG              = 401,
+    CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE             = 402,
+    CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403,
+    CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404,
+    CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION        = 405,
+
+    CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID            = 500,
+    CUDNN_ATTR_INTERMEDIATE_INFO_SIZE                 = 501,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS  = 502,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503,
+
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE  = 600,
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601,
+
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA        = 700,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA         = 701,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC    = 702,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W            = 703,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X            = 704,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y            = 705,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA       = 706,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA        = 707,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC   = 708,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W           = 709,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX          = 710,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY          = 711,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA     = 712,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA      = 713,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW        = 715,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X         = 716,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY        = 717,
+
+    CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750,
+    CUDNN_ATTR_OPERATION_POINTWISE_XDESC         = 751,
+    CUDNN_ATTR_OPERATION_POINTWISE_BDESC         = 752,
+    CUDNN_ATTR_OPERATION_POINTWISE_YDESC         = 753,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1        = 754,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2        = 755,
+    CUDNN_ATTR_OPERATION_POINTWISE_DXDESC        = 756,
+    CUDNN_ATTR_OPERATION_POINTWISE_DYDESC        = 757,
+    CUDNN_ATTR_OPERATION_POINTWISE_TDESC         = 758,
+
+    CUDNN_ATTR_OPERATION_GENSTATS_MODE      = 770,
+    CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771,
+    CUDNN_ATTR_OPERATION_GENSTATS_XDESC     = 772,
+    CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC   = 773,
+    CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774,
+
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE                = 780,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC                 = 781,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC                = 782,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC             = 783,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC                = 784,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC                 = 785,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC    = 786,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC     = 787,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC  = 789,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC           = 790,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC        = 791,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC             = 792,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC              = 793,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC          = 794,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC              = 795,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC   = 796,
+
+    CUDNN_ATTR_OPERATIONGRAPH_HANDLE              = 800,
+    CUDNN_ATTR_OPERATIONGRAPH_OPS                 = 801,
+    CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802,
+
+    CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT       = 900,
+    CUDNN_ATTR_TENSOR_DATA_TYPE            = 901,
+    CUDNN_ATTR_TENSOR_DIMENSIONS           = 902,
+    CUDNN_ATTR_TENSOR_STRIDES              = 903,
+    CUDNN_ATTR_TENSOR_VECTOR_COUNT         = 904,
+    CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905,
+    CUDNN_ATTR_TENSOR_UNIQUE_ID            = 906,
+    CUDNN_ATTR_TENSOR_IS_VIRTUAL           = 907,
+    CUDNN_ATTR_TENSOR_IS_BY_VALUE          = 908,
+    CUDNN_ATTR_TENSOR_REORDERING_MODE      = 909,
+
+    CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS    = 1000,
+    CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001,
+    CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002,
+    CUDNN_ATTR_VARIANT_PACK_WORKSPACE     = 1003,
+
+    CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100,
+    CUDNN_ATTR_LAYOUT_INFO_TYPES      = 1101,
+
+    CUDNN_ATTR_KNOB_INFO_TYPE          = 1200,
+    CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201,
+    CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202,
+    CUDNN_ATTR_KNOB_INFO_STRIDE        = 1203,
+
+    CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300,
+    CUDNN_ATTR_ENGINE_GLOBAL_INDEX    = 1301,
+    CUDNN_ATTR_ENGINE_KNOB_INFO       = 1302,
+    CUDNN_ATTR_ENGINE_NUMERICAL_NOTE  = 1303,
+    CUDNN_ATTR_ENGINE_LAYOUT_INFO     = 1304,
+    CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE   = 1305,
+
+    CUDNN_ATTR_MATMUL_COMP_TYPE = 1500,
+
+    CUDNN_ATTR_OPERATION_MATMUL_ADESC                           = 1520,
+    CUDNN_ATTR_OPERATION_MATMUL_BDESC                           = 1521,
+    CUDNN_ATTR_OPERATION_MATMUL_CDESC                           = 1522,
+    CUDNN_ATTR_OPERATION_MATMUL_DESC                            = 1523,
+    CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT = 1524,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC            = 1525,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC            = 1526,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC            = 1527,
+
+    CUDNN_ATTR_REDUCTION_OPERATOR  = 1600,
+    CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601,
+
+    CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610,
+    CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611,
+    CUDNN_ATTR_OPERATION_REDUCTION_DESC  = 1612,
+
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC        = 1620,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC        = 1621,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC      = 1622,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC    = 1623,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC           = 1624,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC          = 1625,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC   = 1626,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC    = 1627,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC  = 1629,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS          = 1630,
+
+    CUDNN_ATTR_RESAMPLE_MODE            = 1700,
+    CUDNN_ATTR_RESAMPLE_COMP_TYPE       = 1701,
+    CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS    = 1702,
+    CUDNN_ATTR_RESAMPLE_POST_PADDINGS   = 1703,
+    CUDNN_ATTR_RESAMPLE_PRE_PADDINGS    = 1704,
+    CUDNN_ATTR_RESAMPLE_STRIDES         = 1705,
+    CUDNN_ATTR_RESAMPLE_WINDOW_DIMS     = 1706,
+    CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707,
+    CUDNN_ATTR_RESAMPLE_PADDING_MODE    = 1708,
+
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC   = 1710,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC   = 1711,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC = 1712,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA   = 1713,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA    = 1714,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC    = 1716,
+
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC  = 1720,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC  = 1721,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC = 1722,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA   = 1723,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA    = 1724,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC    = 1725,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC   = 1726,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC   = 1727,
+
+    CUDNN_ATTR_OPERATION_CONCAT_AXIS          = 1800,
+    CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS   = 1801,
+    CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802,
+    CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC   = 1803,
+
+    CUDNN_ATTR_OPERATION_SIGNAL_MODE     = 1900,
+    CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901,
+    CUDNN_ATTR_OPERATION_SIGNAL_VALUE    = 1902,
+    CUDNN_ATTR_OPERATION_SIGNAL_XDESC    = 1903,
+    CUDNN_ATTR_OPERATION_SIGNAL_YDESC    = 1904,
+
+    CUDNN_ATTR_OPERATION_NORM_FWD_MODE                     = 2000,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PHASE                    = 2001,
+    CUDNN_ATTR_OPERATION_NORM_FWD_XDESC                    = 2002,
+    CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC                = 2003,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC        = 2004,
+    CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC               = 2005,
+    CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC                = 2006,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC             = 2007,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC      = 2008,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC  = 2009,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC   = 2010,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC  = 2012,
+    CUDNN_ATTR_OPERATION_NORM_FWD_YDESC                    = 2013,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS          = 2014,
+
+    CUDNN_ATTR_OPERATION_NORM_BWD_MODE              = 2100,
+    CUDNN_ATTR_OPERATION_NORM_BWD_XDESC             = 2101,
+    CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC         = 2102,
+    CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC            = 2104,
+    CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC        = 2105,
+    CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC      = 2106,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC       = 2107,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC        = 2108,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC            = 2109,
+    CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS   = 2110,
+
+    CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200,
+    CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201,
+
+    CUDNN_ATTR_RNG_DISTRIBUTION                   = 2300,
+    CUDNN_ATTR_RNG_NORMAL_DIST_MEAN               = 2301,
+    CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM           = 2303,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM           = 2304,
+    CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY     = 2305,
+
+    CUDNN_ATTR_OPERATION_RNG_YDESC = 2310,
+    CUDNN_ATTR_OPERATION_RNG_SEED  = 2311,
+    CUDNN_ATTR_OPERATION_RNG_DESC  = 2312,
+
+} cudnnBackendAttributeName_t;
+
+typedef enum {
+    CUDNN_TYPE_HANDLE = 0,
+    CUDNN_TYPE_DATA_TYPE,
+    CUDNN_TYPE_BOOLEAN,
+    CUDNN_TYPE_INT64,
+    CUDNN_TYPE_FLOAT,
+    CUDNN_TYPE_DOUBLE,
+    CUDNN_TYPE_VOID_PTR,
+    CUDNN_TYPE_CONVOLUTION_MODE,
+    CUDNN_TYPE_HEUR_MODE,
+    CUDNN_TYPE_KNOB_TYPE,
+    CUDNN_TYPE_NAN_PROPOGATION,
+    CUDNN_TYPE_NUMERICAL_NOTE,
+    CUDNN_TYPE_LAYOUT_TYPE,
+    CUDNN_TYPE_ATTRIB_NAME,
+    CUDNN_TYPE_POINTWISE_MODE,
+    CUDNN_TYPE_BACKEND_DESCRIPTOR,
+    CUDNN_TYPE_GENSTATS_MODE,
+    CUDNN_TYPE_BN_FINALIZE_STATS_MODE,
+    CUDNN_TYPE_REDUCTION_OPERATOR_TYPE,
+    CUDNN_TYPE_BEHAVIOR_NOTE,
+    CUDNN_TYPE_TENSOR_REORDERING_MODE,
+    CUDNN_TYPE_RESAMPLE_MODE,
+    CUDNN_TYPE_PADDING_MODE,
+    CUDNN_TYPE_INT32,
+    CUDNN_TYPE_CHAR,
+    CUDNN_TYPE_SIGNAL_MODE,
+    CUDNN_TYPE_FRACTION,
+    CUDNN_TYPE_NORM_MODE,
+    CUDNN_TYPE_NORM_FWD_PHASE,
+    CUDNN_TYPE_RNG_DISTRIBUTION
+} cudnnBackendAttributeType_t;
+
+typedef enum {
+    CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0,
+    CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR,
+    CUDNN_BACKEND_ENGINE_DESCRIPTOR,
+    CUDNN_BACKEND_ENGINECFG_DESCRIPTOR,
+    CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR,
+    CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR,
+    CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR,
+    CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR,
+    CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR,
+    CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR,
+    CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR,
+    CUDNN_BACKEND_TENSOR_DESCRIPTOR,
+    CUDNN_BACKEND_MATMUL_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR,
+    CUDNN_BACKEND_REDUCTION_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR,
+    CUDNN_BACKEND_RESAMPLE_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR,
+    CUDNN_BACKEND_RNG_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR
+} cudnnBackendDescriptorType_t;
+
+typedef enum {
+    CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0,
+    CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS,
+    CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION,
+    CUDNN_NUMERICAL_NOTE_FFT,
+    CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13,
+    CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
+} cudnnBackendNumericalNote_t;
+
+typedef enum {
+    CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION             = 0,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER   = 2,
+    CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
+} cudnnBackendBehaviorNote_t;
+
+typedef enum {
+    CUDNN_KNOB_TYPE_SPLIT_K          = 0,
+    CUDNN_KNOB_TYPE_SWIZZLE          = 1,
+    CUDNN_KNOB_TYPE_TILE_SIZE        = 2,
+    CUDNN_KNOB_TYPE_USE_TEX          = 3,
+    CUDNN_KNOB_TYPE_EDGE             = 4,
+    CUDNN_KNOB_TYPE_KBLOCK           = 5,
+    CUDNN_KNOB_TYPE_LDGA             = 6,
+    CUDNN_KNOB_TYPE_LDGB             = 7,
+    CUDNN_KNOB_TYPE_CHUNK_K          = 8,
+    CUDNN_KNOB_TYPE_SPLIT_H          = 9,
+    CUDNN_KNOB_TYPE_WINO_TILE        = 10,
+    CUDNN_KNOB_TYPE_MULTIPLY         = 11,
+    CUDNN_KNOB_TYPE_SPLIT_K_BUF      = 12,
+    CUDNN_KNOB_TYPE_TILEK            = 13,
+    CUDNN_KNOB_TYPE_STAGES           = 14,
+    CUDNN_KNOB_TYPE_REDUCTION_MODE   = 15,
+    CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE = 16,
+    CUDNN_KNOB_TYPE_SPLIT_K_SLC      = 17,
+    CUDNN_KNOB_TYPE_IDX_MODE         = 18,
+    CUDNN_KNOB_TYPE_SLICED           = 19,
+    CUDNN_KNOB_TYPE_SPLIT_RS         = 20,
+    CUDNN_KNOB_TYPE_SINGLEBUFFER     = 21,
+    CUDNN_KNOB_TYPE_LDGC             = 22,
+    CUDNN_KNOB_TYPE_SPECFILT         = 23,
+    CUDNN_KNOB_TYPE_KERNEL_CFG       = 24,
+    CUDNN_KNOB_TYPE_WORKSPACE        = 25,
+    CUDNN_KNOB_TYPE_TILE_CGA         = 26,
+    CUDNN_KNOB_TYPE_TILE_CGA_M       = 27,
+    CUDNN_KNOB_TYPE_TILE_CGA_N       = 28,
+
+    CUDNN_KNOB_TYPE_COUNTS,
+} cudnnBackendKnobType_t;
+
+typedef enum {
+    CUDNN_LAYOUT_TYPE_PREFERRED_NCHW   = 0,
+    CUDNN_LAYOUT_TYPE_PREFERRED_NHWC   = 1,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3,
+    CUDNN_LAYOUT_TYPE_COUNT            = 4,
+} cudnnBackendLayoutType_t;
+
+typedef enum {
+    CUDNN_HEUR_MODE_INSTANT  = 0,
+    CUDNN_HEUR_MODE_B        = 1,
+    CUDNN_HEUR_MODE_FALLBACK = 2,
+    CUDNN_HEUR_MODE_A        = 3,
+    CUDNN_HEUR_MODES_COUNT   = 4,
+} cudnnBackendHeurMode_t;
+
+typedef enum {
+    CUDNN_TENSOR_REORDERING_NONE    = 0,
+    CUDNN_TENSOR_REORDERING_INT8x32 = 1,
+} cudnnBackendTensorReordering_t;
+
+typedef enum {
+    CUDNN_ZERO_PAD     = 0,
+    CUDNN_NEG_INF_PAD  = 1,
+    CUDNN_EDGE_VAL_PAD = 2,
+} cudnnPaddingMode_t;
+
+typedef enum {
+    CUDNN_LAYER_NORM    = 0,
+    CUDNN_INSTANCE_NORM = 1,
+    CUDNN_BATCH_NORM    = 2,
+    CUDNN_GROUP_NORM    = 3,
+} cudnnBackendNormMode_t;
+
+typedef enum {
+    CUDNN_NORM_FWD_INFERENCE = 0,
+    CUDNN_NORM_FWD_TRAINING  = 1,
+} cudnnBackendNormFwdPhase_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t elementCount,
+                         const void *arrayOfElements);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t requestedElementCount,
+                         int64_t *elementCount,
+                         void *arrayOfElements);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* _CUDNN_BACKEND_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer.h
new file mode 100644
index 0000000000000000000000000000000000000000..88335dfb027c5193c7cfc888ed84dc352dc5de13
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer.h
@@ -0,0 +1,1183 @@
+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ *  cudnn_ops_infer : cuDNN's basic definitions and inference operations.
+ */
+
+#if !defined(CUDNN_OPS_INFER_H_)
+#define CUDNN_OPS_INFER_H_
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+#include "cudnn_version.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_OPS_INFER_MAJOR 8
+#define CUDNN_OPS_INFER_MINOR 7
+#define CUDNN_OPS_INFER_PATCH 0
+
+#if (CUDNN_OPS_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_INFER_MINOR != CUDNN_MINOR) || \
+    (CUDNN_OPS_INFER_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN OPS INFER!!!
+#endif
+
+#ifndef CUDNNWINAPI
+#ifdef _WIN32
+#define CUDNNWINAPI __stdcall
+#else
+#define CUDNNWINAPI
+#endif
+#endif
+
+/* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */
+#if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__))
+/* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */
+#define CUDNN_DEPRECATED __attribute__((deprecated))
+#elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER)
+/* Microsoft Visual C++ */
+#define CUDNN_DEPRECATED __declspec(deprecated)
+#elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L)
+/* C++14 compilers */
+#define CUDNN_DEPRECATED [[deprecated]]
+#else
+/* No support for the deprecated attribute */
+#define CUDNN_DEPRECATED
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct cudnnContext;
+typedef struct cudnnContext *cudnnHandle_t;
+
+size_t CUDNNWINAPI
+cudnnGetVersion(void);
+
+size_t CUDNNWINAPI
+cudnnGetMaxDeviceVersion(void);
+
+/* Returns CUDA Runtime version statically linked against cudnn */
+size_t CUDNNWINAPI
+cudnnGetCudartVersion(void);
+
+/*
+ * CUDNN return codes
+ */
+typedef enum {
+    CUDNN_STATUS_SUCCESS                      = 0,
+    CUDNN_STATUS_NOT_INITIALIZED              = 1,
+    CUDNN_STATUS_ALLOC_FAILED                 = 2,
+    CUDNN_STATUS_BAD_PARAM                    = 3,
+    CUDNN_STATUS_INTERNAL_ERROR               = 4,
+    CUDNN_STATUS_INVALID_VALUE                = 5,
+    CUDNN_STATUS_ARCH_MISMATCH                = 6,
+    CUDNN_STATUS_MAPPING_ERROR                = 7,
+    CUDNN_STATUS_EXECUTION_FAILED             = 8,
+    CUDNN_STATUS_NOT_SUPPORTED                = 9,
+    CUDNN_STATUS_LICENSE_ERROR                = 10,
+    CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11,
+    CUDNN_STATUS_RUNTIME_IN_PROGRESS          = 12,
+    CUDNN_STATUS_RUNTIME_FP_OVERFLOW          = 13,
+    CUDNN_STATUS_VERSION_MISMATCH             = 14,
+} cudnnStatus_t;
+
+/* human-readable error messages */
+const char *CUDNNWINAPI
+cudnnGetErrorString(cudnnStatus_t status);
+
+/* Forward definition in this version only */
+typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t;
+
+typedef enum {
+    CUDNN_ERRQUERY_RAWCODE     = 0,
+    CUDNN_ERRQUERY_NONBLOCKING = 1,
+    CUDNN_ERRQUERY_BLOCKING    = 2,
+} cudnnErrQueryMode_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag);
+
+#ifndef __LIBRARY_TYPES_H__
+
+typedef enum libraryPropertyType_t { MAJOR_VERSION, MINOR_VERSION, PATCH_LEVEL } libraryPropertyType;
+
+#endif
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetProperty(libraryPropertyType type, int *value);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreate(cudnnHandle_t *handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroy(cudnnHandle_t handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId);
+
+/* Data structures to represent Image/Filter and the Neural Network Layer */
+typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t;
+typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t;
+typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t;
+typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t;
+typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t;
+typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t;
+typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t;
+typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t;
+typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t;
+typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t;
+/*
+ * CUDNN data type
+ */
+typedef enum {
+    CUDNN_DATA_FLOAT              = 0,
+    CUDNN_DATA_DOUBLE             = 1,
+    CUDNN_DATA_HALF               = 2,
+    CUDNN_DATA_INT8               = 3,
+    CUDNN_DATA_INT32              = 4,
+    CUDNN_DATA_INT8x4             = 5,
+    CUDNN_DATA_UINT8              = 6,
+    CUDNN_DATA_UINT8x4            = 7,
+    CUDNN_DATA_INT8x32            = 8,
+    CUDNN_DATA_BFLOAT16           = 9,
+    CUDNN_DATA_INT64              = 10,
+    CUDNN_DATA_BOOLEAN            = 11,
+    CUDNN_DATA_FP8_E4M3           = 12,
+    CUDNN_DATA_FP8_E5M2           = 13,
+    CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14,
+} cudnnDataType_t;
+
+/*
+ * CUDNN math type
+ */
+typedef enum {
+    CUDNN_DEFAULT_MATH                    = 0,
+    CUDNN_TENSOR_OP_MATH                  = 1,
+    CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
+    CUDNN_FMA_MATH                        = 3,
+} cudnnMathType_t;
+
+/*
+ * CUDNN propagate Nan
+ */
+typedef enum {
+    CUDNN_NOT_PROPAGATE_NAN = 0,
+    CUDNN_PROPAGATE_NAN     = 1,
+} cudnnNanPropagation_t;
+
+/*
+ * CUDNN Determinism
+ */
+typedef enum {
+    CUDNN_NON_DETERMINISTIC = 0,
+    CUDNN_DETERMINISTIC     = 1,
+} cudnnDeterminism_t;
+
+/* Maximum supported number of tensor dimensions */
+#define CUDNN_DIM_MAX 8
+
+/* Create an instance of a generic Tensor descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
+
+typedef enum {
+    CUDNN_TENSOR_NCHW        = 0, /* row major (wStride = 1, hStride = w) */
+    CUDNN_TENSOR_NHWC        = 1, /* feature maps interleaved ( cStride = 1 )*/
+    CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */
+} cudnnTensorFormat_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnTensorFormat_t format,
+                           cudnnDataType_t dataType, /* image data type */
+                           int n,                    /* number of inputs (batch size) */
+                           int c,                    /* number of input feature maps */
+                           int h,                    /* height of input section */
+                           int w);                   /* width of input section */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnDataType_t dataType, /* image data type */
+                             int n,                    /* number of inputs (batch size) */
+                             int c,                    /* number of input feature maps */
+                             int h,                    /* height of input section */
+                             int w,                    /* width of input section */
+                             int nStride,
+                             int cStride,
+                             int hStride,
+                             int wStride);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           int *n,                    /* number of inputs (batch size) */
+                           int *c,                    /* number of input feature maps  */
+                           int *h,                    /* height of input section */
+                           int *w,                    /* width of input section */
+                           int *nStride,
+                           int *cStride,
+                           int *hStride,
+                           int *wStride);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t dataType,
+                           int nbDims,
+                           const int dimA[],
+                           const int strideA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnTensorFormat_t format,
+                             cudnnDataType_t dataType,
+                             int nbDims,
+                             const int dimA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType,
+                           int *nbDims,
+                           int dimA[],
+                           int strideA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size);
+
+/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride
+
+   1)Example of all images in row major order one batch of features after the other (with an optional padding on row)
+   input_stride :  c x h x h_stride
+   feature_stride : h x h_stride
+   h_stride  :  >= w  ( h_stride = w if no padding)
+   w_stride  : 1
+
+
+   2)Example of all images in row major with features maps interleaved
+   input_stride :  c x h x h_stride
+   feature_stride : 1
+   h_stride  :  w x c
+   w_stride  : c
+
+   3)Example of all images in column major order one batch of features after the other (with optional padding on column)
+   input_stride :  c x w x w_stride
+   feature_stride : w x w_stride
+   h_stride  :  1
+   w_stride  :  >= h
+
+*/
+
+/* Destroy an instance of Tensor4d descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
+
+/* Fold/unfold transforms */
+typedef enum {
+    CUDNN_TRANSFORM_FOLD   = 0U,
+    CUDNN_TRANSFORM_UNFOLD = 1U,
+} cudnnFoldingDirection_t;
+
+/** Create a destination descriptor for cudnnTransformTensor */
+cudnnStatus_t CUDNNWINAPI
+cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       cudnnTensorDescriptor_t destDesc,
+                       size_t *destSizeInBytes);
+
+/** Create an empty tensor transform descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc);
+
+/** Initialize a previously created tensor transform descriptor. */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  const uint32_t nbDims,
+                                  const cudnnTensorFormat_t destFormat,
+                                  const int32_t padBeforeA[],
+                                  const int32_t padAfterA[],
+                                  const uint32_t foldA[],
+                                  const cudnnFoldingDirection_t direction);
+
+/**
+ * Retrieves the values stored in a previously initialized tensor transform
+ * descriptor.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  uint32_t nbDimsRequested,
+                                  cudnnTensorFormat_t *destFormat,
+                                  int32_t padBeforeA[],
+                                  int32_t padAfterA[],
+                                  uint32_t foldA[],
+                                  cudnnFoldingDirection_t *direction);
+
+/**
+ * Destroys a previously created tensor transform descriptor.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc);
+
+/* Tensor layout conversion helper (y = alpha * x + beta * y) */
+cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensor(cudnnHandle_t handle,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t yDesc,
+                     void *y);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensorEx(cudnnHandle_t handle,
+                       const cudnnTensorTransformDescriptor_t transDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       const void *srcData,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t destDesc,
+                       void *destData);
+
+/* Tensor Bias addition : C = alpha * A + beta * C  */
+cudnnStatus_t CUDNNWINAPI
+cudnnAddTensor(cudnnHandle_t handle,
+               const void *alpha,
+               const cudnnTensorDescriptor_t aDesc,
+               const void *A,
+               const void *beta,
+               const cudnnTensorDescriptor_t cDesc,
+               void *C);
+
+/*
+ * CUDNN OpTensor op type
+ */
+typedef enum {
+    CUDNN_OP_TENSOR_ADD  = 0,
+    CUDNN_OP_TENSOR_MUL  = 1,
+    CUDNN_OP_TENSOR_MIN  = 2,
+    CUDNN_OP_TENSOR_MAX  = 3,
+    CUDNN_OP_TENSOR_SQRT = 4,
+    CUDNN_OP_TENSOR_NOT  = 5,
+} cudnnOpTensorOp_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t opTensorOp,
+                           cudnnDataType_t opTensorCompType,
+                           cudnnNanPropagation_t opTensorNanOpt);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t *opTensorOp,
+                           cudnnDataType_t *opTensorCompType,
+                           cudnnNanPropagation_t *opTensorNanOpt);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc);
+
+/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */
+/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */
+cudnnStatus_t CUDNNWINAPI
+cudnnOpTensor(cudnnHandle_t handle,
+              const cudnnOpTensorDescriptor_t opTensorDesc,
+              const void *alpha1,
+              const cudnnTensorDescriptor_t aDesc,
+              const void *A,
+              const void *alpha2,
+              const cudnnTensorDescriptor_t bDesc,
+              const void *B,
+              const void *beta,
+              const cudnnTensorDescriptor_t cDesc,
+              void *C);
+
+/*
+ * CUDNN ReduceTensor op type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_ADD          = 0,
+    CUDNN_REDUCE_TENSOR_MUL          = 1,
+    CUDNN_REDUCE_TENSOR_MIN          = 2,
+    CUDNN_REDUCE_TENSOR_MAX          = 3,
+    CUDNN_REDUCE_TENSOR_AMAX         = 4,
+    CUDNN_REDUCE_TENSOR_AVG          = 5,
+    CUDNN_REDUCE_TENSOR_NORM1        = 6,
+    CUDNN_REDUCE_TENSOR_NORM2        = 7,
+    CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
+} cudnnReduceTensorOp_t;
+
+/*
+ * CUDNN ReduceTensor indices type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_NO_INDICES        = 0,
+    CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
+} cudnnReduceTensorIndices_t;
+
+/*
+ * CUDNN tensor indices type size (all unsigned)
+ * Currently not supported, default is 32 bit unsigned.
+ */
+typedef enum {
+    CUDNN_32BIT_INDICES = 0,
+    CUDNN_64BIT_INDICES = 1,
+    CUDNN_16BIT_INDICES = 2,
+    CUDNN_8BIT_INDICES  = 3,
+} cudnnIndicesType_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t reduceTensorOp,
+                               cudnnDataType_t reduceTensorCompType,
+                               cudnnNanPropagation_t reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t reduceTensorIndices,
+                               cudnnIndicesType_t reduceTensorIndicesType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t *reduceTensorOp,
+                               cudnnDataType_t *reduceTensorCompType,
+                               cudnnNanPropagation_t *reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t *reduceTensorIndices,
+                               cudnnIndicesType_t *reduceTensorIndicesType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc);
+
+/* Helper function to return the minimum size of the index space to be passed to the reduction given the input and
+ * output tensors */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionIndicesSize(cudnnHandle_t handle,
+                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                             const cudnnTensorDescriptor_t aDesc,
+                             const cudnnTensorDescriptor_t cDesc,
+                             size_t *sizeInBytes);
+
+/* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output
+ * tensors */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
+                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               const cudnnTensorDescriptor_t aDesc,
+                               const cudnnTensorDescriptor_t cDesc,
+                               size_t *sizeInBytes);
+
+/* Tensor operation : C = reduce op( alpha * A ) + beta * C */
+/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */
+/* The indices space is ignored for reduce ops other than min or max. */
+cudnnStatus_t CUDNNWINAPI
+cudnnReduceTensor(cudnnHandle_t handle,
+                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                  void *indices,
+                  size_t indicesSizeInBytes,
+                  void *workspace,
+                  size_t workspaceSizeInBytes,
+                  const void *alpha,
+                  const cudnnTensorDescriptor_t aDesc,
+                  const void *A,
+                  const void *beta,
+                  const cudnnTensorDescriptor_t cDesc,
+                  void *C);
+
+/* Set all values of a tensor to a given value : y[i] = value[0] */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr);
+
+/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */
+cudnnStatus_t CUDNNWINAPI
+cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha);
+
+/* Create an instance of FilterStruct */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int k,  /* number of output feature maps */
+                           int c,  /* number of input feature maps */
+                           int h,  /* height of each input filter */
+                           int w); /* width of  each input filter */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *k,  /* number of output feature maps */
+                           int *c,  /* number of input feature maps */
+                           int *h,  /* height of each input filter */
+                           int *w); /* width of  each input filter */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int nbDims,
+                           const int filterDimA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *nbDims,
+                           int filterDimA[]);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnTransformFilter(cudnnHandle_t handle,
+                     const cudnnTensorTransformDescriptor_t transDesc,
+                     const void *alpha,
+                     const cudnnFilterDescriptor_t srcDesc,
+                     const void *srcData,
+                     const void *beta,
+                     const cudnnFilterDescriptor_t destDesc,
+                     void *destData);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc);
+
+/*
+ *  softmax algorithm
+ */
+typedef enum {
+    CUDNN_SOFTMAX_FAST     = 0, /* straightforward implementation */
+    CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */
+    CUDNN_SOFTMAX_LOG      = 2
+} cudnnSoftmaxAlgorithm_t;
+
+typedef enum {
+    CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */
+    CUDNN_SOFTMAX_MODE_CHANNEL  = 1  /* compute the softmax over all C for each H, W, N */
+} cudnnSoftmaxMode_t;
+
+/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+
+/* Function to perform forward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxForward(cudnnHandle_t handle,
+                    cudnnSoftmaxAlgorithm_t algo,
+                    cudnnSoftmaxMode_t mode,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+
+/*
+ *  pooling mode
+ */
+typedef enum {
+    CUDNN_POOLING_MAX                           = 0,
+    CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */
+    CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */
+    CUDNN_POOLING_MAX_DETERMINISTIC             = 3
+} cudnnPoolingMode_t;
+
+/* Create an instance of pooling descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t mode,
+                            cudnnNanPropagation_t maxpoolingNanOpt,
+                            int windowHeight,
+                            int windowWidth,
+                            int verticalPadding,
+                            int horizontalPadding,
+                            int verticalStride,
+                            int horizontalStride);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *windowHeight,
+                            int *windowWidth,
+                            int *verticalPadding,
+                            int *horizontalPadding,
+                            int *verticalStride,
+                            int *horizontalStride);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            const cudnnPoolingMode_t mode,
+                            const cudnnNanPropagation_t maxpoolingNanOpt,
+                            int nbDims,
+                            const int windowDimA[],
+                            const int paddingA[],
+                            const int strideA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            int nbDimsRequested,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *nbDims,
+                            int windowDimA[],
+                            int paddingA[],
+                            int strideA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims,
+                                  int outputTensorDimA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n,
+                                  int *c,
+                                  int *h,
+                                  int *w);
+
+/* Destroy an instance of pooling descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc);
+
+/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+
+/* Function to perform forward pooling */
+cudnnStatus_t CUDNNWINAPI
+cudnnPoolingForward(cudnnHandle_t handle,
+                    const cudnnPoolingDescriptor_t poolingDesc,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+
+/*
+ * activation mode
+ */
+typedef enum {
+    CUDNN_ACTIVATION_SIGMOID      = 0,
+    CUDNN_ACTIVATION_RELU         = 1,
+    CUDNN_ACTIVATION_TANH         = 2,
+    CUDNN_ACTIVATION_CLIPPED_RELU = 3,
+    CUDNN_ACTIVATION_ELU          = 4,
+    CUDNN_ACTIVATION_IDENTITY     = 5,
+    CUDNN_ACTIVATION_SWISH        = 6
+} cudnnActivationMode_t;
+
+/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t mode,
+                             cudnnNanPropagation_t reluNanOpt,
+                             double coef); /* ceiling for clipped RELU, alpha for ELU */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt,
+                             double *coef); /* ceiling for clipped RELU, alpha for ELU */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
+
+/* Function to perform forward activation  */
+cudnnStatus_t CUDNNWINAPI
+cudnnActivationForward(cudnnHandle_t handle,
+                       cudnnActivationDescriptor_t activationDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t xDesc,
+                       const void *x,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t yDesc,
+                       void *y);
+
+/*
+ * Create an instance of LRN (Local Response Normalization) descriptor
+ * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc);
+
+#define CUDNN_LRN_MIN_N 1       /* minimum allowed lrnN */
+#define CUDNN_LRN_MAX_N 16      /* maximum allowed lrnN */
+#define CUDNN_LRN_MIN_K 1e-5    /* minimum allowed lrnK */
+#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */
+
+/* LRN layer mode */
+typedef enum {
+    CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */
+} cudnnLRNMode_t;
+
+/*
+ * Uses a window [center-lookBehind, center+lookAhead], where
+ * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1.
+ * Values of double parameters cast to tensor data type.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK);
+/*
+ * Retrieve the settings currently stored in an LRN layer descriptor
+ * Any of the provided pointers can be NULL (no corresponding value will be returned)
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK);
+
+/* Destroy an instance of LRN descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc);
+
+/* LRN functions: output = alpha * normalize(x) + beta * old_y */
+
+/* LRN cross-channel forward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelForward(cudnnHandle_t handle,
+                            cudnnLRNDescriptor_t normDesc,
+                            cudnnLRNMode_t lrnMode,
+                            const void *alpha,
+                            const cudnnTensorDescriptor_t xDesc,
+                            const void *x,
+                            const void *beta,
+                            const cudnnTensorDescriptor_t yDesc,
+                            void *y);
+
+typedef enum {
+    CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
+} cudnnDivNormMode_t;
+
+/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
+                                  cudnnLRNDescriptor_t normDesc,
+                                  cudnnDivNormMode_t mode,
+                                  const void *alpha,
+                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+                                  const void *x,
+                                  const void *means, /* if NULL, means are assumed to be zero */
+                                  void *temp,
+                                  void *temp2,
+                                  const void *beta,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *y);
+
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_BATCHNORM_PER_ACTIVATION = 0,
+
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_BATCHNORM_SPATIAL = 1,
+
+    /*
+     * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors).
+     * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values
+     */
+    CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
+} cudnnBatchNormMode_t;
+
+#define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */
+
+/*
+ * Derives a tensor descriptor from layer data descriptor for BatchNormalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
+                              const cudnnTensorDescriptor_t xDesc,
+                              cudnnBatchNormMode_t mode);
+
+typedef enum {
+    CUDNN_BATCHNORM_OPS_BN                = 0, /* do batch normalization only */
+    CUDNN_BATCHNORM_OPS_BN_ACTIVATION     = 1, /* do batchNorm, then activation */
+    CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */
+} cudnnBatchNormOps_t;
+
+/*
+ * Performs Batch Normalization during Inference:
+ * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k]
+ * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed
+ * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
+                                        cudnnBatchNormMode_t mode,
+                                        const void *alpha, /* alpha[0] = result blend factor */
+                                        const void *beta,  /* beta[0] = dest layer blend factor */
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const void *x, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        void *y, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                        const void *bnScale,
+                                        const void *bnBias,
+                                        const void *estimatedMean,
+                                        const void *estimatedVariance,
+                                        double epsilon);
+
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_NORM_PER_ACTIVATION = 0,
+
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_NORM_PER_CHANNEL = 1,
+} cudnnNormMode_t;
+
+typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t;
+
+/*
+ * Derives a tensor descriptor from layer data descriptor for Normalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc,
+                                cudnnTensorDescriptor_t derivedNormMeanVarDesc,
+                                const cudnnTensorDescriptor_t xDesc,
+                                cudnnNormMode_t mode,
+                                int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+typedef enum {
+    CUDNN_NORM_OPS_NORM                = 0, /* do normalization only */
+    CUDNN_NORM_OPS_NORM_ACTIVATION     = 1, /* do Norm, then activation */
+    CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */
+} cudnnNormOps_t;
+
+/*
+ * Performs Normalization during Inference:
+ * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k]
+ * with normScale, normBias, runningMean, runningInvVariance tensors indexed
+ * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardInference(cudnnHandle_t handle,
+                                   cudnnNormMode_t mode,
+                                   cudnnNormOps_t normOps,
+                                   cudnnNormAlgo_t algo,
+                                   const void *alpha, /* alpha[0] = result blend factor */
+                                   const void *beta,  /* beta[0] = dest layer blend factor */
+                                   const cudnnTensorDescriptor_t xDesc,
+                                   const void *x, /* NxCxHxW */
+                                   const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                   const void *normScale,
+                                   const void *normBias,
+                                   const cudnnTensorDescriptor_t normMeanVarDesc,
+                                   const void *estimatedMean,
+                                   const void *estimatedVariance,
+                                   const cudnnTensorDescriptor_t zDesc,
+                                   const void *z,
+                                   cudnnActivationDescriptor_t activationDesc,
+                                   const cudnnTensorDescriptor_t yDesc,
+                                   void *y, /* NxCxHxW */
+                                   double epsilon,
+                                   int groupCnt); /* Place hold for future work*/
+
+/* APIs for spatial transformer network*/
+typedef enum {
+    CUDNN_SAMPLER_BILINEAR = 0,
+} cudnnSamplerType_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
+                                       cudnnSamplerType_t samplerType,
+                                       cudnnDataType_t dataType,
+                                       const int nbDims,
+                                       const int dimA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
+                                   const cudnnSpatialTransformerDescriptor_t stDesc,
+                                   const void *theta,
+                                   void *grid);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
+                             cudnnSpatialTransformerDescriptor_t stDesc,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *grid,
+                             const void *beta,
+                             cudnnTensorDescriptor_t yDesc,
+                             void *y);
+
+typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc);
+
+/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes);
+
+/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float dropout,
+                          void *states,
+                          size_t stateSizeInBytes,
+                          unsigned long long seed);
+
+/* Restores the dropout descriptor to a previously saved-off state */
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                              cudnnHandle_t handle,
+                              float dropout,
+                              void *states,
+                              size_t stateSizeInBytes,
+                              unsigned long long seed);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float *dropout,
+                          void **states,
+                          unsigned long long *seed);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutForward(cudnnHandle_t handle,
+                    const cudnnDropoutDescriptor_t dropoutDesc,
+                    const cudnnTensorDescriptor_t xdesc,
+                    const void *x,
+                    const cudnnTensorDescriptor_t ydesc,
+                    void *y,
+                    void *reserveSpace,
+                    size_t reserveSpaceSizeInBytes);
+
+/* TODO: remove */
+
+typedef struct cudnnAlgorithmStruct *cudnnAlgorithmDescriptor_t;
+typedef struct cudnnAlgorithmPerformanceStruct *cudnnAlgorithmPerformance_t;
+
+/* TODO: move these enums out to the appropriate submodule */
+typedef enum {
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM         = 0,
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
+    CUDNN_CONVOLUTION_FWD_ALGO_GEMM                  = 2,
+    CUDNN_CONVOLUTION_FWD_ALGO_DIRECT                = 3,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT                   = 4,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING            = 5,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD              = 6,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED     = 7,
+    CUDNN_CONVOLUTION_FWD_ALGO_COUNT                 = 8
+} cudnnConvolutionFwdAlgo_t;
+
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3                 = 3, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD          = 4, /* not implemented */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING        = 6,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT             = 7
+} cudnnConvolutionBwdFilterAlgo_t;
+
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING        = 3,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD          = 4,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT             = 6
+} cudnnConvolutionBwdDataAlgo_t;
+
+typedef enum {
+    CUDNN_RNN_ALGO_STANDARD               = 0,
+    CUDNN_RNN_ALGO_PERSIST_STATIC         = 1,
+    CUDNN_RNN_ALGO_PERSIST_DYNAMIC        = 2,
+    CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
+    CUDNN_RNN_ALGO_COUNT                  = 4,
+} cudnnRNNAlgo_t;
+
+typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t;
+
+/* TODO: remove */
+typedef struct cudnnAlgorithmUnionStruct {
+    union Algorithm {
+        cudnnConvolutionFwdAlgo_t convFwdAlgo;
+        cudnnConvolutionBwdFilterAlgo_t convBwdFilterAlgo;
+        cudnnConvolutionBwdDataAlgo_t convBwdDataAlgo;
+        cudnnRNNAlgo_t RNNAlgo;
+        cudnnCTCLossAlgo_t CTCLossAlgo;
+    } algo;
+} cudnnAlgorithm_t;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
+                             cudnnAlgorithmDescriptor_t algoDesc,
+                             cudnnStatus_t status,
+                             float time,
+                             size_t memory);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
+                             cudnnAlgorithmDescriptor_t *algoDesc,
+                             cudnnStatus_t *status,
+                             float *time,
+                             size_t *memory);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSaveAlgorithm(cudnnHandle_t handle,
+                   cudnnAlgorithmDescriptor_t algoDesc,
+                   void *algoSpace,
+                   size_t algoSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRestoreAlgorithm(cudnnHandle_t handle,
+                      void *algoSpace,
+                      size_t algoSpaceSizeInBytes,
+                      cudnnAlgorithmDescriptor_t algoDesc);
+
+typedef enum {
+    CUDNN_SEV_FATAL   = 0,
+    CUDNN_SEV_ERROR   = 1,
+    CUDNN_SEV_WARNING = 2,
+    CUDNN_SEV_INFO    = 3,
+} cudnnSeverity_t;
+
+/* Message masks to be used with cudnnSetCallback() */
+#define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR)
+#define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING)
+#define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO)
+
+/* struct containing useful informaiton for each API call */
+typedef struct cudnnDebugStruct {
+    unsigned cudnn_version;
+    cudnnStatus_t cudnnStatus;
+    unsigned time_sec;      /* epoch time in seconds */
+    unsigned time_usec;     /* microseconds part of epoch time */
+    unsigned time_delta;    /* time since start in seconds */
+    cudnnHandle_t handle;   /* cudnn handle */
+    cudaStream_t stream;    /* cuda stream ID */
+    unsigned long long pid; /* process ID */
+    unsigned long long tid; /* thread ID */
+    int cudaDeviceId;       /* CUDA device ID */
+    int reserved[15];       /* reserved for future use */
+} cudnnDebug_t;
+
+typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr);
+
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnOpsInferVersionCheck(void);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_OPS_INFER_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer_v8.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer_v8.h
new file mode 100644
index 0000000000000000000000000000000000000000..88335dfb027c5193c7cfc888ed84dc352dc5de13
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer_v8.h
@@ -0,0 +1,1183 @@
+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ *  cudnn_ops_infer : cuDNN's basic definitions and inference operations.
+ */
+
+#if !defined(CUDNN_OPS_INFER_H_)
+#define CUDNN_OPS_INFER_H_
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+#include "cudnn_version.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_OPS_INFER_MAJOR 8
+#define CUDNN_OPS_INFER_MINOR 7
+#define CUDNN_OPS_INFER_PATCH 0
+
+#if (CUDNN_OPS_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_INFER_MINOR != CUDNN_MINOR) || \
+    (CUDNN_OPS_INFER_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN OPS INFER!!!
+#endif
+
+#ifndef CUDNNWINAPI
+#ifdef _WIN32
+#define CUDNNWINAPI __stdcall
+#else
+#define CUDNNWINAPI
+#endif
+#endif
+
+/* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */
+#if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__))
+/* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */
+#define CUDNN_DEPRECATED __attribute__((deprecated))
+#elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER)
+/* Microsoft Visual C++ */
+#define CUDNN_DEPRECATED __declspec(deprecated)
+#elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L)
+/* C++14 compilers */
+#define CUDNN_DEPRECATED [[deprecated]]
+#else
+/* No support for the deprecated attribute */
+#define CUDNN_DEPRECATED
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct cudnnContext;
+typedef struct cudnnContext *cudnnHandle_t;
+
+size_t CUDNNWINAPI
+cudnnGetVersion(void);
+
+size_t CUDNNWINAPI
+cudnnGetMaxDeviceVersion(void);
+
+/* Returns CUDA Runtime version statically linked against cudnn */
+size_t CUDNNWINAPI
+cudnnGetCudartVersion(void);
+
+/*
+ * CUDNN return codes
+ */
+typedef enum {
+    CUDNN_STATUS_SUCCESS                      = 0,
+    CUDNN_STATUS_NOT_INITIALIZED              = 1,
+    CUDNN_STATUS_ALLOC_FAILED                 = 2,
+    CUDNN_STATUS_BAD_PARAM                    = 3,
+    CUDNN_STATUS_INTERNAL_ERROR               = 4,
+    CUDNN_STATUS_INVALID_VALUE                = 5,
+    CUDNN_STATUS_ARCH_MISMATCH                = 6,
+    CUDNN_STATUS_MAPPING_ERROR                = 7,
+    CUDNN_STATUS_EXECUTION_FAILED             = 8,
+    CUDNN_STATUS_NOT_SUPPORTED                = 9,
+    CUDNN_STATUS_LICENSE_ERROR                = 10,
+    CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11,
+    CUDNN_STATUS_RUNTIME_IN_PROGRESS          = 12,
+    CUDNN_STATUS_RUNTIME_FP_OVERFLOW          = 13,
+    CUDNN_STATUS_VERSION_MISMATCH             = 14,
+} cudnnStatus_t;
+
+/* human-readable error messages */
+const char *CUDNNWINAPI
+cudnnGetErrorString(cudnnStatus_t status);
+
+/* Forward definition in this version only */
+typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t;
+
+typedef enum {
+    CUDNN_ERRQUERY_RAWCODE     = 0,
+    CUDNN_ERRQUERY_NONBLOCKING = 1,
+    CUDNN_ERRQUERY_BLOCKING    = 2,
+} cudnnErrQueryMode_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag);
+
+#ifndef __LIBRARY_TYPES_H__
+
+typedef enum libraryPropertyType_t { MAJOR_VERSION, MINOR_VERSION, PATCH_LEVEL } libraryPropertyType;
+
+#endif
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetProperty(libraryPropertyType type, int *value);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreate(cudnnHandle_t *handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroy(cudnnHandle_t handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId);
+
+/* Data structures to represent Image/Filter and the Neural Network Layer */
+typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t;
+typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t;
+typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t;
+typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t;
+typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t;
+typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t;
+typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t;
+typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t;
+typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t;
+typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t;
+/*
+ * CUDNN data type
+ */
+typedef enum {
+    CUDNN_DATA_FLOAT              = 0,
+    CUDNN_DATA_DOUBLE             = 1,
+    CUDNN_DATA_HALF               = 2,
+    CUDNN_DATA_INT8               = 3,
+    CUDNN_DATA_INT32              = 4,
+    CUDNN_DATA_INT8x4             = 5,
+    CUDNN_DATA_UINT8              = 6,
+    CUDNN_DATA_UINT8x4            = 7,
+    CUDNN_DATA_INT8x32            = 8,
+    CUDNN_DATA_BFLOAT16           = 9,
+    CUDNN_DATA_INT64              = 10,
+    CUDNN_DATA_BOOLEAN            = 11,
+    CUDNN_DATA_FP8_E4M3           = 12,
+    CUDNN_DATA_FP8_E5M2           = 13,
+    CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14,
+} cudnnDataType_t;
+
+/*
+ * CUDNN math type
+ */
+typedef enum {
+    CUDNN_DEFAULT_MATH                    = 0,
+    CUDNN_TENSOR_OP_MATH                  = 1,
+    CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
+    CUDNN_FMA_MATH                        = 3,
+} cudnnMathType_t;
+
+/*
+ * CUDNN propagate Nan
+ */
+typedef enum {
+    CUDNN_NOT_PROPAGATE_NAN = 0,
+    CUDNN_PROPAGATE_NAN     = 1,
+} cudnnNanPropagation_t;
+
+/*
+ * CUDNN Determinism
+ */
+typedef enum {
+    CUDNN_NON_DETERMINISTIC = 0,
+    CUDNN_DETERMINISTIC     = 1,
+} cudnnDeterminism_t;
+
+/* Maximum supported number of tensor dimensions */
+#define CUDNN_DIM_MAX 8
+
+/* Create an instance of a generic Tensor descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
+
+typedef enum {
+    CUDNN_TENSOR_NCHW        = 0, /* row major (wStride = 1, hStride = w) */
+    CUDNN_TENSOR_NHWC        = 1, /* feature maps interleaved ( cStride = 1 )*/
+    CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */
+} cudnnTensorFormat_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnTensorFormat_t format,
+                           cudnnDataType_t dataType, /* image data type */
+                           int n,                    /* number of inputs (batch size) */
+                           int c,                    /* number of input feature maps */
+                           int h,                    /* height of input section */
+                           int w);                   /* width of input section */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnDataType_t dataType, /* image data type */
+                             int n,                    /* number of inputs (batch size) */
+                             int c,                    /* number of input feature maps */
+                             int h,                    /* height of input section */
+                             int w,                    /* width of input section */
+                             int nStride,
+                             int cStride,
+                             int hStride,
+                             int wStride);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           int *n,                    /* number of inputs (batch size) */
+                           int *c,                    /* number of input feature maps  */
+                           int *h,                    /* height of input section */
+                           int *w,                    /* width of input section */
+                           int *nStride,
+                           int *cStride,
+                           int *hStride,
+                           int *wStride);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t dataType,
+                           int nbDims,
+                           const int dimA[],
+                           const int strideA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnTensorFormat_t format,
+                             cudnnDataType_t dataType,
+                             int nbDims,
+                             const int dimA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType,
+                           int *nbDims,
+                           int dimA[],
+                           int strideA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size);
+
+/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride
+
+   1)Example of all images in row major order one batch of features after the other (with an optional padding on row)
+   input_stride :  c x h x h_stride
+   feature_stride : h x h_stride
+   h_stride  :  >= w  ( h_stride = w if no padding)
+   w_stride  : 1
+
+
+   2)Example of all images in row major with features maps interleaved
+   input_stride :  c x h x h_stride
+   feature_stride : 1
+   h_stride  :  w x c
+   w_stride  : c
+
+   3)Example of all images in column major order one batch of features after the other (with optional padding on column)
+   input_stride :  c x w x w_stride
+   feature_stride : w x w_stride
+   h_stride  :  1
+   w_stride  :  >= h
+
+*/
+
+/* Destroy an instance of Tensor4d descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
+
+/* Fold/unfold transforms */
+typedef enum {
+    CUDNN_TRANSFORM_FOLD   = 0U,
+    CUDNN_TRANSFORM_UNFOLD = 1U,
+} cudnnFoldingDirection_t;
+
+/** Create a destination descriptor for cudnnTransformTensor */
+cudnnStatus_t CUDNNWINAPI
+cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       cudnnTensorDescriptor_t destDesc,
+                       size_t *destSizeInBytes);
+
+/** Create an empty tensor transform descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc);
+
+/** Initialize a previously created tensor transform descriptor. */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  const uint32_t nbDims,
+                                  const cudnnTensorFormat_t destFormat,
+                                  const int32_t padBeforeA[],
+                                  const int32_t padAfterA[],
+                                  const uint32_t foldA[],
+                                  const cudnnFoldingDirection_t direction);
+
+/**
+ * Retrieves the values stored in a previously initialized tensor transform
+ * descriptor.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  uint32_t nbDimsRequested,
+                                  cudnnTensorFormat_t *destFormat,
+                                  int32_t padBeforeA[],
+                                  int32_t padAfterA[],
+                                  uint32_t foldA[],
+                                  cudnnFoldingDirection_t *direction);
+
+/**
+ * Destroys a previously created tensor transform descriptor.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc);
+
+/* Tensor layout conversion helper (y = alpha * x + beta * y) */
+cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensor(cudnnHandle_t handle,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t yDesc,
+                     void *y);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensorEx(cudnnHandle_t handle,
+                       const cudnnTensorTransformDescriptor_t transDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       const void *srcData,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t destDesc,
+                       void *destData);
+
+/* Tensor Bias addition : C = alpha * A + beta * C  */
+cudnnStatus_t CUDNNWINAPI
+cudnnAddTensor(cudnnHandle_t handle,
+               const void *alpha,
+               const cudnnTensorDescriptor_t aDesc,
+               const void *A,
+               const void *beta,
+               const cudnnTensorDescriptor_t cDesc,
+               void *C);
+
+/*
+ * CUDNN OpTensor op type
+ */
+typedef enum {
+    CUDNN_OP_TENSOR_ADD  = 0,
+    CUDNN_OP_TENSOR_MUL  = 1,
+    CUDNN_OP_TENSOR_MIN  = 2,
+    CUDNN_OP_TENSOR_MAX  = 3,
+    CUDNN_OP_TENSOR_SQRT = 4,
+    CUDNN_OP_TENSOR_NOT  = 5,
+} cudnnOpTensorOp_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t opTensorOp,
+                           cudnnDataType_t opTensorCompType,
+                           cudnnNanPropagation_t opTensorNanOpt);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t *opTensorOp,
+                           cudnnDataType_t *opTensorCompType,
+                           cudnnNanPropagation_t *opTensorNanOpt);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc);
+
+/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */
+/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */
+cudnnStatus_t CUDNNWINAPI
+cudnnOpTensor(cudnnHandle_t handle,
+              const cudnnOpTensorDescriptor_t opTensorDesc,
+              const void *alpha1,
+              const cudnnTensorDescriptor_t aDesc,
+              const void *A,
+              const void *alpha2,
+              const cudnnTensorDescriptor_t bDesc,
+              const void *B,
+              const void *beta,
+              const cudnnTensorDescriptor_t cDesc,
+              void *C);
+
+/*
+ * CUDNN ReduceTensor op type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_ADD          = 0,
+    CUDNN_REDUCE_TENSOR_MUL          = 1,
+    CUDNN_REDUCE_TENSOR_MIN          = 2,
+    CUDNN_REDUCE_TENSOR_MAX          = 3,
+    CUDNN_REDUCE_TENSOR_AMAX         = 4,
+    CUDNN_REDUCE_TENSOR_AVG          = 5,
+    CUDNN_REDUCE_TENSOR_NORM1        = 6,
+    CUDNN_REDUCE_TENSOR_NORM2        = 7,
+    CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
+} cudnnReduceTensorOp_t;
+
+/*
+ * CUDNN ReduceTensor indices type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_NO_INDICES        = 0,
+    CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
+} cudnnReduceTensorIndices_t;
+
+/*
+ * CUDNN tensor indices type size (all unsigned)
+ * Currently not supported, default is 32 bit unsigned.
+ */
+typedef enum {
+    CUDNN_32BIT_INDICES = 0,
+    CUDNN_64BIT_INDICES = 1,
+    CUDNN_16BIT_INDICES = 2,
+    CUDNN_8BIT_INDICES  = 3,
+} cudnnIndicesType_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t reduceTensorOp,
+                               cudnnDataType_t reduceTensorCompType,
+                               cudnnNanPropagation_t reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t reduceTensorIndices,
+                               cudnnIndicesType_t reduceTensorIndicesType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t *reduceTensorOp,
+                               cudnnDataType_t *reduceTensorCompType,
+                               cudnnNanPropagation_t *reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t *reduceTensorIndices,
+                               cudnnIndicesType_t *reduceTensorIndicesType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc);
+
+/* Helper function to return the minimum size of the index space to be passed to the reduction given the input and
+ * output tensors */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionIndicesSize(cudnnHandle_t handle,
+                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                             const cudnnTensorDescriptor_t aDesc,
+                             const cudnnTensorDescriptor_t cDesc,
+                             size_t *sizeInBytes);
+
+/* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output
+ * tensors */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
+                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               const cudnnTensorDescriptor_t aDesc,
+                               const cudnnTensorDescriptor_t cDesc,
+                               size_t *sizeInBytes);
+
+/* Tensor operation : C = reduce op( alpha * A ) + beta * C */
+/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */
+/* The indices space is ignored for reduce ops other than min or max. */
+cudnnStatus_t CUDNNWINAPI
+cudnnReduceTensor(cudnnHandle_t handle,
+                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                  void *indices,
+                  size_t indicesSizeInBytes,
+                  void *workspace,
+                  size_t workspaceSizeInBytes,
+                  const void *alpha,
+                  const cudnnTensorDescriptor_t aDesc,
+                  const void *A,
+                  const void *beta,
+                  const cudnnTensorDescriptor_t cDesc,
+                  void *C);
+
+/* Set all values of a tensor to a given value : y[i] = value[0] */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr);
+
+/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */
+cudnnStatus_t CUDNNWINAPI
+cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha);
+
+/* Create an instance of FilterStruct */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int k,  /* number of output feature maps */
+                           int c,  /* number of input feature maps */
+                           int h,  /* height of each input filter */
+                           int w); /* width of  each input filter */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *k,  /* number of output feature maps */
+                           int *c,  /* number of input feature maps */
+                           int *h,  /* height of each input filter */
+                           int *w); /* width of  each input filter */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int nbDims,
+                           const int filterDimA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *nbDims,
+                           int filterDimA[]);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnTransformFilter(cudnnHandle_t handle,
+                     const cudnnTensorTransformDescriptor_t transDesc,
+                     const void *alpha,
+                     const cudnnFilterDescriptor_t srcDesc,
+                     const void *srcData,
+                     const void *beta,
+                     const cudnnFilterDescriptor_t destDesc,
+                     void *destData);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc);
+
+/*
+ *  softmax algorithm
+ */
+typedef enum {
+    CUDNN_SOFTMAX_FAST     = 0, /* straightforward implementation */
+    CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */
+    CUDNN_SOFTMAX_LOG      = 2
+} cudnnSoftmaxAlgorithm_t;
+
+typedef enum {
+    CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */
+    CUDNN_SOFTMAX_MODE_CHANNEL  = 1  /* compute the softmax over all C for each H, W, N */
+} cudnnSoftmaxMode_t;
+
+/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+
+/* Function to perform forward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxForward(cudnnHandle_t handle,
+                    cudnnSoftmaxAlgorithm_t algo,
+                    cudnnSoftmaxMode_t mode,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+
+/*
+ *  pooling mode
+ */
+typedef enum {
+    CUDNN_POOLING_MAX                           = 0,
+    CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */
+    CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */
+    CUDNN_POOLING_MAX_DETERMINISTIC             = 3
+} cudnnPoolingMode_t;
+
+/* Create an instance of pooling descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t mode,
+                            cudnnNanPropagation_t maxpoolingNanOpt,
+                            int windowHeight,
+                            int windowWidth,
+                            int verticalPadding,
+                            int horizontalPadding,
+                            int verticalStride,
+                            int horizontalStride);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *windowHeight,
+                            int *windowWidth,
+                            int *verticalPadding,
+                            int *horizontalPadding,
+                            int *verticalStride,
+                            int *horizontalStride);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            const cudnnPoolingMode_t mode,
+                            const cudnnNanPropagation_t maxpoolingNanOpt,
+                            int nbDims,
+                            const int windowDimA[],
+                            const int paddingA[],
+                            const int strideA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            int nbDimsRequested,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *nbDims,
+                            int windowDimA[],
+                            int paddingA[],
+                            int strideA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims,
+                                  int outputTensorDimA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n,
+                                  int *c,
+                                  int *h,
+                                  int *w);
+
+/* Destroy an instance of pooling descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc);
+
+/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+
+/* Function to perform forward pooling */
+cudnnStatus_t CUDNNWINAPI
+cudnnPoolingForward(cudnnHandle_t handle,
+                    const cudnnPoolingDescriptor_t poolingDesc,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+
+/*
+ * activation mode
+ */
+typedef enum {
+    CUDNN_ACTIVATION_SIGMOID      = 0,
+    CUDNN_ACTIVATION_RELU         = 1,
+    CUDNN_ACTIVATION_TANH         = 2,
+    CUDNN_ACTIVATION_CLIPPED_RELU = 3,
+    CUDNN_ACTIVATION_ELU          = 4,
+    CUDNN_ACTIVATION_IDENTITY     = 5,
+    CUDNN_ACTIVATION_SWISH        = 6
+} cudnnActivationMode_t;
+
+/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t mode,
+                             cudnnNanPropagation_t reluNanOpt,
+                             double coef); /* ceiling for clipped RELU, alpha for ELU */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt,
+                             double *coef); /* ceiling for clipped RELU, alpha for ELU */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
+
+/* Function to perform forward activation  */
+cudnnStatus_t CUDNNWINAPI
+cudnnActivationForward(cudnnHandle_t handle,
+                       cudnnActivationDescriptor_t activationDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t xDesc,
+                       const void *x,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t yDesc,
+                       void *y);
+
+/*
+ * Create an instance of LRN (Local Response Normalization) descriptor
+ * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc);
+
+#define CUDNN_LRN_MIN_N 1       /* minimum allowed lrnN */
+#define CUDNN_LRN_MAX_N 16      /* maximum allowed lrnN */
+#define CUDNN_LRN_MIN_K 1e-5    /* minimum allowed lrnK */
+#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */
+
+/* LRN layer mode */
+typedef enum {
+    CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */
+} cudnnLRNMode_t;
+
+/*
+ * Uses a window [center-lookBehind, center+lookAhead], where
+ * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1.
+ * Values of double parameters cast to tensor data type.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK);
+/*
+ * Retrieve the settings currently stored in an LRN layer descriptor
+ * Any of the provided pointers can be NULL (no corresponding value will be returned)
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK);
+
+/* Destroy an instance of LRN descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc);
+
+/* LRN functions: output = alpha * normalize(x) + beta * old_y */
+
+/* LRN cross-channel forward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelForward(cudnnHandle_t handle,
+                            cudnnLRNDescriptor_t normDesc,
+                            cudnnLRNMode_t lrnMode,
+                            const void *alpha,
+                            const cudnnTensorDescriptor_t xDesc,
+                            const void *x,
+                            const void *beta,
+                            const cudnnTensorDescriptor_t yDesc,
+                            void *y);
+
+typedef enum {
+    CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
+} cudnnDivNormMode_t;
+
+/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
+                                  cudnnLRNDescriptor_t normDesc,
+                                  cudnnDivNormMode_t mode,
+                                  const void *alpha,
+                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+                                  const void *x,
+                                  const void *means, /* if NULL, means are assumed to be zero */
+                                  void *temp,
+                                  void *temp2,
+                                  const void *beta,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *y);
+
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_BATCHNORM_PER_ACTIVATION = 0,
+
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_BATCHNORM_SPATIAL = 1,
+
+    /*
+     * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors).
+     * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values
+     */
+    CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
+} cudnnBatchNormMode_t;
+
+#define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */
+
+/*
+ * Derives a tensor descriptor from layer data descriptor for BatchNormalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
+                              const cudnnTensorDescriptor_t xDesc,
+                              cudnnBatchNormMode_t mode);
+
+typedef enum {
+    CUDNN_BATCHNORM_OPS_BN                = 0, /* do batch normalization only */
+    CUDNN_BATCHNORM_OPS_BN_ACTIVATION     = 1, /* do batchNorm, then activation */
+    CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */
+} cudnnBatchNormOps_t;
+
+/*
+ * Performs Batch Normalization during Inference:
+ * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k]
+ * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed
+ * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
+                                        cudnnBatchNormMode_t mode,
+                                        const void *alpha, /* alpha[0] = result blend factor */
+                                        const void *beta,  /* beta[0] = dest layer blend factor */
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const void *x, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        void *y, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                        const void *bnScale,
+                                        const void *bnBias,
+                                        const void *estimatedMean,
+                                        const void *estimatedVariance,
+                                        double epsilon);
+
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_NORM_PER_ACTIVATION = 0,
+
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_NORM_PER_CHANNEL = 1,
+} cudnnNormMode_t;
+
+typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t;
+
+/*
+ * Derives a tensor descriptor from layer data descriptor for Normalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc,
+                                cudnnTensorDescriptor_t derivedNormMeanVarDesc,
+                                const cudnnTensorDescriptor_t xDesc,
+                                cudnnNormMode_t mode,
+                                int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+typedef enum {
+    CUDNN_NORM_OPS_NORM                = 0, /* do normalization only */
+    CUDNN_NORM_OPS_NORM_ACTIVATION     = 1, /* do Norm, then activation */
+    CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */
+} cudnnNormOps_t;
+
+/*
+ * Performs Normalization during Inference:
+ * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k]
+ * with normScale, normBias, runningMean, runningInvVariance tensors indexed
+ * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardInference(cudnnHandle_t handle,
+                                   cudnnNormMode_t mode,
+                                   cudnnNormOps_t normOps,
+                                   cudnnNormAlgo_t algo,
+                                   const void *alpha, /* alpha[0] = result blend factor */
+                                   const void *beta,  /* beta[0] = dest layer blend factor */
+                                   const cudnnTensorDescriptor_t xDesc,
+                                   const void *x, /* NxCxHxW */
+                                   const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                   const void *normScale,
+                                   const void *normBias,
+                                   const cudnnTensorDescriptor_t normMeanVarDesc,
+                                   const void *estimatedMean,
+                                   const void *estimatedVariance,
+                                   const cudnnTensorDescriptor_t zDesc,
+                                   const void *z,
+                                   cudnnActivationDescriptor_t activationDesc,
+                                   const cudnnTensorDescriptor_t yDesc,
+                                   void *y, /* NxCxHxW */
+                                   double epsilon,
+                                   int groupCnt); /* Place hold for future work*/
+
+/* APIs for spatial transformer network*/
+typedef enum {
+    CUDNN_SAMPLER_BILINEAR = 0,
+} cudnnSamplerType_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
+                                       cudnnSamplerType_t samplerType,
+                                       cudnnDataType_t dataType,
+                                       const int nbDims,
+                                       const int dimA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
+                                   const cudnnSpatialTransformerDescriptor_t stDesc,
+                                   const void *theta,
+                                   void *grid);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
+                             cudnnSpatialTransformerDescriptor_t stDesc,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *grid,
+                             const void *beta,
+                             cudnnTensorDescriptor_t yDesc,
+                             void *y);
+
+typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc);
+
+/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes);
+
+/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float dropout,
+                          void *states,
+                          size_t stateSizeInBytes,
+                          unsigned long long seed);
+
+/* Restores the dropout descriptor to a previously saved-off state */
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                              cudnnHandle_t handle,
+                              float dropout,
+                              void *states,
+                              size_t stateSizeInBytes,
+                              unsigned long long seed);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float *dropout,
+                          void **states,
+                          unsigned long long *seed);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutForward(cudnnHandle_t handle,
+                    const cudnnDropoutDescriptor_t dropoutDesc,
+                    const cudnnTensorDescriptor_t xdesc,
+                    const void *x,
+                    const cudnnTensorDescriptor_t ydesc,
+                    void *y,
+                    void *reserveSpace,
+                    size_t reserveSpaceSizeInBytes);
+
+/* TODO: remove */
+
+typedef struct cudnnAlgorithmStruct *cudnnAlgorithmDescriptor_t;
+typedef struct cudnnAlgorithmPerformanceStruct *cudnnAlgorithmPerformance_t;
+
+/* TODO: move these enums out to the appropriate submodule */
+typedef enum {
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM         = 0,
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
+    CUDNN_CONVOLUTION_FWD_ALGO_GEMM                  = 2,
+    CUDNN_CONVOLUTION_FWD_ALGO_DIRECT                = 3,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT                   = 4,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING            = 5,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD              = 6,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED     = 7,
+    CUDNN_CONVOLUTION_FWD_ALGO_COUNT                 = 8
+} cudnnConvolutionFwdAlgo_t;
+
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3                 = 3, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD          = 4, /* not implemented */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING        = 6,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT             = 7
+} cudnnConvolutionBwdFilterAlgo_t;
+
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING        = 3,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD          = 4,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT             = 6
+} cudnnConvolutionBwdDataAlgo_t;
+
+typedef enum {
+    CUDNN_RNN_ALGO_STANDARD               = 0,
+    CUDNN_RNN_ALGO_PERSIST_STATIC         = 1,
+    CUDNN_RNN_ALGO_PERSIST_DYNAMIC        = 2,
+    CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
+    CUDNN_RNN_ALGO_COUNT                  = 4,
+} cudnnRNNAlgo_t;
+
+typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t;
+
+/* TODO: remove */
+typedef struct cudnnAlgorithmUnionStruct {
+    union Algorithm {
+        cudnnConvolutionFwdAlgo_t convFwdAlgo;
+        cudnnConvolutionBwdFilterAlgo_t convBwdFilterAlgo;
+        cudnnConvolutionBwdDataAlgo_t convBwdDataAlgo;
+        cudnnRNNAlgo_t RNNAlgo;
+        cudnnCTCLossAlgo_t CTCLossAlgo;
+    } algo;
+} cudnnAlgorithm_t;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
+                             cudnnAlgorithmDescriptor_t algoDesc,
+                             cudnnStatus_t status,
+                             float time,
+                             size_t memory);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
+                             cudnnAlgorithmDescriptor_t *algoDesc,
+                             cudnnStatus_t *status,
+                             float *time,
+                             size_t *memory);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSaveAlgorithm(cudnnHandle_t handle,
+                   cudnnAlgorithmDescriptor_t algoDesc,
+                   void *algoSpace,
+                   size_t algoSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRestoreAlgorithm(cudnnHandle_t handle,
+                      void *algoSpace,
+                      size_t algoSpaceSizeInBytes,
+                      cudnnAlgorithmDescriptor_t algoDesc);
+
+typedef enum {
+    CUDNN_SEV_FATAL   = 0,
+    CUDNN_SEV_ERROR   = 1,
+    CUDNN_SEV_WARNING = 2,
+    CUDNN_SEV_INFO    = 3,
+} cudnnSeverity_t;
+
+/* Message masks to be used with cudnnSetCallback() */
+#define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR)
+#define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING)
+#define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO)
+
+/* struct containing useful informaiton for each API call */
+typedef struct cudnnDebugStruct {
+    unsigned cudnn_version;
+    cudnnStatus_t cudnnStatus;
+    unsigned time_sec;      /* epoch time in seconds */
+    unsigned time_usec;     /* microseconds part of epoch time */
+    unsigned time_delta;    /* time since start in seconds */
+    cudnnHandle_t handle;   /* cudnn handle */
+    cudaStream_t stream;    /* cuda stream ID */
+    unsigned long long pid; /* process ID */
+    unsigned long long tid; /* thread ID */
+    int cudaDeviceId;       /* CUDA device ID */
+    int reserved[15];       /* reserved for future use */
+} cudnnDebug_t;
+
+typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr);
+
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnOpsInferVersionCheck(void);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_OPS_INFER_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_train.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_train.h
new file mode 100644
index 0000000000000000000000000000000000000000..b16897b7626ebc9d22fd8031932800eb023e65df
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_train.h
@@ -0,0 +1,501 @@
+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ *  cudnn_ops_train : cuDNN's basic training operations and algorithms.
+ */
+
+#if !defined(CUDNN_OPS_TRAIN_H_)
+#define CUDNN_OPS_TRAIN_H_
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+#include "cudnn_version.h"
+#include "cudnn_ops_infer.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_OPS_TRAIN_MAJOR 8
+#define CUDNN_OPS_TRAIN_MINOR 7
+#define CUDNN_OPS_TRAIN_PATCH 0
+
+#if (CUDNN_OPS_TRAIN_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_TRAIN_MINOR != CUDNN_MINOR) || \
+    (CUDNN_OPS_TRAIN_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN OPS TRAIN!!!
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Function to perform backward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxBackward(cudnnHandle_t handle,
+                     cudnnSoftmaxAlgorithm_t algo,
+                     cudnnSoftmaxMode_t mode,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+
+/* Function to perform backward pooling */
+cudnnStatus_t CUDNNWINAPI
+cudnnPoolingBackward(cudnnHandle_t handle,
+                     const cudnnPoolingDescriptor_t poolingDesc,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+
+/* Function to perform backward activation  */
+cudnnStatus_t CUDNNWINAPI
+cudnnActivationBackward(cudnnHandle_t handle,
+                        cudnnActivationDescriptor_t activationDesc,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t yDesc,
+                        const void *y,
+                        const cudnnTensorDescriptor_t dyDesc,
+                        const void *dy,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t dxDesc,
+                        void *dx);
+
+/* LRN cross-channel backward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
+                             cudnnLRNDescriptor_t normDesc,
+                             cudnnLRNMode_t lrnMode,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t yDesc,
+                             const void *y,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
+                                   cudnnLRNDescriptor_t normDesc,
+                                   cudnnDivNormMode_t mode,
+                                   const void *alpha,
+                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
+                                   const void *x,
+                                   const void *means, /* if NULL, means are assumed to be zero */
+                                   const void *dy,
+                                   void *temp,
+                                   void *temp2,
+                                   const void *beta,
+                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+                                   void *dx,                                   /* output x differential */
+                                   void *dMeans); /* output means differential, can be NULL */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
+                                                         cudnnBatchNormMode_t mode,
+                                                         cudnnBatchNormOps_t bnOps,
+                                                         const cudnnTensorDescriptor_t xDesc,
+                                                         const cudnnTensorDescriptor_t zDesc,
+                                                         const cudnnTensorDescriptor_t yDesc,
+                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                                         const cudnnActivationDescriptor_t activationDesc,
+                                                         size_t *sizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnBatchNormMode_t mode,
+                                                  cudnnBatchNormOps_t bnOps,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t dyDesc,
+                                                  const cudnnTensorDescriptor_t dzDesc,
+                                                  const cudnnTensorDescriptor_t dxDesc,
+                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  size_t *sizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
+                                                     cudnnBatchNormMode_t mode,
+                                                     cudnnBatchNormOps_t bnOps,
+                                                     const cudnnActivationDescriptor_t activationDesc,
+                                                     const cudnnTensorDescriptor_t xDesc,
+                                                     size_t *sizeInBytes);
+
+/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc,
+    void *y, /* NxCxHxW */
+
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
+        (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
+    const void *bnScale,
+    const void *bnBias,
+
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
+
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance);
+
+/* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    cudnnBatchNormOps_t bnOps,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *xData,
+    const cudnnTensorDescriptor_t zDesc,
+    const void *zData,
+    const cudnnTensorDescriptor_t yDesc,
+    void *yData,
+
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const void *bnScale,
+    const void *bnBias,
+
+    double exponentialAverageFactor,
+    void *resultRunningMean,
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance,
+
+    cudnnActivationDescriptor_t activationDesc,
+    void *workspace,
+    size_t workSpaceSizeInBytes,
+    void *reserveSpace,
+    size_t reserveSpaceSizeInBytes);
+
+/* Performs backward pass of Batch Normalization layer. Returns x gradient,
+* bnScale gradient and bnBias gradient */
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackward(cudnnHandle_t handle,
+                                cudnnBatchNormMode_t mode,
+                                const void *alphaDataDiff,
+                                const void *betaDataDiff,
+                                const void *alphaParamDiff,
+                                const void *betaParamDiff,
+                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+                                const void *x,
+                                const cudnnTensorDescriptor_t dyDesc,
+                                const void *dy,
+                                const cudnnTensorDescriptor_t dxDesc,
+                                void *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                const void *bnScale, /* bnBias doesn't affect backpropagation */
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void *dBnScaleResult,
+                                void *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double epsilon,
+
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void *savedMean,
+                                const void *savedInvVariance);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
+                                  cudnnBatchNormMode_t mode,
+                                  cudnnBatchNormOps_t bnOps,
+
+                                  const void *alphaDataDiff,
+                                  const void *betaDataDiff,
+                                  const void *alphaParamDiff,
+                                  const void *betaParamDiff,
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  const void *yData,
+                                  const cudnnTensorDescriptor_t dyDesc,
+                                  const void *dyData,
+                                  const cudnnTensorDescriptor_t dzDesc,
+                                  void *dzData,
+                                  const cudnnTensorDescriptor_t dxDesc,
+                                  void *dxData,
+
+                                  /* Shared tensor desc for the 4 tensors below */
+                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                  const void *bnScaleData,
+                                  const void *bnBiasData, /* needed if there is activation */
+                                  void *dBnScaleData,
+                                  void *dBnBiasData,
+                                  double epsilon, /* Same epsilon as forward pass */
+
+                                  /* Optionally cached intermediate results from
+                                     forward pass */
+                                  const void *savedMean,
+                                  const void *savedInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  void *workSpace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnNormMode_t mode,
+                                                  cudnnNormOps_t normOps,
+                                                  cudnnNormAlgo_t algo,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t zDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                                  size_t *sizeInBytes,
+                                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle,
+                                           cudnnNormMode_t mode,
+                                           cudnnNormOps_t normOps,
+                                           cudnnNormAlgo_t algo,
+                                           const cudnnTensorDescriptor_t xDesc,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           const cudnnTensorDescriptor_t dyDesc,
+                                           const cudnnTensorDescriptor_t dzDesc,
+                                           const cudnnTensorDescriptor_t dxDesc,
+                                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                                           const cudnnActivationDescriptor_t activationDesc,
+                                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                                           size_t *sizeInBytes,
+                                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle,
+                                              cudnnNormMode_t mode,
+                                              cudnnNormOps_t normOps,
+                                              cudnnNormAlgo_t algo,
+                                              const cudnnActivationDescriptor_t activationDesc,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              size_t *sizeInBytes,
+                                              int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+/* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */
+cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardTraining(cudnnHandle_t handle,
+                                  cudnnNormMode_t mode,
+                                  cudnnNormOps_t normOps,
+                                  cudnnNormAlgo_t algo,
+                                  const void *alpha, /* alpha[0] = result blend factor */
+                                  const void *beta,  /* beta[0] = dest layer blend factor */
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                  const void *normScale,
+                                  const void *normBias,
+                                  double exponentialAverageFactor,
+                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                  void *resultRunningMean,
+                                  void *resultRunningVariance,
+                                  /* Has to be >= 0. Should be the same in forward and backward functions. */
+                                  double epsilon,
+                                  /* Optionally save intermediate results from the forward pass here
+                                     - can be reused to speed up backward pass. NULL if unused */
+                                  void *resultSaveMean,
+                                  void *resultSaveInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  const cudnnTensorDescriptor_t zDesc,
+                                  const void *zData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *yData,
+                                  void *workspace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationBackward(cudnnHandle_t handle,
+                           cudnnNormMode_t mode,
+                           cudnnNormOps_t normOps,
+                           cudnnNormAlgo_t algo,
+                           const void *alphaDataDiff,
+                           const void *betaDataDiff,
+                           const void *alphaParamDiff,
+                           const void *betaParamDiff,
+                           const cudnnTensorDescriptor_t xDesc,
+                           const void *xData,
+                           const cudnnTensorDescriptor_t yDesc,
+                           const void *yData,
+                           const cudnnTensorDescriptor_t dyDesc,
+                           const void *dyData,
+                           const cudnnTensorDescriptor_t dzDesc,
+                           void *dzData,
+                           const cudnnTensorDescriptor_t dxDesc,
+                           void *dxData,
+                           /* Shared tensor desc for the 4 tensors below */
+                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                           const void *normScaleData,
+                           const void *normBiasData, /* needed if there is activation */
+                           void *dNormScaleData,
+                           void *dNormBiasData,
+                           double epsilon, /* Same epsilon as forward pass */
+                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                           /* Optionally cached intermediate results from
+                              forward pass */
+                           const void *savedMean,
+                           const void *savedInvVariance,
+                           cudnnActivationDescriptor_t activationDesc,
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes,
+                           void *reserveSpace,
+                           size_t reserveSpaceSizeInBytes,
+                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
+                                    const cudnnSpatialTransformerDescriptor_t stDesc,
+                                    const void *dgrid,
+                                    void *dtheta);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
+                              cudnnSpatialTransformerDescriptor_t stDesc,
+                              const void *alpha,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const void *x,
+                              const void *beta,
+                              const cudnnTensorDescriptor_t dxDesc,
+                              void *dx,
+                              const void *alphaDgrid,
+                              const cudnnTensorDescriptor_t dyDesc,
+                              const void *dy,
+                              const void *grid,
+                              const void *betaDgrid,
+                              void *dgrid);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutBackward(cudnnHandle_t handle,
+                     const cudnnDropoutDescriptor_t dropoutDesc,
+                     const cudnnTensorDescriptor_t dydesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dxdesc,
+                     void *dx,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes);
+
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnOpsTrainVersionCheck(void);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_OPS_TRAIN_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ab66649fe4f20f3a1507d880cfd815a311d70d8
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufft.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufft.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d11c6a2579a7dba4e61dda86be5a2541d7d21b7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufft.h
@@ -0,0 +1,322 @@
+ /* Copyright 2005-2021 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+/*!
+* \file cufft.h
+* \brief Public header file for the NVIDIA CUDA FFT library (CUFFT)
+*/
+
+#ifndef _CUFFT_H_
+#define _CUFFT_H_
+
+
+#include "cuComplex.h"
+#include "driver_types.h"
+#include "library_types.h"
+
+#ifndef CUFFTAPI
+#ifdef _WIN32
+#define CUFFTAPI __stdcall
+#elif __GNUC__ >= 4
+#define CUFFTAPI __attribute__ ((visibility ("default")))
+#else
+#define CUFFTAPI
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CUFFT_VER_MAJOR 10
+#define CUFFT_VER_MINOR 9
+#define CUFFT_VER_PATCH 0
+#define CUFFT_VER_BUILD 58
+
+// cuFFT library version
+//
+// CUFFT_VERSION / 1000 - major version
+// CUFFT_VERSION / 100 % 100 - minor version
+// CUFFT_VERSION % 100 - patch level
+#define CUFFT_VERSION 10900
+
+// CUFFT API function return values
+typedef enum cufftResult_t {
+  CUFFT_SUCCESS        = 0x0,
+  CUFFT_INVALID_PLAN   = 0x1,
+  CUFFT_ALLOC_FAILED   = 0x2,
+  CUFFT_INVALID_TYPE   = 0x3,
+  CUFFT_INVALID_VALUE  = 0x4,
+  CUFFT_INTERNAL_ERROR = 0x5,
+  CUFFT_EXEC_FAILED    = 0x6,
+  CUFFT_SETUP_FAILED   = 0x7,
+  CUFFT_INVALID_SIZE   = 0x8,
+  CUFFT_UNALIGNED_DATA = 0x9,
+  CUFFT_INCOMPLETE_PARAMETER_LIST = 0xA,
+  CUFFT_INVALID_DEVICE = 0xB,
+  CUFFT_PARSE_ERROR = 0xC,
+  CUFFT_NO_WORKSPACE = 0xD,
+  CUFFT_NOT_IMPLEMENTED = 0xE,
+  CUFFT_LICENSE_ERROR = 0x0F,
+  CUFFT_NOT_SUPPORTED = 0x10
+
+} cufftResult;
+
+#define MAX_CUFFT_ERROR 0x11
+
+
+// CUFFT defines and supports the following data types
+
+
+// cufftReal is a single-precision, floating-point real data type.
+// cufftDoubleReal is a double-precision, real data type.
+typedef float cufftReal;
+typedef double cufftDoubleReal;
+
+// cufftComplex is a single-precision, floating-point complex data type that
+// consists of interleaved real and imaginary components.
+// cufftDoubleComplex is the double-precision equivalent.
+typedef cuComplex cufftComplex;
+typedef cuDoubleComplex cufftDoubleComplex;
+
+// CUFFT transform directions
+#define CUFFT_FORWARD -1 // Forward FFT
+#define CUFFT_INVERSE  1 // Inverse FFT
+
+// CUFFT supports the following transform types
+typedef enum cufftType_t {
+  CUFFT_R2C = 0x2a,     // Real to Complex (interleaved)
+  CUFFT_C2R = 0x2c,     // Complex (interleaved) to Real
+  CUFFT_C2C = 0x29,     // Complex to Complex, interleaved
+  CUFFT_D2Z = 0x6a,     // Double to Double-Complex
+  CUFFT_Z2D = 0x6c,     // Double-Complex to Double
+  CUFFT_Z2Z = 0x69      // Double-Complex to Double-Complex
+} cufftType;
+
+// CUFFT supports the following data layouts
+typedef enum cufftCompatibility_t {
+    CUFFT_COMPATIBILITY_FFTW_PADDING    = 0x01    // The default value
+} cufftCompatibility;
+
+#define CUFFT_COMPATIBILITY_DEFAULT   CUFFT_COMPATIBILITY_FFTW_PADDING
+
+//
+// structure definition used by the shim between old and new APIs
+//
+#define MAX_SHIM_RANK 3
+
+// cufftHandle is a handle type used to store and access CUFFT plans.
+typedef int cufftHandle;
+
+
+cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,
+                                 int nx,
+                                 cufftType type,
+                                 int batch);
+
+cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,
+                                 int nx, int ny,
+                                 cufftType type);
+
+cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan,
+                                 int nx, int ny, int nz,
+                                 cufftType type);
+
+cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan,
+                                   int rank,
+                                   int *n,
+                                   int *inembed, int istride, int idist,
+                                   int *onembed, int ostride, int odist,
+                                   cufftType type,
+                                   int batch);
+
+cufftResult CUFFTAPI cufftMakePlan1d(cufftHandle plan,
+                                     int nx,
+                                     cufftType type,
+                                     int batch,
+                                     size_t *workSize);
+
+cufftResult CUFFTAPI cufftMakePlan2d(cufftHandle plan,
+                                     int nx, int ny,
+                                     cufftType type,
+                                     size_t *workSize);
+
+cufftResult CUFFTAPI cufftMakePlan3d(cufftHandle plan,
+                                     int nx, int ny, int nz,
+                                     cufftType type,
+                                     size_t *workSize);
+
+cufftResult CUFFTAPI cufftMakePlanMany(cufftHandle plan,
+                                       int rank,
+                                       int *n,
+                                       int *inembed, int istride, int idist,
+                                       int *onembed, int ostride, int odist,
+                                       cufftType type,
+                                       int batch,
+                                       size_t *workSize);
+
+cufftResult CUFFTAPI cufftMakePlanMany64(cufftHandle plan,
+                                         int rank,
+                                         long long int *n,
+                                         long long int *inembed,
+                                         long long int istride,
+                                         long long int idist,
+                                         long long int *onembed,
+                                         long long int ostride, long long int odist,
+                                         cufftType type,
+                                         long long int batch,
+                                         size_t * workSize);
+
+cufftResult CUFFTAPI cufftGetSizeMany64(cufftHandle plan,
+                                        int rank,
+                                        long long int *n,
+                                        long long int *inembed,
+                                        long long int istride, long long int idist,
+                                        long long int *onembed,
+                                        long long int ostride, long long int odist,
+                                        cufftType type,
+                                        long long int batch,
+                                        size_t *workSize);
+
+
+
+
+cufftResult CUFFTAPI cufftEstimate1d(int nx,
+                                     cufftType type,
+                                     int batch,
+                                     size_t *workSize);
+
+cufftResult CUFFTAPI cufftEstimate2d(int nx, int ny,
+                                     cufftType type,
+                                     size_t *workSize);
+
+cufftResult CUFFTAPI cufftEstimate3d(int nx, int ny, int nz,
+                                     cufftType type,
+                                     size_t *workSize);
+
+cufftResult CUFFTAPI cufftEstimateMany(int rank,
+                                       int *n,
+                                       int *inembed, int istride, int idist,
+                                       int *onembed, int ostride, int odist,
+                                       cufftType type,
+                                       int batch,
+                                       size_t *workSize);
+
+cufftResult CUFFTAPI cufftCreate(cufftHandle * handle);
+
+cufftResult CUFFTAPI cufftGetSize1d(cufftHandle handle,
+                                    int nx,
+                                    cufftType type,
+                                    int batch,
+                                    size_t *workSize );
+
+cufftResult CUFFTAPI cufftGetSize2d(cufftHandle handle,
+                                    int nx, int ny,
+                                    cufftType type,
+                                    size_t *workSize);
+
+cufftResult CUFFTAPI cufftGetSize3d(cufftHandle handle,
+                                    int nx, int ny, int nz,
+                                    cufftType type,
+                                    size_t *workSize);
+
+cufftResult CUFFTAPI cufftGetSizeMany(cufftHandle handle,
+                                      int rank, int *n,
+                                      int *inembed, int istride, int idist,
+                                      int *onembed, int ostride, int odist,
+                                      cufftType type, int batch, size_t *workArea);
+
+cufftResult CUFFTAPI cufftGetSize(cufftHandle handle, size_t *workSize);
+
+cufftResult CUFFTAPI cufftSetWorkArea(cufftHandle plan, void *workArea);
+
+cufftResult CUFFTAPI cufftSetAutoAllocation(cufftHandle plan, int autoAllocate);
+
+cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan,
+                                  cufftComplex *idata,
+                                  cufftComplex *odata,
+                                  int direction);
+
+cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan,
+                                  cufftReal *idata,
+                                  cufftComplex *odata);
+
+cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan,
+                                  cufftComplex *idata,
+                                  cufftReal *odata);
+
+cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan,
+                                  cufftDoubleComplex *idata,
+                                  cufftDoubleComplex *odata,
+                                  int direction);
+
+cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan,
+                                  cufftDoubleReal *idata,
+                                  cufftDoubleComplex *odata);
+
+cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,
+                                  cufftDoubleComplex *idata,
+                                  cufftDoubleReal *odata);
+
+
+// utility functions
+cufftResult CUFFTAPI cufftSetStream(cufftHandle plan,
+                                    cudaStream_t stream);
+
+cufftResult CUFFTAPI cufftDestroy(cufftHandle plan);
+
+cufftResult CUFFTAPI cufftGetVersion(int *version);
+
+cufftResult CUFFTAPI cufftGetProperty(libraryPropertyType type,
+                                      int *value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CUFFT_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ec44e1cd050b033dfa40719340dfe3c80b3e679
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6eef99ad56755b346739013ba70ac95be304afcb
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e194487a0e2ec02abcb1dd8634c42141a148d84
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete.h
@@ -0,0 +1,87 @@
+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#if !defined(CURANDDISCRETE_H_)
+#define CURANDDISCRETE_H_
+
+struct curandDistributionShift_st {
+    curandDistribution_t probability;
+    curandDistribution_t host_probability;
+    unsigned int shift;
+    unsigned int length;
+    unsigned int host_gen;
+};
+
+struct curandHistogramM2_st {
+    curandHistogramM2V_t V; 
+    curandHistogramM2V_t host_V; 
+    curandHistogramM2K_t K; 
+    curandHistogramM2K_t host_K; 
+    unsigned int host_gen;
+};
+
+
+struct curandDistributionM2Shift_st {
+    curandHistogramM2_t histogram;
+    curandHistogramM2_t host_histogram;
+    unsigned int shift;
+    unsigned int length;
+    unsigned int host_gen;
+};
+
+struct curandDiscreteDistribution_st {
+    curandDiscreteDistribution_t self_host_ptr;
+    curandDistributionM2Shift_t M2;
+    curandDistributionM2Shift_t host_M2;
+    double stddev;
+    double mean;
+    curandMethod_t method;
+    unsigned int host_gen; 
+};
+
+#endif // !defined(CURANDDISCRETE_H_)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete2.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete2.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac827749840488f66d71f492c14dbec80b57a3d1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete2.h
@@ -0,0 +1,253 @@
+
+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+
+#if !defined(CURAND_DISCRETE_H_)
+#define CURAND_DISCRETE_H_
+
+/**
+ * \defgroup DEVICE Device API
+ *
+ * @{
+ */
+
+#ifndef __CUDACC_RTC__
+#include <math.h>
+#endif // __CUDACC_RTC__
+
+#include "curand_mrg32k3a.h"
+#include "curand_mtgp32_kernel.h"
+#include "curand_philox4x32_x.h"
+
+
+template <typename T>
+QUALIFIERS unsigned int _curand_discrete(T x, curandDiscreteDistribution_t discrete_distribution){
+    if (discrete_distribution->method == CURAND_M2){
+        return _curand_M2_double(x, discrete_distribution->M2);
+    }
+    return (unsigned int)((discrete_distribution->stddev * _curand_normal_icdf_double(x)) + discrete_distribution->mean + 0.5);
+}
+
+
+template <typename STATE>
+QUALIFIERS unsigned int curand__discrete(STATE state, curandDiscreteDistribution_t discrete_distribution){
+    if (discrete_distribution->method == CURAND_M2){
+        return curand_M2_double(state, discrete_distribution->M2);
+    }
+    return (unsigned int)((discrete_distribution->stddev * curand_normal_double(state)) + discrete_distribution->mean + 0.5); //Round to nearest
+}
+
+template <typename STATE>
+QUALIFIERS uint4 curand__discrete4(STATE state, curandDiscreteDistribution_t discrete_distribution){
+    if (discrete_distribution->method == CURAND_M2){
+        return curand_M2_double4(state, discrete_distribution->M2);
+    }
+    double4 _res;
+    uint4 result;
+    _res = curand_normal4_double(state);
+    result.x = (unsigned int)((discrete_distribution->stddev * _res.x) + discrete_distribution->mean + 0.5); //Round to nearest
+    result.y = (unsigned int)((discrete_distribution->stddev * _res.y) + discrete_distribution->mean + 0.5); //Round to nearest
+    result.z = (unsigned int)((discrete_distribution->stddev * _res.z) + discrete_distribution->mean + 0.5); //Round to nearest
+    result.w = (unsigned int)((discrete_distribution->stddev * _res.w) + discrete_distribution->mean + 0.5); //Round to nearest
+    return result;
+}
+
+/*
+ * \brief Return a discrete distributed unsigned int from a XORWOW generator.
+ *
+ * Return a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the XORWOW generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStateXORWOW_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+
+/*
+ * \brief Return a discrete distributed unsigned int from a Philox4_32_10 generator.
+ *
+ * Return a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+
+/*
+ * \brief Return four discrete distributed unsigned ints from a Philox4_32_10 generator.
+ *
+ * Return four single discrete distributed unsigned ints derived from a
+ * distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS uint4 curand_discrete4(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete4(state, discrete_distribution);
+}
+/*
+ * \brief Return a discrete distributed unsigned int from a MRG32k3a generator.
+ *
+ * Re turn a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the MRG32k3a generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStateMRG32k3a_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+
+/*
+ * \brief Return a discrete distributed unsigned int from a MTGP32 generator.
+ *
+ * Return a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the MTGP32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStateMtgp32_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+
+/*
+ * \brief Return a discrete distributed unsigned int from a Sobol32 generator.
+ *
+ * Return a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStateSobol32_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+
+/*
+ * \brief Return a discrete distributed unsigned int from a scrambled Sobol32 generator.
+ *
+ * Return a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the scrambled Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol32_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+
+/*
+ * \brief Return a discrete distributed unsigned int from a Sobol64 generator.
+ *
+ * Return a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStateSobol64_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+
+/*
+ * \brief Return a discrete distributed unsigned int from a scrambled Sobol64 generator.
+ *
+ * Return a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the scrambled Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol64_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+
+#endif // !defined(CURAND_DISCRETE_H_)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab8a5a5b78629ba86dbb996a7561b415e90dfb53
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32.h
@@ -0,0 +1,210 @@
+/*
+ * Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CURAND_MTGP32_H
+#define CURAND_MTGP32_H
+/*
+ * @file curand_mtgp32.h
+ *
+ * @brief Mersenne Twister for Graphic Processors (mtgp32), which
+ * generates 32-bit unsigned integers and single precision floating
+ * point numbers based on IEEE 754 format.
+ *
+ * @author Mutsuo Saito (Hiroshima University)
+ * @author Makoto Matsumoto (Hiroshima University)
+ *
+ */
+/*
+ * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+ * University.  All rights reserved.
+ * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
+ * University and University of Tokyo.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials provided
+ *       with the distribution.
+ *     * Neither the name of the Hiroshima University nor the names of
+ *       its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written
+ *       permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#define MTGPDC_MEXP 11213
+#define MTGPDC_N 351
+#define MTGPDC_FLOOR_2P 256
+#define MTGPDC_CEIL_2P 512
+#define MTGPDC_PARAM_TABLE mtgp32dc_params_fast_11213
+#define MTGP32_STATE_SIZE 1024
+#define MTGP32_STATE_MASK 1023
+#define CURAND_NUM_MTGP32_PARAMS 200
+#define MEXP 11213
+#define THREAD_NUM MTGPDC_FLOOR_2P
+#define LARGE_SIZE (THREAD_NUM * 3)
+#define TBL_SIZE 16
+
+/**
+ * \addtogroup DEVICE Device API
+ *
+ * @{
+ */
+
+/*
+ * \struct MTGP32_PARAMS_FAST_T
+ * MTGP32 parameters.
+ * Some element is redundant to keep structure simple.
+ *
+ * \b pos is a pick up position which is selected to have good
+ * performance on graphic processors.  3 < \b pos < Q, where Q is a
+ * maximum number such that the size of status array - Q is a power of
+ * 2.  For example, when \b mexp is 44497, size of 32-bit status array
+ * is 696, and Q is 184, then \b pos is between 4 and 183. This means
+ * 512 parallel calculations is allowed when \b mexp is 44497.
+ *
+ * \b poly_sha1 is SHA1 digest of the characteristic polynomial of
+ * state transition function. SHA1 is calculated based on printing
+ * form of the polynomial. This is important when we use parameters
+ * generated by the dynamic creator which
+ *
+ * \b mask This is a mask to make the dimension of state space have
+ * just Mersenne Prime. This is redundant.
+ */
+
+struct mtgp32_params_fast;
+
+struct mtgp32_params_fast {
+    int mexp;			/*< Mersenne exponent. This is redundant. */
+    int pos;			/*< pick up position. */
+    int sh1;			/*< shift value 1. 0 < sh1 < 32. */
+    int sh2;			/*< shift value 2. 0 < sh2 < 32. */
+    unsigned int tbl[16];		/*< a small matrix. */
+    unsigned int tmp_tbl[16];	/*< a small matrix for tempering. */
+    unsigned int flt_tmp_tbl[16];	/*< a small matrix for tempering and
+                 converting to float. */
+    unsigned int mask;		/*< This is a mask for state space */
+    unsigned char poly_sha1[21]; /*< SHA1 digest */
+};
+
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct mtgp32_params_fast mtgp32_params_fast_t;
+/** \endcond */
+
+/*
+ * Generator Parameters.
+ */
+struct mtgp32_kernel_params;
+struct mtgp32_kernel_params {
+    unsigned int pos_tbl[CURAND_NUM_MTGP32_PARAMS];
+    unsigned int param_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
+    unsigned int temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
+    unsigned int single_temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
+    unsigned int sh1_tbl[CURAND_NUM_MTGP32_PARAMS];
+    unsigned int sh2_tbl[CURAND_NUM_MTGP32_PARAMS];
+    unsigned int mask[1];
+};
+
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct mtgp32_kernel_params mtgp32_kernel_params_t;
+/** \endcond */
+
+
+
+/*
+ * kernel I/O
+ * This structure must be initialized before first use.
+ */
+
+/* MTGP (Mersenne Twister) RNG */
+/* This generator uses the Mersenne Twister algorithm of
+ * http://arxiv.org/abs/1005.4973v2
+ * Has period 2^11213.
+*/
+
+/**
+ * CURAND MTGP32 state 
+ */
+struct curandStateMtgp32;
+
+struct curandStateMtgp32 {
+    unsigned int s[MTGP32_STATE_SIZE];
+    int offset;
+    int pIdx;
+    mtgp32_kernel_params_t * k;
+};
+
+/*
+ * CURAND MTGP32 state 
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateMtgp32 curandStateMtgp32_t;
+/** \endcond */
+
+/** @} */
+
+#endif
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_kernel.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe8ad5fe1fd3078a5f7a4bc5bcf872c68792d07c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_kernel.h
@@ -0,0 +1,385 @@
+/*
+ * Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ * curand_mtgp32_kernel.h
+ *
+ *
+ * MTGP32-11213
+ *
+ * Mersenne Twister RNG for the GPU
+ *
+ * The period of generated integers is 2<sup>11213</sup>-1.
+ *
+ * This code generates 32-bit unsigned integers, and
+ * single precision floating point numbers uniformly distributed
+ * in the range [1, 2). (float r; 1.0 <= r < 2.0)
+ */
+
+/*
+ * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+ * University.  All rights reserved.
+ * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
+ * University and University of Tokyo.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials provided
+ *       with the distribution.
+ *     * Neither the name of the Hiroshima University nor the names of
+ *       its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#if !defined CURAND_MTGP32_KERNEL_H
+#define CURAND_MTGP32_KERNEL_H
+
+#if !defined(QUALIFIERS)
+#define QUALIFIERS static __forceinline__ __device__
+#endif
+
+#ifndef __CUDACC_RTC__
+#include <cuda.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <string.h>
+#endif // ifndef __CUDACC_RTC__
+#include "curand.h"
+#include "curand_mtgp32.h"
+
+/**
+ * \addtogroup DEVICE Device API
+ *
+ * @{
+ */
+
+#ifndef __CUDA_ARCH__
+// define blockDim and threadIdx for host compatibility call
+extern const dim3 blockDim;
+extern const uint3 threadIdx;
+#endif
+
+
+/*
+ * The function of the recursion formula calculation.
+ *
+ * @param[in] X1 the farthest part of state array.
+ * @param[in] X2 the second farthest part of state array.
+ * @param[in] Y a part of state array.
+ * @param[in] bid block id.
+ * @return output
+ */
+QUALIFIERS unsigned int para_rec(mtgp32_kernel_params_t * k,unsigned int X1, unsigned int X2, unsigned int Y, int bid) {
+    unsigned int X = (X1 & k->mask[0]) ^ X2;
+    unsigned int MAT;
+
+    X ^= X << k->sh1_tbl[bid];
+    Y = X ^ (Y >> k->sh2_tbl[bid]);
+    MAT = k->param_tbl[bid][Y & 0x0f];
+    return Y ^ MAT;
+}
+
+/*
+ * The tempering function.
+ *
+ * @param[in] V the output value should be tempered.
+ * @param[in] T the tempering helper value.
+ * @param[in] bid block id.
+ * @return the tempered value.
+ */
+QUALIFIERS unsigned int temper(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
+    unsigned int MAT;
+
+    T ^= T >> 16;
+    T ^= T >> 8;
+    MAT = k->temper_tbl[bid][T & 0x0f];
+    return V ^ MAT;
+}
+
+/*
+ * The tempering and converting function.
+ * By using the preset table, converting to IEEE format
+ * and tempering are done simultaneously.
+ *
+ * @param[in] V the output value should be tempered.
+ * @param[in] T the tempering helper value.
+ * @param[in] bid block id.
+ * @return the tempered and converted value.
+ */
+QUALIFIERS unsigned int temper_single(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
+    unsigned int MAT;
+    unsigned int r;
+
+    T ^= T >> 16;
+    T ^= T >> 8;
+    MAT = k->single_temper_tbl[bid][T & 0x0f];
+    r = (V >> 9) ^ MAT;
+    return r;
+}
+
+/**
+ * \brief Return 32-bits of pseudorandomness from a mtgp32 generator.
+ *
+ * Return 32-bits of pseudorandomness from the mtgp32 generator in \p state,
+ * increment position of generator by the number of threads in the block.
+ * Note the number of threads in the block can not exceed 256.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
+ */
+QUALIFIERS unsigned int curand(curandStateMtgp32_t *state)
+{
+    unsigned int t;
+    unsigned int d;
+    int pos = state->k->pos_tbl[state->pIdx];
+    unsigned int r;
+    unsigned int o;
+
+    d = blockDim.z * blockDim.y * blockDim.x;
+    //assert( d <= 256 );
+    t = (blockDim.z * blockDim.y * threadIdx.z) + (blockDim.x * threadIdx.y) + threadIdx.x;
+    r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
+             state->pIdx);
+
+    state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
+    o = temper(state->k, r,
+           state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
+           state->pIdx);
+#if __CUDA_ARCH__ != 0
+    __syncthreads();
+#endif
+    if (t == 0)
+    {
+        state->offset = (state->offset + d) & MTGP32_STATE_MASK;
+    }
+#if __CUDA_ARCH__ != 0
+    __syncthreads();
+#endif
+    return o;
+
+}
+/**
+ * \brief Return 32-bits of pseudorandomness from a specific position in a mtgp32 generator.
+ *
+ * Return 32-bits of pseudorandomness from position \p index of the mtgp32 generator in \p state,
+ * increment position of generator by \p n positions, which must be the total number of positions
+ * upddated in the state by the thread block, for this invocation.
+ *
+ * Note :
+ * Thread indices must range from 0...\ n - 1.
+ * The number of positions updated may not exceed 256.
+ * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
+ *
+ * \param state - Pointer to state to update
+ * \param index - Index (0..255) of the position within the state to draw from and update
+ * \param n - The total number of postions in this state that are being updated by this invocation
+ *
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
+ */
+QUALIFIERS unsigned int curand_mtgp32_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
+{
+    unsigned int t;
+    int pos = state->k->pos_tbl[state->pIdx];
+    unsigned int r;
+    unsigned int o;
+
+    t = index;
+    r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
+             state->pIdx);
+
+    state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
+    o = temper(state->k, r,
+           state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
+           state->pIdx);
+#if __CUDA_ARCH__ != 0
+    __syncthreads();
+#endif
+    if (index == 0)
+    {
+        state->offset = (state->offset + n) & MTGP32_STATE_MASK;
+    }
+#if __CUDA_ARCH__ != 0
+    __syncthreads();
+#endif
+    return o;
+}
+/**
+ * \brief Return a uniformly distributed float from a mtgp32 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the mtgp32 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * Note: This alternate derivation of a uniform float is provided for completeness
+ * with the original source
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_mtgp32_single(curandStateMtgp32_t *state)
+{
+    unsigned int t;
+    unsigned int d;
+    int pos = state->k->pos_tbl[state->pIdx];
+    unsigned int r;
+    unsigned int o_u;
+    float o_f;
+
+
+    t = blockDim.z * blockDim.y;
+    d = t * blockDim.x;
+    //assert( d <= 256 );
+    t += threadIdx.x;
+    r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
+             state->pIdx);
+
+    state->s[t] = r;
+    o_u = temper_single(state->k, r,
+                        state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
+                        state->pIdx);
+#if __CUDA_ARCH__ != 0
+    __syncthreads();
+#endif
+    if (threadIdx.x == 0)
+    {
+        state->offset = (state->offset + d) & MTGP32_STATE_MASK;
+    }
+#if __CUDA_ARCH__ != 0
+    __syncthreads();
+#endif
+    memcpy(&o_f, &o_u, sizeof(o_u));
+    return o_f;
+}
+
+/**
+ * \brief Return a uniformly distributed float from a specific position in a mtgp32 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from position \p index of the mtgp32 generator in \p state, and
+ * increment position of generator by \p n positions, which must be the total number of positions
+ * upddated in the state by the thread block, for this invocation.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * Note 1:
+ * Thread indices must range from 0...\p n - 1.
+ * The number of positions updated may not exceed 256.
+ * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
+ *
+ * Note 2: This alternate derivation of a uniform float is provided for completeness
+ * with the original source
+ *
+ * \param state - Pointer to state to update
+ * \param index - Index (0..255) of the position within the state to draw from and update
+ * \param n - The total number of postions in this state that are being updated by this invocation
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_mtgp32_single_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
+{
+    unsigned int t;
+    int pos = state->k->pos_tbl[state->pIdx];
+    unsigned int r;
+    unsigned int o_u;
+    float o_f;
+
+    t = index;
+    r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
+             state->pIdx);
+
+    state->s[t] = r;
+    o_u = temper_single(state->k, r,
+                        state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
+                        state->pIdx);
+#if __CUDA_ARCH__ != 0
+    __syncthreads();
+#endif
+    if (threadIdx.x == 0)
+    {
+        state->offset = (state->offset + n) & MTGP32_STATE_MASK;
+    }
+#if __CUDA_ARCH__ != 0
+    __syncthreads();
+#endif
+    memcpy(&o_f, &o_u, sizeof(o_u));
+    return o_f;
+}
+
+/** @} */
+
+#endif
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal.h
new file mode 100644
index 0000000000000000000000000000000000000000..193cfc5acb0abc446e287097c048bea02675103a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal.h
@@ -0,0 +1,837 @@
+
+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+
+#if !defined(CURAND_NORMAL_H_)
+#define CURAND_NORMAL_H_
+
+/**
+ * \defgroup DEVICE Device API
+ *
+ * @{
+ */
+
+#ifndef __CUDACC_RTC__
+#include <math.h>
+#endif // __CUDACC_RTC__
+
+#include "curand_mrg32k3a.h"
+#include "curand_mtgp32_kernel.h"
+#include "curand_philox4x32_x.h"
+#include "curand_normal_static.h"
+
+QUALIFIERS float2 _curand_box_muller(unsigned int x, unsigned int y)
+{
+    float2 result;
+    float u = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2);
+    float v = y * CURAND_2POW32_INV_2PI + (CURAND_2POW32_INV_2PI/2);
+#if __CUDA_ARCH__ > 0
+    float s = sqrtf(-2.0f * logf(u));
+    __sincosf(v, &result.x, &result.y);
+#else
+    float s = sqrtf(-2.0f * logf(u));
+    result.x = sinf(v);
+    result.y = cosf(v);
+#endif
+    result.x *= s;
+    result.y *= s;
+    return result;
+}
+
+QUALIFIERS float2 curand_box_muller_mrg(curandStateMRG32k3a_t * state)
+{
+    float x, y;
+    x = curand_uniform(state);
+    y = curand_uniform(state) * CURAND_2PI;
+    float2 result;
+#if __CUDA_ARCH__ > 0
+    float s = sqrtf(-2.0f * logf(x));
+    __sincosf(y, &result.x, &result.y);
+#else
+    float s = sqrtf(-2.0f * logf(x));
+    result.x = sinf(y);
+    result.y = cosf(y);
+#endif
+    result.x *= s;
+    result.y *= s;
+    return result;
+}
+
+QUALIFIERS double2
+_curand_box_muller_double(unsigned int x0, unsigned int x1,
+                          unsigned int y0, unsigned int y1)
+{
+    double2 result;
+    unsigned long long zx = (unsigned long long)x0 ^
+        ((unsigned long long)x1 << (53 - 32));
+    double u = zx * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
+    unsigned long long zy = (unsigned long long)y0 ^
+        ((unsigned long long)y1 << (53 - 32));
+    double v = zy * (CURAND_2POW53_INV_DOUBLE*2.0) + CURAND_2POW53_INV_DOUBLE;
+    double s = sqrt(-2.0 * log(u));
+
+#if __CUDA_ARCH__ > 0
+    sincospi(v, &result.x, &result.y);
+#else
+    result.x = sin(v*CURAND_PI_DOUBLE);
+    result.y = cos(v*CURAND_PI_DOUBLE);
+#endif
+    result.x *= s;
+    result.y *= s;
+
+    return result;
+}
+
+QUALIFIERS double2
+curand_box_muller_mrg_double(curandStateMRG32k3a_t * state)
+{
+    double x, y;
+    double2 result;
+    x = curand_uniform_double(state);
+    y = curand_uniform_double(state) * 2.0;
+
+    double s = sqrt(-2.0 * log(x));
+#if __CUDA_ARCH__ > 0
+    sincospi(y, &result.x, &result.y);
+#else
+    result.x = sin(y*CURAND_PI_DOUBLE);
+    result.y = cos(y*CURAND_PI_DOUBLE);
+#endif
+    result.x *= s;
+    result.y *= s;
+    return result;
+}
+
+template <typename R>
+QUALIFIERS float2 curand_box_muller(R *state)
+{
+    float2 result;
+    unsigned int x = curand(state);
+    unsigned int y = curand(state);
+    result = _curand_box_muller(x, y);
+    return result;
+}
+
+template <typename R>
+QUALIFIERS float4 curand_box_muller4(R *state)
+{
+    float4 result;
+    float2 _result;
+    uint4 x = curand4(state);
+    //unsigned int y = curand(state);
+    _result = _curand_box_muller(x.x, x.y);
+    result.x = _result.x;
+    result.y = _result.y;
+    _result = _curand_box_muller(x.z, x.w);
+    result.z = _result.x;
+    result.w = _result.y;
+    return result;
+}
+
+template <typename R>
+QUALIFIERS double2 curand_box_muller_double(R *state)
+{
+    double2 result;
+    unsigned int x0 = curand(state);
+    unsigned int x1 = curand(state);
+    unsigned int y0 = curand(state);
+    unsigned int y1 = curand(state);
+    result = _curand_box_muller_double(x0, x1, y0, y1);
+    return result;
+}
+
+template <typename R>
+QUALIFIERS double2 curand_box_muller2_double(R *state)
+{
+    double2 result;
+    uint4 _x;
+    _x = curand4(state);
+    result = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
+    return result;
+}
+
+
+template <typename R>
+QUALIFIERS double4 curand_box_muller4_double(R *state)
+{
+    double4 result;
+    double2 _res1;
+    double2 _res2;
+    uint4 _x;
+    uint4 _y;
+    _x = curand4(state);
+    _y = curand4(state);
+    _res1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
+    _res2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w);
+    result.x = _res1.x;
+    result.y = _res1.y;
+    result.z = _res2.x;
+    result.w = _res2.y;
+    return result;
+}
+
+//QUALIFIERS float _curand_normal_icdf(unsigned int x)
+//{
+//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
+//    float s = CURAND_SQRT2;
+//    // Mirror to avoid loss of precision
+//    if(x > 0x80000000UL) {
+//        x = 0xffffffffUL - x;
+//        s = -s;
+//    }
+//    float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+//    // p is in (0, 0.5], 2p is in (0, 1]
+//    return s * erfcinvf(2.0f * p);
+//#else
+//    x++;    //suppress warnings
+//    return 0.0f;
+//#endif
+//}
+//
+//QUALIFIERS float _curand_normal_icdf(unsigned long long x)
+//{
+//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
+//    unsigned int t = (unsigned int)(x >> 32);
+//    float s = CURAND_SQRT2;
+//    // Mirror to avoid loss of precision
+//    if(t > 0x80000000UL) {
+//        t = 0xffffffffUL - t;
+//        s = -s;
+//    }
+//    float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+//    // p is in (0, 0.5], 2p is in (0, 1]
+//    return s * erfcinvf(2.0f * p);
+//#else
+//    x++;
+//    return 0.0f;
+//#endif
+//}
+//
+//QUALIFIERS double _curand_normal_icdf_double(unsigned int x)
+//{
+//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
+//    double s = CURAND_SQRT2_DOUBLE;
+//    // Mirror to avoid loss of precision
+//    if(x > 0x80000000UL) {
+//        x = 0xffffffffUL - x;
+//        s = -s;
+//    }
+//    double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
+//    // p is in (0, 0.5], 2p is in (0, 1]
+//    return s * erfcinv(2.0 * p);
+//#else
+//    x++;
+//    return 0.0;
+//#endif
+//}
+//
+//QUALIFIERS double _curand_normal_icdf_double(unsigned long long x)
+//{
+//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
+//    double s = CURAND_SQRT2_DOUBLE;
+//    x >>= 11;
+//    // Mirror to avoid loss of precision
+//    if(x > 0x10000000000000UL) {
+//        x = 0x1fffffffffffffUL - x;
+//        s = -s;
+//    }
+//    double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
+//    // p is in (0, 0.5], 2p is in (0, 1]
+//    return s * erfcinv(2.0 * p);
+//#else
+//    x++;
+//    return 0.0;
+//#endif
+//}
+//
+
+/**
+ * \brief Return a normally distributed float from an XORWOW generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the XORWOW generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then returns them one at a time.
+ * See ::curand_normal2() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStateXORWOW_t *state)
+{
+    if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
+        unsigned int x, y;
+        x = curand(state);
+        y = curand(state);
+        float2 v = _curand_box_muller(x, y);
+        state->boxmuller_extra = v.y;
+        state->boxmuller_flag = EXTRA_FLAG_NORMAL;
+        return v.x;
+    }
+    state->boxmuller_flag = 0;
+    return state->boxmuller_extra;
+}
+
+/**
+ * \brief Return a normally distributed float from an Philox4_32_10 generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then returns them one at a time.
+ * See ::curand_normal2() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+
+QUALIFIERS float curand_normal(curandStatePhilox4_32_10_t *state)
+{
+    if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
+        unsigned int x, y;
+        x = curand(state);
+        y = curand(state);
+        float2 v = _curand_box_muller(x, y);
+        state->boxmuller_extra = v.y;
+        state->boxmuller_flag = EXTRA_FLAG_NORMAL;
+        return v.x;
+    }
+    state->boxmuller_flag = 0;
+    return state->boxmuller_extra;
+}
+
+
+
+/**
+ * \brief Return a normally distributed float from an MRG32k3a generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the MRG32k3a generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then returns them one at a time.
+ * See ::curand_normal2() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStateMRG32k3a_t *state)
+{
+    if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
+        float2 v = curand_box_muller_mrg(state);
+        state->boxmuller_extra = v.y;
+        state->boxmuller_flag = EXTRA_FLAG_NORMAL;
+        return v.x;
+    }
+    state->boxmuller_flag = 0;
+    return state->boxmuller_extra;
+}
+
+/**
+ * \brief Return two normally distributed floats from an XORWOW generator.
+ *
+ * Return two normally distributed floats with mean \p 0.0f and
+ * standard deviation \p 1.0f from the XORWOW generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float2 where each element is from a
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float2 curand_normal2(curandStateXORWOW_t *state)
+{
+    return curand_box_muller(state);
+}
+/**
+ * \brief Return two normally distributed floats from an Philox4_32_10 generator.
+ *
+ * Return two normally distributed floats with mean \p 0.0f and
+ * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float2 where each element is from a
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float2 curand_normal2(curandStatePhilox4_32_10_t *state)
+{
+    return curand_box_muller(state);
+}
+
+/**
+ * \brief Return four normally distributed floats from an Philox4_32_10 generator.
+ *
+ * Return four normally distributed floats with mean \p 0.0f and
+ * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
+ * increment position of generator by four.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float2 where each element is from a
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float4 curand_normal4(curandStatePhilox4_32_10_t *state)
+{
+    return curand_box_muller4(state);
+}
+
+
+
+/**
+ * \brief Return two normally distributed floats from an MRG32k3a generator.
+ *
+ * Return two normally distributed floats with mean \p 0.0f and
+ * standard deviation \p 1.0f from the MRG32k3a generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float2 where each element is from a
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float2 curand_normal2(curandStateMRG32k3a_t *state)
+{
+    return curand_box_muller_mrg(state);
+}
+
+/**
+ * \brief Return a normally distributed float from a MTGP32 generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the MTGP32 generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStateMtgp32_t *state)
+{
+    return _curand_normal_icdf(curand(state));
+}
+/**
+ * \brief Return a normally distributed float from a Sobol32 generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStateSobol32_t *state)
+{
+    return _curand_normal_icdf(curand(state));
+}
+
+/**
+ * \brief Return a normally distributed float from a scrambled Sobol32 generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the scrambled Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStateScrambledSobol32_t *state)
+{
+    return _curand_normal_icdf(curand(state));
+}
+
+/**
+ * \brief Return a normally distributed float from a Sobol64 generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStateSobol64_t *state)
+{
+    return _curand_normal_icdf(curand(state));
+}
+
+/**
+ * \brief Return a normally distributed float from a scrambled Sobol64 generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the scrambled Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStateScrambledSobol64_t *state)
+{
+    return _curand_normal_icdf(curand(state));
+}
+
+/**
+ * \brief Return a normally distributed double from an XORWOW generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the XORWOW generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then returns them one at a time.
+ * See ::curand_normal2_double() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStateXORWOW_t *state)
+{
+    if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
+        unsigned int x0, x1, y0, y1;
+        x0 = curand(state);
+        x1 = curand(state);
+        y0 = curand(state);
+        y1 = curand(state);
+        double2 v = _curand_box_muller_double(x0, x1, y0, y1);
+        state->boxmuller_extra_double = v.y;
+        state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
+        return v.x;
+    }
+    state->boxmuller_flag_double = 0;
+    return state->boxmuller_extra_double;
+}
+
+/**
+ * \brief Return a normally distributed double from an Philox4_32_10 generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the Philox4_32_10 generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then returns them one at a time.
+ * See ::curand_normal2_double() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+
+QUALIFIERS double curand_normal_double(curandStatePhilox4_32_10_t *state)
+{
+    if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
+        uint4 _x;
+        _x = curand4(state);
+        double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
+        state->boxmuller_extra_double = v.y;
+        state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
+        return v.x;
+    }
+    state->boxmuller_flag_double = 0;
+    return state->boxmuller_extra_double;
+}
+
+
+/**
+ * \brief Return a normally distributed double from an MRG32k3a generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the XORWOW generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then returns them one at a time.
+ * See ::curand_normal2_double() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStateMRG32k3a_t *state)
+{
+    if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
+        double2 v = curand_box_muller_mrg_double(state);
+        state->boxmuller_extra_double = v.y;
+        state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
+        return v.x;
+    }
+    state->boxmuller_flag_double = 0;
+    return state->boxmuller_extra_double;
+}
+
+/**
+ * \brief Return two normally distributed doubles from an XORWOW generator.
+ *
+ * Return two normally distributed doubles with mean \p 0.0 and
+ * standard deviation \p 1.0 from the XORWOW generator in \p state,
+ * increment position of generator by 2.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double2 where each element is from a
+ * distribution with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double2 curand_normal2_double(curandStateXORWOW_t *state)
+{
+    return curand_box_muller_double(state);
+}
+
+/**
+ * \brief Return two normally distributed doubles from an Philox4_32_10 generator.
+ *
+ * Return two normally distributed doubles with mean \p 0.0 and
+ * standard deviation \p 1.0 from the Philox4_32_10 generator in \p state,
+ * increment position of generator by 2.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double2 where each element is from a
+ * distribution with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double2 curand_normal2_double(curandStatePhilox4_32_10_t *state)
+{
+    uint4 _x;
+    double2 result;
+
+    _x = curand4(state);
+    double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
+    result.x = v1.x;
+    result.y = v1.y;
+
+    return result;
+}
+
+ // not a part of API
+QUALIFIERS double4 curand_normal4_double(curandStatePhilox4_32_10_t *state)
+{
+    uint4 _x;
+    uint4 _y;
+    double4 result;
+
+    _x = curand4(state);
+    _y = curand4(state);
+    double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
+    double2 v2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w);
+    result.x = v1.x;
+    result.y = v1.y;
+    result.z = v2.x;
+    result.w = v2.y;
+
+    return result;
+}
+
+
+/**
+ * \brief Return two normally distributed doubles from an MRG32k3a generator.
+ *
+ * Return two normally distributed doubles with mean \p 0.0 and
+ * standard deviation \p 1.0 from the MRG32k3a generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double2 where each element is from a
+ * distribution with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double2 curand_normal2_double(curandStateMRG32k3a_t *state)
+{
+    return curand_box_muller_mrg_double(state);
+}
+
+/**
+ * \brief Return a normally distributed double from an MTGP32 generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the MTGP32 generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStateMtgp32_t *state)
+{
+    return _curand_normal_icdf_double(curand(state));
+}
+
+/**
+ * \brief Return a normally distributed double from an Sobol32 generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStateSobol32_t *state)
+{
+    return _curand_normal_icdf_double(curand(state));
+}
+
+/**
+ * \brief Return a normally distributed double from a scrambled Sobol32 generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the scrambled Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStateScrambledSobol32_t *state)
+{
+    return _curand_normal_icdf_double(curand(state));
+}
+
+/**
+ * \brief Return a normally distributed double from a Sobol64 generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStateSobol64_t *state)
+{
+    return _curand_normal_icdf_double(curand(state));
+}
+
+/**
+ * \brief Return a normally distributed double from a scrambled Sobol64 generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the scrambled Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStateScrambledSobol64_t *state)
+{
+    return _curand_normal_icdf_double(curand(state));
+}
+#endif // !defined(CURAND_NORMAL_H_)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c892505e17734560543bbfbd47aa535fade55899
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8376b63625d3d9b2f14e371a45bdb6fe8e2b84a
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvtxDetail/nvtxImpl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvtxDetail/nvtxImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..bdc6bd4c72d74ef0f1fe5c2c8b4c49b196927fa7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvtxDetail/nvtxImpl.h
@@ -0,0 +1,469 @@
+/* This file was procedurally generated!  Do not modify this file by hand.  */
+
+/*
+* Copyright 2009-2016  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+
+#ifndef NVTX_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+/* ---- Include required platform headers ---- */
+
+#if defined(_WIN32) 
+
+#include <Windows.h>
+
+#else
+#include <unistd.h>
+
+#if defined(__ANDROID__)
+#include <android/api-level.h> 
+#endif
+
+#if defined(__linux__) || defined(__CYGWIN__)
+#include <sched.h>
+#endif
+
+#include <limits.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <string.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <wchar.h>
+
+#endif
+
+/* ---- Define macros used in this file ---- */
+
+#define NVTX_INIT_STATE_FRESH 0
+#define NVTX_INIT_STATE_STARTED 1
+#define NVTX_INIT_STATE_COMPLETE 2
+
+#ifdef NVTX_DEBUG_PRINT
+#ifdef __ANDROID__
+#include <android/log.h>
+#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
+#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
+#else
+#include <stdio.h>
+#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
+#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
+#endif
+#else /* !defined(NVTX_DEBUG_PRINT) */
+#define NVTX_ERR(...)
+#define NVTX_INFO(...)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#ifdef __GNUC__
+#pragma GCC visibility push(hidden)
+#endif
+
+/* ---- Forward declare all functions referenced in globals ---- */
+
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
+    NvtxCallbackModule module,
+    NvtxFunctionTable* out_table,
+    unsigned int* out_size);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(
+    uint32_t version);
+NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(
+    uint32_t exportTableId);
+
+#include "nvtxInitDecls.h"
+
+/* ---- Define all globals ---- */
+
+typedef struct nvtxGlobals_t
+{
+    volatile unsigned int initState;
+    NvtxExportTableCallbacks etblCallbacks;
+    NvtxExportTableVersionInfo etblVersionInfo;
+
+    /* Implementation function pointers */
+    nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr;
+    nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr;
+    nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr;
+    nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr;
+    nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr;
+    nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr;
+    nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr;
+    nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr;
+    nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr;
+    nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr;
+    nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr;
+    nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr;
+    nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr;
+    nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr;
+    nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr;
+
+    nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr;
+    nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr;
+    nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr;
+    nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr;
+    nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr;
+    nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr;
+    nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr;
+    nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr;
+
+    nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr;
+    nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr;
+    nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr;
+    nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr;
+    nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr;
+    nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr;
+    nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr;
+    nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr;
+    nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr;
+    nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr;
+    nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr;
+    nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr;
+    nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr;
+    nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr;
+
+    nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr;
+    nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr;
+    nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr;
+    nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr;
+    nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr;
+    nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr;
+
+    nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr;
+    nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr;
+    nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr;
+    nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr;
+    nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr;
+    nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr;
+    nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr;
+    nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr;
+    nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr;
+    nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr;
+    nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr;
+    nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr;
+    nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr;
+    nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr;
+    nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr;
+
+    nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr;
+    nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr;
+    nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr;
+    nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr;
+    nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
+    nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr;
+
+    /* Tables of function pointers -- Extra null added to the end to ensure
+    *  a crash instead of silent corruption if a tool reads off the end. */
+    NvtxFunctionPointer* functionTable_CORE  [NVTX_CBID_CORE_SIZE   + 1];
+    NvtxFunctionPointer* functionTable_CUDA  [NVTX_CBID_CUDA_SIZE   + 1];
+    NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1];
+    NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1];
+    NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE  + 1];
+    NvtxFunctionPointer* functionTable_SYNC  [NVTX_CBID_SYNC_SIZE   + 1];
+} nvtxGlobals_t;
+
+NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) =
+{
+    NVTX_INIT_STATE_FRESH,
+
+    {
+        sizeof(NvtxExportTableCallbacks),
+        NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)
+    },
+    {
+        sizeof(NvtxExportTableVersionInfo),
+        NVTX_VERSION,
+        0,
+        NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)
+    },
+
+    /* Implementation function pointers */
+    NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init),
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init),
+
+    /* Tables of function pointers */
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr,
+        0
+    }
+};
+
+/* ---- Define static inline implementations of core API functions ---- */
+
+#include "nvtxImplCore.h"
+
+/* ---- Define implementations of export table functions ---- */
+
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
+    NvtxCallbackModule module,
+    NvtxFunctionTable* out_table,
+    unsigned int* out_size)
+{
+    unsigned int bytes = 0;
+    NvtxFunctionTable table = (NvtxFunctionTable)0;
+
+    switch (module)
+    {
+    case NVTX_CB_MODULE_CORE:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE);
+        break;
+    case NVTX_CB_MODULE_CUDA:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA);
+        break;
+    case NVTX_CB_MODULE_OPENCL:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL);
+        break;
+    case NVTX_CB_MODULE_CUDART:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART);
+        break;
+    case NVTX_CB_MODULE_CORE2:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2);
+        break;
+    case NVTX_CB_MODULE_SYNC:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC);
+        break;
+    default: return 0;
+    }
+
+    if (out_size)
+        *out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1;
+
+    if (out_table)
+        *out_table = table;
+
+    return 1;
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId)
+{
+    switch (exportTableId)
+    {
+    case NVTX_ETID_CALLBACKS:       return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks;
+    case NVTX_ETID_VERSIONINFO:     return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo;
+    default:                        return 0;
+    }
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version)
+{
+    /* Reserved for custom implementations to resolve problems with tools */
+    (void)version;
+}
+
+/* ---- Define implementations of init versions of all API functions ---- */
+
+#include "nvtxInitDefs.h"
+
+/* ---- Define implementations of initialization functions ---- */
+
+#include "nvtxInit.h"
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ce3282d395d3c5d71d173eae68b324d59e90d2f
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/libnvToolsExt.so.1 b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/libnvToolsExt.so.1
new file mode 100644
index 0000000000000000000000000000000000000000..4338353fb6b12c216bd3c4f3919230931aab4aeb
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/libnvToolsExt.so.1 differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cufft_cu11-10.9.0.58.dist-info/INSTALLER b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cufft_cu11-10.9.0.58.dist-info/INSTALLER
new file mode 100644
index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cufft_cu11-10.9.0.58.dist-info/INSTALLER
@@ -0,0 +1 @@
+pip
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/INSTALLER b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/INSTALLER
new file mode 100644
index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/INSTALLER
@@ -0,0 +1 @@
+pip
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/RECORD b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/RECORD
new file mode 100644
index 0000000000000000000000000000000000000000..680500f28b6b1cb9f9bf1b1c350711563f34e50f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/RECORD
@@ -0,0 +1,22 @@
+nvidia/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+nvidia/__pycache__/__init__.cpython-311.pyc,,
+nvidia/cusolver/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+nvidia/cusolver/__pycache__/__init__.cpython-311.pyc,,
+nvidia/cusolver/include/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+nvidia/cusolver/include/__pycache__/__init__.cpython-311.pyc,,
+nvidia/cusolver/include/cusolverDn.h,sha256=8KUcqUxWPr8jpz3ZVpTB6I3IXMme1ok7E7vi9XXKRzk,147406
+nvidia/cusolver/include/cusolverMg.h,sha256=N8989nnS2BleeMyuftbQgBDJ4sMAkLPSnmy_S_7fxng,11549
+nvidia/cusolver/include/cusolverRf.h,sha256=7BZfWeuMJ8w1Pz4iZeGmwvDZbDNNq0ivG5MHtiATtls,14292
+nvidia/cusolver/include/cusolverSp.h,sha256=8fev0XawDBd0xrOxUlQ3WhclKlUuVAT64zKxwnP8iT0,32561
+nvidia/cusolver/include/cusolverSp_LOWLEVEL_PREVIEW.h,sha256=rTuS0rxwGV3bAz50ua59WVPQ9SvlijORj732oPejoCk,37495
+nvidia/cusolver/include/cusolver_common.h,sha256=oTUxSnGYIrH8ESAV-s0BgSSMyq5u9hsPtFcCbODMRM4,8825
+nvidia/cusolver/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+nvidia/cusolver/lib/__pycache__/__init__.cpython-311.pyc,,
+nvidia/cusolver/lib/libcusolver.so.11,sha256=EugAsYUnRSYliOioPoeYlG3NtytSNUvnLJTjkkwNYIg,302736312
+nvidia/cusolver/lib/libcusolverMg.so.11,sha256=XQ8dThn3AEDlKEn-5-Q_ncv3R47DlpQpReMymoa4dXU,184721832
+nvidia_cusolver_cu11-11.4.1.48.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+nvidia_cusolver_cu11-11.4.1.48.dist-info/License.txt,sha256=rW9YU_ugyg0VnQ9Y1JrkmDDC-Mk_epJki5zpCttMbM0,59262
+nvidia_cusolver_cu11-11.4.1.48.dist-info/METADATA,sha256=9ErGf3O-O5QWTmfl8S-pAEt1JQvSEDHuBS8Aq81nfs4,1552
+nvidia_cusolver_cu11-11.4.1.48.dist-info/RECORD,,
+nvidia_cusolver_cu11-11.4.1.48.dist-info/WHEEL,sha256=-kQi_VMfvRQozZJT7HUPMfY-5vLo0LVTmAylNJ3Ft98,106
+nvidia_cusolver_cu11-11.4.1.48.dist-info/top_level.txt,sha256=fTkAtiFuL16nUrB9ytDDtpytz2t0B4NvYTnRzwAhO14,7
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/WHEEL b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/WHEEL
new file mode 100644
index 0000000000000000000000000000000000000000..06e355fe0e3ed7077903f119ae6928a17da8eb6f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/WHEEL
@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: bdist_wheel (0.37.1)
+Root-Is-Purelib: true
+Tag: py3-none-manylinux1_x86_64
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging-24.2.dist-info/METADATA b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging-24.2.dist-info/METADATA
new file mode 100644
index 0000000000000000000000000000000000000000..1479c8694bfbd583a896dbe9bd33cdb6d7e7371e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging-24.2.dist-info/METADATA
@@ -0,0 +1,102 @@
+Metadata-Version: 2.3
+Name: packaging
+Version: 24.2
+Summary: Core utilities for Python packages
+Author-email: Donald Stufft <donald@stufft.io>
+Requires-Python: >=3.8
+Description-Content-Type: text/x-rst
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Classifier: Typing :: Typed
+Project-URL: Documentation, https://packaging.pypa.io/
+Project-URL: Source, https://github.com/pypa/packaging
+
+packaging
+=========
+
+.. start-intro
+
+Reusable core utilities for various Python Packaging
+`interoperability specifications <https://packaging.python.org/specifications/>`_.
+
+This library provides utilities that implement the interoperability
+specifications which have clearly one correct behaviour (eg: :pep:`440`)
+or benefit greatly from having a single shared implementation (eg: :pep:`425`).
+
+.. end-intro
+
+The ``packaging`` project includes the following: version handling, specifiers,
+markers, requirements, tags, utilities.
+
+Documentation
+-------------
+
+The `documentation`_ provides information and the API for the following:
+
+- Version Handling
+- Specifiers
+- Markers
+- Requirements
+- Tags
+- Utilities
+
+Installation
+------------
+
+Use ``pip`` to install these utilities::
+
+    pip install packaging
+
+The ``packaging`` library uses calendar-based versioning (``YY.N``).
+
+Discussion
+----------
+
+If you run into bugs, you can file them in our `issue tracker`_.
+
+You can also join ``#pypa`` on Freenode to ask questions or get involved.
+
+
+.. _`documentation`: https://packaging.pypa.io/
+.. _`issue tracker`: https://github.com/pypa/packaging/issues
+
+
+Code of Conduct
+---------------
+
+Everyone interacting in the packaging project's codebases, issue trackers, chat
+rooms, and mailing lists is expected to follow the `PSF Code of Conduct`_.
+
+.. _PSF Code of Conduct: https://github.com/pypa/.github/blob/main/CODE_OF_CONDUCT.md
+
+Contributing
+------------
+
+The ``CONTRIBUTING.rst`` file outlines how to contribute to this project as
+well as how to report a potential security issue. The documentation for this
+project also covers information about `project development`_ and `security`_.
+
+.. _`project development`: https://packaging.pypa.io/en/latest/development/
+.. _`security`: https://packaging.pypa.io/en/latest/security/
+
+Project History
+---------------
+
+Please review the ``CHANGELOG.rst`` file or the `Changelog documentation`_ for
+recent changes and project history.
+
+.. _`Changelog documentation`: https://packaging.pypa.io/en/latest/changelog/
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/share/cmake/pybind11/pybind11Tools.cmake b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/share/cmake/pybind11/pybind11Tools.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..bed5e08039abfffb59f1861573a6b835fc4d5a99
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/share/cmake/pybind11/pybind11Tools.cmake
@@ -0,0 +1,239 @@
+# tools/pybind11Tools.cmake -- Build system for the pybind11 modules
+#
+# Copyright (c) 2020 Wenzel Jakob <wenzel.jakob@epfl.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+# include_guard(global) (pre-CMake 3.10)
+if(TARGET pybind11::python_headers)
+  return()
+endif()
+
+# Built-in in CMake 3.5+
+include(CMakeParseArguments)
+
+if(pybind11_FIND_QUIETLY)
+  set(_pybind11_quiet QUIET)
+else()
+  set(_pybind11_quiet "")
+endif()
+
+# If this is the first run, PYTHON_VERSION can stand in for PYBIND11_PYTHON_VERSION
+if(NOT DEFINED PYBIND11_PYTHON_VERSION AND DEFINED PYTHON_VERSION)
+  message(WARNING "Set PYBIND11_PYTHON_VERSION to search for a specific version, not "
+                  "PYTHON_VERSION (which is an output). Assuming that is what you "
+                  "meant to do and continuing anyway.")
+  set(PYBIND11_PYTHON_VERSION
+      "${PYTHON_VERSION}"
+      CACHE STRING "Python version to use for compiling modules")
+  unset(PYTHON_VERSION)
+  unset(PYTHON_VERSION CACHE)
+elseif(DEFINED PYBIND11_PYTHON_VERSION)
+  # If this is set as a normal variable, promote it
+  set(PYBIND11_PYTHON_VERSION
+      "${PYBIND11_PYTHON_VERSION}"
+      CACHE STRING "Python version to use for compiling modules")
+else()
+  # Make an empty cache variable.
+  set(PYBIND11_PYTHON_VERSION
+      ""
+      CACHE STRING "Python version to use for compiling modules")
+endif()
+
+# A user can set versions manually too
+set(Python_ADDITIONAL_VERSIONS
+    "3.12;3.11;3.10;3.9;3.8;3.7"
+    CACHE INTERNAL "")
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+find_package(PythonLibsNew ${PYBIND11_PYTHON_VERSION} MODULE REQUIRED ${_pybind11_quiet})
+list(REMOVE_AT CMAKE_MODULE_PATH -1)
+
+# Makes a normal variable a cached variable
+macro(_PYBIND11_PROMOTE_TO_CACHE NAME)
+  set(_tmp_ptc "${${NAME}}")
+  # CMake 3.21 complains if a cached variable is shadowed by a normal one
+  unset(${NAME})
+  set(${NAME}
+      "${_tmp_ptc}"
+      CACHE INTERNAL "")
+endmacro()
+
+# Cache variables so pybind11_add_module can be used in parent projects
+_pybind11_promote_to_cache(PYTHON_INCLUDE_DIRS)
+_pybind11_promote_to_cache(PYTHON_LIBRARIES)
+_pybind11_promote_to_cache(PYTHON_MODULE_PREFIX)
+_pybind11_promote_to_cache(PYTHON_MODULE_EXTENSION)
+_pybind11_promote_to_cache(PYTHON_MODULE_DEBUG_POSTFIX)
+_pybind11_promote_to_cache(PYTHON_VERSION_MAJOR)
+_pybind11_promote_to_cache(PYTHON_VERSION_MINOR)
+_pybind11_promote_to_cache(PYTHON_VERSION)
+_pybind11_promote_to_cache(PYTHON_IS_DEBUG)
+
+if(PYBIND11_MASTER_PROJECT)
+  if(PYTHON_MODULE_EXTENSION MATCHES "pypy")
+    if(NOT DEFINED PYPY_VERSION)
+      execute_process(
+        COMMAND ${PYTHON_EXECUTABLE} -c
+                [=[import sys; sys.stdout.write(".".join(map(str, sys.pypy_version_info[:3])))]=]
+        OUTPUT_VARIABLE pypy_version)
+      set(PYPY_VERSION
+          ${pypy_version}
+          CACHE INTERNAL "")
+    endif()
+    message(STATUS "PYPY ${PYPY_VERSION} (Py ${PYTHON_VERSION})")
+  else()
+    message(STATUS "PYTHON ${PYTHON_VERSION}")
+  endif()
+endif()
+
+# Only add Python for build - must be added during the import for config since
+# it has to be re-discovered.
+#
+# This needs to be an target to it is included after the local pybind11
+# directory, just in case there are multiple versions of pybind11, we want the
+# one we expect.
+add_library(pybind11::python_headers INTERFACE IMPORTED)
+set_property(TARGET pybind11::python_headers PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+                                                      "$<BUILD_INTERFACE:${PYTHON_INCLUDE_DIRS}>")
+set_property(
+  TARGET pybind11::pybind11
+  APPEND
+  PROPERTY INTERFACE_LINK_LIBRARIES pybind11::python_headers)
+
+set(pybind11_INCLUDE_DIRS
+    "${pybind11_INCLUDE_DIR}" "${PYTHON_INCLUDE_DIRS}"
+    CACHE INTERNAL "Directories where pybind11 and possibly Python headers are located")
+
+# Python debug libraries expose slightly different objects before 3.8
+# https://docs.python.org/3.6/c-api/intro.html#debugging-builds
+# https://stackoverflow.com/questions/39161202/how-to-work-around-missing-pymodule-create2-in-amd64-win-python35-d-lib
+if(PYTHON_IS_DEBUG)
+  set_property(
+    TARGET pybind11::pybind11
+    APPEND
+    PROPERTY INTERFACE_COMPILE_DEFINITIONS Py_DEBUG)
+endif()
+
+# The <3.11 code here does not support release/debug builds at the same time, like on vcpkg
+if(CMAKE_VERSION VERSION_LESS 3.11)
+  set_property(
+    TARGET pybind11::module
+    APPEND
+    PROPERTY
+      INTERFACE_LINK_LIBRARIES
+      pybind11::python_link_helper
+      "$<$<OR:$<PLATFORM_ID:Windows>,$<PLATFORM_ID:Cygwin>>:$<BUILD_INTERFACE:${PYTHON_LIBRARIES}>>"
+  )
+
+  set_property(
+    TARGET pybind11::embed
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES pybind11::pybind11 $<BUILD_INTERFACE:${PYTHON_LIBRARIES}>)
+else()
+  # The IMPORTED INTERFACE library here is to ensure that "debug" and "release" get processed outside
+  # of a generator expression - https://gitlab.kitware.com/cmake/cmake/-/issues/18424, as they are
+  # target_link_library keywords rather than real libraries.
+  add_library(pybind11::_ClassicPythonLibraries IMPORTED INTERFACE)
+  target_link_libraries(pybind11::_ClassicPythonLibraries INTERFACE ${PYTHON_LIBRARIES})
+  target_link_libraries(
+    pybind11::module
+    INTERFACE
+      pybind11::python_link_helper
+      "$<$<OR:$<PLATFORM_ID:Windows>,$<PLATFORM_ID:Cygwin>>:pybind11::_ClassicPythonLibraries>")
+
+  target_link_libraries(pybind11::embed INTERFACE pybind11::pybind11
+                                                  pybind11::_ClassicPythonLibraries)
+endif()
+
+function(pybind11_extension name)
+  # The prefix and extension are provided by FindPythonLibsNew.cmake
+  set_target_properties(
+    ${name}
+    PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
+               DEBUG_POSTFIX "${PYTHON_MODULE_DEBUG_POSTFIX}"
+               SUFFIX "${PYTHON_MODULE_EXTENSION}")
+endfunction()
+
+# Build a Python extension module:
+# pybind11_add_module(<name> [MODULE | SHARED] [EXCLUDE_FROM_ALL]
+#                     [NO_EXTRAS] [THIN_LTO] [OPT_SIZE] source1 [source2 ...])
+#
+function(pybind11_add_module target_name)
+  set(options "MODULE;SHARED;EXCLUDE_FROM_ALL;NO_EXTRAS;SYSTEM;THIN_LTO;OPT_SIZE")
+  cmake_parse_arguments(ARG "${options}" "" "" ${ARGN})
+
+  if(ARG_MODULE AND ARG_SHARED)
+    message(FATAL_ERROR "Can't be both MODULE and SHARED")
+  elseif(ARG_SHARED)
+    set(lib_type SHARED)
+  else()
+    set(lib_type MODULE)
+  endif()
+
+  if(ARG_EXCLUDE_FROM_ALL)
+    set(exclude_from_all EXCLUDE_FROM_ALL)
+  else()
+    set(exclude_from_all "")
+  endif()
+
+  add_library(${target_name} ${lib_type} ${exclude_from_all} ${ARG_UNPARSED_ARGUMENTS})
+
+  target_link_libraries(${target_name} PRIVATE pybind11::module)
+
+  if(ARG_SYSTEM)
+    message(
+      STATUS
+        "Warning: this does not have an effect - use NO_SYSTEM_FROM_IMPORTED if using imported targets"
+    )
+  endif()
+
+  pybind11_extension(${target_name})
+
+  # -fvisibility=hidden is required to allow multiple modules compiled against
+  # different pybind versions to work properly, and for some features (e.g.
+  # py::module_local).  We force it on everything inside the `pybind11`
+  # namespace; also turning it on for a pybind module compilation here avoids
+  # potential warnings or issues from having mixed hidden/non-hidden types.
+  if(NOT DEFINED CMAKE_CXX_VISIBILITY_PRESET)
+    set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET "hidden")
+  endif()
+
+  if(NOT DEFINED CMAKE_CUDA_VISIBILITY_PRESET)
+    set_target_properties(${target_name} PROPERTIES CUDA_VISIBILITY_PRESET "hidden")
+  endif()
+
+  if(ARG_NO_EXTRAS)
+    return()
+  endif()
+
+  if(NOT DEFINED CMAKE_INTERPROCEDURAL_OPTIMIZATION)
+    if(ARG_THIN_LTO)
+      target_link_libraries(${target_name} PRIVATE pybind11::thin_lto)
+    else()
+      target_link_libraries(${target_name} PRIVATE pybind11::lto)
+    endif()
+  endif()
+
+  if(DEFINED CMAKE_BUILD_TYPE) # see https://github.com/pybind/pybind11/issues/4454
+    # Use case-insensitive comparison to match the result of $<CONFIG:cfgs>
+    string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
+    if(NOT MSVC AND NOT "${uppercase_CMAKE_BUILD_TYPE}" MATCHES DEBUG|RELWITHDEBINFO)
+      pybind11_strip(${target_name})
+    endif()
+  endif()
+
+  if(MSVC)
+    target_link_libraries(${target_name} PRIVATE pybind11::windows_extras)
+  endif()
+
+  if(ARG_OPT_SIZE)
+    target_link_libraries(${target_name} PRIVATE pybind11::opt_size)
+  endif()
+endfunction()
+
+# Provide general way to call common Python commands in "common" file.
+set(_Python
+    PYTHON
+    CACHE INTERNAL "" FORCE)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/_pyximport3.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/_pyximport3.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94217e9fd621ac98ff5825293af6fbdc8b176b9b
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/_pyximport3.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/pyximport.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/pyximport.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae4a76d2a9d1d7bd57a6871b07669fc75fe089f0
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/pyximport.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/pyximport.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/pyximport.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d575815afb9dc59834f058b1861dcfa38834ada
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/pyximport.py
@@ -0,0 +1,11 @@
+from __future__ import absolute_import
+import sys
+
+if sys.version_info < (3, 5):
+    # _pyximport3 module requires at least Python 3.5
+    from pyximport._pyximport2 import install, uninstall, show_docs
+else:
+    from pyximport._pyximport3 import install, uninstall, show_docs
+
+if __name__ == '__main__':
+    show_docs()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d381712b4a356e4ad2c066c3b87abead312f4b81
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/__init__.py
@@ -0,0 +1,2038 @@
+
+r"""
+The torch package contains data structures for multi-dimensional
+tensors and defines mathematical operations over these tensors.
+Additionally, it provides many utilities for efficient serialization of
+Tensors and arbitrary types, and other useful utilities.
+
+It has a CUDA counterpart, that enables you to run your tensor computations
+on an NVIDIA GPU with compute capability >= 3.0.
+"""
+
+import math
+import os
+import sys
+import platform
+import textwrap
+import ctypes
+import inspect
+import threading
+
+# multipy/deploy is setting this import before importing torch, this is the most
+# reliable way we have to detect if we're running within deploy.
+# https://github.com/pytorch/multipy/blob/d60f34ad38c371e441fe7ffdb77a3c3dda5a5d19/multipy/runtime/interpreter/interpreter_impl.cpp#L134-L137
+def _running_with_deploy():
+    return sys.modules.get("torch._meta_registrations", None) is object
+
+from ._utils import _import_dotted_name, classproperty
+from ._utils import _functionalize_sync as _sync
+from ._utils_internal import get_file_path, prepare_multiprocessing_environment, \
+    USE_RTLD_GLOBAL_WITH_LIBTORCH, USE_GLOBAL_DEPS
+
+# TODO(torch_deploy) figure out how to freeze version.py in fbcode build
+if _running_with_deploy():
+    __version__ = "torch-deploy-1.8"
+else:
+    from .torch_version import __version__ as __version__
+
+from typing import Any, Callable, Dict, Optional, Set, Tuple, Type, TYPE_CHECKING, Union, List
+import builtins
+
+__all__ = [
+    'typename', 'is_tensor', 'is_storage',
+    'set_default_tensor_type', 'set_default_device', 'get_default_device',
+    'set_rng_state', 'get_rng_state', 'manual_seed', 'initial_seed', 'seed',
+    'save', 'load', 'set_printoptions', 'chunk', 'split', 'stack', 'matmul',
+    'no_grad', 'enable_grad', 'rand', 'randn', 'inference_mode',
+    'DoubleStorage', 'FloatStorage', 'LongStorage', 'IntStorage',
+    'ShortStorage', 'CharStorage', 'ByteStorage', 'BoolStorage',
+    'TypedStorage', 'UntypedStorage',
+    'DoubleTensor', 'FloatTensor', 'LongTensor', 'IntTensor',
+    'ShortTensor', 'CharTensor', 'ByteTensor', 'BoolTensor', 'Tensor',
+    'lobpcg', 'use_deterministic_algorithms',
+    'are_deterministic_algorithms_enabled',
+    'is_deterministic_algorithms_warn_only_enabled',
+    'set_deterministic_debug_mode', 'get_deterministic_debug_mode',
+    'set_float32_matmul_precision', 'get_float32_matmul_precision',
+    'set_warn_always', 'is_warn_always_enabled', 'SymInt', 'SymFloat',
+    'SymBool', 'sym_not', 'unravel_index',
+    'sym_int', 'sym_float', 'sym_max', 'sym_min', 'sym_ite', 'compile', 'vmap',
+    'export', 'autocast', 'cond', 'GradScaler',
+]
+
+################################################################################
+# Load the extension module
+################################################################################
+
+if sys.platform == 'win32':
+    pfiles_path = os.getenv('ProgramFiles', 'C:\\Program Files')
+    py_dll_path = os.path.join(sys.exec_prefix, 'Library', 'bin')
+    th_dll_path = os.path.join(os.path.dirname(__file__), 'lib')
+
+    # When users create a virtualenv that inherits the base environment,
+    # we will need to add the corresponding library directory into
+    # DLL search directories. Otherwise, it will rely on `PATH` which
+    # is dependent on user settings.
+    if sys.exec_prefix != sys.base_exec_prefix:
+        base_py_dll_path = os.path.join(sys.base_exec_prefix, 'Library', 'bin')
+    else:
+        base_py_dll_path = ''
+
+    dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, base_py_dll_path]))
+
+    if all(not os.path.exists(os.path.join(p, 'nvToolsExt64_1.dll')) for p in dll_paths):
+        nvtoolsext_dll_path = os.path.join(
+            os.getenv('NVTOOLSEXT_PATH', os.path.join(pfiles_path, 'NVIDIA Corporation', 'NvToolsExt')), 'bin', 'x64')
+    else:
+        nvtoolsext_dll_path = ''
+
+    from .version import cuda as cuda_version
+    import glob
+    if cuda_version and all(not glob.glob(os.path.join(p, 'cudart64*.dll')) for p in dll_paths):
+        cuda_version_1 = cuda_version.replace('.', '_')
+        cuda_path_var = 'CUDA_PATH_V' + cuda_version_1
+        default_path = os.path.join(pfiles_path, 'NVIDIA GPU Computing Toolkit', 'CUDA', 'v' + cuda_version)
+        cuda_path = os.path.join(os.getenv(cuda_path_var, default_path), 'bin')
+    else:
+        cuda_path = ''
+
+    dll_paths.extend(filter(os.path.exists, [nvtoolsext_dll_path, cuda_path]))
+
+    kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True)
+    with_load_library_flags = hasattr(kernel32, 'AddDllDirectory')
+    prev_error_mode = kernel32.SetErrorMode(0x0001)
+
+    kernel32.LoadLibraryW.restype = ctypes.c_void_p
+    if with_load_library_flags:
+        kernel32.LoadLibraryExW.restype = ctypes.c_void_p
+
+    for dll_path in dll_paths:
+        os.add_dll_directory(dll_path)
+
+    try:
+        ctypes.CDLL('vcruntime140.dll')
+        ctypes.CDLL('msvcp140.dll')
+        ctypes.CDLL('vcruntime140_1.dll')
+    except OSError:
+        print('''Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.
+                 It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe''')
+
+    dlls = glob.glob(os.path.join(th_dll_path, '*.dll'))
+    path_patched = False
+    for dll in dlls:
+        is_loaded = False
+        if with_load_library_flags:
+            res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
+            last_error = ctypes.get_last_error()
+            if res is None and last_error != 126:
+                err = ctypes.WinError(last_error)
+                err.strerror += f' Error loading "{dll}" or one of its dependencies.'
+                raise err
+            elif res is not None:
+                is_loaded = True
+        if not is_loaded:
+            if not path_patched:
+                os.environ['PATH'] = ';'.join(dll_paths + [os.environ['PATH']])
+                path_patched = True
+            res = kernel32.LoadLibraryW(dll)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += f' Error loading "{dll}" or one of its dependencies.'
+                raise err
+
+    kernel32.SetErrorMode(prev_error_mode)
+
+
+def _preload_cuda_deps(lib_folder, lib_name):
+    """Preloads cuda deps if they could not be found otherwise."""
+    # Should only be called on Linux if default path resolution have failed
+    assert platform.system() == 'Linux', 'Should only be called on Linux'
+    import glob
+    lib_path = None
+    for path in sys.path:
+        nvidia_path = os.path.join(path, 'nvidia')
+        if not os.path.exists(nvidia_path):
+            continue
+        candidate_lib_paths = glob.glob(os.path.join(nvidia_path, lib_folder, 'lib', lib_name))
+        if candidate_lib_paths and not lib_path:
+            lib_path = candidate_lib_paths[0]
+        if lib_path:
+            break
+    if not lib_path:
+        raise ValueError(f"{lib_name} not found in the system path {sys.path}")
+    ctypes.CDLL(lib_path)
+
+
+# See Note [Global dependencies]
+def _load_global_deps() -> None:
+    if _running_with_deploy() or platform.system() == 'Windows':
+        return
+
+    lib_name = 'libtorch_global_deps' + ('.dylib' if platform.system() == 'Darwin' else '.so')
+    here = os.path.abspath(__file__)
+    lib_path = os.path.join(os.path.dirname(here), 'lib', lib_name)
+
+    try:
+        ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL)
+    except OSError as err:
+        # Can only happen for wheel with cuda libs as PYPI deps
+        # As PyTorch is not purelib, but nvidia-*-cu12 is
+        cuda_libs: Dict[str, str] = {
+            'cublas': 'libcublas.so.*[0-9]',
+            'cudnn': 'libcudnn.so.*[0-9]',
+            'cuda_nvrtc': 'libnvrtc.so.*[0-9]',
+            'cuda_runtime': 'libcudart.so.*[0-9]',
+            'cuda_cupti': 'libcupti.so.*[0-9]',
+            'cufft': 'libcufft.so.*[0-9]',
+            'curand': 'libcurand.so.*[0-9]',
+            'cusolver': 'libcusolver.so.*[0-9]',
+            'cusparse': 'libcusparse.so.*[0-9]',
+            'nccl': 'libnccl.so.*[0-9]',
+            'nvtx': 'libnvToolsExt.so.*[0-9]',
+        }
+        is_cuda_lib_err = [lib for lib in cuda_libs.values() if lib.split('.')[0] in err.args[0]]
+        if not is_cuda_lib_err:
+            raise err
+        for lib_folder, lib_name in cuda_libs.items():
+            _preload_cuda_deps(lib_folder, lib_name)
+        ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL)
+
+
+if (USE_RTLD_GLOBAL_WITH_LIBTORCH or os.getenv('TORCH_USE_RTLD_GLOBAL')) and \
+        (_running_with_deploy() or platform.system() != 'Windows'):
+    # Do it the hard way.  You might want to load libtorch with RTLD_GLOBAL in a
+    # few circumstances:
+    #
+    #   1. You're in a build environment (e.g., fbcode) where
+    #      libtorch_global_deps is not available, but you still need
+    #      to get mkl to link in with RTLD_GLOBAL or it will just
+    #      not work.
+    #
+    #   2. You're trying to run PyTorch under UBSAN and you need
+    #      to ensure that only one copy of libtorch is loaded, so
+    #      vptr checks work properly
+    #
+    # If you're using this setting, you must verify that all the libraries
+    # you load consistently use the same libstdc++, or you may have
+    # mysterious segfaults.
+    #
+    old_flags = sys.getdlopenflags()
+    sys.setdlopenflags(os.RTLD_GLOBAL | os.RTLD_LAZY)
+    from torch._C import *  # noqa: F403
+    sys.setdlopenflags(old_flags)
+    del old_flags
+
+else:
+    # Easy way.  You want this most of the time, because it will prevent
+    # C++ symbols from libtorch clobbering C++ symbols from other
+    # libraries, leading to mysterious segfaults.
+    #
+    # If building in an environment where libtorch_global_deps isn't available
+    # like parts of fbsource, but where RTLD_GLOBAL causes segfaults, you will
+    # want USE_RTLD_GLOBAL_WITH_LIBTORCH = False and USE_GLOBAL_DEPS = False
+    #
+    # See Note [Global dependencies]
+    if USE_GLOBAL_DEPS:
+        _load_global_deps()
+    from torch._C import *  # noqa: F403
+
+# Appease the type checker; ordinarily this binding is inserted by the
+# torch._C module initialization code in C
+if TYPE_CHECKING:
+    from . import _C as _C
+
+class SymInt:
+    """
+    Like an int (including magic methods), but redirects all operations on the
+    wrapped node. This is used in particular to symbolically record operations
+    in the symbolic shape workflow.
+    """
+
+    def __init__(self, node):
+        # This field MUST be named node; C++ binding code assumes that this
+        # class has a field named node that stores SymNode
+        self.node = node
+
+    def __bool__(self):
+        return builtins.bool(self != 0)
+
+    def __int__(self):
+        return self.node.int_()
+
+    def __index__(self):
+        return self.node.int_()
+
+    # Magic methods installed by torch.fx.experimental.sym_node
+
+    def __eq__(self, other: object) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __lt__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __gt__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __le__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __ge__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __add__(self, other) -> "SymInt":
+        raise AssertionError("type stub not overridden")
+
+    def __mul__(self, other) -> "SymInt":
+        raise AssertionError("type stub not overridden")
+
+    def __sym_max__(self, other):
+        raise AssertionError("type stub not overridden")
+
+    def __sym_min__(self, other):
+        raise AssertionError("type stub not overridden")
+
+    def __sym_float__(self):
+        raise AssertionError("type stub not overridden")
+
+    def __neg__(self):
+        raise AssertionError("type stub not overridden")
+
+    def __repr__(self):
+        return str(self.node)
+
+    def __hash__(self) -> builtins.int:
+        if self.node.is_nested_int():
+            return hash(self.node.nested_int())
+        else:
+            # We could support constant SymInts as well, but not doing it for now
+            raise TypeError("unhashable type: non-nested SymInt")
+
+class SymFloat:
+    """
+    Like an float (including magic methods), but redirects all operations on the
+    wrapped node. This is used in particular to symbolically record operations
+    in the symbolic shape workflow.
+    """
+
+    def __init__(self, node):
+        # This field MUST be named node; C++ binding code assumes that this
+        # class has a field named node that stores SymNode
+        self.node = node
+
+    def __bool__(self):
+        return self.node.bool_()
+
+    # Magic methods installed by torch.fx.experimental.sym_node
+
+    def __eq__(self, other: object) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __lt__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __gt__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __le__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __ge__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __sym_max__(self, other):
+        raise AssertionError("type stub not overridden")
+
+    def __sym_min__(self, other):
+        raise AssertionError("type stub not overridden")
+
+    def __sym_int__(self):
+        raise AssertionError("type stub not overridden")
+
+    def is_integer(self):
+        """Return True if the float is an integer."""
+        raise AssertionError("type stub not overridden")
+
+    def __repr__(self):
+        return self.node.str()
+
+class SymBool:
+    """
+    Like an bool (including magic methods), but redirects all operations on the
+    wrapped node. This is used in particular to symbolically record operations
+    in the symbolic shape workflow.
+
+    Unlike regular bools, regular boolean operators will force extra guards instead
+    of symbolically evaluate.  Use the bitwise operators instead to handle this.
+    """
+
+    def __init__(self, node):
+        # This field MUST be named node; C++ binding code assumes that this
+        # class has a field named node that stores SymNode
+        self.node = node
+
+    def __bool__(self):
+        return self.node.bool_()
+
+    def __int__(self):
+        return builtins.int(self.node.bool_())
+
+    # Magic methods installed by torch.fx.experimental.sym_node
+    def __and__(self, other) -> "SymBool":
+        raise AssertionError("type stub not overridden")
+
+    def __or__(self, other) -> "SymBool":
+        raise AssertionError("type stub not overridden")
+
+    # We very carefully define __sym_not__, and not a number of other
+    # plausible alternatives:
+    #
+    #   - We do not override __not__ because this is not a real magic
+    #     method; you cannot override the meaning of the not builtin in
+    #     Python.  We use the name 'sym_not' to clarify that in user code you
+    #     cannot use the builtin not or operator.not_ or operator.__not__ and
+    #     hit this magic method; you must use our custom sym_not operator.
+    #
+    #   - We do not override the __invert__ method because SymBool is
+    #     meant to be usable in situations where bool is expected.  However,
+    #     bitwise negation ~a does the wrong thing with booleans (because
+    #     bool is a subclass of int, so ~1 = -2 which is not falseish.)
+    #     This would be a giant footgun, so we get around it by defining
+    #     our own operator.  Note that bitwise and/or do the right thing,
+    #     so we reuse the conventional operators there for readability.
+    #
+    def __sym_not__(self) -> "SymBool":
+        raise AssertionError("type stub not overridden")
+
+    def __sym_ite__(self, then_val, else_val):
+        raise AssertionError("type stub not overridden")
+
+    def __eq__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __repr__(self):
+        return str(self.node)
+
+    def __hash__(self):
+        if self.node.is_constant():
+            return hash(self.node.bool_())
+        else:
+            raise TypeError("unhashable type: SymBool")
+
+def sym_not(a):
+    r""" SymInt-aware utility for logical negation.
+
+    Args:
+        a (SymBool or bool): Object to negate
+    """
+    import sympy
+    from .overrides import has_torch_function_unary, handle_torch_function
+
+    if has_torch_function_unary(a):
+        return handle_torch_function(sym_not, (a,), a)
+    if hasattr(a, '__sym_not__'):
+        return a.__sym_not__()
+    if isinstance(a, sympy.Basic):
+        return ~a  # type: ignore[operator]
+    return not a
+
+def sym_float(a):
+    r""" SymInt-aware utility for float casting.
+
+    Args:
+        a (SymInt, SymFloat, or object): Object to cast
+    """
+    from .overrides import has_torch_function_unary, handle_torch_function
+
+    if has_torch_function_unary(a):
+        return handle_torch_function(sym_float, (a,), a)
+    if isinstance(a, SymFloat):
+        return a
+    elif hasattr(a, '__sym_float__'):
+        return a.__sym_float__()
+    return py_float(a)  # type: ignore[operator]
+
+
+def sym_int(a):
+    r""" SymInt-aware utility for int casting.
+
+    Args:
+        a (SymInt, SymFloat, or object): Object to cast
+    """
+    from .overrides import has_torch_function_unary, handle_torch_function
+
+    if has_torch_function_unary(a):
+        return handle_torch_function(sym_int, (a,), a)
+    if isinstance(a, SymInt):
+        return a
+    elif isinstance(a, SymFloat):
+        return math.floor(a) if a >= 0 else math.ceil(a)  # type: ignore[arg-type, call-overload]
+    return py_int(a)  # type: ignore[operator]
+
+def sym_max(a, b):
+    """ SymInt-aware utility for max()."""
+    from .overrides import has_torch_function, handle_torch_function
+
+    if has_torch_function((a, b)):
+        return handle_torch_function(sym_max, (a, b), a, b)
+    if isinstance(a, (SymInt, SymFloat)):
+        return a.__sym_max__(b)
+    elif isinstance(b, (SymInt, SymFloat)):
+        # NB: If you actually care about preserving output type exactly
+        # if you do something like max(0, 0.0), it is NOT sound to treat
+        # min/max as commutative
+        return b.__sym_max__(a)
+    return builtins.max(a, b)  # type: ignore[operator]
+
+def sym_min(a, b):
+    """ SymInt-aware utility for max()."""
+    from .overrides import has_torch_function, handle_torch_function
+
+    if has_torch_function((a, b)):
+        return handle_torch_function(sym_min, (a, b), a, b)
+    if isinstance(a, (SymInt, SymFloat)):
+        return a.__sym_min__(b)
+    elif isinstance(b, (SymInt, SymFloat)):
+        return b.__sym_min__(a)
+    return builtins.min(a, b)  # type: ignore[operator]
+
+# Drop in replacement for math.sqrt, math.sin, math.cos etc
+current_module = sys.modules[__name__]
+
+def _get_sym_math_fn(name):
+    def fn(a):
+        from .overrides import has_torch_function_unary, handle_torch_function
+
+        if has_torch_function_unary(a):
+            return handle_torch_function(fn, (a,), a)
+        if hasattr(a, f"__sym_{name}__"):
+            return getattr(a, f"__sym_{name}__")()
+        return getattr(math, name)(a)
+
+    return fn
+
+for name in ("sqrt", "cos", "cosh", "sin", "sinh", "tan", "tanh", "asin", "acos", "atan"):
+    sym_name = f"_sym_{name}"
+    fn = _get_sym_math_fn(name)
+    fn.__qualname__ = fn.__name__ = sym_name
+    setattr(current_module, sym_name, fn)
+
+# Adding temporary shortcut
+sym_sqrt = current_module._sym_sqrt
+__all__.append("sym_sqrt")
+
+del fn, name, sym_name, current_module  # type: ignore[possibly-undefined]
+
+
+def sym_ite(b, t, f):
+    from .overrides import has_torch_function, handle_torch_function
+
+    if has_torch_function((b, t, f)):
+        return handle_torch_function(sym_ite, (b, t, f), b, t, f)
+    assert isinstance(b, (SymBool, builtins.bool)) and type(t) == type(f)
+    if isinstance(b, SymBool):
+        return b.__sym_ite__(t, f)
+    return t if b else f
+
+# Check to see if we can load C extensions, and if not provide some guidance
+# on what the problem might be.
+try:
+    # _initExtension is chosen (arbitrarily) as a sentinel.
+    from torch._C import _initExtension
+except ImportError:
+    import torch._C as _C_for_compiled_check
+
+    # The __file__ check only works for Python 3.7 and above.
+    if _C_for_compiled_check.__file__ is None:
+        raise ImportError(textwrap.dedent('''
+            Failed to load PyTorch C extensions:
+                It appears that PyTorch has loaded the `torch/_C` folder
+                of the PyTorch repository rather than the C extensions which
+                are expected in the `torch._C` namespace. This can occur when
+                using the `install` workflow. e.g.
+                    $ python setup.py install && python -c "import torch"
+
+                This error can generally be solved using the `develop` workflow
+                    $ python setup.py develop && python -c "import torch"  # This should succeed
+                or by running Python from a different directory.
+            ''').strip()) from None
+    raise  # If __file__ is not None the cause is unknown, so just re-raise.
+
+for name in dir(_C):
+    if name[0] != '_' and not name.endswith('Base'):
+        __all__.append(name)
+        obj = getattr(_C, name)
+        if (isinstance(obj, Callable) or inspect.isclass(obj)):  # type: ignore[arg-type]
+            if (obj.__module__ != 'torch'):
+                # TODO: fix their module from C++ side
+                if name not in ['DisableTorchFunctionSubclass', 'DisableTorchFunction', 'Generator']:
+                    obj.__module__ = 'torch'
+    elif name == 'TensorBase':
+        # issue 109438 / pr 109940. Prevent TensorBase from being copied into torch.
+        delattr(sys.modules[__name__], name)
+
+if not TYPE_CHECKING:
+    # issue 38137 and python issue 43367. Submodules of a C extension are
+    # non-standard, and attributes of those submodules cannot be pickled since
+    # pickle expect to be able to import them as "from _C.sub import attr"
+    # which fails with "_C is not a package
+    for attr in dir(_C):
+        candidate = getattr(_C, attr)
+        if type(candidate) is type(_C):
+            # submodule
+            if f'torch._C.{attr}' not in sys.modules:
+                sys.modules[f'torch._C.{attr}'] = candidate
+
+
+################################################################################
+# Define basic utilities
+################################################################################
+
+
+def typename(o):
+    if isinstance(o, torch.Tensor):
+        return o.type()
+
+    module = ''
+    class_name = ''
+    if hasattr(o, '__module__') and o.__module__ != 'builtins' \
+            and o.__module__ != '__builtin__' and o.__module__ is not None:
+        module = o.__module__ + '.'
+
+    if hasattr(o, '__qualname__'):
+        class_name = o.__qualname__
+    elif hasattr(o, '__name__'):
+        class_name = o.__name__
+    else:
+        class_name = o.__class__.__name__
+
+    return module + class_name
+
+
+def is_tensor(obj):
+    r"""Returns True if `obj` is a PyTorch tensor.
+
+    Note that this function is simply doing ``isinstance(obj, Tensor)``.
+    Using that ``isinstance`` check is better for typechecking with mypy,
+    and more explicit - so it's recommended to use that instead of
+    ``is_tensor``.
+
+    Args:
+        obj (Object): Object to test
+    Example::
+
+        >>> x = torch.tensor([1, 2, 3])
+        >>> torch.is_tensor(x)
+        True
+
+    """
+    return isinstance(obj, torch.Tensor)
+
+
+def is_storage(obj):
+    r"""Returns True if `obj` is a PyTorch storage object.
+
+    Args:
+        obj (Object): Object to test
+    """
+    return type(obj) in _storage_classes
+
+
+_GLOBAL_DEVICE_CONTEXT = threading.local()
+
+
+def get_default_device() -> "torch.device":
+    r"""Gets the default ``torch.Tensor`` to be allocated on ``device``"""
+    global _GLOBAL_DEVICE_CONTEXT
+    if hasattr(_GLOBAL_DEVICE_CONTEXT, "device_context"):
+        device = _GLOBAL_DEVICE_CONTEXT.device_context.device
+        if device.index is not None:
+            return device
+        else:
+            # TODO: Call like get_device_index() method corresponding to
+            # each device type
+            return torch.tensor([]).device
+    else:
+        return torch.device("cpu")
+
+
+def set_default_device(device):
+    """Sets the default ``torch.Tensor`` to be allocated on ``device``.  This
+    does not affect factory function calls which are called with an explicit
+    ``device`` argument.  Factory calls will be performed as if they
+    were passed ``device`` as an argument.
+
+    To only temporarily change the default device instead of setting it
+    globally, use ``with torch.device(device):`` instead.
+
+    The default device is initially ``cpu``.  If you set the default tensor
+    device to another device (e.g., ``cuda``) without a device index, tensors
+    will be allocated on whatever the current device for the device type,
+    even after :func:`torch.cuda.set_device` is called.
+
+    .. warning::
+
+        This function imposes a slight performance cost on every Python
+        call to the torch API (not just factory functions).  If this
+        is causing problems for you, please comment on
+        https://github.com/pytorch/pytorch/issues/92701
+
+    .. note::
+
+        This doesn't affect functions that create tensors that share the same memory as the input, like:
+        :func:`torch.from_numpy` and :func:`torch.frombuffer`
+
+    Args:
+        device (device or string): the device to set as default
+
+    Example::
+
+        >>> # xdoctest: +SKIP("requires cuda, changes global state")
+        >>> torch.get_default_device()
+        device(type='cpu')
+        >>> torch.set_default_device('cuda')  # current device is 0
+        >>> torch.get_default_device()
+        device(type='cuda', index=0)
+        >>> torch.set_default_device('cuda')
+        >>> torch.cuda.set_device('cuda:1')  # current device is 1
+        >>> torch.get_default_device()
+        device(type='cuda', index=1)
+        >>> torch.set_default_device('cuda:1')
+        >>> torch.get_default_device()
+        device(type='cuda', index=1)
+
+    """
+    global _GLOBAL_DEVICE_CONTEXT
+    if hasattr(_GLOBAL_DEVICE_CONTEXT, "device_context"):
+        device_context = _GLOBAL_DEVICE_CONTEXT.device_context
+        if device_context is not None:
+            device_context.__exit__(None, None, None)
+
+    if device is None:
+        device_context = None
+    else:
+        from torch.utils._device import DeviceContext
+        device_context = DeviceContext(device)
+        device_context.__enter__()
+    _GLOBAL_DEVICE_CONTEXT.device_context = device_context
+
+
+def set_default_tensor_type(t):
+    r"""
+    .. warning::
+
+        This function is deprecated as of PyTorch 2.1, please use :func:`torch.set_default_dtype()` and
+        :func:`torch.set_default_device()` as alternatives.
+
+    Sets the default ``torch.Tensor`` type to floating point tensor type
+    ``t``. This type will also be used as default floating point type for
+    type inference in :func:`torch.tensor`.
+
+    The default floating point tensor type is initially ``torch.FloatTensor``.
+
+    Args:
+        t (type or string): the floating point tensor type or its name
+
+    Example::
+
+        >>> # xdoctest: +SKIP("Other tests may have changed the default type. Can we reset it?")
+        >>> torch.tensor([1.2, 3]).dtype    # initial default for floating point is torch.float32
+        torch.float32
+        >>> torch.set_default_tensor_type(torch.DoubleTensor)
+        >>> torch.tensor([1.2, 3]).dtype    # a new floating point tensor
+        torch.float64
+
+    """
+    if isinstance(t, str):
+        t = _import_dotted_name(t)
+    _C._set_default_tensor_type(t)
+
+
+def set_default_dtype(d):
+    r"""
+
+    Sets the default floating point dtype to :attr:`d`. Supports torch.float32
+    and torch.float64 as inputs. Other dtypes may be accepted without complaint
+    but are not supported and are unlikely to work as expected.
+
+    When PyTorch is initialized its default floating point dtype is torch.float32,
+    and the intent of set_default_dtype(torch.float64) is to facilitate NumPy-like
+    type inference. The default floating point dtype is used to:
+
+    1. Implicitly determine the default complex dtype. When the default floating point
+       type is float32 the default complex dtype is complex64, and when the default
+       floating point type is float64 the default complex type is complex128.
+    2. Infer the dtype for tensors constructed using Python floats or complex Python
+       numbers. See examples below.
+    3. Determine the result of type promotion between bool and integer tensors and
+       Python floats and complex Python numbers.
+
+    Args:
+        d (:class:`torch.dtype`): the floating point dtype to make the default.
+                                  Either torch.float32 or torch.float64.
+
+    Example:
+        >>> # xdoctest: +SKIP("Other tests may have changed the default type. Can we reset it?")
+        >>> # initial default for floating point is torch.float32
+        >>> # Python floats are interpreted as float32
+        >>> torch.tensor([1.2, 3]).dtype
+        torch.float32
+        >>> # initial default for floating point is torch.complex64
+        >>> # Complex Python numbers are interpreted as complex64
+        >>> torch.tensor([1.2, 3j]).dtype
+        torch.complex64
+
+        >>> torch.set_default_dtype(torch.float64)
+
+        >>> # Python floats are now interpreted as float64
+        >>> torch.tensor([1.2, 3]).dtype    # a new floating point tensor
+        torch.float64
+        >>> # Complex Python numbers are now interpreted as complex128
+        >>> torch.tensor([1.2, 3j]).dtype   # a new complex tensor
+        torch.complex128
+
+    """
+    _C._set_default_dtype(d)
+
+def use_deterministic_algorithms(mode: builtins.bool, *, warn_only: builtins.bool = False) -> None:
+    r""" Sets whether PyTorch operations must use "deterministic"
+    algorithms. That is, algorithms which, given the same input, and when
+    run on the same software and hardware, always produce the same output.
+    When enabled, operations will use deterministic algorithms when available,
+    and if only nondeterministic algorithms are available they will throw a
+    :class:`RuntimeError` when called.
+
+    .. note:: This setting alone is not always enough to make an application
+        reproducible. Refer to :ref:`reproducibility` for more information.
+
+    .. note:: :func:`torch.set_deterministic_debug_mode` offers an alternative
+        interface for this feature.
+
+    The following normally-nondeterministic operations will act
+    deterministically when ``mode=True``:
+
+        * :class:`torch.nn.Conv1d` when called on CUDA tensor
+        * :class:`torch.nn.Conv2d` when called on CUDA tensor
+        * :class:`torch.nn.Conv3d` when called on CUDA tensor
+        * :class:`torch.nn.ConvTranspose1d` when called on CUDA tensor
+        * :class:`torch.nn.ConvTranspose2d` when called on CUDA tensor
+        * :class:`torch.nn.ConvTranspose3d` when called on CUDA tensor
+        * :class:`torch.nn.ReplicationPad2d` when attempting to differentiate a CUDA tensor
+        * :func:`torch.bmm` when called on sparse-dense CUDA tensors
+        * :func:`torch.Tensor.__getitem__` when attempting to differentiate a CPU tensor
+          and the index is a list of tensors
+        * :func:`torch.Tensor.index_put` with ``accumulate=False``
+        * :func:`torch.Tensor.index_put` with ``accumulate=True`` when called on a CPU
+          tensor
+        * :func:`torch.Tensor.put_` with ``accumulate=True`` when called on a CPU
+          tensor
+        * :func:`torch.Tensor.scatter_add_` when called on a CUDA tensor
+        * :func:`torch.gather` when called on a CUDA tensor that requires grad
+        * :func:`torch.index_add` when called on CUDA tensor
+        * :func:`torch.index_select` when attempting to differentiate a CUDA tensor
+        * :func:`torch.repeat_interleave` when attempting to differentiate a CUDA tensor
+        * :func:`torch.Tensor.index_copy` when called on a CPU or CUDA tensor
+        * :func:`torch.Tensor.scatter` when `src` type is Tensor and called on CUDA tensor
+        * :func:`torch.Tensor.scatter_reduce` when ``reduce='sum'`` or ``reduce='mean'`` and called on CUDA tensor
+
+    The following normally-nondeterministic operations will throw a
+    :class:`RuntimeError` when ``mode=True``:
+
+        * :class:`torch.nn.AvgPool3d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.AdaptiveAvgPool2d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.AdaptiveAvgPool3d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.MaxPool3d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.AdaptiveMaxPool2d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.FractionalMaxPool2d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.FractionalMaxPool3d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.MaxUnpool1d`
+        * :class:`torch.nn.MaxUnpool2d`
+        * :class:`torch.nn.MaxUnpool3d`
+        * :func:`torch.nn.functional.interpolate` when attempting to differentiate a CUDA tensor
+          and one of the following modes is used:
+
+          - ``linear``
+          - ``bilinear``
+          - ``bicubic``
+          - ``trilinear``
+
+        * :class:`torch.nn.ReflectionPad1d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.ReflectionPad2d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.ReflectionPad3d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.ReplicationPad1d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.ReplicationPad3d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.NLLLoss` when called on a CUDA tensor
+        * :class:`torch.nn.CTCLoss` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.EmbeddingBag` when attempting to differentiate a CUDA tensor when
+          ``mode='max'``
+        * :func:`torch.Tensor.put_` when ``accumulate=False``
+        * :func:`torch.Tensor.put_` when ``accumulate=True`` and called on a CUDA tensor
+        * :func:`torch.histc` when called on a CUDA tensor
+        * :func:`torch.bincount` when called on a CUDA tensor and ``weights``
+          tensor is given
+        * :func:`torch.kthvalue` with called on a CUDA tensor
+        * :func:`torch.median` with indices output when called on a CUDA tensor
+        * :func:`torch.nn.functional.grid_sample` when attempting to differentiate a CUDA tensor
+        * :func:`torch.cumsum` when called on a CUDA tensor when dtype is floating point or complex
+        * :func:`torch.Tensor.scatter_reduce` when ``reduce='prod'`` and called on CUDA tensor
+        * :func:`torch.Tensor.resize_` when called with a quantized tensor
+
+    In addition, several operations fill uninitialized memory when this setting
+    is turned on and when
+    :attr:`torch.utils.deterministic.fill_uninitialized_memory` is turned on.
+    See the documentation for that attribute for more information.
+
+    A handful of CUDA operations are nondeterministic if the CUDA version is
+    10.2 or greater, unless the environment variable ``CUBLAS_WORKSPACE_CONFIG=:4096:8``
+    or ``CUBLAS_WORKSPACE_CONFIG=:16:8`` is set. See the CUDA documentation for more
+    details: `<https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility>`_
+    If one of these environment variable configurations is not set, a :class:`RuntimeError`
+    will be raised from these operations when called with CUDA tensors:
+
+        * :func:`torch.mm`
+        * :func:`torch.mv`
+        * :func:`torch.bmm`
+
+    Note that deterministic operations tend to have worse performance than
+    nondeterministic operations.
+
+    .. note::
+
+        This flag does not detect or prevent nondeterministic behavior caused
+        by calling an inplace operation on a tensor with an internal memory
+        overlap or by giving such a tensor as the :attr:`out` argument for an
+        operation. In these cases, multiple writes of different data may target
+        a single memory location, and the order of writes is not guaranteed.
+
+    Args:
+        mode (:class:`bool`): If True, makes potentially nondeterministic
+            operations switch to a deterministic algorithm or throw a runtime
+            error. If False, allows nondeterministic operations.
+
+    Keyword args:
+        warn_only (:class:`bool`, optional): If True, operations that do not
+            have a deterministic implementation will throw a warning instead of
+            an error. Default: ``False``
+
+    Example::
+
+        >>> # xdoctest: +SKIP
+        >>> torch.use_deterministic_algorithms(True)
+
+        # Forward mode nondeterministic error
+        >>> torch.randn(10, device='cuda').kthvalue(1)
+        ...
+        RuntimeError: kthvalue CUDA does not have a deterministic implementation...
+
+        # Backward mode nondeterministic error
+        >>> torch.nn.AvgPool3d(1)(torch.randn(3, 4, 5, 6, requires_grad=True).cuda()).sum().backward()
+        ...
+        RuntimeError: avg_pool3d_backward_cuda does not have a deterministic implementation...
+    """
+    _C._set_deterministic_algorithms(mode, warn_only=warn_only)
+
+def are_deterministic_algorithms_enabled() -> builtins.bool:
+    r"""Returns True if the global deterministic flag is turned on. Refer to
+    :func:`torch.use_deterministic_algorithms` documentation for more details.
+    """
+    return _C._get_deterministic_algorithms()
+
+def is_deterministic_algorithms_warn_only_enabled() -> builtins.bool:
+    r"""Returns True if the global deterministic flag is set to warn only.
+    Refer to :func:`torch.use_deterministic_algorithms` documentation for more
+    details.
+    """
+    return _C._get_deterministic_algorithms_warn_only()
+
+def set_deterministic_debug_mode(debug_mode: Union[builtins.int, str]) -> None:
+    r"""Sets the debug mode for deterministic operations.
+
+    .. note:: This is an alternative interface for
+        :func:`torch.use_deterministic_algorithms`. Refer to that function's
+        documentation for details about affected operations.
+
+    Args:
+        debug_mode(str or int): If "default" or 0, don't error or warn on
+            nondeterministic operations. If "warn" or 1, warn on
+            nondeterministic operations. If "error" or 2, error on
+            nondeterministic operations.
+    """
+
+    # NOTE: builtins.int is used here because int in this scope resolves
+    # to torch.int
+    if not isinstance(debug_mode, (builtins.int, str)):
+        raise TypeError(f'debug_mode must be str or int, but got {type(debug_mode)}')
+
+    if isinstance(debug_mode, str):
+        if debug_mode == 'default':
+            debug_mode = 0
+        elif debug_mode == 'warn':
+            debug_mode = 1
+        elif debug_mode == 'error':
+            debug_mode = 2
+        else:
+            raise RuntimeError(
+                'invalid value of debug_mode, expected one of `default`, '
+                f'`warn`, `error`, but got {debug_mode}')
+
+    if debug_mode == 0:
+        _C._set_deterministic_algorithms(False)
+    elif debug_mode == 1:
+        _C._set_deterministic_algorithms(True, warn_only=True)
+    elif debug_mode == 2:
+        _C._set_deterministic_algorithms(True)
+    else:
+        raise RuntimeError(
+            'invalid value of debug_mode, expected 0, 1, or 2, '
+            f'but got {debug_mode}')
+
+def get_deterministic_debug_mode() -> builtins.int:
+    r"""Returns the current value of the debug mode for deterministic
+    operations. Refer to :func:`torch.set_deterministic_debug_mode`
+    documentation for more details.
+    """
+
+    if _C._get_deterministic_algorithms():
+        if _C._get_deterministic_algorithms_warn_only():
+            return 1
+        else:
+            return 2
+    else:
+        return 0
+
+def get_float32_matmul_precision() -> builtins.str:
+    r"""Returns the current value of float32 matrix multiplication precision. Refer to
+    :func:`torch.set_float32_matmul_precision` documentation for more details.
+    """
+    return _C._get_float32_matmul_precision()
+
+def set_float32_matmul_precision(precision: str) -> None:
+    r"""Sets the internal precision of float32 matrix multiplications.
+
+    Running float32 matrix multiplications in lower precision may significantly increase
+    performance, and in some programs the loss of precision has a negligible impact.
+
+    Supports three settings:
+
+        * "highest", float32 matrix multiplications use the float32 datatype (24 mantissa
+          bits with 23 bits explicitly stored) for internal computations.
+        * "high", float32 matrix multiplications either use the TensorFloat32 datatype (10
+          mantissa bits explicitly stored) or treat each float32 number as the sum of two bfloat16 numbers
+          (approximately 16 mantissa bits with 14 bits explicitly stored), if the appropriate fast matrix multiplication
+          algorithms are available.  Otherwise float32 matrix multiplications are computed
+          as if the precision is "highest".  See below for more information on the bfloat16
+          approach.
+        * "medium", float32 matrix multiplications use the bfloat16 datatype (8 mantissa
+          bits with 7 bits explicitly stored) for internal computations, if a fast matrix multiplication algorithm
+          using that datatype internally is available. Otherwise float32
+          matrix multiplications are computed as if the precision is "high".
+
+    When using "high" precision, float32 multiplications may use a bfloat16-based algorithm
+    that is more complicated than simply truncating to some smaller number mantissa bits
+    (e.g. 10 for TensorFloat32, 7 for bfloat16 explicitly stored).  Refer to [Henry2019]_ for a complete
+    description of this algorithm.  To briefly explain here, the first step is to realize
+    that we can perfectly encode a single float32 number as the sum of three bfloat16
+    numbers (because float32 has 23 mantissa bits while bfloat16 has 7 explicitly stored, and both have the
+    same number of exponent bits).  This means that the product of two float32 numbers can
+    be exactly given by the sum of nine products of bfloat16 numbers.  We can then trade
+    accuracy for speed by dropping some of these products.  The "high" precision algorithm
+    specifically keeps only the three most significant products, which conveniently excludes
+    all of the products involving the last 8 mantissa bits of either input.  This means that
+    we can represent our inputs as the sum of two bfloat16 numbers rather than three.
+    Because bfloat16 fused-multiply-add (FMA) instructions are typically >10x faster than
+    float32 ones, it's faster to do three multiplications and 2 additions with bfloat16
+    precision than it is to do a single multiplication with float32 precision.
+
+    .. [Henry2019] http://arxiv.org/abs/1904.06376
+
+    .. note::
+
+        This does not change the output dtype of float32 matrix multiplications,
+        it controls how the internal computation of the matrix multiplication is performed.
+
+    .. note::
+
+        This does not change the precision of convolution operations. Other flags,
+        like `torch.backends.cudnn.allow_tf32`, may control the precision of convolution
+        operations.
+
+    .. note::
+
+        This flag currently only affects one native device type: CUDA.
+        If "high" or "medium" are set then the TensorFloat32 datatype will be used
+        when computing float32 matrix multiplications, equivalent to setting
+        `torch.backends.cuda.matmul.allow_tf32 = True`. When "highest" (the default)
+        is set then the float32 datatype is used for internal computations, equivalent
+        to setting `torch.backends.cuda.matmul.allow_tf32 = False`.
+
+    Args:
+        precision(str): can be set to "highest" (default), "high", or "medium" (see above).
+
+    """
+    _C._set_float32_matmul_precision(precision)
+
+def set_warn_always(b: builtins.bool) -> None:
+    r"""When this flag is False (default) then some PyTorch warnings may only
+    appear once per process. This helps avoid excessive warning information.
+    Setting it to True causes these warnings to always appear, which may be
+    helpful when debugging.
+
+    Args:
+        b (:class:`bool`): If True, force warnings to always be emitted
+                           If False, set to the default behaviour
+    """
+    _C._set_warnAlways(b)
+
+def is_warn_always_enabled() -> builtins.bool:
+    r"""Returns True if the global warn_always flag is turned on. Refer to
+    :func:`torch.set_warn_always` documentation for more details.
+    """
+    return _C._get_warnAlways()
+
+################################################################################
+# Define error checking functions
+################################################################################
+
+# These error checking functions must be kept consistent with their C++
+# equivalents. Their C++ equivalents are mentioned where applicable.
+
+def _check_with(error_type, cond: Union[builtins.bool, SymBool], message: Callable[[], str]):  # noqa: F811
+    if not isinstance(cond, (builtins.bool, torch.SymBool)):
+        raise TypeError(f'cond must be a bool, but got {type(cond)}')
+
+    from torch.fx.experimental.symbolic_shapes import expect_true
+    if expect_true(cond):
+        return
+
+    # error_type must be a subclass of Exception and not subclass of Warning
+    assert issubclass(error_type, Exception) and not issubclass(error_type, Warning)
+
+    if message is None:
+        message_evaluated = (
+            'Expected cond to be True, but got False. (Could this error '
+            'message be improved? If so, please report an enhancement request '
+            'to PyTorch.)')
+
+    else:
+        if not callable(message):
+            raise TypeError('message must be a callable')
+
+        message_evaluated = str(message())
+
+    raise error_type(message_evaluated)
+
+def _check(cond, message=None):  # noqa: F811
+    r"""Throws error containing an optional message if the specified condition
+    is False.
+
+    Error type: ``RuntimeError``
+
+    C++ equivalent: ``TORCH_CHECK``
+
+    Args:
+        cond (:class:`bool`): If False, throw error
+
+        message (Callable, optional): Callable that returns either a string or
+            an object that has a ``__str__()`` method to be used as the error
+            message. Default: ``None``
+    """
+    _check_with(RuntimeError, cond, message)
+
+def _check_is_size(i, message=None):
+    """Checks that a given integer is a valid size (i.e., is non-negative).
+    You should use this over _check(i >= 0) because we can use the semantic
+    information (that i is a size) to make some further inferences in case
+    i is an unbacked SymInt.
+
+    NB: Do NOT use this in contexts where a -1 size would be valid (indicating
+    to infer the size from context, or if you should wrap-around or truncate).
+    Only use this if the only valid value is an honest to goodness size.
+    """
+    # This is responsible for the expect_true
+    _check(i >= 0, message)
+    from torch.fx.experimental.symbolic_shapes import _advise_is_size
+    _advise_is_size(i)
+
+def _check_index(cond, message=None):  # noqa: F811
+    r"""Throws error containing an optional message if the specified condition
+    is False.
+
+    Error type: ``IndexError``
+
+    C++ equivalent: ``TORCH_CHECK_INDEX``
+
+    Args:
+        cond (:class:`bool`): If False, throw error
+
+        message (Callable, optional): Callable that returns either a string or
+            an object that has a ``__str__()`` method to be used as the error
+            message. Default: ``None``
+    """
+    _check_with(IndexError, cond, message)
+
+def _check_value(cond, message=None):  # noqa: F811
+    r"""Throws error containing an optional message if the specified condition
+    is False.
+
+    Error type: ``ValueError``
+
+    C++ equivalent: ``TORCH_CHECK_VALUE``
+
+    Args:
+        cond (:class:`bool`): If False, throw error
+
+        message (Callable, optional): Callable that returns either a string or
+            an object that has a ``__str__()`` method to be used as the error
+            message. Default: ``None``
+    """
+    _check_with(ValueError, cond, message)
+
+def _check_type(cond, message=None):  # noqa: F811
+    r"""Throws error containing an optional message if the specified condition
+    is False.
+
+    Error type: ``TypeError``
+
+    C++ equivalent: ``TORCH_CHECK_TYPE``
+
+    Args:
+        cond (:class:`bool`): If False, throw error
+
+        message (Callable, optional): Callable that returns either a string or
+            an object that has a ``__str__()`` method to be used as the error
+            message. Default: ``None``
+    """
+    _check_with(TypeError, cond, message)
+
+def _check_not_implemented(cond, message=None):  # noqa: F811
+    r"""Throws error containing an optional message if the specified condition
+    is False.
+
+    Error type: ``NotImplementedError``
+
+    C++ equivalent: ``TORCH_CHECK_NOT_IMPLEMENTED``
+
+    Args:
+        cond (:class:`bool`): If False, throw error
+
+        message (Callable, optional): Callable that returns either a string or
+            an object that has a ``__str__()`` method to be used as the error
+            message. Default: ``None``
+    """
+    _check_with(NotImplementedError, cond, message)
+
+def _check_tensor_all_with(error_type, cond, message=None):  # noqa: F811
+    if not torch.is_tensor(cond):
+        raise TypeError(f'cond must be a tensor, but got {type(cond)}')
+
+    if not cond.dtype == torch.bool:
+        raise TypeError(
+            f'cond tensor must have dtype torch.bool, but got {cond.dtype}')
+
+    _check_with(error_type, cond._is_all_true().item(), message)
+
+# C++ equivalent: `TORCH_CHECK_TENSOR_ALL`
+def _check_tensor_all(cond, message=None):  # noqa: F811
+    r"""Throws error containing an optional message if the specified condition
+    is False.
+
+    Error type: ``RuntimeError``
+
+    C++ equivalent: ``TORCH_CHECK_TENSOR_ALL``
+
+    Args:
+        cond (:class:`torch.Tensor`): Tensor of dtype ``torch.bool``. If any
+            element is ``False``, throw error
+
+        message (Callable, optional): Callable that returns either a string or
+            an object that has a ``__str__()`` method to be used as the error
+            message. Default: ``None``
+    """
+    _check_tensor_all_with(RuntimeError, cond, message)
+
+################################################################################
+# Define numeric constants
+################################################################################
+
+# For Python Array API (https://data-apis.org/array-api/latest/API_specification/constants.html) and
+# NumPy consistency (https://numpy.org/devdocs/reference/constants.html)
+from math import e , nan , inf , pi
+__all__.extend(['e', 'pi', 'nan', 'inf'])
+
+################################################################################
+# Define Storage and Tensor classes
+################################################################################
+
+from ._tensor import Tensor
+from .storage import _StorageBase, TypedStorage, _LegacyStorage, UntypedStorage, _warn_typed_storage_removal
+
+# NOTE: New <type>Storage classes should never be added. When adding a new
+# dtype, use torch.storage.TypedStorage directly.
+
+class ByteStorage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.uint8
+
+class DoubleStorage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.double
+
+class FloatStorage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.float
+
+class HalfStorage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.half
+
+class LongStorage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.long
+
+class IntStorage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.int
+
+class ShortStorage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.short
+
+class CharStorage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.int8
+
+class BoolStorage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.bool
+
+class BFloat16Storage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.bfloat16
+
+class ComplexDoubleStorage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.cdouble
+
+class ComplexFloatStorage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.cfloat
+
+class QUInt8Storage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.quint8
+
+class QInt8Storage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.qint8
+
+class QInt32Storage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.qint32
+
+class QUInt4x2Storage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.quint4x2
+
+class QUInt2x4Storage(_LegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal(stacklevel=3)
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.quint2x4
+
+_storage_classes = {
+    UntypedStorage, DoubleStorage, FloatStorage, LongStorage, IntStorage,
+    ShortStorage, CharStorage, ByteStorage, HalfStorage, BoolStorage,
+    QUInt8Storage, QInt8Storage, QInt32Storage, BFloat16Storage,
+    ComplexFloatStorage, ComplexDoubleStorage, QUInt4x2Storage, QUInt2x4Storage,
+    TypedStorage
+}
+
+# The _tensor_classes set is initialized by the call to initialize_python_bindings.
+_tensor_classes: Set[Type] = set()
+
+# If you edit these imports, please update torch/__init__.py.in as well
+from .random import set_rng_state, get_rng_state, manual_seed, initial_seed, seed
+from .serialization import save, load
+from ._tensor_str import set_printoptions
+
+################################################################################
+# Initialize extension
+################################################################################
+
+def manager_path():
+    if _running_with_deploy() or platform.system() == 'Windows':
+        return b""
+    path = get_file_path('torch', 'bin', 'torch_shm_manager')
+    prepare_multiprocessing_environment(get_file_path('torch'))
+    if not os.path.exists(path):
+        raise RuntimeError("Unable to find torch_shm_manager at " + path)
+    return path.encode('utf-8')
+
+from torch.amp import autocast, GradScaler
+
+# Initializing the extension shadows the built-in python float / int classes;
+# store them for later use by SymInt / SymFloat.
+py_float = float
+py_int = int
+
+# Shared memory manager needs to know the exact location of manager executable
+_C._initExtension(manager_path())
+del manager_path
+
+# Appease the type checker: it can't deal with direct setting of globals().
+# Note that we will see "too many" functions when reexporting this way; there
+# is not a good way to fix this problem.  Perhaps, try to redesign VariableFunctions
+# so that this import is good enough
+if TYPE_CHECKING:
+    # Some type signatures pulled in from _VariableFunctions here clash with
+    # signatures already imported. For now these clashes are ignored; see
+    # PR #43339 for details.
+    from torch._C._VariableFunctions import *  # type: ignore[assignment, misc] # noqa: F403
+    # Fixup segment_reduce visibility
+    _segment_reduce = segment_reduce
+    del segment_reduce  # noqa: F821
+
+# Ops not to be exposed in `torch` namespace,
+# mostly helper ops.
+PRIVATE_OPS = (
+    'unique_dim',
+)
+
+for name in dir(_C._VariableFunctions):
+    if name.startswith('__') or name in PRIVATE_OPS:
+        continue
+    obj = getattr(_C._VariableFunctions, name)
+    obj.__module__ = 'torch'
+    # Hide some APIs that should not be public
+    if name == "segment_reduce":
+        # TODO: Once the undocumented FC window is passed, remove the line bellow
+        globals()[name] = obj
+        name = "_" + name
+    globals()[name] = obj
+    if not name.startswith("_"):
+        __all__.append(name)
+
+
+################################################################################
+# Add torch.dtype instances to the public API
+################################################################################
+
+import torch
+
+for attribute in dir(torch):
+    if isinstance(getattr(torch, attribute), torch.dtype):
+        __all__.append(attribute)
+
+################################################################################
+# Import TorchDynamo's lazy APIs to avoid circular dependenices
+################################################################################
+
+# needs to be before from .functional import * to avoid circular dependencies
+from ._compile import _disable_dynamo
+
+################################################################################
+# Import interface functions defined in Python
+################################################################################
+
+# needs to be after the above ATen bindings so we can overwrite from Python side
+from .functional import *  # noqa: F403
+
+
+################################################################################
+# Remove unnecessary members
+################################################################################
+
+del _StorageBase
+del _LegacyStorage
+
+################################################################################
+# Define _assert
+################################################################################
+
+# needs to be before the submodule imports to avoid circular dependencies
+def _assert(condition, message):
+    r"""A wrapper around Python's assert which is symbolically traceable.
+    """
+    from .overrides import has_torch_function, handle_torch_function
+
+    if type(condition) is not torch.Tensor and has_torch_function((condition,)):
+        return handle_torch_function(_assert, (condition,), condition, message)
+    assert condition, message
+
+################################################################################
+# Import most common subpackages
+################################################################################
+
+# Use the redundant form so that type checkers know that these are a part of
+# the public API. The "regular" import lines are there solely for the runtime
+# side effect of adding to the imported module's members for other users.
+from torch import cuda as cuda
+from torch import cpu as cpu
+from torch import mps as mps
+from torch import xpu as xpu
+from torch import autograd as autograd
+from torch.autograd import (
+    no_grad as no_grad,
+    enable_grad as enable_grad,
+    set_grad_enabled as set_grad_enabled,
+    inference_mode as inference_mode,
+)
+from torch import fft as fft
+from torch import futures as futures
+from torch import _awaits as _awaits
+from torch import nested as nested
+from torch import nn as nn
+from torch.signal import windows as windows
+from torch import optim as optim
+import torch.optim._multi_tensor
+from torch import multiprocessing as multiprocessing
+from torch import sparse as sparse
+from torch import special as special
+import torch.utils.backcompat
+from torch import jit as jit
+from torch import linalg as linalg
+from torch import hub as hub
+from torch import random as random
+from torch import distributions as distributions
+from torch import testing as testing
+from torch import backends as backends
+import torch.utils.data
+from torch import __config__ as __config__
+from torch import __future__ as __future__
+from torch import profiler as profiler
+
+# Quantized, sparse, AO, etc. should be last to get imported, as nothing
+# is expected to depend on them.
+from torch import ao as ao
+# nn.quant* depends on ao -- so should be after those.
+import torch.nn.quantizable
+import torch.nn.quantized
+import torch.nn.qat
+import torch.nn.intrinsic
+
+_C._init_names(list(torch._storage_classes))
+
+# attach docstrings to torch and tensor functions
+from . import _torch_docs, _tensor_docs, _storage_docs
+del _torch_docs, _tensor_docs, _storage_docs
+
+
+def compiled_with_cxx11_abi() -> builtins.bool:
+    r"""Returns whether PyTorch was built with _GLIBCXX_USE_CXX11_ABI=1"""
+    return _C._GLIBCXX_USE_CXX11_ABI
+
+
+# Import the ops "namespace"
+from torch._ops import ops
+from torch._classes import classes
+import torch._library
+
+# quantization depends on torch.fx
+# Import quantization
+from torch import quantization as quantization
+
+# Import the quasi random sampler
+from torch import quasirandom as quasirandom
+
+# If you are seeing this, it means that this call site was not checked if
+# the memory format could be preserved, and it was switched to old default
+# behaviour of contiguous
+legacy_contiguous_format = contiguous_format
+
+# Register fork handler to initialize OpenMP in child processes (see gh-28389)
+from torch.multiprocessing._atfork import register_after_fork
+register_after_fork(torch.get_num_threads)
+del register_after_fork
+
+# Import tools that require fully imported torch (for applying
+# torch.jit.script as a decorator, for instance):
+from ._lobpcg import lobpcg as lobpcg
+
+# These were previously defined in native_functions.yaml and appeared on the
+# `torch` namespace, but we moved them to c10 dispatch to facilitate custom
+# class usage. We add these lines here to preserve backward compatibility.
+quantized_lstm = torch.ops.aten.quantized_lstm
+quantized_gru = torch.ops.aten.quantized_gru
+
+from torch.utils.dlpack import from_dlpack, to_dlpack
+
+# Import experimental masked operations support. See
+# [RFC-0016](https://github.com/pytorch/rfcs/pull/27) for more
+# information.
+from . import masked
+
+# Import removed ops with error message about removal
+from ._linalg_utils import (  # type: ignore[misc]
+    matrix_rank,
+    eig,
+    solve,
+    lstsq,
+)
+from ._linalg_utils import _symeig as symeig  # type: ignore[misc]
+
+class _TorchCompileInductorWrapper:
+    compiler_name = "inductor"
+
+    def __init__(self, mode, options, dynamic):
+        self.config: Dict[str, Any] = dict()
+        self.dynamic = dynamic
+        self.apply_mode(mode)
+        self.apply_options(options)
+
+        if self.config.get("triton.cudagraphs", False):
+            os.environ["DISABLE_CUPTI_LAZY_REINIT"] = "1"
+            # FIXME: CUDA Graph does not work well with CUPTI teardown.
+            #   1) crashes on 1st lazy CUPTI re-init after teardown (CUDA 11)
+            #   2) crashes on 2nd non-lazy CUPTI re-init after teardown (CUDA 12)
+            # Workaround: turn off CUPTI teardown when using CUDA Graphs.
+            os.environ["TEARDOWN_CUPTI"] = "0"
+
+    def __eq__(self, other):
+        return (isinstance(other, _TorchCompileInductorWrapper) and
+                self.config == other.config and
+                self.dynamic == other.dynamic)
+
+    def apply_mode(self, mode: Optional[str]):
+        if mode is None or mode == "default":
+            pass
+        elif mode in ("reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"):
+            from torch._inductor import list_mode_options
+            self.apply_options(list_mode_options(mode, self.dynamic))
+        else:
+            raise RuntimeError(
+                f"Unrecognized mode={mode}, should be one of: default, reduce-overhead, max-autotune, max-autotune-no-cudagraphs"
+            )
+
+    def apply_options(self, options: Optional[Dict[str, Any]]):
+        if not options:
+            return
+
+        from torch._inductor import config
+        current_config: Dict[str, Any] = config.shallow_copy_dict()
+
+        for key, val in options.items():
+            attr_name = key.replace("-", "_")
+            if attr_name not in current_config:
+                raise RuntimeError(
+                    f"Unexpected optimization option {key}, known options are {list(current_config.keys())}"
+                )
+            if type(val) is not type(current_config[attr_name]):
+                val_type_str = type(val).__name__
+                expected_type_str = type(current_config[attr_name]).__name__
+                raise RuntimeError(
+                    f"Unexpected type of attr {key}, got {val_type_str} should be {expected_type_str}"
+                )
+            self.config[attr_name] = val
+
+    def __call__(self, model_, inputs_):
+        from torch._inductor.compile_fx import compile_fx
+
+        return compile_fx(model_, inputs_, config_patches=self.config)
+
+    def get_compiler_config(self):
+        from torch._inductor.compile_fx import get_patched_config_dict
+        return get_patched_config_dict(config_patches=self.config)
+
+    def reset(self):
+        from torch._inductor import config
+        if "triton.cudagraphs" in self.config or config.triton.cudagraphs:
+            if self.config.get("triton.cudagraphs", True):
+                from torch._inductor.cudagraph_trees import reset_cudagraph_trees
+                reset_cudagraph_trees()
+
+class _TorchCompileWrapper:
+    def __init__(self, backend, mode, options, dynamic):
+        from torch._dynamo.backends.registry import lookup_backend
+
+        if isinstance(backend, str):
+            self.compiler_name = backend
+        elif hasattr(backend, "__name__"):
+            self.compiler_name = backend.__name__
+        else:
+            self.compiler_name = str(backend)
+        self.dynamic = dynamic
+        self.compiler_fn = lookup_backend(backend)
+        self.kwargs = {}
+        # only pass the args if they non-empty
+        if mode and mode != "default":
+            self.kwargs["mode"] = mode
+        if options:
+            self.kwargs["options"] = options
+
+    def __eq__(self, other):
+        return (isinstance(other, _TorchCompileWrapper) and
+                self.compiler_fn == other.compiler_fn and
+                self.kwargs == other.kwargs and
+                self.dynamic == other.dynamic)
+
+    def __call__(self, model_, inputs_):
+        return self.compiler_fn(model_, inputs_, **self.kwargs)
+
+    def reset(self):
+        if hasattr(self.compiler_fn, "reset"):
+            self.compiler_fn.reset()
+
+
+def compile(model: Optional[Callable] = None, *,
+            fullgraph: builtins.bool = False,
+            dynamic: Optional[builtins.bool] = None,
+            backend: Union[str, Callable] = "inductor",
+            mode: Union[str, None] = None,
+            options: Optional[Dict[str, Union[str, builtins.int, builtins.bool]]] = None,
+            disable: builtins.bool = False) -> Callable:
+    """
+    Optimizes given model/function using TorchDynamo and specified backend.
+
+    Concretely, for every frame executed within the compiled region, we will attempt
+    to compile it and cache the compiled result on the code object for future
+    use.  A single frame may be compiled multiple times if previous compiled
+    results are not applicable for subsequent calls (this is called a "guard
+    failure), you can use TORCH_LOGS=guards to debug these situations.
+    Multiple compiled results can be associated with a frame up to
+    ``torch._dynamo.config.cache_size_limit``, which defaults to 64; at which
+    point we will fall back to eager.  Note that compile caches are per
+    *code object*, not frame; if you dynamically create multiple copies of a
+    function, they will all share the same code cache.
+
+    Args:
+       model (Callable): Module/function to optimize
+       fullgraph (bool): If False (default), torch.compile attempts to discover compileable regions
+        in the function that it will optimize. If True, then we require that the entire function be
+        capturable into a single graph. If this is not possible (that is, if there are graph breaks),
+        then this will raise an error.
+       dynamic (bool or None): Use dynamic shape tracing.  When this is True, we will up-front attempt
+        to generate a kernel that is as dynamic as possible to avoid recompilations when
+        sizes change.  This may not always work as some operations/optimizations will
+        force specialization; use TORCH_LOGS=dynamic to debug overspecialization.
+        When this is False, we will NEVER generate dynamic kernels, we will always specialize.
+        By default (None), we automatically detect if dynamism has occurred and compile a more
+        dynamic kernel upon recompile.
+       backend (str or Callable): backend to be used
+
+        - "inductor" is the default backend, which is a good balance between performance and overhead
+
+        - Non experimental in-tree backends can be seen with `torch._dynamo.list_backends()`
+
+        - Experimental or debug in-tree backends can be seen with `torch._dynamo.list_backends(None)`
+
+        - To register an out-of-tree custom backend: https://pytorch.org/docs/main/compile/custom-backends.html
+       mode (str): Can be either "default", "reduce-overhead", "max-autotune" or "max-autotune-no-cudagraphs"
+
+        - "default" is the default mode, which is a good balance between performance and overhead
+
+        - "reduce-overhead" is a mode that reduces the overhead of python with CUDA graphs,
+          useful for small batches.  Reduction of overhead can come at the cost of more memory
+          usage, as we will cache the workspace memory required for the invocation so that we
+          do not have to reallocate it on subsequent runs.  Reduction of overhead is not guaranteed
+          to work; today, we only reduce overhead for CUDA only graphs which do not mutate inputs.
+          There are other circumstances where CUDA graphs are not applicable; use TORCH_LOG=perf_hints
+          to debug.
+
+        - "max-autotune" is a mode that leverages Triton based matrix multiplications and convolutions
+          It enables CUDA graphs by default.
+
+        - "max-autotune-no-cudagraphs" is a mode similar to "max-autotune" but without CUDA graphs
+
+        - To see the exact configs that each mode sets you can call `torch._inductor.list_mode_options()`
+
+       options (dict): A dictionary of options to pass to the backend. Some notable ones to try out are
+
+        - `epilogue_fusion` which fuses pointwise ops into templates. Requires `max_autotune` to also be set
+
+        - `max_autotune` which will profile to pick the best matmul configuration
+
+        - `fallback_random` which is useful when debugging accuracy issues
+
+        - `shape_padding` which pads matrix shapes to better align loads on GPUs especially for tensor cores
+
+        - `triton.cudagraphs` which will reduce the overhead of python with CUDA graphs
+
+        - `trace.enabled` which is the most useful debugging flag to turn on
+
+        - `trace.graph_diagram` which will show you a picture of your graph after fusion
+
+        - For inductor you can see the full list of configs that it supports by calling `torch._inductor.list_options()`
+       disable (bool): Turn torch.compile() into a no-op for testing
+
+    Example::
+
+        @torch.compile(options={"triton.cudagraphs": True}, fullgraph=True)
+        def foo(x):
+            return torch.sin(x) + torch.cos(x)
+
+    """
+    _C._log_api_usage_once("torch.compile")
+    # Temporary until we get proper support for python 3.12
+    if sys.version_info >= (3, 12):
+        raise RuntimeError("Dynamo is not supported on Python 3.12+")
+
+    # Decorator mode
+    if model is None:
+        def fn(model: Callable):
+            if model is None:
+                raise RuntimeError("Model can't be None")
+            return compile(model,
+                           fullgraph=fullgraph,
+                           dynamic=dynamic,
+                           backend=backend,
+                           mode=mode,
+                           options=options,
+                           disable=disable)
+        return fn
+
+    if mode is not None and options is not None:
+        raise RuntimeError("Either mode or options can be specified, but both can't be specified at the same time.")
+    if mode is None and options is None:
+        mode = "default"
+    if backend == "inductor":
+        backend = _TorchCompileInductorWrapper(mode, options, dynamic)
+    else:
+        backend = _TorchCompileWrapper(backend, mode, options, dynamic)
+
+    return torch._dynamo.optimize(backend=backend, nopython=fullgraph, dynamic=dynamic, disable=disable)(model)
+
+
+from torch import export as export
+
+from torch._higher_order_ops import cond
+
+def _register_device_module(device_type, module):
+    r"""Register an external runtime module of the specific :attr:`device_type`
+    supported by torch.
+
+    After the :attr:`module` is registered correctly, the user can refer
+    the external runtime module as part of torch with attribute torch.xxx.
+    """
+    # Make sure the device_type represent a supported device type for torch.
+    device_type = torch.device(device_type).type
+    m = sys.modules[__name__]
+    if hasattr(m, device_type):
+        raise RuntimeError(f"The runtime module of '{device_type}' has already "
+                           f"been registered with '{getattr(m, device_type)}'")
+    setattr(m, device_type, module)
+    torch_module_name = '.'.join([__name__, device_type])
+    sys.modules[torch_module_name] = module
+
+# expose return_types
+from . import return_types
+from . import library
+if not TYPE_CHECKING:
+    from . import _meta_registrations
+
+# Enable CUDA Sanitizer
+if 'TORCH_CUDA_SANITIZER' in os.environ:
+    import torch.cuda._sanitizer as csan
+
+    csan.enable_cuda_sanitizer()
+
+# Populate magic methods on SymInt and SymFloat
+import torch.fx.experimental.sym_node
+
+from torch import func as func
+from torch.func import vmap
+
+
+# The function _sparse_coo_tensor_unsafe is removed from PyTorch
+# Python API (v. 1.13), here we temporarily provide its replacement
+# with a deprecation warning.
+# TODO: remove the function for PyTorch v 1.15.
+def _sparse_coo_tensor_unsafe(*args, **kwargs):
+    import warnings
+    warnings.warn('torch._sparse_coo_tensor_unsafe is deprecated, '
+                  'use torch.sparse_coo_tensor(..., check_invariants=False) instead.')
+    kwargs['check_invariants'] = False
+    return torch.sparse_coo_tensor(*args, **kwargs)
+
+# Register MPS specific decomps
+torch.backends.mps._init()
+
+if not _running_with_deploy():
+    from torch import compiler as compiler
+
+    class _TritonLibrary:
+        lib = torch.library.Library("triton", "DEF")
+        ops_table: Dict[Tuple[str, str], Callable] = {}
+
+        @classmethod
+        def registerOp(cls, op_key, full_schema, op_impl, dispatch_key):
+            if (op_key, dispatch_key) not in cls.ops_table:
+                cls.lib.define(full_schema)
+                cls.lib.impl("triton::" + op_key, op_impl, dispatch_key)
+                cls.ops_table[(op_key, dispatch_key)] = op_impl
+
+            return cls.ops_table[(op_key, dispatch_key)]
+
+
+# Deprecated attributes
+_deprecated_attrs = {
+    "has_mps": torch.backends.mps.is_built,
+    "has_cuda": torch.backends.cuda.is_built,
+    "has_cudnn": torch.backends.cudnn.is_available,
+    "has_mkldnn": torch.backends.mkldnn.is_available,
+}
+
+if TYPE_CHECKING:
+    # Import the following modules during type checking to enable code intelligence features,
+    # such as auto-completion in tools like pylance, even when these modules are not explicitly
+    # imported in user code.
+    from torch import _dynamo as _dynamo
+    from torch import _inductor as _inductor
+    from torch import onnx as onnx
+
+else:
+    _lazy_modules = {
+        "_dynamo",
+        "_inductor",
+        "_export",
+        # ONNX must be imported after _dynamo, _ops, _subclasses, fx, func and jit
+        "onnx",
+    }
+
+    def __getattr__(name):
+        # Deprecated attrs
+        replacement = _deprecated_attrs.get(name)
+        if replacement is not None:
+            import warnings
+            warnings.warn(f"'{name}' is deprecated, please use '{replacement.__module__}.{replacement.__name__}()'", stacklevel=2)
+            return replacement()
+
+        # Lazy modules
+        if name in _lazy_modules:
+            import importlib
+            return importlib.import_module(f".{name}", __name__)
+
+        raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
+
+
+def _constrain_as_value(symbol, min: Optional[builtins.int] = None, max: Optional[builtins.int] = None):
+    """
+    Add min/max constraint on the intermediate symbol at tracing time. If called in eager mode,
+    it will still check if the input value is within the specified range.
+    """
+    torch.sym_constrain_range(symbol, min=min, max=max)
+
+
+def _constrain_as_size(symbol, min: Optional[builtins.int] = None, max: Optional[builtins.int] = None):
+    """
+    This indicates that a given int is size-like, and can be used in any context where a size is expected.
+    You will typically use this when reading out integers from Tensors, e.g., max.item() or lengths.tolist()
+    which then need to be used as tensor constructors. Providing these assertions to PyTorch can help resolve
+      GuardOnDataDependentSymNode errors upon export, since we cannot guard on unbacked SymInts.
+
+    This function has unusual semantics which distinguish it from
+    constrain_as_value.  Specifically, in some circumstances in framework
+    code, we will treat this int as >= 2 (when we do a size-oblivious guard).
+    This makes it easier to This makes it easier to use the unbacked int in
+    size contexts, as we will often attempt to guard on a size being zero/one
+    (e.g., when computing the contiguity of a tensor, or testing if
+    broadcasting can occur), which will not work on unbacked SymInts.
+    However, if we conservatively assume that the size is not zero/one, we will
+    end up with a graph that will still work even if the size is zero/one.
+
+    For more details, see https://docs.google.com/document/d/1HSuTTVvYH1pTew89Rtpeu84Ht3nQEFTYhAX3Ypa_xJs/edit
+    ```
+    """
+    torch.sym_constrain_range_for_size(symbol, min=min, max=max)
+
+
+from . import _logging
+_logging._init_logs()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe6f3a4603169f622e953a793f606f7fa74a9cbb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/__init__.py
@@ -0,0 +1,16 @@
+# torch.ao is a package with a lot of interdependencies.
+# We will use lazy import to avoid cyclic dependencies here.
+
+
+__all__ = [
+    "nn",
+    "ns",
+    "quantization",
+    "pruning",
+]
+
+def __getattr__(name):
+    if name in __all__:
+        import importlib
+        return importlib.import_module("." + name, __name__)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6da38a6c27328910a9300a47fdf70b59362390c
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/__pycache__/embedding_ops.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/__pycache__/embedding_ops.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f711ad5d4a946c86dd6b59045237695c9d2c456
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/__pycache__/embedding_ops.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/conv.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b588d84a74e009c567d4a2d0ede6ebca1a3d11e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/conv.py
@@ -0,0 +1,270 @@
+import torch
+import torch.nn as nn
+from torch.nn.modules.utils import _single, _pair, _triple
+from torch.ao.nn.intrinsic import _FusedModule
+from typing import Tuple, TypeVar, Union
+from torch.nn.common_types import _size_1_t, _size_2_t, _size_3_t
+
+__all__ = [
+    "Conv1d",
+    "Conv2d",
+    "Conv3d"
+]
+
+MOD = TypeVar('MOD', bound=nn.modules.conv._ConvNd)
+
+class _ConvNd(nn.modules.conv._ConvNd):
+
+    _FLOAT_MODULE = MOD
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Tuple[int, ...],
+                 stride: Tuple[int, ...],
+                 padding: Tuple[int, ...],
+                 dilation: Tuple[int, ...],
+                 transposed: bool,
+                 output_padding: Tuple[int, ...],
+                 groups: int,
+                 bias: bool,
+                 padding_mode: str,
+                 qconfig=None,
+                 device=None,
+                 dtype=None) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        nn.modules.conv._ConvNd.__init__(self, in_channels, out_channels, kernel_size,
+                                         stride, padding, dilation, transposed,
+                                         output_padding, groups, bias, padding_mode, **factory_kwargs)
+        assert qconfig, 'qconfig must be provided for QAT module'
+        self.qconfig = qconfig
+        self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs)
+
+    def forward(self, input):
+        return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
+
+    @staticmethod
+    def from_float(cls, mod):
+        r"""Create a qat module from a float module
+
+            Args:
+               `mod`: a float module, either produced by torch.ao.quantization utilities
+               or directly from user
+        """
+        assert type(mod) == cls._FLOAT_MODULE, (
+            "qat."
+            + cls.__name__
+            + ".from_float only works for "
+            + cls._FLOAT_MODULE.__name__  # type: ignore[attr-defined]
+        )
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        assert mod.qconfig, 'Input float module must have a valid qconfig'
+        if issubclass(type(mod), _FusedModule):
+            mod = mod[0]  # type: ignore[index]
+        qconfig = mod.qconfig
+        qat_conv = cls(mod.in_channels, mod.out_channels, mod.kernel_size,
+                       stride=mod.stride, padding=mod.padding, dilation=mod.dilation,
+                       groups=mod.groups, bias=mod.bias is not None,
+                       padding_mode=mod.padding_mode, qconfig=qconfig)
+        qat_conv.weight = mod.weight
+        qat_conv.bias = mod.bias
+        return qat_conv
+
+    def to_float(self):
+        """ This works for both single qat conv, and the qat conv - relu modules
+        to convert the qat module to a floating point module
+        """
+        cls = type(self)
+        conv = cls._FLOAT_CONV_MODULE(  # type: ignore[attr-defined, operator]
+            self.in_channels,
+            self.out_channels,
+            self.kernel_size,  # type: ignore[arg-type]
+            self.stride,  # type: ignore[arg-type]
+            self.padding,  # type: ignore[arg-type]
+            self.dilation,  # type: ignore[arg-type]
+            self.groups,
+            self.bias is not None,
+            self.padding_mode)
+        conv.weight = torch.nn.Parameter(self.weight.detach())
+        if self.bias is not None:
+            conv.bias = torch.nn.Parameter(self.bias.detach())
+        # conv relu
+        if issubclass(cls, _FusedModule):
+            modules = [conv]
+            assert hasattr(cls, "_FLOAT_RELU_MODULE")
+            relu = cls._FLOAT_RELU_MODULE()  # type: ignore[attr-defined]
+            modules.append(relu)
+            fused = cls._FLOAT_MODULE(*modules)  # type: ignore[arg-type, attr-defined, operator]
+            fused.train(self.training)
+            return fused
+        else:
+            return conv
+
+class Conv1d(_ConvNd, nn.Conv1d):
+    r"""
+    A Conv1d module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as :class:`~torch.nn.Conv1d`
+
+    Similar to :class:`~torch.nn.Conv2d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+    """
+    _FLOAT_MODULE = nn.Conv1d
+    _FLOAT_CONV_MODULE = nn.Conv1d
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_1_t,
+                 stride: _size_1_t = 1,
+                 padding: Union[str, _size_1_t] = 0,
+                 dilation: _size_1_t = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 padding_mode: str = 'zeros',
+                 qconfig=None,
+                 device=None,
+                 dtype=None) -> None:
+        kernel_size_ = _single(kernel_size)
+        stride_ = _single(stride)
+        padding_ = padding if isinstance(padding, str) else _single(padding)
+        dilation_ = _single(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride=stride_,
+            padding=padding_,
+            dilation=dilation_,
+            transposed=False,
+            output_padding=_single(0),
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+            device=device,
+            dtype=dtype)
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(cls, mod)
+
+class Conv2d(_ConvNd, nn.Conv2d):
+    r"""
+    A Conv2d module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as `torch.nn.Conv2d`, please see
+    https://pytorch.org/docs/stable/nn.html?highlight=conv2d#torch.nn.Conv2d
+    for documentation.
+
+    Similar to `torch.nn.Conv2d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+    """
+    _FLOAT_MODULE = nn.Conv2d
+    _FLOAT_CONV_MODULE = nn.Conv2d
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_2_t,
+                 stride: _size_2_t = 1,
+                 padding: Union[str, _size_2_t] = 0,
+                 dilation: _size_2_t = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 padding_mode: str = 'zeros',
+                 qconfig=None,
+                 device=None,
+                 dtype=None) -> None:
+        kernel_size_ = _pair(kernel_size)
+        stride_ = _pair(stride)
+        padding_ = padding if isinstance(padding, str) else _pair(padding)
+        dilation_ = _pair(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride=stride_,
+            padding=padding_,
+            dilation=dilation_,
+            transposed=False,
+            output_padding=_pair(0),
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+            device=device,
+            dtype=dtype)
+
+    def forward(self, input):
+        return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(cls, mod)
+
+class Conv3d(_ConvNd, nn.Conv3d):
+    r"""
+    A Conv3d module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as `torch.nn.Conv3d`, please see
+    https://pytorch.org/docs/stable/nn.html?highlight=conv3d#torch.nn.Conv3d
+    for documentation.
+
+    Similar to `torch.nn.Conv3d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+    """
+    _FLOAT_MODULE = nn.Conv3d
+    _FLOAT_CONV_MODULE = nn.Conv3d
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_3_t,
+                 stride: _size_3_t = 1,
+                 padding: Union[str, _size_3_t] = 0,
+                 dilation: _size_3_t = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 padding_mode: str = 'zeros',
+                 qconfig=None,
+                 device=None,
+                 dtype=None) -> None:
+        kernel_size_ = _triple(kernel_size)
+        stride_ = _triple(stride)
+        padding_ = padding if isinstance(padding, str) else _triple(padding)
+        dilation_ = _triple(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride=stride_,
+            padding=padding_,
+            dilation=dilation_,
+            transposed=False,
+            output_padding=_triple(0),
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+            device=device,
+            dtype=dtype)
+
+    def forward(self, input):
+        return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(cls, mod)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/linear.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..99d43ed3f6c22dbf08c50d6cd30195e2f482172e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/linear.py
@@ -0,0 +1,81 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.ao.nn.intrinsic import LinearReLU
+from torch.nn.utils.parametrize import (
+    is_parametrized,
+    type_before_parametrizations,
+    transfer_parametrizations_and_params,
+)
+
+__all__ = [
+    "Linear"
+]
+
+class Linear(nn.Linear):
+    r"""
+    A linear module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as `torch.nn.Linear`, please see
+    https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
+    for documentation.
+
+    Similar to `torch.nn.Linear`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight: fake quant module for weight
+    """
+    _FLOAT_MODULE = nn.Linear
+
+    def __init__(self, in_features, out_features, bias=True,
+                 qconfig=None, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(in_features, out_features, bias, **factory_kwargs)
+        assert qconfig, 'qconfig must be provided for QAT module'
+        self.qconfig = qconfig
+        self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs)
+
+    def forward(self, input):
+        return F.linear(input, self.weight_fake_quant(self.weight), self.bias)
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a qat module from a float module or qparams_dict
+            Args: `mod` a float module, either produced by torch.ao.quantization utilities
+            or directly from user
+        """
+        assert type_before_parametrizations(mod) == cls._FLOAT_MODULE, (
+            " qat."
+            + cls.__name__
+            + ".from_float only works for "
+            + cls._FLOAT_MODULE.__name__
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        assert mod.qconfig, "Input float module must have a valid qconfig"
+        if type_before_parametrizations(mod) == LinearReLU:
+            mod = mod[0]
+
+        qconfig = mod.qconfig
+        qat_linear = cls(mod.in_features, mod.out_features, bias=mod.bias is not None, qconfig=qconfig)
+
+        if is_parametrized(mod, "weight"):
+            transfer_parametrizations_and_params(mod, qat_linear, "weight")
+        else:
+            qat_linear.weight = mod.weight
+
+        if is_parametrized(mod, "bias"):
+            transfer_parametrizations_and_params(mod, qat_linear, "bias")
+        else:
+            qat_linear.bias = mod.bias
+
+        return qat_linear
+
+    def to_float(self):
+        linear = torch.nn.Linear(self.in_features, self.out_features, self.bias is not None)
+        linear.weight = torch.nn.Parameter(self.weight.detach())
+        if self.bias is not None:
+            linear.bias = torch.nn.Parameter(self.bias.detach())
+        linear.train(self.training)
+        return linear
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/quantized/modules/linear.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/quantized/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6c720e9b6094f0f44ef74d3f50f5c282f1738c7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/quantized/modules/linear.py
@@ -0,0 +1,303 @@
+from collections.abc import Iterable
+import torch
+
+import torch.nn as nn
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.qat as nniqat
+from torch.nn.utils.fusion import fuse_linear_bn_weights
+from torch.nn.utils.parametrize import type_before_parametrizations
+
+from typing import Optional
+
+from .utils import _quantize_weight, _hide_packed_params_repr, WeightedQuantizedModule
+
+__all__ = ['LinearPackedParams', 'Linear']
+
+
+class LinearPackedParams(torch.nn.Module):
+    _version = 3
+
+    def __init__(self, dtype=torch.qint8):
+        super().__init__()
+        self.dtype = dtype
+        if self.dtype == torch.qint8:
+            wq = torch._empty_affine_quantized([1, 1], scale=1.0, zero_point=0, dtype=torch.qint8)
+        elif self.dtype == torch.float16:
+            wq = torch.zeros([1, 1], dtype=torch.float)
+        self.set_weight_bias(wq, None)  # type: ignore[possibly-undefined]
+
+    @torch.jit.export
+    def set_weight_bias(self, weight: torch.Tensor, bias: Optional[torch.Tensor]) -> None:
+        if self.dtype == torch.qint8:
+            self._packed_params = torch.ops.quantized.linear_prepack(weight, bias)
+        elif self.dtype == torch.float16:
+            self._packed_params = torch.ops.quantized.linear_prepack_fp16(weight, bias)
+        else:
+            raise RuntimeError('Unsupported dtype on dynamic quantized linear!')
+
+
+    @torch.jit.export
+    def _weight_bias(self):
+        if self.dtype == torch.qint8:
+            return torch.ops.quantized.linear_unpack(self._packed_params)
+        elif self.dtype == torch.float16:
+            return torch.ops.quantized.linear_unpack_fp16(self._packed_params)
+        else:
+            raise RuntimeError('Unsupported dtype on dynamic quantized linear!')
+
+    def forward(self, x):
+        return x
+
+    # Version 1
+    #   self
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #
+    # Version 2
+    #   self
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #   |--- dtype : torch.dtype
+    #
+    # Version 3
+    #   self
+    #   |--- _packed_params : (Tensor, Tensor) representing (weight, bias)
+    #                         of LinearPackedParams
+    #   |--- dtype : torch.dtype
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'dtype'] = self.dtype
+        destination[prefix + '_packed_params'] = self._weight_bias()
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+        if version is None or version < 2:
+            self.dtype = torch.qint8
+        else:
+            self.dtype = state_dict[prefix + 'dtype']
+            state_dict.pop(prefix + 'dtype')
+
+        if version is None or version < 3:
+            self.set_weight_bias(state_dict[prefix + 'weight'], state_dict[prefix + 'bias'])
+            state_dict.pop(prefix + 'weight')
+            state_dict.pop(prefix + 'bias')
+
+        if version == 3:
+            weight, bias = state_dict[prefix + '_packed_params']
+            state_dict.pop(prefix + '_packed_params')
+            self.set_weight_bias(weight, bias)
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
+
+
+    def __repr__(self):
+        return self._weight_bias().__repr__()
+
+
+class Linear(WeightedQuantizedModule):
+    r"""
+    A quantized linear module with quantized tensor as inputs and outputs.
+    We adopt the same interface as `torch.nn.Linear`, please see
+    https://pytorch.org/docs/stable/nn.html#torch.nn.Linear for documentation.
+
+    Similar to :class:`~torch.nn.Linear`, attributes will be randomly
+    initialized at module creation time and will be overwritten later
+
+    Attributes:
+        weight (Tensor): the non-learnable quantized weights of the module of
+                         shape :math:`(\text{out\_features}, \text{in\_features})`.
+        bias (Tensor): the non-learnable bias of the module of shape :math:`(\text{out\_features})`.
+                If :attr:`bias` is ``True``, the values are initialized to zero.
+        scale: `scale` parameter of output Quantized Tensor, type: double
+        zero_point: `zero_point` parameter for output Quantized Tensor, type: long
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> m = nn.quantized.Linear(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> # xdoctest: +SKIP
+        >>> input = torch.quantize_per_tensor(input, 1.0, 0, torch.quint8)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+    _version = 3
+    _FLOAT_MODULE = (nn.Linear, nn.modules.linear.NonDynamicallyQuantizableLinear)
+
+    def __init__(self, in_features, out_features, bias_=True,
+                 dtype=torch.qint8):
+        super().__init__()
+        # We don't muck around with buffers or attributes or anything here
+        # to keep the module simple. *everything* is simply a Python attribute.
+        # Serialization logic is explicitly handled in the below serialization and
+        # deserialization modules
+        self.in_features = in_features
+        self.out_features = out_features
+        bias = None
+        if bias_:
+            bias = torch.zeros(out_features, dtype=torch.float)
+
+        if dtype == torch.qint8:
+            qweight = torch._empty_affine_quantized(
+                [out_features, in_features], scale=1, zero_point=0, dtype=torch.qint8)
+        elif dtype == torch.float16:
+            qweight = torch.zeros([out_features, in_features], dtype=torch.float)
+        else:
+            raise RuntimeError('Unsupported dtype specified for quantized Linear!')
+
+        self._packed_params = LinearPackedParams(dtype)
+        self._packed_params.set_weight_bias(qweight, bias)
+        self.scale = 1.0
+        self.zero_point = 0
+
+    def _get_name(self):
+        return 'QuantizedLinear'
+
+    def extra_repr(self):
+        return 'in_features={}, out_features={}, scale={}, zero_point={}, qscheme={}'.format(
+            self.in_features, self.out_features, self.scale, self.zero_point, self.weight().qscheme()
+        )
+
+    def __repr__(self):
+        return _hide_packed_params_repr(self, LinearPackedParams)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.quantized.linear(
+            x, self._packed_params._packed_params, self.scale, self.zero_point)
+
+    # ===== Serialization methods =====
+    # The special consideration here is that we have to unpack the weights into their
+    # regular QTensor form for serialization. Packed weights should not live
+    # outside the process in which they were created, rather they should be derived
+    # from the QTensor weight.
+    #
+    # Version 1
+    #   self
+    #   |--- scale : float
+    #   |--- zero_point : int
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #
+    # Version 2
+    #   self
+    #   |--- scale : float
+    #   |--- zero_point : int
+    #   |--- _packed_params : Module
+    #        |--- weight : Tensor
+    #        |--- bias : Tensor
+    #
+    # Version 3
+    #   self
+    #   |--- scale : float
+    #   |--- zero_point : int
+    #   |--- _packed_params : Module
+    #        |--- _packed_params : (Tensor, Tensor) representing weight, bias
+    #                              of LinearPackedParams C++ struct
+    #
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'scale'] = torch.tensor(self.scale)
+        destination[prefix + 'zero_point'] = torch.tensor(self.zero_point)
+
+    # ===== Deserialization methods =====
+    # Counterpart to the serialization methods, we must pack the serialized QTensor
+    # weight into its packed format for use by the FBGEMM ops.
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        self.scale = float(state_dict[prefix + 'scale'])
+        state_dict.pop(prefix + 'scale')
+
+        self.zero_point = int(state_dict[prefix + 'zero_point'])
+        state_dict.pop(prefix + 'zero_point')
+
+        version = local_metadata.get('version', None)
+
+        if version is None or version == 1:
+            # We moved the parameters into a LinearPackedParameters submodule
+            weight = state_dict.pop(prefix + 'weight')
+            bias = state_dict.pop(prefix + 'bias')
+            state_dict.update({prefix + '_packed_params.weight': weight,
+                               prefix + '_packed_params.bias': bias})
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, False,
+            missing_keys, unexpected_keys, error_msgs)
+
+    # Function rather than property to make sure that JIT serialization doesn't
+    # register this as an attribute
+    def _weight_bias(self):
+        return self._packed_params._weight_bias()
+
+    def weight(self):
+        return self._weight_bias()[0]
+
+    def bias(self):
+        return self._weight_bias()[1]
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+        self._packed_params.set_weight_bias(w, b)
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a quantized module from an observed float module
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+                          utilities or provided by the user
+        """
+        if hasattr(mod, 'weight_fake_quant'):
+            if type_before_parametrizations(mod) == nniqat.LinearBn1d:
+                mod.weight, mod.bias = fuse_linear_bn_weights(
+                    mod.weight, mod.bias, mod.bn.running_mean, mod.bn.running_var,
+                    mod.bn.eps, mod.bn.weight, mod.bn.bias)
+            weight_post_process = mod.weight_fake_quant
+            activation_post_process = mod.activation_post_process
+        else:
+            # This function does not participate in JIT, so it is OK to ignore
+            # the type mismatch in assignment. Also, mypy has an issue with
+            # iterables not being implemented, so we are ignoring those too.
+            if not isinstance(cls._FLOAT_MODULE, Iterable):
+                cls._FLOAT_MODULE = [cls._FLOAT_MODULE]  # type: ignore[assignment]
+            supported_modules = ', '.join([float_mod.__name__ for float_mod in cls._FLOAT_MODULE])  # type: ignore[attr-defined]
+            error_msg = f'nnq.{cls.__name__}.from_float only works for {supported_modules}, but got: {type(mod)}'
+            assert type_before_parametrizations(mod) in cls._FLOAT_MODULE, error_msg.format()  # type: ignore[attr-defined]
+            assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+            activation_post_process = mod.activation_post_process
+            if type_before_parametrizations(mod) == nni.LinearReLU:
+                mod = mod[0]
+            weight_post_process = mod.qconfig.weight()
+        weight_post_process(mod.weight)
+        dtype = weight_post_process.dtype
+        act_scale, act_zp = activation_post_process.calculate_qparams()
+        assert dtype == torch.qint8, 'Weight observer must have dtype torch.qint8'
+        qweight = _quantize_weight(mod.weight.float(), weight_post_process)
+        qlinear = cls(mod.in_features,
+                      mod.out_features,
+                      dtype=dtype)
+        qlinear.set_weight_bias(qweight, mod.bias)
+        qlinear.scale = float(act_scale)
+        qlinear.zero_point = int(act_zp)
+        return qlinear
+
+    @classmethod
+    def from_reference(cls, ref_qlinear, output_scale, output_zero_point):
+        r"""Create a (fbgemm/qnnpack) quantized module from a reference quantized module
+
+        Args:
+            ref_qlinear (Module): a reference quantized linear module, either produced by torch.ao.quantization
+                          utilities or provided by the user
+            output_scale (float): scale for output Tensor
+            output_zero_point (int): zero point for output Tensor
+        """
+        qlinear = cls(
+            ref_qlinear.in_features,
+            ref_qlinear.out_features)
+        qweight = ref_qlinear.get_quantized_weight()
+        qlinear.set_weight_bias(qweight, ref_qlinear.bias)
+
+        qlinear.scale = float(output_scale)
+        qlinear.zero_point = int(output_zero_point)
+        return qlinear
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/quantized/modules/rnn.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/quantized/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..deb14856a9ef92a56c5988f1534df4330952133f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/quantized/modules/rnn.py
@@ -0,0 +1,51 @@
+import torch
+
+__all__ = [
+    "LSTM",
+]
+
+class LSTM(torch.ao.nn.quantizable.LSTM):
+    r"""A quantized long short-term memory (LSTM).
+
+    For the description and the argument types, please, refer to :class:`~torch.nn.LSTM`
+
+    Attributes:
+        layers : instances of the `_LSTMLayer`
+
+    .. note::
+        To access the weights and biases, you need to access them per layer.
+        See examples in :class:`~torch.ao.nn.quantizable.LSTM`
+
+    Examples::
+        >>> # xdoctest: +SKIP
+        >>> custom_module_config = {
+        ...     'float_to_observed_custom_module_class': {
+        ...         nn.LSTM: nn.quantizable.LSTM,
+        ...     },
+        ...     'observed_to_quantized_custom_module_class': {
+        ...         nn.quantizable.LSTM: nn.quantized.LSTM,
+        ...     }
+        ... }
+        >>> tq.prepare(model, prepare_custom_module_class=custom_module_config)
+        >>> tq.convert(model, convert_custom_module_class=custom_module_config)
+    """
+    _FLOAT_MODULE = torch.ao.nn.quantizable.LSTM  # type: ignore[assignment]
+
+    def _get_name(self):
+        return 'QuantizedLSTM'
+
+    @classmethod
+    def from_float(cls, *args, **kwargs):
+        # The whole flow is float -> observed -> quantized
+        # This class does observed -> quantized only
+        raise NotImplementedError("It looks like you are trying to convert a "
+                                  "non-observed LSTM module. Please, see "
+                                  "the examples on quantizable LSTMs.")
+
+    @classmethod
+    def from_observed(cls, other):
+        assert type(other) == cls._FLOAT_MODULE  # type: ignore[has-type]
+        converted = torch.ao.quantization.convert(other, inplace=False,
+                                                  remove_qconfig=True)
+        converted.__class__ = cls
+        return converted
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc844bc56bf6863094a458443f363dc2142659e8
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e37ee9d99226a35ede2db90ccbfea07be2e2bb6e
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1cca0620b821037fb4f2f24a2317cf88754ecff3
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f09d86a7745c6f2c6642b036b2d31a27583944dc
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35e588fcebd095a2bc7a2d2f1e4d586047533885
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/typing_extensions-4.9.0.dist-info/INSTALLER b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/typing_extensions-4.9.0.dist-info/INSTALLER
new file mode 100644
index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/typing_extensions-4.9.0.dist-info/INSTALLER
@@ -0,0 +1 @@
+pip
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/typing_extensions-4.9.0.dist-info/RECORD b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/typing_extensions-4.9.0.dist-info/RECORD
new file mode 100644
index 0000000000000000000000000000000000000000..46495911f5842491de81ee47a0b00a96fa463edc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/typing_extensions-4.9.0.dist-info/RECORD
@@ -0,0 +1,7 @@
+__pycache__/typing_extensions.cpython-311.pyc,,
+typing_extensions-4.9.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+typing_extensions-4.9.0.dist-info/LICENSE,sha256=Oy-B_iHRgcSZxZolbI4ZaEVdZonSaaqFNzv7avQdo78,13936
+typing_extensions-4.9.0.dist-info/METADATA,sha256=ebx5L9BIL5U8F_82bApE9QHP5izlymctyyI4Ey1bTck,2966
+typing_extensions-4.9.0.dist-info/RECORD,,
+typing_extensions-4.9.0.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
+typing_extensions.py,sha256=R1TPIKi5cxfmdVZfNaDB7WjKgEY4deP5D2CS3XR3hcQ,110125