diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/cython.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/cython.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d3d1e7b3d74b9917ce1864a3ccead9f9872f588 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/cython.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/isympy.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/isympy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..71824cc2e8d5612f0eb2937d4accf83c862a5a90 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/isympy.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_soft.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_soft.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b8846ae14b6651773d26126e576c7693edf8dc92 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_soft.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_error.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_error.py new file mode 100644 index 0000000000000000000000000000000000000000..f7ff08c0f508ad7077eb6ed1990898840c952b3a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_error.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from typing import Any + + +class Timeout(TimeoutError): # noqa: N818 + """Raised when the lock could not be acquired in *timeout* seconds.""" + + def __init__(self, lock_file: str) -> None: + super().__init__() + self._lock_file = lock_file + + def __reduce__(self) -> str | tuple[Any, ...]: + return self.__class__, (self._lock_file,) # Properly pickle the exception + + def __str__(self) -> str: + return f"The file lock '{self._lock_file}' could not be acquired." + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self.lock_file!r})" + + @property + def lock_file(self) -> str: + """:return: The path of the file lock.""" + return self._lock_file + + +__all__ = [ + "Timeout", +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_soft.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_soft.py new file mode 100644 index 0000000000000000000000000000000000000000..28c67f74cc82b8f55e47afd6a71972cc1fb95eb6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_soft.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +import os +import sys +from contextlib import suppress +from errno import EACCES, EEXIST +from pathlib import Path + +from ._api import BaseFileLock +from ._util import ensure_directory_exists, raise_on_not_writable_file + + +class SoftFileLock(BaseFileLock): + """Simply watches the existence of the lock file.""" + + def _acquire(self) -> None: + raise_on_not_writable_file(self.lock_file) + ensure_directory_exists(self.lock_file) + # first check for exists and read-only mode as the open will mask this case as EEXIST + flags = ( + os.O_WRONLY # open for writing only + | os.O_CREAT + | os.O_EXCL # together with above raise EEXIST if the file specified by filename exists + | os.O_TRUNC # truncate the file to zero byte + ) + try: + file_handler = os.open(self.lock_file, flags, self._context.mode) + except OSError as exception: # re-raise unless expected exception + if not ( + exception.errno == EEXIST # lock already exist + or (exception.errno == EACCES and sys.platform == "win32") # has no access to this lock + ): # pragma: win32 no cover + raise + else: + self._context.lock_file_fd = file_handler + + def _release(self) -> None: + assert self._context.lock_file_fd is not None # noqa: S101 + os.close(self._context.lock_file_fd) # the lock file is definitely not None + self._context.lock_file_fd = None + with suppress(OSError): # the file is already deleted and that's what we want + Path(self.lock_file).unlink() + + +__all__ = [ + "SoftFileLock", +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/py.typed b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/py.typed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/version.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/version.py new file mode 100644 index 0000000000000000000000000000000000000000..cc9fc1550b3b64cc4ff85291e33b6cb0a745af97 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/version.py @@ -0,0 +1,16 @@ +# file generated by setuptools_scm +# don't change, don't track in version control +TYPE_CHECKING = False +if TYPE_CHECKING: + from typing import Tuple, Union + VERSION_TUPLE = Tuple[Union[int, str], ...] +else: + VERSION_TUPLE = object + +version: str +__version__: str +__version_tuple__: VERSION_TUPLE +version_tuple: VERSION_TUPLE + +__version__ = version = '3.13.1' +__version_tuple__ = version_tuple = (3, 13, 1) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2270130fdf3c2a11c116488da9c93859844df79 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2dca1351ab88b1aec13d16c1b9620b53e30f773 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/utils.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..505a4694d4a018f04bf02cba50e7439b32bc64bb Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..565cde39ec49ebce7deb8a65ae1005bd9d862fb3 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e3711e6bdfacb381a0cd04b6aaa854f565acf272 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/put.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/put.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36df0352ed9ee1ea5c902438ab75b2a3b640106b Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/put.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/get.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/get.py new file mode 100644 index 0000000000000000000000000000000000000000..851ab81ee581e74cac41c64c83ef0af75826d6b0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/get.py @@ -0,0 +1,587 @@ +from hashlib import md5 +from itertools import product + +import pytest + +from fsspec.implementations.local import make_path_posix +from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS + + +class AbstractGetTests: + def test_get_file_to_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1a + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + assert local_fs.isdir(target) + + target_file2 = local_join(target, "file2") + target_subfile1 = local_join(target, "subfile1") + + # Copy from source directory + fs.get(fs_join(source, "file2"), target) + assert local_fs.isfile(target_file2) + + # Copy from sub directory + fs.get(fs_join(source, "subdir", "subfile1"), target) + assert local_fs.isfile(target_subfile1) + + # Remove copied files + local_fs.rm([target_file2, target_subfile1]) + assert not local_fs.exists(target_file2) + assert not local_fs.exists(target_subfile1) + + # Repeat with trailing slash on target + fs.get(fs_join(source, "file2"), target + "/") + assert local_fs.isdir(target) + assert local_fs.isfile(target_file2) + + fs.get(fs_join(source, "subdir", "subfile1"), target + "/") + assert local_fs.isfile(target_subfile1) + + def test_get_file_to_new_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1b + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + fs.get( + fs_join(source, "subdir", "subfile1"), local_join(target, "newdir/") + ) # Note trailing slash + + assert local_fs.isdir(target) + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "subfile1")) + + def test_get_file_to_file_in_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1c + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + fs.get(fs_join(source, "subdir", "subfile1"), local_join(target, "newfile")) + assert local_fs.isfile(local_join(target, "newfile")) + + def test_get_file_to_file_in_new_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1d + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + fs.get( + fs_join(source, "subdir", "subfile1"), + local_join(target, "newdir", "newfile"), + ) + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "newfile")) + + def test_get_directory_to_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1e + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + assert local_fs.isdir(target) + + for source_slash, target_slash in zip([False, True], [False, True]): + s = fs_join(source, "subdir") + if source_slash: + s += "/" + t = target + "/" if target_slash else target + + # Without recursive does nothing + fs.get(s, t) + assert local_fs.ls(target) == [] + + # With recursive + fs.get(s, t, recursive=True) + if source_slash: + assert local_fs.isfile(local_join(target, "subfile1")) + assert local_fs.isfile(local_join(target, "subfile2")) + assert local_fs.isdir(local_join(target, "nesteddir")) + assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile")) + assert not local_fs.exists(local_join(target, "subdir")) + + local_fs.rm( + [ + local_join(target, "subfile1"), + local_join(target, "subfile2"), + local_join(target, "nesteddir"), + ], + recursive=True, + ) + else: + assert local_fs.isdir(local_join(target, "subdir")) + assert local_fs.isfile(local_join(target, "subdir", "subfile1")) + assert local_fs.isfile(local_join(target, "subdir", "subfile2")) + assert local_fs.isdir(local_join(target, "subdir", "nesteddir")) + assert local_fs.isfile( + local_join(target, "subdir", "nesteddir", "nestedfile") + ) + + local_fs.rm(local_join(target, "subdir"), recursive=True) + assert local_fs.ls(target) == [] + + # Limit recursive by maxdepth + fs.get(s, t, recursive=True, maxdepth=1) + if source_slash: + assert local_fs.isfile(local_join(target, "subfile1")) + assert local_fs.isfile(local_join(target, "subfile2")) + assert not local_fs.exists(local_join(target, "nesteddir")) + assert not local_fs.exists(local_join(target, "subdir")) + + local_fs.rm( + [ + local_join(target, "subfile1"), + local_join(target, "subfile2"), + ], + recursive=True, + ) + else: + assert local_fs.isdir(local_join(target, "subdir")) + assert local_fs.isfile(local_join(target, "subdir", "subfile1")) + assert local_fs.isfile(local_join(target, "subdir", "subfile2")) + assert not local_fs.exists(local_join(target, "subdir", "nesteddir")) + + local_fs.rm(local_join(target, "subdir"), recursive=True) + assert local_fs.ls(target) == [] + + def test_get_directory_to_new_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1f + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + for source_slash, target_slash in zip([False, True], [False, True]): + s = fs_join(source, "subdir") + if source_slash: + s += "/" + t = local_join(target, "newdir") + if target_slash: + t += "/" + + # Without recursive does nothing + fs.get(s, t) + assert local_fs.ls(target) == [] + + # With recursive + fs.get(s, t, recursive=True) + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "subfile1")) + assert local_fs.isfile(local_join(target, "newdir", "subfile2")) + assert local_fs.isdir(local_join(target, "newdir", "nesteddir")) + assert local_fs.isfile( + local_join(target, "newdir", "nesteddir", "nestedfile") + ) + assert not local_fs.exists(local_join(target, "subdir")) + + local_fs.rm(local_join(target, "newdir"), recursive=True) + assert local_fs.ls(target) == [] + + # Limit recursive by maxdepth + fs.get(s, t, recursive=True, maxdepth=1) + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "subfile1")) + assert local_fs.isfile(local_join(target, "newdir", "subfile2")) + assert not local_fs.exists(local_join(target, "newdir", "nesteddir")) + assert not local_fs.exists(local_join(target, "subdir")) + + local_fs.rm(local_join(target, "newdir"), recursive=True) + assert not local_fs.exists(local_join(target, "newdir")) + + def test_get_glob_to_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1g + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + for target_slash in [False, True]: + t = target + "/" if target_slash else target + + # Without recursive + fs.get(fs_join(source, "subdir", "*"), t) + assert local_fs.isfile(local_join(target, "subfile1")) + assert local_fs.isfile(local_join(target, "subfile2")) + assert not local_fs.isdir(local_join(target, "nesteddir")) + assert not local_fs.exists(local_join(target, "nesteddir", "nestedfile")) + assert not local_fs.exists(local_join(target, "subdir")) + + local_fs.rm( + [ + local_join(target, "subfile1"), + local_join(target, "subfile2"), + ], + recursive=True, + ) + assert local_fs.ls(target) == [] + + # With recursive + for glob, recursive in zip(["*", "**"], [True, False]): + fs.get(fs_join(source, "subdir", glob), t, recursive=recursive) + assert local_fs.isfile(local_join(target, "subfile1")) + assert local_fs.isfile(local_join(target, "subfile2")) + assert local_fs.isdir(local_join(target, "nesteddir")) + assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile")) + assert not local_fs.exists(local_join(target, "subdir")) + + local_fs.rm( + [ + local_join(target, "subfile1"), + local_join(target, "subfile2"), + local_join(target, "nesteddir"), + ], + recursive=True, + ) + assert local_fs.ls(target) == [] + + # Limit recursive by maxdepth + fs.get( + fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1 + ) + assert local_fs.isfile(local_join(target, "subfile1")) + assert local_fs.isfile(local_join(target, "subfile2")) + assert not local_fs.exists(local_join(target, "nesteddir")) + assert not local_fs.exists(local_join(target, "subdir")) + + local_fs.rm( + [ + local_join(target, "subfile1"), + local_join(target, "subfile2"), + ], + recursive=True, + ) + assert local_fs.ls(target) == [] + + def test_get_glob_to_new_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1h + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + for target_slash in [False, True]: + t = fs_join(target, "newdir") + if target_slash: + t += "/" + + # Without recursive + fs.get(fs_join(source, "subdir", "*"), t) + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "subfile1")) + assert local_fs.isfile(local_join(target, "newdir", "subfile2")) + assert not local_fs.exists(local_join(target, "newdir", "nesteddir")) + assert not local_fs.exists( + local_join(target, "newdir", "nesteddir", "nestedfile") + ) + assert not local_fs.exists(local_join(target, "subdir")) + assert not local_fs.exists(local_join(target, "newdir", "subdir")) + + local_fs.rm(local_join(target, "newdir"), recursive=True) + assert local_fs.ls(target) == [] + + # With recursive + for glob, recursive in zip(["*", "**"], [True, False]): + fs.get(fs_join(source, "subdir", glob), t, recursive=recursive) + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "subfile1")) + assert local_fs.isfile(local_join(target, "newdir", "subfile2")) + assert local_fs.isdir(local_join(target, "newdir", "nesteddir")) + assert local_fs.isfile( + local_join(target, "newdir", "nesteddir", "nestedfile") + ) + assert not local_fs.exists(local_join(target, "subdir")) + assert not local_fs.exists(local_join(target, "newdir", "subdir")) + + local_fs.rm(local_join(target, "newdir"), recursive=True) + assert not local_fs.exists(local_join(target, "newdir")) + + # Limit recursive by maxdepth + fs.get( + fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1 + ) + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "subfile1")) + assert local_fs.isfile(local_join(target, "newdir", "subfile2")) + assert not local_fs.exists(local_join(target, "newdir", "nesteddir")) + assert not local_fs.exists(local_join(target, "subdir")) + assert not local_fs.exists(local_join(target, "newdir", "subdir")) + + local_fs.rm(local_fs.ls(target, detail=False), recursive=True) + assert not local_fs.exists(local_join(target, "newdir")) + + @pytest.mark.parametrize( + GLOB_EDGE_CASES_TESTS["argnames"], + GLOB_EDGE_CASES_TESTS["argvalues"], + ) + def test_get_glob_edge_cases( + self, + path, + recursive, + maxdepth, + expected, + fs, + fs_join, + fs_glob_edge_cases_files, + local_fs, + local_join, + local_target, + ): + # Copy scenario 1g + source = fs_glob_edge_cases_files + + target = local_target + + for new_dir, target_slash in product([True, False], [True, False]): + local_fs.mkdir(target) + + t = local_join(target, "newdir") if new_dir else target + t = t + "/" if target_slash else t + + fs.get(fs_join(source, path), t, recursive=recursive, maxdepth=maxdepth) + + output = local_fs.find(target) + if new_dir: + prefixed_expected = [ + make_path_posix(local_join(target, "newdir", p)) for p in expected + ] + else: + prefixed_expected = [ + make_path_posix(local_join(target, p)) for p in expected + ] + assert sorted(output) == sorted(prefixed_expected) + + try: + local_fs.rm(target, recursive=True) + except FileNotFoundError: + pass + + def test_get_list_of_files_to_existing_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 2a + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + source_files = [ + fs_join(source, "file1"), + fs_join(source, "file2"), + fs_join(source, "subdir", "subfile1"), + ] + + for target_slash in [False, True]: + t = target + "/" if target_slash else target + + fs.get(source_files, t) + assert local_fs.isfile(local_join(target, "file1")) + assert local_fs.isfile(local_join(target, "file2")) + assert local_fs.isfile(local_join(target, "subfile1")) + + local_fs.rm( + [ + local_join(target, "file1"), + local_join(target, "file2"), + local_join(target, "subfile1"), + ], + recursive=True, + ) + assert local_fs.ls(target) == [] + + def test_get_list_of_files_to_new_directory( + self, + fs, + fs_join, + fs_bulk_operations_scenario_0, + local_fs, + local_join, + local_target, + ): + # Copy scenario 2b + source = fs_bulk_operations_scenario_0 + + target = local_target + local_fs.mkdir(target) + + source_files = [ + fs_join(source, "file1"), + fs_join(source, "file2"), + fs_join(source, "subdir", "subfile1"), + ] + + fs.get(source_files, local_join(target, "newdir") + "/") # Note trailing slash + assert local_fs.isdir(local_join(target, "newdir")) + assert local_fs.isfile(local_join(target, "newdir", "file1")) + assert local_fs.isfile(local_join(target, "newdir", "file2")) + assert local_fs.isfile(local_join(target, "newdir", "subfile1")) + + def test_get_directory_recursive( + self, fs, fs_join, fs_path, local_fs, local_join, local_target + ): + # https://github.com/fsspec/filesystem_spec/issues/1062 + # Recursive cp/get/put of source directory into non-existent target directory. + src = fs_join(fs_path, "src") + src_file = fs_join(src, "file") + fs.mkdir(src) + fs.touch(src_file) + + target = local_target + + # get without slash + assert not local_fs.exists(target) + for loop in range(2): + fs.get(src, target, recursive=True) + assert local_fs.isdir(target) + + if loop == 0: + assert local_fs.isfile(local_join(target, "file")) + assert not local_fs.exists(local_join(target, "src")) + else: + assert local_fs.isfile(local_join(target, "file")) + assert local_fs.isdir(local_join(target, "src")) + assert local_fs.isfile(local_join(target, "src", "file")) + + local_fs.rm(target, recursive=True) + + # get with slash + assert not local_fs.exists(target) + for loop in range(2): + fs.get(src + "/", target, recursive=True) + assert local_fs.isdir(target) + assert local_fs.isfile(local_join(target, "file")) + assert not local_fs.exists(local_join(target, "src")) + + def test_get_directory_without_files_with_same_name_prefix( + self, + fs, + fs_join, + local_fs, + local_join, + local_target, + fs_dir_and_file_with_same_name_prefix, + ): + # Create the test dirs + source = fs_dir_and_file_with_same_name_prefix + target = local_target + + # Test without glob + fs.get(fs_join(source, "subdir"), target, recursive=True) + + assert local_fs.isfile(local_join(target, "subfile.txt")) + assert not local_fs.isfile(local_join(target, "subdir.txt")) + + local_fs.rm([local_join(target, "subfile.txt")]) + assert local_fs.ls(target) == [] + + # Test with glob + fs.get(fs_join(source, "subdir*"), target, recursive=True) + + assert local_fs.isdir(local_join(target, "subdir")) + assert local_fs.isfile(local_join(target, "subdir", "subfile.txt")) + assert local_fs.isfile(local_join(target, "subdir.txt")) + + def test_get_with_source_and_destination_as_list( + self, + fs, + fs_join, + local_fs, + local_join, + local_target, + fs_10_files_with_hashed_names, + ): + # Create the test dir + source = fs_10_files_with_hashed_names + target = local_target + + # Create list of files for source and destination + source_files = [] + destination_files = [] + for i in range(10): + hashed_i = md5(str(i).encode("utf-8")).hexdigest() + source_files.append(fs_join(source, f"{hashed_i}.txt")) + destination_files.append( + make_path_posix(local_join(target, f"{hashed_i}.txt")) + ) + + # Copy and assert order was kept + fs.get(rpath=source_files, lpath=destination_files) + + for i in range(10): + file_content = local_fs.cat(destination_files[i]).decode("utf-8") + assert file_content == str(i) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/put.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/put.py new file mode 100644 index 0000000000000000000000000000000000000000..9fc349977f0384d9fc86126498be5c6ad99a21d3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/put.py @@ -0,0 +1,591 @@ +from hashlib import md5 +from itertools import product + +import pytest + +from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS + + +class AbstractPutTests: + def test_put_file_to_existing_directory( + self, + fs, + fs_join, + fs_target, + local_join, + local_bulk_operations_scenario_0, + supports_empty_directories, + ): + # Copy scenario 1a + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + fs.touch(fs_join(target, "dummy")) + assert fs.isdir(target) + + target_file2 = fs_join(target, "file2") + target_subfile1 = fs_join(target, "subfile1") + + # Copy from source directory + fs.put(local_join(source, "file2"), target) + assert fs.isfile(target_file2) + + # Copy from sub directory + fs.put(local_join(source, "subdir", "subfile1"), target) + assert fs.isfile(target_subfile1) + + # Remove copied files + fs.rm([target_file2, target_subfile1]) + assert not fs.exists(target_file2) + assert not fs.exists(target_subfile1) + + # Repeat with trailing slash on target + fs.put(local_join(source, "file2"), target + "/") + assert fs.isdir(target) + assert fs.isfile(target_file2) + + fs.put(local_join(source, "subdir", "subfile1"), target + "/") + assert fs.isfile(target_subfile1) + + def test_put_file_to_new_directory( + self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0 + ): + # Copy scenario 1b + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + fs.put( + local_join(source, "subdir", "subfile1"), fs_join(target, "newdir/") + ) # Note trailing slash + assert fs.isdir(target) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + + def test_put_file_to_file_in_existing_directory( + self, + fs, + fs_join, + fs_target, + local_join, + supports_empty_directories, + local_bulk_operations_scenario_0, + ): + # Copy scenario 1c + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + fs.touch(fs_join(target, "dummy")) + assert fs.isdir(target) + + fs.put(local_join(source, "subdir", "subfile1"), fs_join(target, "newfile")) + assert fs.isfile(fs_join(target, "newfile")) + + def test_put_file_to_file_in_new_directory( + self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0 + ): + # Copy scenario 1d + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + fs.put( + local_join(source, "subdir", "subfile1"), + fs_join(target, "newdir", "newfile"), + ) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "newfile")) + + def test_put_directory_to_existing_directory( + self, + fs, + fs_join, + fs_target, + local_bulk_operations_scenario_0, + supports_empty_directories, + ): + # Copy scenario 1e + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + dummy = fs_join(target, "dummy") + fs.touch(dummy) + assert fs.isdir(target) + + for source_slash, target_slash in zip([False, True], [False, True]): + s = fs_join(source, "subdir") + if source_slash: + s += "/" + t = target + "/" if target_slash else target + + # Without recursive does nothing + fs.put(s, t) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + # With recursive + fs.put(s, t, recursive=True) + if source_slash: + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert fs.isdir(fs_join(target, "nesteddir")) + assert fs.isfile(fs_join(target, "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + fs_join(target, "nesteddir"), + ], + recursive=True, + ) + else: + assert fs.isdir(fs_join(target, "subdir")) + assert fs.isfile(fs_join(target, "subdir", "subfile1")) + assert fs.isfile(fs_join(target, "subdir", "subfile2")) + assert fs.isdir(fs_join(target, "subdir", "nesteddir")) + assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile")) + + fs.rm(fs_join(target, "subdir"), recursive=True) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + # Limit recursive by maxdepth + fs.put(s, t, recursive=True, maxdepth=1) + if source_slash: + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert not fs.exists(fs_join(target, "nesteddir")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + ], + recursive=True, + ) + else: + assert fs.isdir(fs_join(target, "subdir")) + assert fs.isfile(fs_join(target, "subdir", "subfile1")) + assert fs.isfile(fs_join(target, "subdir", "subfile2")) + assert not fs.exists(fs_join(target, "subdir", "nesteddir")) + + fs.rm(fs_join(target, "subdir"), recursive=True) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + def test_put_directory_to_new_directory( + self, + fs, + fs_join, + fs_target, + local_bulk_operations_scenario_0, + supports_empty_directories, + ): + # Copy scenario 1f + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + for source_slash, target_slash in zip([False, True], [False, True]): + s = fs_join(source, "subdir") + if source_slash: + s += "/" + t = fs_join(target, "newdir") + if target_slash: + t += "/" + + # Without recursive does nothing + fs.put(s, t) + if supports_empty_directories: + assert fs.ls(target) == [] + else: + with pytest.raises(FileNotFoundError): + fs.ls(target) + + # With recursive + fs.put(s, t, recursive=True) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert fs.isdir(fs_join(target, "newdir", "nesteddir")) + assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + # Limit recursive by maxdepth + fs.put(s, t, recursive=True, maxdepth=1) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert not fs.exists(fs_join(target, "newdir", "nesteddir")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + def test_put_glob_to_existing_directory( + self, + fs, + fs_join, + fs_target, + local_join, + supports_empty_directories, + local_bulk_operations_scenario_0, + ): + # Copy scenario 1g + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + dummy = fs_join(target, "dummy") + fs.touch(dummy) + assert fs.isdir(target) + + for target_slash in [False, True]: + t = target + "/" if target_slash else target + + # Without recursive + fs.put(local_join(source, "subdir", "*"), t) + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert not fs.isdir(fs_join(target, "nesteddir")) + assert not fs.exists(fs_join(target, "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + ], + recursive=True, + ) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + # With recursive + for glob, recursive in zip(["*", "**"], [True, False]): + fs.put(local_join(source, "subdir", glob), t, recursive=recursive) + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert fs.isdir(fs_join(target, "nesteddir")) + assert fs.isfile(fs_join(target, "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + fs_join(target, "nesteddir"), + ], + recursive=True, + ) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + # Limit recursive by maxdepth + fs.put( + local_join(source, "subdir", glob), + t, + recursive=recursive, + maxdepth=1, + ) + assert fs.isfile(fs_join(target, "subfile1")) + assert fs.isfile(fs_join(target, "subfile2")) + assert not fs.exists(fs_join(target, "nesteddir")) + assert not fs.exists(fs_join(target, "subdir")) + + fs.rm( + [ + fs_join(target, "subfile1"), + fs_join(target, "subfile2"), + ], + recursive=True, + ) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + def test_put_glob_to_new_directory( + self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0 + ): + # Copy scenario 1h + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + for target_slash in [False, True]: + t = fs_join(target, "newdir") + if target_slash: + t += "/" + + # Without recursive + fs.put(local_join(source, "subdir", "*"), t) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert not fs.exists(fs_join(target, "newdir", "nesteddir")) + assert not fs.exists(fs_join(target, "newdir", "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + assert not fs.exists(fs_join(target, "newdir", "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + # With recursive + for glob, recursive in zip(["*", "**"], [True, False]): + fs.put(local_join(source, "subdir", glob), t, recursive=recursive) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert fs.isdir(fs_join(target, "newdir", "nesteddir")) + assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile")) + assert not fs.exists(fs_join(target, "subdir")) + assert not fs.exists(fs_join(target, "newdir", "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + # Limit recursive by maxdepth + fs.put( + local_join(source, "subdir", glob), + t, + recursive=recursive, + maxdepth=1, + ) + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + assert fs.isfile(fs_join(target, "newdir", "subfile2")) + assert not fs.exists(fs_join(target, "newdir", "nesteddir")) + assert not fs.exists(fs_join(target, "subdir")) + assert not fs.exists(fs_join(target, "newdir", "subdir")) + + fs.rm(fs_join(target, "newdir"), recursive=True) + assert not fs.exists(fs_join(target, "newdir")) + + @pytest.mark.parametrize( + GLOB_EDGE_CASES_TESTS["argnames"], + GLOB_EDGE_CASES_TESTS["argvalues"], + ) + def test_put_glob_edge_cases( + self, + path, + recursive, + maxdepth, + expected, + fs, + fs_join, + fs_target, + local_glob_edge_cases_files, + local_join, + fs_sanitize_path, + ): + # Copy scenario 1g + source = local_glob_edge_cases_files + + target = fs_target + + for new_dir, target_slash in product([True, False], [True, False]): + fs.mkdir(target) + + t = fs_join(target, "newdir") if new_dir else target + t = t + "/" if target_slash else t + + fs.put(local_join(source, path), t, recursive=recursive, maxdepth=maxdepth) + + output = fs.find(target) + if new_dir: + prefixed_expected = [ + fs_sanitize_path(fs_join(target, "newdir", p)) for p in expected + ] + else: + prefixed_expected = [ + fs_sanitize_path(fs_join(target, p)) for p in expected + ] + assert sorted(output) == sorted(prefixed_expected) + + try: + fs.rm(target, recursive=True) + except FileNotFoundError: + pass + + def test_put_list_of_files_to_existing_directory( + self, + fs, + fs_join, + fs_target, + local_join, + local_bulk_operations_scenario_0, + supports_empty_directories, + ): + # Copy scenario 2a + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + if not supports_empty_directories: + # Force target directory to exist by adding a dummy file + dummy = fs_join(target, "dummy") + fs.touch(dummy) + assert fs.isdir(target) + + source_files = [ + local_join(source, "file1"), + local_join(source, "file2"), + local_join(source, "subdir", "subfile1"), + ] + + for target_slash in [False, True]: + t = target + "/" if target_slash else target + + fs.put(source_files, t) + assert fs.isfile(fs_join(target, "file1")) + assert fs.isfile(fs_join(target, "file2")) + assert fs.isfile(fs_join(target, "subfile1")) + + fs.rm( + [ + fs_join(target, "file1"), + fs_join(target, "file2"), + fs_join(target, "subfile1"), + ], + recursive=True, + ) + assert fs.ls(target, detail=False) == ( + [] if supports_empty_directories else [dummy] + ) + + def test_put_list_of_files_to_new_directory( + self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0 + ): + # Copy scenario 2b + source = local_bulk_operations_scenario_0 + + target = fs_target + fs.mkdir(target) + + source_files = [ + local_join(source, "file1"), + local_join(source, "file2"), + local_join(source, "subdir", "subfile1"), + ] + + fs.put(source_files, fs_join(target, "newdir") + "/") # Note trailing slash + assert fs.isdir(fs_join(target, "newdir")) + assert fs.isfile(fs_join(target, "newdir", "file1")) + assert fs.isfile(fs_join(target, "newdir", "file2")) + assert fs.isfile(fs_join(target, "newdir", "subfile1")) + + def test_put_directory_recursive( + self, fs, fs_join, fs_target, local_fs, local_join, local_path + ): + # https://github.com/fsspec/filesystem_spec/issues/1062 + # Recursive cp/get/put of source directory into non-existent target directory. + src = local_join(local_path, "src") + src_file = local_join(src, "file") + local_fs.mkdir(src) + local_fs.touch(src_file) + + target = fs_target + + # put without slash + assert not fs.exists(target) + for loop in range(2): + fs.put(src, target, recursive=True) + assert fs.isdir(target) + + if loop == 0: + assert fs.isfile(fs_join(target, "file")) + assert not fs.exists(fs_join(target, "src")) + else: + assert fs.isfile(fs_join(target, "file")) + assert fs.isdir(fs_join(target, "src")) + assert fs.isfile(fs_join(target, "src", "file")) + + fs.rm(target, recursive=True) + + # put with slash + assert not fs.exists(target) + for loop in range(2): + fs.put(src + "/", target, recursive=True) + assert fs.isdir(target) + assert fs.isfile(fs_join(target, "file")) + assert not fs.exists(fs_join(target, "src")) + + def test_put_directory_without_files_with_same_name_prefix( + self, + fs, + fs_join, + fs_target, + local_join, + local_dir_and_file_with_same_name_prefix, + supports_empty_directories, + ): + # Create the test dirs + source = local_dir_and_file_with_same_name_prefix + target = fs_target + + # Test without glob + fs.put(local_join(source, "subdir"), fs_target, recursive=True) + + assert fs.isfile(fs_join(fs_target, "subfile.txt")) + assert not fs.isfile(fs_join(fs_target, "subdir.txt")) + + fs.rm([fs_join(target, "subfile.txt")]) + if supports_empty_directories: + assert fs.ls(target) == [] + else: + assert not fs.exists(target) + + # Test with glob + fs.put(local_join(source, "subdir*"), fs_target, recursive=True) + + assert fs.isdir(fs_join(fs_target, "subdir")) + assert fs.isfile(fs_join(fs_target, "subdir", "subfile.txt")) + assert fs.isfile(fs_join(fs_target, "subdir.txt")) + + def test_copy_with_source_and_destination_as_list( + self, fs, fs_target, fs_join, local_join, local_10_files_with_hashed_names + ): + # Create the test dir + source = local_10_files_with_hashed_names + target = fs_target + + # Create list of files for source and destination + source_files = [] + destination_files = [] + for i in range(10): + hashed_i = md5(str(i).encode("utf-8")).hexdigest() + source_files.append(local_join(source, f"{hashed_i}.txt")) + destination_files.append(fs_join(target, f"{hashed_i}.txt")) + + # Copy and assert order was kept + fs.put(lpath=source_files, rpath=destination_files) + + for i in range(10): + file_content = fs.cat(destination_files[i]).decode("utf-8") + assert file_content == str(i) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aff35e592d80e510b08ff18a4c3571202b653d6e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/__init__.py @@ -0,0 +1,38 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import torch + +from torch._functorch.deprecated import ( + combine_state_for_ensemble, + functionalize, + grad, + grad_and_value, + hessian, + jacfwd, + jacrev, + jvp, + make_functional, + make_functional_with_buffers, + vjp, + vmap, +) + +# utilities. Maybe these should go in their own namespace in the future? +from torch._functorch.make_functional import ( + FunctionalModule, + FunctionalModuleWithBuffers, +) + +# Top-level APIs. Please think carefully before adding something to the +# top-level namespace: +# - private helper functions should go into torch._functorch +# - very experimental things should go into functorch.experimental +# - compilation related things should go into functorch.compile + +# Was never documented +from torch._functorch.python_key import make_fx + +__version__ = torch.__version__ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/control_flow.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/control_flow.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..236bad2830b7beabcb777696e7e30459429613a0 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/control_flow.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/ops.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/ops.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44827bc40e40795ca52391a98242140f6d4aa366 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/ops.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/control_flow.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/control_flow.py new file mode 100644 index 0000000000000000000000000000000000000000..e24fc6142820013002f6cbc1d6f85e7e132aade8 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/control_flow.py @@ -0,0 +1,8 @@ +from torch import cond # noqa: F401 +from torch._higher_order_ops.cond import UnsupportedAliasMutationException # noqa: F401 + +from torch._higher_order_ops.map import ( # noqa: F401 + _stack_pytree, + _unstack_pytree, + map, +) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaProfilerTypedefs.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaProfilerTypedefs.h new file mode 100644 index 0000000000000000000000000000000000000000..bea7df4573aff2fa5b0d0029ce9d40a7ebe2de46 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaProfilerTypedefs.h @@ -0,0 +1,78 @@ +/* + * Copyright 2020-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef CUDAPROFILERTYPEDEFS_H +#define CUDAPROFILERTYPEDEFS_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/* + * Macros for the latest version for each driver function in cudaProfiler.h + */ +#define PFN_cuProfilerInitialize PFN_cuProfilerInitialize_v4000 +#define PFN_cuProfilerStart PFN_cuProfilerStart_v4000 +#define PFN_cuProfilerStop PFN_cuProfilerStop_v4000 + + +/** + * Type definitions for functions defined in cudaProfiler.h + */ +typedef CUresult (CUDAAPI *PFN_cuProfilerInitialize_v4000)(const char *configFile, const char *outputFile, CUoutput_mode outputMode); +typedef CUresult (CUDAAPI *PFN_cuProfilerStart_v4000)(void); +typedef CUresult (CUDAAPI *PFN_cuProfilerStop_v4000)(void); + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // file guard diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAU.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAU.h new file mode 100644 index 0000000000000000000000000000000000000000..97de57ae494d62ae176fc02ad3c0c3f4d43e1526 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAU.h @@ -0,0 +1,282 @@ +/* + * Copyright 2010-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef CUDAVDPAU_H +#define CUDAVDPAU_H + +#ifdef CUDA_FORCE_API_VERSION +#error "CUDA_FORCE_API_VERSION is no longer supported." +#endif + +#define cuVDPAUCtxCreate cuVDPAUCtxCreate_v2 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * \defgroup CUDA_VDPAU VDPAU Interoperability + * \ingroup CUDA_DRIVER + * + * ___MANBRIEF___ VDPAU interoperability functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the VDPAU interoperability functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Gets the CUDA device associated with a VDPAU device + * + * Returns in \p *pDevice the CUDA device associated with a \p vdpDevice, if + * applicable. + * + * \param pDevice - Device associated with vdpDevice + * \param vdpDevice - A VdpDevice handle + * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface, + * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource, + * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources, + * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray, + * ::cudaVDPAUGetDevice + */ +CUresult CUDAAPI cuVDPAUGetDevice(CUdevice *pDevice, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress); + +/** + * \brief Create a CUDA context for interoperability with VDPAU + * + * Creates a new CUDA context, initializes VDPAU interoperability, and + * associates the CUDA context with the calling thread. It must be called + * before performing any other VDPAU interoperability operations. It may fail + * if the needed VDPAU driver facilities are not available. For usage of the + * \p flags parameter, see ::cuCtxCreate(). + * + * \param pCtx - Returned CUDA context + * \param flags - Options for CUDA context creation + * \param device - Device on which to create the context + * \param vdpDevice - The VdpDevice to interop with + * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface, + * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource, + * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources, + * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray, + * ::cuVDPAUGetDevice + */ +CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress); + +/** + * \brief Registers a VDPAU VdpVideoSurface object + * + * Registers the VdpVideoSurface specified by \p vdpSurface for access by + * CUDA. A handle to the registered object is returned as \p pCudaResource. + * The surface's intended usage is specified using \p flags, as follows: + * + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA. This is the default value. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA + * will not write to this resource. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that + * CUDA will not read from this resource and will write over the + * entire contents of the resource, so none of the data previously + * stored in the resource will be preserved. + * + * The VdpVideoSurface is presented as an array of subresources that may be + * accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray. + * The exact number of valid \p arrayIndex values depends on the VDPAU surface + * format. The mapping is shown in the table below. \p mipLevel must be 0. + * + * \htmlonly + * + * + * + * + * + * + * + * + * + * + *
VdpChromaType arrayIndexSize FormatContent
VDP_CHROMA_TYPE_4200 w x h/2R8 Top-field luma
1 w x h/2R8 Bottom-field luma
2 w/2 x h/4R8G8 Top-field chroma
3 w/2 x h/4R8G8 Bottom-field chroma
VDP_CHROMA_TYPE_4220 w x h/2R8 Top-field luma
1 w x h/2R8 Bottom-field luma
2 w/2 x h/2R8G8 Top-field chroma
3 w/2 x h/2R8G8 Bottom-field chroma
+ * \endhtmlonly + * + * \latexonly + * \begin{tabular}{|l|l|l|l|l|} + * \hline + * VdpChromaType & arrayIndex & Size & Format & Content \\ + * \hline + * VDP\_CHROMA\_TYPE\_420 & 0 & w x h/2 & R8 & Top-field luma \\ + * & 1 & w x h/2 & R8 & Bottom-field luma \\ + * & 2 & w/2 x h/4 & R8G8 & Top-field chroma \\ + * & 3 & w/2 x h/4 & R8G8 & Bottom-field chroma \\ + * \hline + * VDP\_CHROMA\_TYPE\_422 & 0 & w x h/2 & R8 & Top-field luma \\ + * & 1 & w x h/2 & R8 & Bottom-field luma \\ + * & 2 & w/2 x h/2 & R8G8 & Top-field chroma \\ + * & 3 & w/2 x h/2 & R8G8 & Bottom-field chroma \\ + * \hline + * \end{tabular} + * \endlatexonly + * + * \param pCudaResource - Pointer to the returned object handle + * \param vdpSurface - The VdpVideoSurface to be registered + * \param flags - Map flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * \notefnerr + * + * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate, + * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource, + * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources, + * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray, + * ::cuVDPAUGetDevice, + * ::cudaGraphicsVDPAURegisterVideoSurface + */ +CUresult CUDAAPI cuGraphicsVDPAURegisterVideoSurface(CUgraphicsResource *pCudaResource, VdpVideoSurface vdpSurface, unsigned int flags); + +/** + * \brief Registers a VDPAU VdpOutputSurface object + * + * Registers the VdpOutputSurface specified by \p vdpSurface for access by + * CUDA. A handle to the registered object is returned as \p pCudaResource. + * The surface's intended usage is specified using \p flags, as follows: + * + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA. This is the default value. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA + * will not write to this resource. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that + * CUDA will not read from this resource and will write over the + * entire contents of the resource, so none of the data previously + * stored in the resource will be preserved. + * + * The VdpOutputSurface is presented as an array of subresources that may be + * accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray. + * The exact number of valid \p arrayIndex values depends on the VDPAU surface + * format. The mapping is shown in the table below. \p mipLevel must be 0. + * + * \htmlonly + * + * + * + * + *
VdpRGBAFormat arrayIndexSize Format Content
VDP_RGBA_FORMAT_B8G8R8A8 0 w x hARGB8 Entire surface
VDP_RGBA_FORMAT_R10G10B10A20 w x hA2BGR10Entire surface
+ * \endhtmlonly + * + * \latexonly + * \begin{tabular}{|l|l|l|l|l|} + * \hline + * VdpRGBAFormat & arrayIndex & Size & Format & Content \\ + * \hline + * VDP\_RGBA\_FORMAT\_B8G8R8A8 & 0 & w x h & ARGB8 & Entire surface \\ + * VDP\_RGBA\_FORMAT\_R10G10B10A2 & 0 & w x h & A2BGR10 & Entire surface \\ + * \hline + * \end{tabular} + * \endlatexonly + * + * \param pCudaResource - Pointer to the returned object handle + * \param vdpSurface - The VdpOutputSurface to be registered + * \param flags - Map flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * \notefnerr + * + * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate, + * ::cuGraphicsVDPAURegisterVideoSurface, ::cuGraphicsUnregisterResource, + * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources, + * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray, + * ::cuVDPAUGetDevice, + * ::cudaGraphicsVDPAURegisterOutputSurface + */ +CUresult CUDAAPI cuGraphicsVDPAURegisterOutputSurface(CUgraphicsResource *pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags); + +/** @} */ /* END CUDA_VDPAU */ + + +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef cuVDPAUCtxCreate + + CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress); +#endif /* __CUDA_API_VERSION_INTERNAL */ + +#ifdef __cplusplus +}; +#endif + +#endif diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..12b74c94deef2bdea5bd14c9247814427308870b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.h @@ -0,0 +1,100 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_20_ATOMIC_FUNCTIONS_H__) +#define __SM_20_ATOMIC_FUNCTIONS_H__ + +#if defined(__CUDACC_RTC__) +#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__ +#else /* __CUDACC_RTC__ */ +#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + + +#ifdef __CUDA_ARCH__ +extern "C" +{ +extern __device__ __device_builtin__ float __fAtomicAdd(float *address, float val); +} +#endif /* __CUDA_ARCH__ */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val) __DEF_IF_HOST + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __DEF_IF_HOST +#undef __SM_20_ATOMIC_FUNCTIONS_DECL__ + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +#include "sm_20_atomic_functions.hpp" +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#endif /* !__SM_20_ATOMIC_FUNCTIONS_H__ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.hpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ac4aa9bfc6b8d5d4d240e05a2fd557889f30c47f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.hpp @@ -0,0 +1,85 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_20_ATOMIC_FUNCTIONS_HPP__) +#define __SM_20_ATOMIC_FUNCTIONS_HPP__ + +#if defined(__CUDACC_RTC__) +#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__ +#else /* __CUDACC_RTC__ */ +#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val) +{ + return __fAtomicAdd(address, val); +} + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __SM_20_ATOMIC_FUNCTIONS_DECL__ + +#endif /* !__SM_20_ATOMIC_FUNCTIONS_HPP__ */ + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_intrinsics.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_intrinsics.h new file mode 100644 index 0000000000000000000000000000000000000000..da1e823a24171ed1ca9414955c6c68159a4411f5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_intrinsics.h @@ -0,0 +1,116 @@ +/* + + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + + * + + * NOTICE TO LICENSEE: + + * + + * This source code and/or documentation ("Licensed Deliverables") are + + * subject to NVIDIA intellectual property rights under U.S. and + + * international Copyright laws. + + * + + * These Licensed Deliverables contained herein is PROPRIETARY and + + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + + * conditions of a form of NVIDIA software license agreement by and + + * between NVIDIA and Licensee ("License Agreement") or electronically + + * accepted by Licensee. Notwithstanding any terms or conditions to + + * the contrary in the License Agreement, reproduction or disclosure + + * of the Licensed Deliverables to any third party without the express + + * written consent of NVIDIA is prohibited. + + * + + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + + * OF THESE LICENSED DELIVERABLES. + + * + + * U.S. Government End Users. These Licensed Deliverables are a + + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + + * 1995), consisting of "commercial computer software" and "commercial + + * computer software documentation" as such terms are used in 48 + + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + + * U.S. Government End Users acquire the Licensed Deliverables with + + * only those rights set forth herein. + + * + + * Any use of the Licensed Deliverables in individual and commercial + + * software must include, in the user documentation and internal + + * comments to the code, the above Disclaimer and U.S. Government End + + * Users Notice. + + */ + + + +#if !defined(__SM_35_INTRINSICS_H__) + +#define __SM_35_INTRINSICS_H__ + + + +/********************************************************************************** + +* All sm_35 intrinsics are supported by sm_32 so simply include its header file * + +**********************************************************************************/ + +#include "sm_32_intrinsics.h" + + + +#endif /* !__SM_35_INTRINSICS_H__ */ + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.hpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ab69cf38045a7c44dae67e7149d49ac4c6148747 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.hpp @@ -0,0 +1,316 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__VECTOR_FUNCTIONS_HPP__) +#define __VECTOR_FUNCTIONS_HPP__ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +#if defined(__CUDACC_RTC__) +#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#endif /* __CUDACC_RTC__ */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x) +{ + char1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x) +{ + uchar1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y) +{ + char2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y) +{ + uchar2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z) +{ + char3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z) +{ + uchar3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w) +{ + char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w) +{ + uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x) +{ + short1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x) +{ + ushort1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y) +{ + short2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y) +{ + ushort2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z) +{ + short3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z) +{ + ushort3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w) +{ + short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w) +{ + ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x) +{ + int1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x) +{ + uint1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y) +{ + int2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y) +{ + uint2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z) +{ + int3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z) +{ + uint3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w) +{ + int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w) +{ + uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x) +{ + long1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x) +{ + ulong1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y) +{ + long2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y) +{ + ulong2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z) +{ + long3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z) +{ + ulong3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w) +{ + long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w) +{ + ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x) +{ + float1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y) +{ + float2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z) +{ + float3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w) +{ + float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x) +{ + longlong1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x) +{ + ulonglong1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y) +{ + longlong2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y) +{ + ulonglong2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z) +{ + longlong3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z) +{ + ulonglong3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w) +{ + longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w) +{ + ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x) +{ + double1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y) +{ + double2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z) +{ + double3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w) +{ + double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +#undef __VECTOR_FUNCTIONS_DECL__ + +#endif /* !__VECTOR_FUNCTIONS_HPP__ */ + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50238b16212e1b5a84f2de7739346a07399bc09f Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4af83dc8e1c77ff0c084e00a6d7cbb158c4a16f6 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_train.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_train.h new file mode 100644 index 0000000000000000000000000000000000000000..2f1d6c07ffbce6289c4dba773ee73a52bcc99059 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_train.h @@ -0,0 +1,540 @@ +/* + * Copyright 2017-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* cudnn_adv_train : cuDNN's advanced and experimental features. + +*/ + +#if !defined(CUDNN_ADV_TRAIN_H_) +#define CUDNN_ADV_TRAIN_H_ + +#include +#include + +#include "cudnn_version.h" +#include "cudnn_ops_infer.h" +#include "cudnn_ops_train.h" +#include "cudnn_adv_infer.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_ADV_TRAIN_MAJOR 8 +#define CUDNN_ADV_TRAIN_MINOR 7 +#define CUDNN_ADV_TRAIN_PATCH 0 + +#if (CUDNN_ADV_TRAIN_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_TRAIN_MINOR != CUDNN_MINOR) || \ + (CUDNN_ADV_TRAIN_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN ADV TRAIN!!! +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +typedef enum { + CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */ + CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */ +} cudnnWgradMode_t; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRNNForwardTraining(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t *xDesc, + const void *x, + const cudnnTensorDescriptor_t hxDesc, + const void *hx, + const cudnnTensorDescriptor_t cxDesc, + const void *cx, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t *yDesc, + void *y, + const cudnnTensorDescriptor_t hyDesc, + void *hy, + const cudnnTensorDescriptor_t cyDesc, + void *cy, + void *workSpace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRNNBackwardData(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t *yDesc, + const void *y, + const cudnnTensorDescriptor_t *dyDesc, + const void *dy, + const cudnnTensorDescriptor_t dhyDesc, + const void *dhy, + const cudnnTensorDescriptor_t dcyDesc, + const void *dcy, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t hxDesc, + const void *hx, + const cudnnTensorDescriptor_t cxDesc, + const void *cx, + const cudnnTensorDescriptor_t *dxDesc, + void *dx, + const cudnnTensorDescriptor_t dhxDesc, + void *dhx, + const cudnnTensorDescriptor_t dcxDesc, + void *dcx, + void *workSpace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnRNNBackwardData_v8(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + const int32_t devSeqLengths[], + cudnnRNNDataDescriptor_t yDesc, + const void *y, + const void *dy, + cudnnRNNDataDescriptor_t xDesc, + void *dx, + cudnnTensorDescriptor_t hDesc, + const void *hx, + const void *dhy, + void *dhx, + cudnnTensorDescriptor_t cDesc, + const void *cx, + const void *dcy, + void *dcx, + size_t weightSpaceSize, + const void *weightSpace, + size_t workSpaceSize, + void *workSpace, + size_t reserveSpaceSize, + void *reserveSpace); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRNNBackwardWeights(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t *xDesc, + const void *x, + const cudnnTensorDescriptor_t hxDesc, + const void *hx, + const cudnnTensorDescriptor_t *yDesc, + const void *y, + const void *workSpace, + size_t workSpaceSizeInBytes, + const cudnnFilterDescriptor_t dwDesc, + void *dw, + const void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnRNNBackwardWeights_v8(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + cudnnWgradMode_t addGrad, + const int32_t devSeqLengths[], + cudnnRNNDataDescriptor_t xDesc, + const void *x, + cudnnTensorDescriptor_t hDesc, + const void *hx, + cudnnRNNDataDescriptor_t yDesc, + const void *y, + size_t weightSpaceSize, + void *dweightSpace, + size_t workSpaceSize, + void *workSpace, + size_t reserveSpaceSize, + void *reserveSpace); + +/* RNN EX API */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRNNForwardTrainingEx(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const cudnnRNNDataDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t hxDesc, + const void *hx, + const cudnnTensorDescriptor_t cxDesc, + const void *cx, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnRNNDataDescriptor_t yDesc, + void *y, + const cudnnTensorDescriptor_t hyDesc, + void *hy, + const cudnnTensorDescriptor_t cyDesc, + void *cy, + const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */ + const void *keys, /* reserved, should pass NULL */ + const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */ + void *cAttn, /* reserved, should pass NULL */ + const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */ + void *iAttn, /* reserved, should pass NULL */ + const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */ + void *queries, /* reserved, should pass NULL */ + void *workSpace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRNNBackwardDataEx(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const cudnnRNNDataDescriptor_t yDesc, + const void *y, + const cudnnRNNDataDescriptor_t dyDesc, + const void *dy, + const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */ + const void *dcAttn, /* reserved, should pass NULL */ + const cudnnTensorDescriptor_t dhyDesc, + const void *dhy, + const cudnnTensorDescriptor_t dcyDesc, + const void *dcy, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t hxDesc, + const void *hx, + const cudnnTensorDescriptor_t cxDesc, + const void *cx, + const cudnnRNNDataDescriptor_t dxDesc, + void *dx, + const cudnnTensorDescriptor_t dhxDesc, + void *dhx, + const cudnnTensorDescriptor_t dcxDesc, + void *dcx, + const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */ + void *dkeys, /* reserved, should pass NULL */ + void *workSpace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRNNBackwardWeightsEx(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const cudnnRNNDataDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t hxDesc, + const void *hx, + const cudnnRNNDataDescriptor_t yDesc, + const void *y, + void *workSpace, + size_t workSpaceSizeInBytes, + const cudnnFilterDescriptor_t dwDesc, + void *dw, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +/* RNN FIND API */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t *xDesc, + const void *x, + const cudnnTensorDescriptor_t hxDesc, + const void *hx, + const cudnnTensorDescriptor_t cxDesc, + const void *cx, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t *yDesc, + void *y, + const cudnnTensorDescriptor_t hyDesc, + void *hy, + const cudnnTensorDescriptor_t cyDesc, + void *cy, + const float findIntensity, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnAlgorithmPerformance_t *perfResults, + void *workspace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t *yDesc, + const void *y, + const cudnnTensorDescriptor_t *dyDesc, + const void *dy, + const cudnnTensorDescriptor_t dhyDesc, + const void *dhy, + const cudnnTensorDescriptor_t dcyDesc, + const void *dcy, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t hxDesc, + const void *hx, + const cudnnTensorDescriptor_t cxDesc, + const void *cx, + const cudnnTensorDescriptor_t *dxDesc, + void *dx, + const cudnnTensorDescriptor_t dhxDesc, + void *dhx, + const cudnnTensorDescriptor_t dcxDesc, + void *dcx, + const float findIntensity, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnAlgorithmPerformance_t *perfResults, + void *workspace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t *xDesc, + const void *x, + const cudnnTensorDescriptor_t hxDesc, + const void *hx, + const cudnnTensorDescriptor_t *yDesc, + const void *y, + const float findIntensity, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnAlgorithmPerformance_t *perfResults, + const void *workspace, + size_t workSpaceSizeInBytes, + const cudnnFilterDescriptor_t dwDesc, + void *dw, + const void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + const int loWinIdx[], + const int hiWinIdx[], + const int devSeqLengthsDQDO[], + const int devSeqLengthsDKDV[], + const cudnnSeqDataDescriptor_t doDesc, + const void *dout, + const cudnnSeqDataDescriptor_t dqDesc, + void *dqueries, + const void *queries, + const cudnnSeqDataDescriptor_t dkDesc, + void *dkeys, + const void *keys, + const cudnnSeqDataDescriptor_t dvDesc, + void *dvalues, + const void *values, + size_t weightSizeInBytes, + const void *weights, + size_t workSpaceSizeInBytes, + void *workSpace, + size_t reserveSpaceSizeInBytes, + void *reserveSpace); + +cudnnStatus_t CUDNNWINAPI +cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + cudnnWgradMode_t addGrad, + const cudnnSeqDataDescriptor_t qDesc, + const void *queries, + const cudnnSeqDataDescriptor_t kDesc, + const void *keys, + const cudnnSeqDataDescriptor_t vDesc, + const void *values, + const cudnnSeqDataDescriptor_t doDesc, + const void *dout, + size_t weightSizeInBytes, + const void *weights, + void *dweights, + size_t workSpaceSizeInBytes, + void *workSpace, + size_t reserveSpaceSizeInBytes, + void *reserveSpace); + +/* +* CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions +*/ +/* Input normalization mode for loss function */ +typedef enum { + CUDNN_LOSS_NORMALIZATION_NONE = 0, + CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1, +} cudnnLossNormalizationMode_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType); + +cudnnStatus_t CUDNNWINAPI +cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t compType, + cudnnLossNormalizationMode_t normMode, + cudnnNanPropagation_t gradMode); + +cudnnStatus_t CUDNNWINAPI +cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t compType, + cudnnLossNormalizationMode_t normMode, + cudnnNanPropagation_t gradMode, + int maxLabelLength); + +cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType); + +cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t *compType, + cudnnLossNormalizationMode_t *normMode, + cudnnNanPropagation_t *gradMode); + +cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t *compType, + cudnnLossNormalizationMode_t *normMode, + cudnnNanPropagation_t *gradMode, + int *maxLabelLength); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc); + +/* return the ctc costs and gradients, given the probabilities and labels */ +cudnnStatus_t CUDNNWINAPI +cudnnCTCLoss( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t + probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the + mini batch size, A is the alphabet size) */ + const void *probs, /* probabilities after softmax, in GPU memory */ + const int hostLabels[], /* labels, in CPU memory */ + const int hostLabelLengths[], /* the length of each label, in CPU memory */ + const int hostInputLengths[], /* the lengths of timing steps in each batch, in CPU memory */ + void *costs, /* the returned costs of CTC, in GPU memory */ + const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */ + void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */ + cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ + cudnnCTCLossDescriptor_t ctcLossDesc, + void *workspace, /* pointer to the workspace, in GPU memory */ + size_t workSpaceSizeInBytes); /* size of the workspace */ + +/* return the ctc costs and gradients, given the probabilities and labels */ +cudnnStatus_t CUDNNWINAPI +cudnnCTCLoss_v8( + cudnnHandle_t handle, + cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ + cudnnCTCLossDescriptor_t ctcLossDesc, + const cudnnTensorDescriptor_t + probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the + mini batch size, A is the alphabet size) */ + const void *probs, /* probabilities after softmax, in GPU memory */ + const int labels[], /* labels, in GPU memory */ + const int labelLengths[], /* the length of each label, in GPU memory */ + const int inputLengths[], /* the lengths of timing steps in each batch, in GPU memory */ + void *costs, /* the returned costs of CTC, in GPU memory */ + const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */ + void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */ + size_t workSpaceSizeInBytes, /* size of the workspace */ + void *workspace); /* pointer to the workspace, in GPU memory */ + +/* return the workspace size needed for ctc */ +cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossWorkspaceSize( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the + timing steps, N is the mini batch size, A is the alphabet size) */ + const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the + dimensions are T,N,A. To compute costs + only, set it to NULL */ + const int *labels, /* labels, in CPU memory */ + const int *labelLengths, /* the length of each label, in CPU memory */ + const int *inputLengths, /* the lengths of timing steps in each batch, in CPU memory */ + cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ + cudnnCTCLossDescriptor_t ctcLossDesc, + size_t *sizeInBytes); /* pointer to the returned workspace size */ + +/* return the workspace size needed for ctc */ +cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossWorkspaceSize_v8( + cudnnHandle_t handle, + cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ + cudnnCTCLossDescriptor_t ctcLossDesc, + const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the + timing steps, N is the mini batch size, A is the alphabet size) */ + const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the + dimensions are T,N,A. To compute costs + only, set it to NULL */ + size_t *sizeInBytes); /* pointer to the returned workspace size */ + +/* + * \brief Cross-library version checker. + * This function is implemented differently in each sub-library. Each sublib + * checks whether its own version matches that of its dependencies. + * \returns CUDNN_STATUS_SUCCESS if the version check passes, + * CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent. + */ +cudnnStatus_t CUDNNWINAPI +cudnnAdvTrainVersionCheck(void); + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_ADV_TRAIN_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend.h new file mode 100644 index 0000000000000000000000000000000000000000..bfebee101195e52a789815eaf94ea0f581072ac4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend.h @@ -0,0 +1,600 @@ +/* + * Copyright 2017-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CUDNN_BACKEND_H_ +#define _CUDNN_BACKEND_H_ + +/* + * The content in this header file is under development to be included in cudnn.h in the future + * Production code should have all include of this header file remove. + */ + +#include "cudnn_ops_infer.h" +#include "cudnn_cnn_infer.h" + +/* NOTE: definition in extern "C" to be copied later to public header */ +#if defined(__cplusplus) +extern "C" { +#endif + +typedef void *cudnnBackendDescriptor_t; + +typedef struct cudnnFractionStruct { + int64_t numerator; + int64_t denominator; +} cudnnFraction_t; + +typedef enum { + CUDNN_POINTWISE_ADD = 0, + CUDNN_POINTWISE_ADD_SQUARE = 5, + CUDNN_POINTWISE_DIV = 6, + CUDNN_POINTWISE_MAX = 3, + CUDNN_POINTWISE_MIN = 2, + CUDNN_POINTWISE_MOD = 7, + CUDNN_POINTWISE_MUL = 1, + CUDNN_POINTWISE_POW = 8, + CUDNN_POINTWISE_SUB = 9, + + CUDNN_POINTWISE_ABS = 10, + CUDNN_POINTWISE_CEIL = 11, + CUDNN_POINTWISE_COS = 12, + CUDNN_POINTWISE_EXP = 13, + CUDNN_POINTWISE_FLOOR = 14, + CUDNN_POINTWISE_LOG = 15, + CUDNN_POINTWISE_NEG = 16, + CUDNN_POINTWISE_RSQRT = 17, + CUDNN_POINTWISE_SIN = 18, + CUDNN_POINTWISE_SQRT = 4, + CUDNN_POINTWISE_TAN = 19, + CUDNN_POINTWISE_ERF = 20, + CUDNN_POINTWISE_IDENTITY = 21, + + CUDNN_POINTWISE_RELU_FWD = 100, + CUDNN_POINTWISE_TANH_FWD = 101, + CUDNN_POINTWISE_SIGMOID_FWD = 102, + CUDNN_POINTWISE_ELU_FWD = 103, + CUDNN_POINTWISE_GELU_FWD = 104, + CUDNN_POINTWISE_SOFTPLUS_FWD = 105, + CUDNN_POINTWISE_SWISH_FWD = 106, + CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107, + + CUDNN_POINTWISE_RELU_BWD = 200, + CUDNN_POINTWISE_TANH_BWD = 201, + CUDNN_POINTWISE_SIGMOID_BWD = 202, + CUDNN_POINTWISE_ELU_BWD = 203, + CUDNN_POINTWISE_GELU_BWD = 204, + CUDNN_POINTWISE_SOFTPLUS_BWD = 205, + CUDNN_POINTWISE_SWISH_BWD = 206, + CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207, + + CUDNN_POINTWISE_CMP_EQ = 300, + CUDNN_POINTWISE_CMP_NEQ = 301, + CUDNN_POINTWISE_CMP_GT = 302, + CUDNN_POINTWISE_CMP_GE = 303, + CUDNN_POINTWISE_CMP_LT = 304, + CUDNN_POINTWISE_CMP_LE = 305, + + CUDNN_POINTWISE_LOGICAL_AND = 400, + CUDNN_POINTWISE_LOGICAL_OR = 401, + CUDNN_POINTWISE_LOGICAL_NOT = 402, + + CUDNN_POINTWISE_GEN_INDEX = 501, + + CUDNN_POINTWISE_BINARY_SELECT = 601, +} cudnnPointwiseMode_t; + +typedef enum { + CUDNN_RESAMPLE_NEAREST = 0, + CUDNN_RESAMPLE_BILINEAR = 1, + CUDNN_RESAMPLE_AVGPOOL = 2, + CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2, + CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4, + CUDNN_RESAMPLE_MAXPOOL = 3, +} cudnnResampleMode_t; + +typedef enum { + CUDNN_SIGNAL_SET = 0, + CUDNN_SIGNAL_WAIT = 1, +} cudnnSignalMode_t; + +typedef enum { + CUDNN_GENSTATS_SUM_SQSUM = 0, +} cudnnGenStatsMode_t; + +typedef enum { + CUDNN_BN_FINALIZE_STATISTICS_TRAINING = 0, + CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1, +} cudnnBnFinalizeStatsMode_t; + +typedef enum { + CUDNN_RNG_DISTRIBUTION_BERNOULLI, + CUDNN_RNG_DISTRIBUTION_UNIFORM, + CUDNN_RNG_DISTRIBUTION_NORMAL, +} cudnnRngDistribution_t; + +typedef enum { + CUDNN_ATTR_POINTWISE_MODE = 0, + CUDNN_ATTR_POINTWISE_MATH_PREC = 1, + CUDNN_ATTR_POINTWISE_NAN_PROPAGATION = 2, + CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP = 3, + CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP = 4, + CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE = 5, + CUDNN_ATTR_POINTWISE_ELU_ALPHA = 6, + CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA = 7, + CUDNN_ATTR_POINTWISE_SWISH_BETA = 8, + CUDNN_ATTR_POINTWISE_AXIS = 9, + + CUDNN_ATTR_CONVOLUTION_COMP_TYPE = 100, + CUDNN_ATTR_CONVOLUTION_CONV_MODE = 101, + CUDNN_ATTR_CONVOLUTION_DILATIONS = 102, + CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103, + CUDNN_ATTR_CONVOLUTION_POST_PADDINGS = 104, + CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS = 105, + CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS = 106, + + CUDNN_ATTR_ENGINEHEUR_MODE = 200, + CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201, + CUDNN_ATTR_ENGINEHEUR_RESULTS = 202, + + CUDNN_ATTR_ENGINECFG_ENGINE = 300, + CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301, + CUDNN_ATTR_ENGINECFG_KNOB_CHOICES = 302, + + CUDNN_ATTR_EXECUTION_PLAN_HANDLE = 400, + CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG = 401, + CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE = 402, + CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403, + CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404, + CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION = 405, + + CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID = 500, + CUDNN_ATTR_INTERMEDIATE_INFO_SIZE = 501, + CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS = 502, + CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503, + + CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE = 600, + CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601, + + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA = 700, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA = 701, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC = 702, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W = 703, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X = 704, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y = 705, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA = 706, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA = 707, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC = 708, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W = 709, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX = 710, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY = 711, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA = 712, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA = 713, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW = 715, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X = 716, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY = 717, + + CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750, + CUDNN_ATTR_OPERATION_POINTWISE_XDESC = 751, + CUDNN_ATTR_OPERATION_POINTWISE_BDESC = 752, + CUDNN_ATTR_OPERATION_POINTWISE_YDESC = 753, + CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1 = 754, + CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2 = 755, + CUDNN_ATTR_OPERATION_POINTWISE_DXDESC = 756, + CUDNN_ATTR_OPERATION_POINTWISE_DYDESC = 757, + CUDNN_ATTR_OPERATION_POINTWISE_TDESC = 758, + + CUDNN_ATTR_OPERATION_GENSTATS_MODE = 770, + CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771, + CUDNN_ATTR_OPERATION_GENSTATS_XDESC = 772, + CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC = 773, + CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774, + + CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE = 780, + CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC = 781, + CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC = 782, + CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC = 783, + CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC = 784, + CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC = 785, + CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC = 786, + CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC = 787, + CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788, + CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC = 789, + CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC = 790, + CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC = 791, + CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC = 792, + CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC = 793, + CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC = 794, + CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC = 795, + CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC = 796, + + CUDNN_ATTR_OPERATIONGRAPH_HANDLE = 800, + CUDNN_ATTR_OPERATIONGRAPH_OPS = 801, + CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802, + + CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT = 900, + CUDNN_ATTR_TENSOR_DATA_TYPE = 901, + CUDNN_ATTR_TENSOR_DIMENSIONS = 902, + CUDNN_ATTR_TENSOR_STRIDES = 903, + CUDNN_ATTR_TENSOR_VECTOR_COUNT = 904, + CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905, + CUDNN_ATTR_TENSOR_UNIQUE_ID = 906, + CUDNN_ATTR_TENSOR_IS_VIRTUAL = 907, + CUDNN_ATTR_TENSOR_IS_BY_VALUE = 908, + CUDNN_ATTR_TENSOR_REORDERING_MODE = 909, + + CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS = 1000, + CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001, + CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002, + CUDNN_ATTR_VARIANT_PACK_WORKSPACE = 1003, + + CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100, + CUDNN_ATTR_LAYOUT_INFO_TYPES = 1101, + + CUDNN_ATTR_KNOB_INFO_TYPE = 1200, + CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201, + CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202, + CUDNN_ATTR_KNOB_INFO_STRIDE = 1203, + + CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300, + CUDNN_ATTR_ENGINE_GLOBAL_INDEX = 1301, + CUDNN_ATTR_ENGINE_KNOB_INFO = 1302, + CUDNN_ATTR_ENGINE_NUMERICAL_NOTE = 1303, + CUDNN_ATTR_ENGINE_LAYOUT_INFO = 1304, + CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE = 1305, + + CUDNN_ATTR_MATMUL_COMP_TYPE = 1500, + + CUDNN_ATTR_OPERATION_MATMUL_ADESC = 1520, + CUDNN_ATTR_OPERATION_MATMUL_BDESC = 1521, + CUDNN_ATTR_OPERATION_MATMUL_CDESC = 1522, + CUDNN_ATTR_OPERATION_MATMUL_DESC = 1523, + CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT = 1524, + CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC = 1525, + CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC = 1526, + CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC = 1527, + + CUDNN_ATTR_REDUCTION_OPERATOR = 1600, + CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601, + + CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610, + CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611, + CUDNN_ATTR_OPERATION_REDUCTION_DESC = 1612, + + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC = 1620, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC = 1621, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC = 1622, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC = 1623, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC = 1624, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC = 1625, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC = 1626, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC = 1627, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC = 1629, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS = 1630, + + CUDNN_ATTR_RESAMPLE_MODE = 1700, + CUDNN_ATTR_RESAMPLE_COMP_TYPE = 1701, + CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS = 1702, + CUDNN_ATTR_RESAMPLE_POST_PADDINGS = 1703, + CUDNN_ATTR_RESAMPLE_PRE_PADDINGS = 1704, + CUDNN_ATTR_RESAMPLE_STRIDES = 1705, + CUDNN_ATTR_RESAMPLE_WINDOW_DIMS = 1706, + CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707, + CUDNN_ATTR_RESAMPLE_PADDING_MODE = 1708, + + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC = 1710, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC = 1711, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC = 1712, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA = 1713, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA = 1714, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC = 1716, + + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC = 1720, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC = 1721, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC = 1722, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA = 1723, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA = 1724, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC = 1725, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC = 1726, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC = 1727, + + CUDNN_ATTR_OPERATION_CONCAT_AXIS = 1800, + CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS = 1801, + CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802, + CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC = 1803, + + CUDNN_ATTR_OPERATION_SIGNAL_MODE = 1900, + CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901, + CUDNN_ATTR_OPERATION_SIGNAL_VALUE = 1902, + CUDNN_ATTR_OPERATION_SIGNAL_XDESC = 1903, + CUDNN_ATTR_OPERATION_SIGNAL_YDESC = 1904, + + CUDNN_ATTR_OPERATION_NORM_FWD_MODE = 2000, + CUDNN_ATTR_OPERATION_NORM_FWD_PHASE = 2001, + CUDNN_ATTR_OPERATION_NORM_FWD_XDESC = 2002, + CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC = 2003, + CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC = 2004, + CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC = 2005, + CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC = 2006, + CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC = 2007, + CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC = 2008, + CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC = 2009, + CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC = 2010, + CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011, + CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC = 2012, + CUDNN_ATTR_OPERATION_NORM_FWD_YDESC = 2013, + CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS = 2014, + + CUDNN_ATTR_OPERATION_NORM_BWD_MODE = 2100, + CUDNN_ATTR_OPERATION_NORM_BWD_XDESC = 2101, + CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC = 2102, + CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103, + CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC = 2104, + CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC = 2105, + CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC = 2106, + CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC = 2107, + CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC = 2108, + CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC = 2109, + CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS = 2110, + + CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200, + CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201, + + CUDNN_ATTR_RNG_DISTRIBUTION = 2300, + CUDNN_ATTR_RNG_NORMAL_DIST_MEAN = 2301, + CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302, + CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM = 2303, + CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM = 2304, + CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY = 2305, + + CUDNN_ATTR_OPERATION_RNG_YDESC = 2310, + CUDNN_ATTR_OPERATION_RNG_SEED = 2311, + CUDNN_ATTR_OPERATION_RNG_DESC = 2312, + +} cudnnBackendAttributeName_t; + +typedef enum { + CUDNN_TYPE_HANDLE = 0, + CUDNN_TYPE_DATA_TYPE, + CUDNN_TYPE_BOOLEAN, + CUDNN_TYPE_INT64, + CUDNN_TYPE_FLOAT, + CUDNN_TYPE_DOUBLE, + CUDNN_TYPE_VOID_PTR, + CUDNN_TYPE_CONVOLUTION_MODE, + CUDNN_TYPE_HEUR_MODE, + CUDNN_TYPE_KNOB_TYPE, + CUDNN_TYPE_NAN_PROPOGATION, + CUDNN_TYPE_NUMERICAL_NOTE, + CUDNN_TYPE_LAYOUT_TYPE, + CUDNN_TYPE_ATTRIB_NAME, + CUDNN_TYPE_POINTWISE_MODE, + CUDNN_TYPE_BACKEND_DESCRIPTOR, + CUDNN_TYPE_GENSTATS_MODE, + CUDNN_TYPE_BN_FINALIZE_STATS_MODE, + CUDNN_TYPE_REDUCTION_OPERATOR_TYPE, + CUDNN_TYPE_BEHAVIOR_NOTE, + CUDNN_TYPE_TENSOR_REORDERING_MODE, + CUDNN_TYPE_RESAMPLE_MODE, + CUDNN_TYPE_PADDING_MODE, + CUDNN_TYPE_INT32, + CUDNN_TYPE_CHAR, + CUDNN_TYPE_SIGNAL_MODE, + CUDNN_TYPE_FRACTION, + CUDNN_TYPE_NORM_MODE, + CUDNN_TYPE_NORM_FWD_PHASE, + CUDNN_TYPE_RNG_DISTRIBUTION +} cudnnBackendAttributeType_t; + +typedef enum { + CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0, + CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR, + CUDNN_BACKEND_ENGINE_DESCRIPTOR, + CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, + CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR, + CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, + CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR, + CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR, + CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR, + CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR, + CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, + CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR, + CUDNN_BACKEND_TENSOR_DESCRIPTOR, + CUDNN_BACKEND_MATMUL_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR, + CUDNN_BACKEND_REDUCTION_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR, + CUDNN_BACKEND_RESAMPLE_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR, + CUDNN_BACKEND_RNG_DESCRIPTOR, + CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR +} cudnnBackendDescriptorType_t; + +typedef enum { + CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0, + CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS, + CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION, + CUDNN_NUMERICAL_NOTE_FFT, + CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC, + CUDNN_NUMERICAL_NOTE_WINOGRAD, + CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4, + CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6, + CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13, + CUDNN_NUMERICAL_NOTE_TYPE_COUNT, +} cudnnBackendNumericalNote_t; + +typedef enum { + CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION = 0, + CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1, + CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER = 2, + CUDNN_BEHAVIOR_NOTE_TYPE_COUNT, +} cudnnBackendBehaviorNote_t; + +typedef enum { + CUDNN_KNOB_TYPE_SPLIT_K = 0, + CUDNN_KNOB_TYPE_SWIZZLE = 1, + CUDNN_KNOB_TYPE_TILE_SIZE = 2, + CUDNN_KNOB_TYPE_USE_TEX = 3, + CUDNN_KNOB_TYPE_EDGE = 4, + CUDNN_KNOB_TYPE_KBLOCK = 5, + CUDNN_KNOB_TYPE_LDGA = 6, + CUDNN_KNOB_TYPE_LDGB = 7, + CUDNN_KNOB_TYPE_CHUNK_K = 8, + CUDNN_KNOB_TYPE_SPLIT_H = 9, + CUDNN_KNOB_TYPE_WINO_TILE = 10, + CUDNN_KNOB_TYPE_MULTIPLY = 11, + CUDNN_KNOB_TYPE_SPLIT_K_BUF = 12, + CUDNN_KNOB_TYPE_TILEK = 13, + CUDNN_KNOB_TYPE_STAGES = 14, + CUDNN_KNOB_TYPE_REDUCTION_MODE = 15, + CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE = 16, + CUDNN_KNOB_TYPE_SPLIT_K_SLC = 17, + CUDNN_KNOB_TYPE_IDX_MODE = 18, + CUDNN_KNOB_TYPE_SLICED = 19, + CUDNN_KNOB_TYPE_SPLIT_RS = 20, + CUDNN_KNOB_TYPE_SINGLEBUFFER = 21, + CUDNN_KNOB_TYPE_LDGC = 22, + CUDNN_KNOB_TYPE_SPECFILT = 23, + CUDNN_KNOB_TYPE_KERNEL_CFG = 24, + CUDNN_KNOB_TYPE_WORKSPACE = 25, + CUDNN_KNOB_TYPE_TILE_CGA = 26, + CUDNN_KNOB_TYPE_TILE_CGA_M = 27, + CUDNN_KNOB_TYPE_TILE_CGA_N = 28, + + CUDNN_KNOB_TYPE_COUNTS, +} cudnnBackendKnobType_t; + +typedef enum { + CUDNN_LAYOUT_TYPE_PREFERRED_NCHW = 0, + CUDNN_LAYOUT_TYPE_PREFERRED_NHWC = 1, + CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2, + CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3, + CUDNN_LAYOUT_TYPE_COUNT = 4, +} cudnnBackendLayoutType_t; + +typedef enum { + CUDNN_HEUR_MODE_INSTANT = 0, + CUDNN_HEUR_MODE_B = 1, + CUDNN_HEUR_MODE_FALLBACK = 2, + CUDNN_HEUR_MODE_A = 3, + CUDNN_HEUR_MODES_COUNT = 4, +} cudnnBackendHeurMode_t; + +typedef enum { + CUDNN_TENSOR_REORDERING_NONE = 0, + CUDNN_TENSOR_REORDERING_INT8x32 = 1, +} cudnnBackendTensorReordering_t; + +typedef enum { + CUDNN_ZERO_PAD = 0, + CUDNN_NEG_INF_PAD = 1, + CUDNN_EDGE_VAL_PAD = 2, +} cudnnPaddingMode_t; + +typedef enum { + CUDNN_LAYER_NORM = 0, + CUDNN_INSTANCE_NORM = 1, + CUDNN_BATCH_NORM = 2, + CUDNN_GROUP_NORM = 3, +} cudnnBackendNormMode_t; + +typedef enum { + CUDNN_NORM_FWD_INFERENCE = 0, + CUDNN_NORM_FWD_TRAINING = 1, +} cudnnBackendNormFwdPhase_t; + +cudnnStatus_t CUDNNWINAPI +cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor, + cudnnBackendAttributeName_t attributeName, + cudnnBackendAttributeType_t attributeType, + int64_t elementCount, + const void *arrayOfElements); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor, + cudnnBackendAttributeName_t attributeName, + cudnnBackendAttributeType_t attributeType, + int64_t requestedElementCount, + int64_t *elementCount, + void *arrayOfElements); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack); + +#if defined(__cplusplus) +} +#endif + +#endif /* _CUDNN_BACKEND_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer.h new file mode 100644 index 0000000000000000000000000000000000000000..88335dfb027c5193c7cfc888ed84dc352dc5de13 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer.h @@ -0,0 +1,1183 @@ +/* + * Copyright 2017-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * cudnn_ops_infer : cuDNN's basic definitions and inference operations. + */ + +#if !defined(CUDNN_OPS_INFER_H_) +#define CUDNN_OPS_INFER_H_ + +#include +#include + +#include "cudnn_version.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_OPS_INFER_MAJOR 8 +#define CUDNN_OPS_INFER_MINOR 7 +#define CUDNN_OPS_INFER_PATCH 0 + +#if (CUDNN_OPS_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_INFER_MINOR != CUDNN_MINOR) || \ + (CUDNN_OPS_INFER_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN OPS INFER!!! +#endif + +#ifndef CUDNNWINAPI +#ifdef _WIN32 +#define CUDNNWINAPI __stdcall +#else +#define CUDNNWINAPI +#endif +#endif + +/* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */ +#if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__)) +/* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */ +#define CUDNN_DEPRECATED __attribute__((deprecated)) +#elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER) +/* Microsoft Visual C++ */ +#define CUDNN_DEPRECATED __declspec(deprecated) +#elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L) +/* C++14 compilers */ +#define CUDNN_DEPRECATED [[deprecated]] +#else +/* No support for the deprecated attribute */ +#define CUDNN_DEPRECATED +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +struct cudnnContext; +typedef struct cudnnContext *cudnnHandle_t; + +size_t CUDNNWINAPI +cudnnGetVersion(void); + +size_t CUDNNWINAPI +cudnnGetMaxDeviceVersion(void); + +/* Returns CUDA Runtime version statically linked against cudnn */ +size_t CUDNNWINAPI +cudnnGetCudartVersion(void); + +/* + * CUDNN return codes + */ +typedef enum { + CUDNN_STATUS_SUCCESS = 0, + CUDNN_STATUS_NOT_INITIALIZED = 1, + CUDNN_STATUS_ALLOC_FAILED = 2, + CUDNN_STATUS_BAD_PARAM = 3, + CUDNN_STATUS_INTERNAL_ERROR = 4, + CUDNN_STATUS_INVALID_VALUE = 5, + CUDNN_STATUS_ARCH_MISMATCH = 6, + CUDNN_STATUS_MAPPING_ERROR = 7, + CUDNN_STATUS_EXECUTION_FAILED = 8, + CUDNN_STATUS_NOT_SUPPORTED = 9, + CUDNN_STATUS_LICENSE_ERROR = 10, + CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11, + CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12, + CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13, + CUDNN_STATUS_VERSION_MISMATCH = 14, +} cudnnStatus_t; + +/* human-readable error messages */ +const char *CUDNNWINAPI +cudnnGetErrorString(cudnnStatus_t status); + +/* Forward definition in this version only */ +typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t; + +typedef enum { + CUDNN_ERRQUERY_RAWCODE = 0, + CUDNN_ERRQUERY_NONBLOCKING = 1, + CUDNN_ERRQUERY_BLOCKING = 2, +} cudnnErrQueryMode_t; + +cudnnStatus_t CUDNNWINAPI +cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag); + +#ifndef __LIBRARY_TYPES_H__ + +typedef enum libraryPropertyType_t { MAJOR_VERSION, MINOR_VERSION, PATCH_LEVEL } libraryPropertyType; + +#endif + +cudnnStatus_t CUDNNWINAPI +cudnnGetProperty(libraryPropertyType type, int *value); + +cudnnStatus_t CUDNNWINAPI +cudnnCreate(cudnnHandle_t *handle); +cudnnStatus_t CUDNNWINAPI +cudnnDestroy(cudnnHandle_t handle); +cudnnStatus_t CUDNNWINAPI +cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId); +cudnnStatus_t CUDNNWINAPI +cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId); + +/* Data structures to represent Image/Filter and the Neural Network Layer */ +typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t; +typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t; +typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t; +typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t; +typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t; +typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t; +typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t; +typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t; +typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t; +typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t; +/* + * CUDNN data type + */ +typedef enum { + CUDNN_DATA_FLOAT = 0, + CUDNN_DATA_DOUBLE = 1, + CUDNN_DATA_HALF = 2, + CUDNN_DATA_INT8 = 3, + CUDNN_DATA_INT32 = 4, + CUDNN_DATA_INT8x4 = 5, + CUDNN_DATA_UINT8 = 6, + CUDNN_DATA_UINT8x4 = 7, + CUDNN_DATA_INT8x32 = 8, + CUDNN_DATA_BFLOAT16 = 9, + CUDNN_DATA_INT64 = 10, + CUDNN_DATA_BOOLEAN = 11, + CUDNN_DATA_FP8_E4M3 = 12, + CUDNN_DATA_FP8_E5M2 = 13, + CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14, +} cudnnDataType_t; + +/* + * CUDNN math type + */ +typedef enum { + CUDNN_DEFAULT_MATH = 0, + CUDNN_TENSOR_OP_MATH = 1, + CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2, + CUDNN_FMA_MATH = 3, +} cudnnMathType_t; + +/* + * CUDNN propagate Nan + */ +typedef enum { + CUDNN_NOT_PROPAGATE_NAN = 0, + CUDNN_PROPAGATE_NAN = 1, +} cudnnNanPropagation_t; + +/* + * CUDNN Determinism + */ +typedef enum { + CUDNN_NON_DETERMINISTIC = 0, + CUDNN_DETERMINISTIC = 1, +} cudnnDeterminism_t; + +/* Maximum supported number of tensor dimensions */ +#define CUDNN_DIM_MAX 8 + +/* Create an instance of a generic Tensor descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc); + +typedef enum { + CUDNN_TENSOR_NCHW = 0, /* row major (wStride = 1, hStride = w) */ + CUDNN_TENSOR_NHWC = 1, /* feature maps interleaved ( cStride = 1 )*/ + CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */ +} cudnnTensorFormat_t; + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, + cudnnTensorFormat_t format, + cudnnDataType_t dataType, /* image data type */ + int n, /* number of inputs (batch size) */ + int c, /* number of input feature maps */ + int h, /* height of input section */ + int w); /* width of input section */ + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t dataType, /* image data type */ + int n, /* number of inputs (batch size) */ + int c, /* number of input feature maps */ + int h, /* height of input section */ + int w, /* width of input section */ + int nStride, + int cStride, + int hStride, + int wStride); + +cudnnStatus_t CUDNNWINAPI +cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t *dataType, /* image data type */ + int *n, /* number of inputs (batch size) */ + int *c, /* number of input feature maps */ + int *h, /* height of input section */ + int *w, /* width of input section */ + int *nStride, + int *cStride, + int *hStride, + int *wStride); + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t dataType, + int nbDims, + const int dimA[], + const int strideA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc, + cudnnTensorFormat_t format, + cudnnDataType_t dataType, + int nbDims, + const int dimA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc, + int nbDimsRequested, + cudnnDataType_t *dataType, + int *nbDims, + int dimA[], + int strideA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size); + +/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride + + 1)Example of all images in row major order one batch of features after the other (with an optional padding on row) + input_stride : c x h x h_stride + feature_stride : h x h_stride + h_stride : >= w ( h_stride = w if no padding) + w_stride : 1 + + + 2)Example of all images in row major with features maps interleaved + input_stride : c x h x h_stride + feature_stride : 1 + h_stride : w x c + w_stride : c + + 3)Example of all images in column major order one batch of features after the other (with optional padding on column) + input_stride : c x w x w_stride + feature_stride : w x w_stride + h_stride : 1 + w_stride : >= h + +*/ + +/* Destroy an instance of Tensor4d descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc); + +/* Fold/unfold transforms */ +typedef enum { + CUDNN_TRANSFORM_FOLD = 0U, + CUDNN_TRANSFORM_UNFOLD = 1U, +} cudnnFoldingDirection_t; + +/** Create a destination descriptor for cudnnTransformTensor */ +cudnnStatus_t CUDNNWINAPI +cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc, + const cudnnTensorDescriptor_t srcDesc, + cudnnTensorDescriptor_t destDesc, + size_t *destSizeInBytes); + +/** Create an empty tensor transform descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc); + +/** Initialize a previously created tensor transform descriptor. */ +cudnnStatus_t CUDNNWINAPI +cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc, + const uint32_t nbDims, + const cudnnTensorFormat_t destFormat, + const int32_t padBeforeA[], + const int32_t padAfterA[], + const uint32_t foldA[], + const cudnnFoldingDirection_t direction); + +/** + * Retrieves the values stored in a previously initialized tensor transform + * descriptor. + */ +cudnnStatus_t CUDNNWINAPI +cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc, + uint32_t nbDimsRequested, + cudnnTensorFormat_t *destFormat, + int32_t padBeforeA[], + int32_t padAfterA[], + uint32_t foldA[], + cudnnFoldingDirection_t *direction); + +/** + * Destroys a previously created tensor transform descriptor. + */ +cudnnStatus_t CUDNNWINAPI +cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc); + +/* Tensor layout conversion helper (y = alpha * x + beta * y) */ +cudnnStatus_t CUDNNWINAPI +cudnnTransformTensor(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +cudnnStatus_t CUDNNWINAPI +cudnnTransformTensorEx(cudnnHandle_t handle, + const cudnnTensorTransformDescriptor_t transDesc, + const void *alpha, + const cudnnTensorDescriptor_t srcDesc, + const void *srcData, + const void *beta, + const cudnnTensorDescriptor_t destDesc, + void *destData); + +/* Tensor Bias addition : C = alpha * A + beta * C */ +cudnnStatus_t CUDNNWINAPI +cudnnAddTensor(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C); + +/* + * CUDNN OpTensor op type + */ +typedef enum { + CUDNN_OP_TENSOR_ADD = 0, + CUDNN_OP_TENSOR_MUL = 1, + CUDNN_OP_TENSOR_MIN = 2, + CUDNN_OP_TENSOR_MAX = 3, + CUDNN_OP_TENSOR_SQRT = 4, + CUDNN_OP_TENSOR_NOT = 5, +} cudnnOpTensorOp_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc, + cudnnOpTensorOp_t opTensorOp, + cudnnDataType_t opTensorCompType, + cudnnNanPropagation_t opTensorNanOpt); + +cudnnStatus_t CUDNNWINAPI +cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc, + cudnnOpTensorOp_t *opTensorOp, + cudnnDataType_t *opTensorCompType, + cudnnNanPropagation_t *opTensorNanOpt); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc); + +/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */ +/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */ +cudnnStatus_t CUDNNWINAPI +cudnnOpTensor(cudnnHandle_t handle, + const cudnnOpTensorDescriptor_t opTensorDesc, + const void *alpha1, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *alpha2, + const cudnnTensorDescriptor_t bDesc, + const void *B, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C); + +/* + * CUDNN ReduceTensor op type + */ +typedef enum { + CUDNN_REDUCE_TENSOR_ADD = 0, + CUDNN_REDUCE_TENSOR_MUL = 1, + CUDNN_REDUCE_TENSOR_MIN = 2, + CUDNN_REDUCE_TENSOR_MAX = 3, + CUDNN_REDUCE_TENSOR_AMAX = 4, + CUDNN_REDUCE_TENSOR_AVG = 5, + CUDNN_REDUCE_TENSOR_NORM1 = 6, + CUDNN_REDUCE_TENSOR_NORM2 = 7, + CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8, +} cudnnReduceTensorOp_t; + +/* + * CUDNN ReduceTensor indices type + */ +typedef enum { + CUDNN_REDUCE_TENSOR_NO_INDICES = 0, + CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1, +} cudnnReduceTensorIndices_t; + +/* + * CUDNN tensor indices type size (all unsigned) + * Currently not supported, default is 32 bit unsigned. + */ +typedef enum { + CUDNN_32BIT_INDICES = 0, + CUDNN_64BIT_INDICES = 1, + CUDNN_16BIT_INDICES = 2, + CUDNN_8BIT_INDICES = 3, +} cudnnIndicesType_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc, + cudnnReduceTensorOp_t reduceTensorOp, + cudnnDataType_t reduceTensorCompType, + cudnnNanPropagation_t reduceTensorNanOpt, + cudnnReduceTensorIndices_t reduceTensorIndices, + cudnnIndicesType_t reduceTensorIndicesType); + +cudnnStatus_t CUDNNWINAPI +cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc, + cudnnReduceTensorOp_t *reduceTensorOp, + cudnnDataType_t *reduceTensorCompType, + cudnnNanPropagation_t *reduceTensorNanOpt, + cudnnReduceTensorIndices_t *reduceTensorIndices, + cudnnIndicesType_t *reduceTensorIndicesType); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc); + +/* Helper function to return the minimum size of the index space to be passed to the reduction given the input and + * output tensors */ +cudnnStatus_t CUDNNWINAPI +cudnnGetReductionIndicesSize(cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + const cudnnTensorDescriptor_t aDesc, + const cudnnTensorDescriptor_t cDesc, + size_t *sizeInBytes); + +/* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output + * tensors */ +cudnnStatus_t CUDNNWINAPI +cudnnGetReductionWorkspaceSize(cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + const cudnnTensorDescriptor_t aDesc, + const cudnnTensorDescriptor_t cDesc, + size_t *sizeInBytes); + +/* Tensor operation : C = reduce op( alpha * A ) + beta * C */ +/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */ +/* The indices space is ignored for reduce ops other than min or max. */ +cudnnStatus_t CUDNNWINAPI +cudnnReduceTensor(cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + void *indices, + size_t indicesSizeInBytes, + void *workspace, + size_t workspaceSizeInBytes, + const void *alpha, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C); + +/* Set all values of a tensor to a given value : y[i] = value[0] */ +cudnnStatus_t CUDNNWINAPI +cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr); + +/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */ +cudnnStatus_t CUDNNWINAPI +cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha); + +/* Create an instance of FilterStruct */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, /* image data type */ + cudnnTensorFormat_t format, + int k, /* number of output feature maps */ + int c, /* number of input feature maps */ + int h, /* height of each input filter */ + int w); /* width of each input filter */ + +cudnnStatus_t CUDNNWINAPI +cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t *dataType, /* image data type */ + cudnnTensorFormat_t *format, + int *k, /* number of output feature maps */ + int *c, /* number of input feature maps */ + int *h, /* height of each input filter */ + int *w); /* width of each input filter */ + +cudnnStatus_t CUDNNWINAPI +cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, /* image data type */ + cudnnTensorFormat_t format, + int nbDims, + const int filterDimA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc, + int nbDimsRequested, + cudnnDataType_t *dataType, /* image data type */ + cudnnTensorFormat_t *format, + int *nbDims, + int filterDimA[]); +cudnnStatus_t CUDNNWINAPI +cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size); + +cudnnStatus_t CUDNNWINAPI +cudnnTransformFilter(cudnnHandle_t handle, + const cudnnTensorTransformDescriptor_t transDesc, + const void *alpha, + const cudnnFilterDescriptor_t srcDesc, + const void *srcData, + const void *beta, + const cudnnFilterDescriptor_t destDesc, + void *destData); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc); + +/* + * softmax algorithm + */ +typedef enum { + CUDNN_SOFTMAX_FAST = 0, /* straightforward implementation */ + CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */ + CUDNN_SOFTMAX_LOG = 2 +} cudnnSoftmaxAlgorithm_t; + +typedef enum { + CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */ + CUDNN_SOFTMAX_MODE_CHANNEL = 1 /* compute the softmax over all C for each H, W, N */ +} cudnnSoftmaxMode_t; + +/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform forward softmax */ +cudnnStatus_t CUDNNWINAPI +cudnnSoftmaxForward(cudnnHandle_t handle, + cudnnSoftmaxAlgorithm_t algo, + cudnnSoftmaxMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* + * pooling mode + */ +typedef enum { + CUDNN_POOLING_MAX = 0, + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */ + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */ + CUDNN_POOLING_MAX_DETERMINISTIC = 3 +} cudnnPoolingMode_t; + +/* Create an instance of pooling descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t mode, + cudnnNanPropagation_t maxpoolingNanOpt, + int windowHeight, + int windowWidth, + int verticalPadding, + int horizontalPadding, + int verticalStride, + int horizontalStride); + +cudnnStatus_t CUDNNWINAPI +cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t *mode, + cudnnNanPropagation_t *maxpoolingNanOpt, + int *windowHeight, + int *windowWidth, + int *verticalPadding, + int *horizontalPadding, + int *verticalStride, + int *horizontalStride); + +cudnnStatus_t CUDNNWINAPI +cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, + const cudnnPoolingMode_t mode, + const cudnnNanPropagation_t maxpoolingNanOpt, + int nbDims, + const int windowDimA[], + const int paddingA[], + const int strideA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc, + int nbDimsRequested, + cudnnPoolingMode_t *mode, + cudnnNanPropagation_t *maxpoolingNanOpt, + int *nbDims, + int windowDimA[], + int paddingA[], + int strideA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + int nbDims, + int outputTensorDimA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + int *n, + int *c, + int *h, + int *w); + +/* Destroy an instance of pooling descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc); + +/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform forward pooling */ +cudnnStatus_t CUDNNWINAPI +cudnnPoolingForward(cudnnHandle_t handle, + const cudnnPoolingDescriptor_t poolingDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* + * activation mode + */ +typedef enum { + CUDNN_ACTIVATION_SIGMOID = 0, + CUDNN_ACTIVATION_RELU = 1, + CUDNN_ACTIVATION_TANH = 2, + CUDNN_ACTIVATION_CLIPPED_RELU = 3, + CUDNN_ACTIVATION_ELU = 4, + CUDNN_ACTIVATION_IDENTITY = 5, + CUDNN_ACTIVATION_SWISH = 6 +} cudnnActivationMode_t; + +/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc, + cudnnActivationMode_t mode, + cudnnNanPropagation_t reluNanOpt, + double coef); /* ceiling for clipped RELU, alpha for ELU */ + +cudnnStatus_t CUDNNWINAPI +cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc, + cudnnActivationMode_t *mode, + cudnnNanPropagation_t *reluNanOpt, + double *coef); /* ceiling for clipped RELU, alpha for ELU */ + +cudnnStatus_t CUDNNWINAPI +cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta); + +cudnnStatus_t CUDNNWINAPI +cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc); + +/* Function to perform forward activation */ +cudnnStatus_t CUDNNWINAPI +cudnnActivationForward(cudnnHandle_t handle, + cudnnActivationDescriptor_t activationDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* + * Create an instance of LRN (Local Response Normalization) descriptor + * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper + */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc); + +#define CUDNN_LRN_MIN_N 1 /* minimum allowed lrnN */ +#define CUDNN_LRN_MAX_N 16 /* maximum allowed lrnN */ +#define CUDNN_LRN_MIN_K 1e-5 /* minimum allowed lrnK */ +#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */ + +/* LRN layer mode */ +typedef enum { + CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */ +} cudnnLRNMode_t; + +/* + * Uses a window [center-lookBehind, center+lookAhead], where + * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1. + * Values of double parameters cast to tensor data type. + */ +cudnnStatus_t CUDNNWINAPI +cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK); +/* + * Retrieve the settings currently stored in an LRN layer descriptor + * Any of the provided pointers can be NULL (no corresponding value will be returned) + */ +cudnnStatus_t CUDNNWINAPI +cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK); + +/* Destroy an instance of LRN descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc); + +/* LRN functions: output = alpha * normalize(x) + beta * old_y */ + +/* LRN cross-channel forward computation. Double parameters cast to tensor data type */ +cudnnStatus_t CUDNNWINAPI +cudnnLRNCrossChannelForward(cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnLRNMode_t lrnMode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +typedef enum { + CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0, +} cudnnDivNormMode_t; + +/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */ +cudnnStatus_t CUDNNWINAPI +cudnnDivisiveNormalizationForward(cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnDivNormMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */ + const void *x, + const void *means, /* if NULL, means are assumed to be zero */ + void *temp, + void *temp2, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +typedef enum { + /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */ + CUDNN_BATCHNORM_PER_ACTIVATION = 0, + + /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */ + CUDNN_BATCHNORM_SPATIAL = 1, + + /* + * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors). + * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values + */ + CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2, +} cudnnBatchNormMode_t; + +#define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */ + +/* + * Derives a tensor descriptor from layer data descriptor for BatchNormalization + * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for + * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions. + */ +cudnnStatus_t CUDNNWINAPI +cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc, + const cudnnTensorDescriptor_t xDesc, + cudnnBatchNormMode_t mode); + +typedef enum { + CUDNN_BATCHNORM_OPS_BN = 0, /* do batch normalization only */ + CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1, /* do batchNorm, then activation */ + CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */ +} cudnnBatchNormOps_t; + +/* + * Performs Batch Normalization during Inference: + * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k] + * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed + * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining + * above for notes on function arguments. + */ +cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationForwardInference(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + const cudnnTensorDescriptor_t xDesc, + const void *x, /* NxCxHxW */ + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW */ + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + const void *bnScale, + const void *bnBias, + const void *estimatedMean, + const void *estimatedVariance, + double epsilon); + +typedef enum { + /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */ + CUDNN_NORM_PER_ACTIVATION = 0, + + /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */ + CUDNN_NORM_PER_CHANNEL = 1, +} cudnnNormMode_t; + +typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t; + +/* + * Derives a tensor descriptor from layer data descriptor for Normalization + * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for + * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions. + */ +cudnnStatus_t CUDNNWINAPI +cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc, + cudnnTensorDescriptor_t derivedNormMeanVarDesc, + const cudnnTensorDescriptor_t xDesc, + cudnnNormMode_t mode, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +typedef enum { + CUDNN_NORM_OPS_NORM = 0, /* do normalization only */ + CUDNN_NORM_OPS_NORM_ACTIVATION = 1, /* do Norm, then activation */ + CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */ +} cudnnNormOps_t; + +/* + * Performs Normalization during Inference: + * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k] + * with normScale, normBias, runningMean, runningInvVariance tensors indexed + * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining + * above for notes on function arguments. + */ +cudnnStatus_t CUDNNWINAPI +cudnnNormalizationForwardInference(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + const cudnnTensorDescriptor_t xDesc, + const void *x, /* NxCxHxW */ + const cudnnTensorDescriptor_t normScaleBiasDesc, + const void *normScale, + const void *normBias, + const cudnnTensorDescriptor_t normMeanVarDesc, + const void *estimatedMean, + const void *estimatedVariance, + const cudnnTensorDescriptor_t zDesc, + const void *z, + cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW */ + double epsilon, + int groupCnt); /* Place hold for future work*/ + +/* APIs for spatial transformer network*/ +typedef enum { + CUDNN_SAMPLER_BILINEAR = 0, +} cudnnSamplerType_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc, + cudnnSamplerType_t samplerType, + cudnnDataType_t dataType, + const int nbDims, + const int dimA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle, + const cudnnSpatialTransformerDescriptor_t stDesc, + const void *theta, + void *grid); + +cudnnStatus_t CUDNNWINAPI +cudnnSpatialTfSamplerForward(cudnnHandle_t handle, + cudnnSpatialTransformerDescriptor_t stDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *grid, + const void *beta, + cudnnTensorDescriptor_t yDesc, + void *y); + +typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc); + +/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes); + +/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */ +cudnnStatus_t CUDNNWINAPI +cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float dropout, + void *states, + size_t stateSizeInBytes, + unsigned long long seed); + +/* Restores the dropout descriptor to a previously saved-off state */ +cudnnStatus_t CUDNNWINAPI +cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float dropout, + void *states, + size_t stateSizeInBytes, + unsigned long long seed); + +cudnnStatus_t CUDNNWINAPI +cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float *dropout, + void **states, + unsigned long long *seed); + +cudnnStatus_t CUDNNWINAPI +cudnnDropoutForward(cudnnHandle_t handle, + const cudnnDropoutDescriptor_t dropoutDesc, + const cudnnTensorDescriptor_t xdesc, + const void *x, + const cudnnTensorDescriptor_t ydesc, + void *y, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +/* TODO: remove */ + +typedef struct cudnnAlgorithmStruct *cudnnAlgorithmDescriptor_t; +typedef struct cudnnAlgorithmPerformanceStruct *cudnnAlgorithmPerformance_t; + +/* TODO: move these enums out to the appropriate submodule */ +typedef enum { + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1, + CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2, + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3, + CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4, + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7, + CUDNN_CONVOLUTION_FWD_ALGO_COUNT = 8 +} cudnnConvolutionFwdAlgo_t; + +typedef enum { + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4, /* not implemented */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING = 6, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT = 7 +} cudnnConvolutionBwdFilterAlgo_t; + +typedef enum { + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT = 6 +} cudnnConvolutionBwdDataAlgo_t; + +typedef enum { + CUDNN_RNN_ALGO_STANDARD = 0, + CUDNN_RNN_ALGO_PERSIST_STATIC = 1, + CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2, + CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3, + CUDNN_RNN_ALGO_COUNT = 4, +} cudnnRNNAlgo_t; + +typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t; + +/* TODO: remove */ +typedef struct cudnnAlgorithmUnionStruct { + union Algorithm { + cudnnConvolutionFwdAlgo_t convFwdAlgo; + cudnnConvolutionBwdFilterAlgo_t convBwdFilterAlgo; + cudnnConvolutionBwdDataAlgo_t convBwdDataAlgo; + cudnnRNNAlgo_t RNNAlgo; + cudnnCTCLossAlgo_t CTCLossAlgo; + } algo; +} cudnnAlgorithm_t; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf, + cudnnAlgorithmDescriptor_t algoDesc, + cudnnStatus_t status, + float time, + size_t memory); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf, + cudnnAlgorithmDescriptor_t *algoDesc, + cudnnStatus_t *status, + float *time, + size_t *memory); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSaveAlgorithm(cudnnHandle_t handle, + cudnnAlgorithmDescriptor_t algoDesc, + void *algoSpace, + size_t algoSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRestoreAlgorithm(cudnnHandle_t handle, + void *algoSpace, + size_t algoSpaceSizeInBytes, + cudnnAlgorithmDescriptor_t algoDesc); + +typedef enum { + CUDNN_SEV_FATAL = 0, + CUDNN_SEV_ERROR = 1, + CUDNN_SEV_WARNING = 2, + CUDNN_SEV_INFO = 3, +} cudnnSeverity_t; + +/* Message masks to be used with cudnnSetCallback() */ +#define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR) +#define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING) +#define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO) + +/* struct containing useful informaiton for each API call */ +typedef struct cudnnDebugStruct { + unsigned cudnn_version; + cudnnStatus_t cudnnStatus; + unsigned time_sec; /* epoch time in seconds */ + unsigned time_usec; /* microseconds part of epoch time */ + unsigned time_delta; /* time since start in seconds */ + cudnnHandle_t handle; /* cudnn handle */ + cudaStream_t stream; /* cuda stream ID */ + unsigned long long pid; /* process ID */ + unsigned long long tid; /* thread ID */ + int cudaDeviceId; /* CUDA device ID */ + int reserved[15]; /* reserved for future use */ +} cudnnDebug_t; + +typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg); + +cudnnStatus_t CUDNNWINAPI +cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr); + +cudnnStatus_t CUDNNWINAPI +cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr); + +/* + * \brief Cross-library version checker. + * This function is implemented differently in each sub-library. Each sublib + * checks whether its own version matches that of its dependencies. + * \returns CUDNN_STATUS_SUCCESS if the version check passes, + * CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent. + */ +cudnnStatus_t CUDNNWINAPI +cudnnOpsInferVersionCheck(void); + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_OPS_INFER_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer_v8.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer_v8.h new file mode 100644 index 0000000000000000000000000000000000000000..88335dfb027c5193c7cfc888ed84dc352dc5de13 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer_v8.h @@ -0,0 +1,1183 @@ +/* + * Copyright 2017-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * cudnn_ops_infer : cuDNN's basic definitions and inference operations. + */ + +#if !defined(CUDNN_OPS_INFER_H_) +#define CUDNN_OPS_INFER_H_ + +#include +#include + +#include "cudnn_version.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_OPS_INFER_MAJOR 8 +#define CUDNN_OPS_INFER_MINOR 7 +#define CUDNN_OPS_INFER_PATCH 0 + +#if (CUDNN_OPS_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_INFER_MINOR != CUDNN_MINOR) || \ + (CUDNN_OPS_INFER_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN OPS INFER!!! +#endif + +#ifndef CUDNNWINAPI +#ifdef _WIN32 +#define CUDNNWINAPI __stdcall +#else +#define CUDNNWINAPI +#endif +#endif + +/* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */ +#if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__)) +/* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */ +#define CUDNN_DEPRECATED __attribute__((deprecated)) +#elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER) +/* Microsoft Visual C++ */ +#define CUDNN_DEPRECATED __declspec(deprecated) +#elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L) +/* C++14 compilers */ +#define CUDNN_DEPRECATED [[deprecated]] +#else +/* No support for the deprecated attribute */ +#define CUDNN_DEPRECATED +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +struct cudnnContext; +typedef struct cudnnContext *cudnnHandle_t; + +size_t CUDNNWINAPI +cudnnGetVersion(void); + +size_t CUDNNWINAPI +cudnnGetMaxDeviceVersion(void); + +/* Returns CUDA Runtime version statically linked against cudnn */ +size_t CUDNNWINAPI +cudnnGetCudartVersion(void); + +/* + * CUDNN return codes + */ +typedef enum { + CUDNN_STATUS_SUCCESS = 0, + CUDNN_STATUS_NOT_INITIALIZED = 1, + CUDNN_STATUS_ALLOC_FAILED = 2, + CUDNN_STATUS_BAD_PARAM = 3, + CUDNN_STATUS_INTERNAL_ERROR = 4, + CUDNN_STATUS_INVALID_VALUE = 5, + CUDNN_STATUS_ARCH_MISMATCH = 6, + CUDNN_STATUS_MAPPING_ERROR = 7, + CUDNN_STATUS_EXECUTION_FAILED = 8, + CUDNN_STATUS_NOT_SUPPORTED = 9, + CUDNN_STATUS_LICENSE_ERROR = 10, + CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11, + CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12, + CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13, + CUDNN_STATUS_VERSION_MISMATCH = 14, +} cudnnStatus_t; + +/* human-readable error messages */ +const char *CUDNNWINAPI +cudnnGetErrorString(cudnnStatus_t status); + +/* Forward definition in this version only */ +typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t; + +typedef enum { + CUDNN_ERRQUERY_RAWCODE = 0, + CUDNN_ERRQUERY_NONBLOCKING = 1, + CUDNN_ERRQUERY_BLOCKING = 2, +} cudnnErrQueryMode_t; + +cudnnStatus_t CUDNNWINAPI +cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag); + +#ifndef __LIBRARY_TYPES_H__ + +typedef enum libraryPropertyType_t { MAJOR_VERSION, MINOR_VERSION, PATCH_LEVEL } libraryPropertyType; + +#endif + +cudnnStatus_t CUDNNWINAPI +cudnnGetProperty(libraryPropertyType type, int *value); + +cudnnStatus_t CUDNNWINAPI +cudnnCreate(cudnnHandle_t *handle); +cudnnStatus_t CUDNNWINAPI +cudnnDestroy(cudnnHandle_t handle); +cudnnStatus_t CUDNNWINAPI +cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId); +cudnnStatus_t CUDNNWINAPI +cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId); + +/* Data structures to represent Image/Filter and the Neural Network Layer */ +typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t; +typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t; +typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t; +typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t; +typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t; +typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t; +typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t; +typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t; +typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t; +typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t; +/* + * CUDNN data type + */ +typedef enum { + CUDNN_DATA_FLOAT = 0, + CUDNN_DATA_DOUBLE = 1, + CUDNN_DATA_HALF = 2, + CUDNN_DATA_INT8 = 3, + CUDNN_DATA_INT32 = 4, + CUDNN_DATA_INT8x4 = 5, + CUDNN_DATA_UINT8 = 6, + CUDNN_DATA_UINT8x4 = 7, + CUDNN_DATA_INT8x32 = 8, + CUDNN_DATA_BFLOAT16 = 9, + CUDNN_DATA_INT64 = 10, + CUDNN_DATA_BOOLEAN = 11, + CUDNN_DATA_FP8_E4M3 = 12, + CUDNN_DATA_FP8_E5M2 = 13, + CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14, +} cudnnDataType_t; + +/* + * CUDNN math type + */ +typedef enum { + CUDNN_DEFAULT_MATH = 0, + CUDNN_TENSOR_OP_MATH = 1, + CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2, + CUDNN_FMA_MATH = 3, +} cudnnMathType_t; + +/* + * CUDNN propagate Nan + */ +typedef enum { + CUDNN_NOT_PROPAGATE_NAN = 0, + CUDNN_PROPAGATE_NAN = 1, +} cudnnNanPropagation_t; + +/* + * CUDNN Determinism + */ +typedef enum { + CUDNN_NON_DETERMINISTIC = 0, + CUDNN_DETERMINISTIC = 1, +} cudnnDeterminism_t; + +/* Maximum supported number of tensor dimensions */ +#define CUDNN_DIM_MAX 8 + +/* Create an instance of a generic Tensor descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc); + +typedef enum { + CUDNN_TENSOR_NCHW = 0, /* row major (wStride = 1, hStride = w) */ + CUDNN_TENSOR_NHWC = 1, /* feature maps interleaved ( cStride = 1 )*/ + CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */ +} cudnnTensorFormat_t; + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, + cudnnTensorFormat_t format, + cudnnDataType_t dataType, /* image data type */ + int n, /* number of inputs (batch size) */ + int c, /* number of input feature maps */ + int h, /* height of input section */ + int w); /* width of input section */ + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t dataType, /* image data type */ + int n, /* number of inputs (batch size) */ + int c, /* number of input feature maps */ + int h, /* height of input section */ + int w, /* width of input section */ + int nStride, + int cStride, + int hStride, + int wStride); + +cudnnStatus_t CUDNNWINAPI +cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t *dataType, /* image data type */ + int *n, /* number of inputs (batch size) */ + int *c, /* number of input feature maps */ + int *h, /* height of input section */ + int *w, /* width of input section */ + int *nStride, + int *cStride, + int *hStride, + int *wStride); + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t dataType, + int nbDims, + const int dimA[], + const int strideA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc, + cudnnTensorFormat_t format, + cudnnDataType_t dataType, + int nbDims, + const int dimA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc, + int nbDimsRequested, + cudnnDataType_t *dataType, + int *nbDims, + int dimA[], + int strideA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size); + +/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride + + 1)Example of all images in row major order one batch of features after the other (with an optional padding on row) + input_stride : c x h x h_stride + feature_stride : h x h_stride + h_stride : >= w ( h_stride = w if no padding) + w_stride : 1 + + + 2)Example of all images in row major with features maps interleaved + input_stride : c x h x h_stride + feature_stride : 1 + h_stride : w x c + w_stride : c + + 3)Example of all images in column major order one batch of features after the other (with optional padding on column) + input_stride : c x w x w_stride + feature_stride : w x w_stride + h_stride : 1 + w_stride : >= h + +*/ + +/* Destroy an instance of Tensor4d descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc); + +/* Fold/unfold transforms */ +typedef enum { + CUDNN_TRANSFORM_FOLD = 0U, + CUDNN_TRANSFORM_UNFOLD = 1U, +} cudnnFoldingDirection_t; + +/** Create a destination descriptor for cudnnTransformTensor */ +cudnnStatus_t CUDNNWINAPI +cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc, + const cudnnTensorDescriptor_t srcDesc, + cudnnTensorDescriptor_t destDesc, + size_t *destSizeInBytes); + +/** Create an empty tensor transform descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc); + +/** Initialize a previously created tensor transform descriptor. */ +cudnnStatus_t CUDNNWINAPI +cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc, + const uint32_t nbDims, + const cudnnTensorFormat_t destFormat, + const int32_t padBeforeA[], + const int32_t padAfterA[], + const uint32_t foldA[], + const cudnnFoldingDirection_t direction); + +/** + * Retrieves the values stored in a previously initialized tensor transform + * descriptor. + */ +cudnnStatus_t CUDNNWINAPI +cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc, + uint32_t nbDimsRequested, + cudnnTensorFormat_t *destFormat, + int32_t padBeforeA[], + int32_t padAfterA[], + uint32_t foldA[], + cudnnFoldingDirection_t *direction); + +/** + * Destroys a previously created tensor transform descriptor. + */ +cudnnStatus_t CUDNNWINAPI +cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc); + +/* Tensor layout conversion helper (y = alpha * x + beta * y) */ +cudnnStatus_t CUDNNWINAPI +cudnnTransformTensor(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +cudnnStatus_t CUDNNWINAPI +cudnnTransformTensorEx(cudnnHandle_t handle, + const cudnnTensorTransformDescriptor_t transDesc, + const void *alpha, + const cudnnTensorDescriptor_t srcDesc, + const void *srcData, + const void *beta, + const cudnnTensorDescriptor_t destDesc, + void *destData); + +/* Tensor Bias addition : C = alpha * A + beta * C */ +cudnnStatus_t CUDNNWINAPI +cudnnAddTensor(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C); + +/* + * CUDNN OpTensor op type + */ +typedef enum { + CUDNN_OP_TENSOR_ADD = 0, + CUDNN_OP_TENSOR_MUL = 1, + CUDNN_OP_TENSOR_MIN = 2, + CUDNN_OP_TENSOR_MAX = 3, + CUDNN_OP_TENSOR_SQRT = 4, + CUDNN_OP_TENSOR_NOT = 5, +} cudnnOpTensorOp_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc, + cudnnOpTensorOp_t opTensorOp, + cudnnDataType_t opTensorCompType, + cudnnNanPropagation_t opTensorNanOpt); + +cudnnStatus_t CUDNNWINAPI +cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc, + cudnnOpTensorOp_t *opTensorOp, + cudnnDataType_t *opTensorCompType, + cudnnNanPropagation_t *opTensorNanOpt); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc); + +/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */ +/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */ +cudnnStatus_t CUDNNWINAPI +cudnnOpTensor(cudnnHandle_t handle, + const cudnnOpTensorDescriptor_t opTensorDesc, + const void *alpha1, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *alpha2, + const cudnnTensorDescriptor_t bDesc, + const void *B, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C); + +/* + * CUDNN ReduceTensor op type + */ +typedef enum { + CUDNN_REDUCE_TENSOR_ADD = 0, + CUDNN_REDUCE_TENSOR_MUL = 1, + CUDNN_REDUCE_TENSOR_MIN = 2, + CUDNN_REDUCE_TENSOR_MAX = 3, + CUDNN_REDUCE_TENSOR_AMAX = 4, + CUDNN_REDUCE_TENSOR_AVG = 5, + CUDNN_REDUCE_TENSOR_NORM1 = 6, + CUDNN_REDUCE_TENSOR_NORM2 = 7, + CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8, +} cudnnReduceTensorOp_t; + +/* + * CUDNN ReduceTensor indices type + */ +typedef enum { + CUDNN_REDUCE_TENSOR_NO_INDICES = 0, + CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1, +} cudnnReduceTensorIndices_t; + +/* + * CUDNN tensor indices type size (all unsigned) + * Currently not supported, default is 32 bit unsigned. + */ +typedef enum { + CUDNN_32BIT_INDICES = 0, + CUDNN_64BIT_INDICES = 1, + CUDNN_16BIT_INDICES = 2, + CUDNN_8BIT_INDICES = 3, +} cudnnIndicesType_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc, + cudnnReduceTensorOp_t reduceTensorOp, + cudnnDataType_t reduceTensorCompType, + cudnnNanPropagation_t reduceTensorNanOpt, + cudnnReduceTensorIndices_t reduceTensorIndices, + cudnnIndicesType_t reduceTensorIndicesType); + +cudnnStatus_t CUDNNWINAPI +cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc, + cudnnReduceTensorOp_t *reduceTensorOp, + cudnnDataType_t *reduceTensorCompType, + cudnnNanPropagation_t *reduceTensorNanOpt, + cudnnReduceTensorIndices_t *reduceTensorIndices, + cudnnIndicesType_t *reduceTensorIndicesType); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc); + +/* Helper function to return the minimum size of the index space to be passed to the reduction given the input and + * output tensors */ +cudnnStatus_t CUDNNWINAPI +cudnnGetReductionIndicesSize(cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + const cudnnTensorDescriptor_t aDesc, + const cudnnTensorDescriptor_t cDesc, + size_t *sizeInBytes); + +/* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output + * tensors */ +cudnnStatus_t CUDNNWINAPI +cudnnGetReductionWorkspaceSize(cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + const cudnnTensorDescriptor_t aDesc, + const cudnnTensorDescriptor_t cDesc, + size_t *sizeInBytes); + +/* Tensor operation : C = reduce op( alpha * A ) + beta * C */ +/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */ +/* The indices space is ignored for reduce ops other than min or max. */ +cudnnStatus_t CUDNNWINAPI +cudnnReduceTensor(cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + void *indices, + size_t indicesSizeInBytes, + void *workspace, + size_t workspaceSizeInBytes, + const void *alpha, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C); + +/* Set all values of a tensor to a given value : y[i] = value[0] */ +cudnnStatus_t CUDNNWINAPI +cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr); + +/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */ +cudnnStatus_t CUDNNWINAPI +cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha); + +/* Create an instance of FilterStruct */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, /* image data type */ + cudnnTensorFormat_t format, + int k, /* number of output feature maps */ + int c, /* number of input feature maps */ + int h, /* height of each input filter */ + int w); /* width of each input filter */ + +cudnnStatus_t CUDNNWINAPI +cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t *dataType, /* image data type */ + cudnnTensorFormat_t *format, + int *k, /* number of output feature maps */ + int *c, /* number of input feature maps */ + int *h, /* height of each input filter */ + int *w); /* width of each input filter */ + +cudnnStatus_t CUDNNWINAPI +cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, /* image data type */ + cudnnTensorFormat_t format, + int nbDims, + const int filterDimA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc, + int nbDimsRequested, + cudnnDataType_t *dataType, /* image data type */ + cudnnTensorFormat_t *format, + int *nbDims, + int filterDimA[]); +cudnnStatus_t CUDNNWINAPI +cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size); + +cudnnStatus_t CUDNNWINAPI +cudnnTransformFilter(cudnnHandle_t handle, + const cudnnTensorTransformDescriptor_t transDesc, + const void *alpha, + const cudnnFilterDescriptor_t srcDesc, + const void *srcData, + const void *beta, + const cudnnFilterDescriptor_t destDesc, + void *destData); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc); + +/* + * softmax algorithm + */ +typedef enum { + CUDNN_SOFTMAX_FAST = 0, /* straightforward implementation */ + CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */ + CUDNN_SOFTMAX_LOG = 2 +} cudnnSoftmaxAlgorithm_t; + +typedef enum { + CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */ + CUDNN_SOFTMAX_MODE_CHANNEL = 1 /* compute the softmax over all C for each H, W, N */ +} cudnnSoftmaxMode_t; + +/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform forward softmax */ +cudnnStatus_t CUDNNWINAPI +cudnnSoftmaxForward(cudnnHandle_t handle, + cudnnSoftmaxAlgorithm_t algo, + cudnnSoftmaxMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* + * pooling mode + */ +typedef enum { + CUDNN_POOLING_MAX = 0, + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */ + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */ + CUDNN_POOLING_MAX_DETERMINISTIC = 3 +} cudnnPoolingMode_t; + +/* Create an instance of pooling descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t mode, + cudnnNanPropagation_t maxpoolingNanOpt, + int windowHeight, + int windowWidth, + int verticalPadding, + int horizontalPadding, + int verticalStride, + int horizontalStride); + +cudnnStatus_t CUDNNWINAPI +cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t *mode, + cudnnNanPropagation_t *maxpoolingNanOpt, + int *windowHeight, + int *windowWidth, + int *verticalPadding, + int *horizontalPadding, + int *verticalStride, + int *horizontalStride); + +cudnnStatus_t CUDNNWINAPI +cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, + const cudnnPoolingMode_t mode, + const cudnnNanPropagation_t maxpoolingNanOpt, + int nbDims, + const int windowDimA[], + const int paddingA[], + const int strideA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc, + int nbDimsRequested, + cudnnPoolingMode_t *mode, + cudnnNanPropagation_t *maxpoolingNanOpt, + int *nbDims, + int windowDimA[], + int paddingA[], + int strideA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + int nbDims, + int outputTensorDimA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + int *n, + int *c, + int *h, + int *w); + +/* Destroy an instance of pooling descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc); + +/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform forward pooling */ +cudnnStatus_t CUDNNWINAPI +cudnnPoolingForward(cudnnHandle_t handle, + const cudnnPoolingDescriptor_t poolingDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* + * activation mode + */ +typedef enum { + CUDNN_ACTIVATION_SIGMOID = 0, + CUDNN_ACTIVATION_RELU = 1, + CUDNN_ACTIVATION_TANH = 2, + CUDNN_ACTIVATION_CLIPPED_RELU = 3, + CUDNN_ACTIVATION_ELU = 4, + CUDNN_ACTIVATION_IDENTITY = 5, + CUDNN_ACTIVATION_SWISH = 6 +} cudnnActivationMode_t; + +/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc, + cudnnActivationMode_t mode, + cudnnNanPropagation_t reluNanOpt, + double coef); /* ceiling for clipped RELU, alpha for ELU */ + +cudnnStatus_t CUDNNWINAPI +cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc, + cudnnActivationMode_t *mode, + cudnnNanPropagation_t *reluNanOpt, + double *coef); /* ceiling for clipped RELU, alpha for ELU */ + +cudnnStatus_t CUDNNWINAPI +cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta); + +cudnnStatus_t CUDNNWINAPI +cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc); + +/* Function to perform forward activation */ +cudnnStatus_t CUDNNWINAPI +cudnnActivationForward(cudnnHandle_t handle, + cudnnActivationDescriptor_t activationDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* + * Create an instance of LRN (Local Response Normalization) descriptor + * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper + */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc); + +#define CUDNN_LRN_MIN_N 1 /* minimum allowed lrnN */ +#define CUDNN_LRN_MAX_N 16 /* maximum allowed lrnN */ +#define CUDNN_LRN_MIN_K 1e-5 /* minimum allowed lrnK */ +#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */ + +/* LRN layer mode */ +typedef enum { + CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */ +} cudnnLRNMode_t; + +/* + * Uses a window [center-lookBehind, center+lookAhead], where + * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1. + * Values of double parameters cast to tensor data type. + */ +cudnnStatus_t CUDNNWINAPI +cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK); +/* + * Retrieve the settings currently stored in an LRN layer descriptor + * Any of the provided pointers can be NULL (no corresponding value will be returned) + */ +cudnnStatus_t CUDNNWINAPI +cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK); + +/* Destroy an instance of LRN descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc); + +/* LRN functions: output = alpha * normalize(x) + beta * old_y */ + +/* LRN cross-channel forward computation. Double parameters cast to tensor data type */ +cudnnStatus_t CUDNNWINAPI +cudnnLRNCrossChannelForward(cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnLRNMode_t lrnMode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +typedef enum { + CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0, +} cudnnDivNormMode_t; + +/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */ +cudnnStatus_t CUDNNWINAPI +cudnnDivisiveNormalizationForward(cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnDivNormMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */ + const void *x, + const void *means, /* if NULL, means are assumed to be zero */ + void *temp, + void *temp2, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +typedef enum { + /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */ + CUDNN_BATCHNORM_PER_ACTIVATION = 0, + + /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */ + CUDNN_BATCHNORM_SPATIAL = 1, + + /* + * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors). + * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values + */ + CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2, +} cudnnBatchNormMode_t; + +#define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */ + +/* + * Derives a tensor descriptor from layer data descriptor for BatchNormalization + * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for + * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions. + */ +cudnnStatus_t CUDNNWINAPI +cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc, + const cudnnTensorDescriptor_t xDesc, + cudnnBatchNormMode_t mode); + +typedef enum { + CUDNN_BATCHNORM_OPS_BN = 0, /* do batch normalization only */ + CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1, /* do batchNorm, then activation */ + CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */ +} cudnnBatchNormOps_t; + +/* + * Performs Batch Normalization during Inference: + * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k] + * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed + * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining + * above for notes on function arguments. + */ +cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationForwardInference(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + const cudnnTensorDescriptor_t xDesc, + const void *x, /* NxCxHxW */ + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW */ + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + const void *bnScale, + const void *bnBias, + const void *estimatedMean, + const void *estimatedVariance, + double epsilon); + +typedef enum { + /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */ + CUDNN_NORM_PER_ACTIVATION = 0, + + /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */ + CUDNN_NORM_PER_CHANNEL = 1, +} cudnnNormMode_t; + +typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t; + +/* + * Derives a tensor descriptor from layer data descriptor for Normalization + * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for + * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions. + */ +cudnnStatus_t CUDNNWINAPI +cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc, + cudnnTensorDescriptor_t derivedNormMeanVarDesc, + const cudnnTensorDescriptor_t xDesc, + cudnnNormMode_t mode, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +typedef enum { + CUDNN_NORM_OPS_NORM = 0, /* do normalization only */ + CUDNN_NORM_OPS_NORM_ACTIVATION = 1, /* do Norm, then activation */ + CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */ +} cudnnNormOps_t; + +/* + * Performs Normalization during Inference: + * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k] + * with normScale, normBias, runningMean, runningInvVariance tensors indexed + * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining + * above for notes on function arguments. + */ +cudnnStatus_t CUDNNWINAPI +cudnnNormalizationForwardInference(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + const cudnnTensorDescriptor_t xDesc, + const void *x, /* NxCxHxW */ + const cudnnTensorDescriptor_t normScaleBiasDesc, + const void *normScale, + const void *normBias, + const cudnnTensorDescriptor_t normMeanVarDesc, + const void *estimatedMean, + const void *estimatedVariance, + const cudnnTensorDescriptor_t zDesc, + const void *z, + cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW */ + double epsilon, + int groupCnt); /* Place hold for future work*/ + +/* APIs for spatial transformer network*/ +typedef enum { + CUDNN_SAMPLER_BILINEAR = 0, +} cudnnSamplerType_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc, + cudnnSamplerType_t samplerType, + cudnnDataType_t dataType, + const int nbDims, + const int dimA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle, + const cudnnSpatialTransformerDescriptor_t stDesc, + const void *theta, + void *grid); + +cudnnStatus_t CUDNNWINAPI +cudnnSpatialTfSamplerForward(cudnnHandle_t handle, + cudnnSpatialTransformerDescriptor_t stDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *grid, + const void *beta, + cudnnTensorDescriptor_t yDesc, + void *y); + +typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc); + +/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes); + +/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */ +cudnnStatus_t CUDNNWINAPI +cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float dropout, + void *states, + size_t stateSizeInBytes, + unsigned long long seed); + +/* Restores the dropout descriptor to a previously saved-off state */ +cudnnStatus_t CUDNNWINAPI +cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float dropout, + void *states, + size_t stateSizeInBytes, + unsigned long long seed); + +cudnnStatus_t CUDNNWINAPI +cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float *dropout, + void **states, + unsigned long long *seed); + +cudnnStatus_t CUDNNWINAPI +cudnnDropoutForward(cudnnHandle_t handle, + const cudnnDropoutDescriptor_t dropoutDesc, + const cudnnTensorDescriptor_t xdesc, + const void *x, + const cudnnTensorDescriptor_t ydesc, + void *y, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +/* TODO: remove */ + +typedef struct cudnnAlgorithmStruct *cudnnAlgorithmDescriptor_t; +typedef struct cudnnAlgorithmPerformanceStruct *cudnnAlgorithmPerformance_t; + +/* TODO: move these enums out to the appropriate submodule */ +typedef enum { + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1, + CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2, + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3, + CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4, + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7, + CUDNN_CONVOLUTION_FWD_ALGO_COUNT = 8 +} cudnnConvolutionFwdAlgo_t; + +typedef enum { + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4, /* not implemented */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING = 6, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT = 7 +} cudnnConvolutionBwdFilterAlgo_t; + +typedef enum { + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT = 6 +} cudnnConvolutionBwdDataAlgo_t; + +typedef enum { + CUDNN_RNN_ALGO_STANDARD = 0, + CUDNN_RNN_ALGO_PERSIST_STATIC = 1, + CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2, + CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3, + CUDNN_RNN_ALGO_COUNT = 4, +} cudnnRNNAlgo_t; + +typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t; + +/* TODO: remove */ +typedef struct cudnnAlgorithmUnionStruct { + union Algorithm { + cudnnConvolutionFwdAlgo_t convFwdAlgo; + cudnnConvolutionBwdFilterAlgo_t convBwdFilterAlgo; + cudnnConvolutionBwdDataAlgo_t convBwdDataAlgo; + cudnnRNNAlgo_t RNNAlgo; + cudnnCTCLossAlgo_t CTCLossAlgo; + } algo; +} cudnnAlgorithm_t; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf, + cudnnAlgorithmDescriptor_t algoDesc, + cudnnStatus_t status, + float time, + size_t memory); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf, + cudnnAlgorithmDescriptor_t *algoDesc, + cudnnStatus_t *status, + float *time, + size_t *memory); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSaveAlgorithm(cudnnHandle_t handle, + cudnnAlgorithmDescriptor_t algoDesc, + void *algoSpace, + size_t algoSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRestoreAlgorithm(cudnnHandle_t handle, + void *algoSpace, + size_t algoSpaceSizeInBytes, + cudnnAlgorithmDescriptor_t algoDesc); + +typedef enum { + CUDNN_SEV_FATAL = 0, + CUDNN_SEV_ERROR = 1, + CUDNN_SEV_WARNING = 2, + CUDNN_SEV_INFO = 3, +} cudnnSeverity_t; + +/* Message masks to be used with cudnnSetCallback() */ +#define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR) +#define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING) +#define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO) + +/* struct containing useful informaiton for each API call */ +typedef struct cudnnDebugStruct { + unsigned cudnn_version; + cudnnStatus_t cudnnStatus; + unsigned time_sec; /* epoch time in seconds */ + unsigned time_usec; /* microseconds part of epoch time */ + unsigned time_delta; /* time since start in seconds */ + cudnnHandle_t handle; /* cudnn handle */ + cudaStream_t stream; /* cuda stream ID */ + unsigned long long pid; /* process ID */ + unsigned long long tid; /* thread ID */ + int cudaDeviceId; /* CUDA device ID */ + int reserved[15]; /* reserved for future use */ +} cudnnDebug_t; + +typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg); + +cudnnStatus_t CUDNNWINAPI +cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr); + +cudnnStatus_t CUDNNWINAPI +cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr); + +/* + * \brief Cross-library version checker. + * This function is implemented differently in each sub-library. Each sublib + * checks whether its own version matches that of its dependencies. + * \returns CUDNN_STATUS_SUCCESS if the version check passes, + * CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent. + */ +cudnnStatus_t CUDNNWINAPI +cudnnOpsInferVersionCheck(void); + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_OPS_INFER_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_train.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_train.h new file mode 100644 index 0000000000000000000000000000000000000000..b16897b7626ebc9d22fd8031932800eb023e65df --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_train.h @@ -0,0 +1,501 @@ +/* + * Copyright 2017-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * cudnn_ops_train : cuDNN's basic training operations and algorithms. + */ + +#if !defined(CUDNN_OPS_TRAIN_H_) +#define CUDNN_OPS_TRAIN_H_ + +#include +#include + +#include "cudnn_version.h" +#include "cudnn_ops_infer.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_OPS_TRAIN_MAJOR 8 +#define CUDNN_OPS_TRAIN_MINOR 7 +#define CUDNN_OPS_TRAIN_PATCH 0 + +#if (CUDNN_OPS_TRAIN_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_TRAIN_MINOR != CUDNN_MINOR) || \ + (CUDNN_OPS_TRAIN_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN OPS TRAIN!!! +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Function to perform backward softmax */ +cudnnStatus_t CUDNNWINAPI +cudnnSoftmaxBackward(cudnnHandle_t handle, + cudnnSoftmaxAlgorithm_t algo, + cudnnSoftmaxMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +/* Function to perform backward pooling */ +cudnnStatus_t CUDNNWINAPI +cudnnPoolingBackward(cudnnHandle_t handle, + const cudnnPoolingDescriptor_t poolingDesc, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +/* Function to perform backward activation */ +cudnnStatus_t CUDNNWINAPI +cudnnActivationBackward(cudnnHandle_t handle, + cudnnActivationDescriptor_t activationDesc, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +/* LRN cross-channel backward computation. Double parameters cast to tensor data type */ +cudnnStatus_t CUDNNWINAPI +cudnnLRNCrossChannelBackward(cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnLRNMode_t lrnMode, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +cudnnStatus_t CUDNNWINAPI +cudnnDivisiveNormalizationBackward(cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnDivNormMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */ + const void *x, + const void *means, /* if NULL, means are assumed to be zero */ + const void *dy, + void *temp, + void *temp2, + const void *beta, + const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */ + void *dx, /* output x differential */ + void *dMeans); /* output means differential, can be NULL */ + +cudnnStatus_t CUDNNWINAPI +cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t zDesc, + const cudnnTensorDescriptor_t yDesc, + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + const cudnnActivationDescriptor_t activationDesc, + size_t *sizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t yDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnTensorDescriptor_t dzDesc, + const cudnnTensorDescriptor_t dxDesc, + const cudnnTensorDescriptor_t dBnScaleBiasDesc, + const cudnnActivationDescriptor_t activationDesc, + size_t *sizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t xDesc, + size_t *sizeInBytes); + +/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */ +cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationForwardTraining( + cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + + const cudnnTensorDescriptor_t xDesc, + const void *x, /* NxCxHxW */ + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW */ + + /* Shared desc for the next 6 tensors in the argument list. + Data type to be set as follows: + type = (typeOf(x) == double) ? double : float + Dimensions for this descriptor depend on normalization mode + - Spatial Normalization : tensors are expected to have dims 1xCx1x1 + (normalization is performed across NxHxW) + - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW + (normalization is performed across N) */ + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + + /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */ + const void *bnScale, + const void *bnBias, + + /* MUST use factor=1 in the very first call of a complete training cycle. + Use a factor=1/(1+n) at N-th call to the function to get + Cumulative Moving Average (CMA) behavior + CMA[n] = (x[1]+...+x[n])/n + Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) = + ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) = + CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */ + double exponentialAverageFactor, + + /* Used in Training phase only. + runningMean = newMean*factor + runningMean*(1-factor) */ + void *resultRunningMean, + /* Output in training mode, input in inference. Is the moving average + of variance[x] (factor is applied in the same way as for runningMean) */ + void *resultRunningVariance, + + /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */ + double epsilon, + + /* Optionally save intermediate results from the forward pass here + - can be reused to speed up backward pass. NULL if unused */ + void *resultSaveMean, + void *resultSaveInvVariance); + +/* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */ +cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationForwardTrainingEx( + cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + + const cudnnTensorDescriptor_t xDesc, + const void *xData, + const cudnnTensorDescriptor_t zDesc, + const void *zData, + const cudnnTensorDescriptor_t yDesc, + void *yData, + + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + const void *bnScale, + const void *bnBias, + + double exponentialAverageFactor, + void *resultRunningMean, + void *resultRunningVariance, + + /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */ + double epsilon, + + /* Optionally save intermediate results from the forward pass here + - can be reused to speed up backward pass. NULL if unused */ + void *resultSaveMean, + void *resultSaveInvVariance, + + cudnnActivationDescriptor_t activationDesc, + void *workspace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +/* Performs backward pass of Batch Normalization layer. Returns x gradient, +* bnScale gradient and bnBias gradient */ +cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationBackward(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + const void *alphaDataDiff, + const void *betaDataDiff, + const void *alphaParamDiff, + const void *betaParamDiff, + const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */ + const void *x, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + /* Shared tensor desc for the 4 tensors below */ + const cudnnTensorDescriptor_t dBnScaleBiasDesc, + const void *bnScale, /* bnBias doesn't affect backpropagation */ + /* scale and bias diff are not backpropagated below this layer */ + void *dBnScaleResult, + void *dBnBiasResult, + /* Same epsilon as forward pass */ + double epsilon, + + /* Optionally cached intermediate results from + forward pass */ + const void *savedMean, + const void *savedInvVariance); + +cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + + const void *alphaDataDiff, + const void *betaDataDiff, + const void *alphaParamDiff, + const void *betaParamDiff, + const cudnnTensorDescriptor_t xDesc, + const void *xData, + const cudnnTensorDescriptor_t yDesc, + const void *yData, + const cudnnTensorDescriptor_t dyDesc, + const void *dyData, + const cudnnTensorDescriptor_t dzDesc, + void *dzData, + const cudnnTensorDescriptor_t dxDesc, + void *dxData, + + /* Shared tensor desc for the 4 tensors below */ + const cudnnTensorDescriptor_t dBnScaleBiasDesc, + const void *bnScaleData, + const void *bnBiasData, /* needed if there is activation */ + void *dBnScaleData, + void *dBnBiasData, + double epsilon, /* Same epsilon as forward pass */ + + /* Optionally cached intermediate results from + forward pass */ + const void *savedMean, + const void *savedInvVariance, + cudnnActivationDescriptor_t activationDesc, + void *workSpace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t zDesc, + const cudnnTensorDescriptor_t yDesc, + const cudnnTensorDescriptor_t normScaleBiasDesc, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t normMeanVarDesc, + size_t *sizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +cudnnStatus_t CUDNNWINAPI +cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t yDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnTensorDescriptor_t dzDesc, + const cudnnTensorDescriptor_t dxDesc, + const cudnnTensorDescriptor_t dNormScaleBiasDesc, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t normMeanVarDesc, + size_t *sizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +cudnnStatus_t CUDNNWINAPI +cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t xDesc, + size_t *sizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +/* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */ +cudnnStatus_t CUDNNWINAPI +cudnnNormalizationForwardTraining(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + const cudnnTensorDescriptor_t xDesc, + const void *xData, + const cudnnTensorDescriptor_t normScaleBiasDesc, + const void *normScale, + const void *normBias, + double exponentialAverageFactor, + const cudnnTensorDescriptor_t normMeanVarDesc, + void *resultRunningMean, + void *resultRunningVariance, + /* Has to be >= 0. Should be the same in forward and backward functions. */ + double epsilon, + /* Optionally save intermediate results from the forward pass here + - can be reused to speed up backward pass. NULL if unused */ + void *resultSaveMean, + void *resultSaveInvVariance, + cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t zDesc, + const void *zData, + const cudnnTensorDescriptor_t yDesc, + void *yData, + void *workspace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +cudnnStatus_t CUDNNWINAPI +cudnnNormalizationBackward(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const void *alphaDataDiff, + const void *betaDataDiff, + const void *alphaParamDiff, + const void *betaParamDiff, + const cudnnTensorDescriptor_t xDesc, + const void *xData, + const cudnnTensorDescriptor_t yDesc, + const void *yData, + const cudnnTensorDescriptor_t dyDesc, + const void *dyData, + const cudnnTensorDescriptor_t dzDesc, + void *dzData, + const cudnnTensorDescriptor_t dxDesc, + void *dxData, + /* Shared tensor desc for the 4 tensors below */ + const cudnnTensorDescriptor_t dNormScaleBiasDesc, + const void *normScaleData, + const void *normBiasData, /* needed if there is activation */ + void *dNormScaleData, + void *dNormBiasData, + double epsilon, /* Same epsilon as forward pass */ + const cudnnTensorDescriptor_t normMeanVarDesc, + /* Optionally cached intermediate results from + forward pass */ + const void *savedMean, + const void *savedInvVariance, + cudnnActivationDescriptor_t activationDesc, + void *workSpace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +cudnnStatus_t CUDNNWINAPI +cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle, + const cudnnSpatialTransformerDescriptor_t stDesc, + const void *dgrid, + void *dtheta); + +cudnnStatus_t CUDNNWINAPI +cudnnSpatialTfSamplerBackward(cudnnHandle_t handle, + cudnnSpatialTransformerDescriptor_t stDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + const void *alphaDgrid, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const void *grid, + const void *betaDgrid, + void *dgrid); + +cudnnStatus_t CUDNNWINAPI +cudnnDropoutBackward(cudnnHandle_t handle, + const cudnnDropoutDescriptor_t dropoutDesc, + const cudnnTensorDescriptor_t dydesc, + const void *dy, + const cudnnTensorDescriptor_t dxdesc, + void *dx, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +/* + * \brief Cross-library version checker. + * This function is implemented differently in each sub-library. Each sublib + * checks whether its own version matches that of its dependencies. + * \returns CUDNN_STATUS_SUCCESS if the version check passes, + * CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent. + */ +cudnnStatus_t CUDNNWINAPI +cudnnOpsTrainVersionCheck(void); + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_OPS_TRAIN_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ab66649fe4f20f3a1507d880cfd815a311d70d8 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufft.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufft.h new file mode 100644 index 0000000000000000000000000000000000000000..3d11c6a2579a7dba4e61dda86be5a2541d7d21b7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufft.h @@ -0,0 +1,322 @@ + /* Copyright 2005-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/*! +* \file cufft.h +* \brief Public header file for the NVIDIA CUDA FFT library (CUFFT) +*/ + +#ifndef _CUFFT_H_ +#define _CUFFT_H_ + + +#include "cuComplex.h" +#include "driver_types.h" +#include "library_types.h" + +#ifndef CUFFTAPI +#ifdef _WIN32 +#define CUFFTAPI __stdcall +#elif __GNUC__ >= 4 +#define CUFFTAPI __attribute__ ((visibility ("default"))) +#else +#define CUFFTAPI +#endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define CUFFT_VER_MAJOR 10 +#define CUFFT_VER_MINOR 9 +#define CUFFT_VER_PATCH 0 +#define CUFFT_VER_BUILD 58 + +// cuFFT library version +// +// CUFFT_VERSION / 1000 - major version +// CUFFT_VERSION / 100 % 100 - minor version +// CUFFT_VERSION % 100 - patch level +#define CUFFT_VERSION 10900 + +// CUFFT API function return values +typedef enum cufftResult_t { + CUFFT_SUCCESS = 0x0, + CUFFT_INVALID_PLAN = 0x1, + CUFFT_ALLOC_FAILED = 0x2, + CUFFT_INVALID_TYPE = 0x3, + CUFFT_INVALID_VALUE = 0x4, + CUFFT_INTERNAL_ERROR = 0x5, + CUFFT_EXEC_FAILED = 0x6, + CUFFT_SETUP_FAILED = 0x7, + CUFFT_INVALID_SIZE = 0x8, + CUFFT_UNALIGNED_DATA = 0x9, + CUFFT_INCOMPLETE_PARAMETER_LIST = 0xA, + CUFFT_INVALID_DEVICE = 0xB, + CUFFT_PARSE_ERROR = 0xC, + CUFFT_NO_WORKSPACE = 0xD, + CUFFT_NOT_IMPLEMENTED = 0xE, + CUFFT_LICENSE_ERROR = 0x0F, + CUFFT_NOT_SUPPORTED = 0x10 + +} cufftResult; + +#define MAX_CUFFT_ERROR 0x11 + + +// CUFFT defines and supports the following data types + + +// cufftReal is a single-precision, floating-point real data type. +// cufftDoubleReal is a double-precision, real data type. +typedef float cufftReal; +typedef double cufftDoubleReal; + +// cufftComplex is a single-precision, floating-point complex data type that +// consists of interleaved real and imaginary components. +// cufftDoubleComplex is the double-precision equivalent. +typedef cuComplex cufftComplex; +typedef cuDoubleComplex cufftDoubleComplex; + +// CUFFT transform directions +#define CUFFT_FORWARD -1 // Forward FFT +#define CUFFT_INVERSE 1 // Inverse FFT + +// CUFFT supports the following transform types +typedef enum cufftType_t { + CUFFT_R2C = 0x2a, // Real to Complex (interleaved) + CUFFT_C2R = 0x2c, // Complex (interleaved) to Real + CUFFT_C2C = 0x29, // Complex to Complex, interleaved + CUFFT_D2Z = 0x6a, // Double to Double-Complex + CUFFT_Z2D = 0x6c, // Double-Complex to Double + CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex +} cufftType; + +// CUFFT supports the following data layouts +typedef enum cufftCompatibility_t { + CUFFT_COMPATIBILITY_FFTW_PADDING = 0x01 // The default value +} cufftCompatibility; + +#define CUFFT_COMPATIBILITY_DEFAULT CUFFT_COMPATIBILITY_FFTW_PADDING + +// +// structure definition used by the shim between old and new APIs +// +#define MAX_SHIM_RANK 3 + +// cufftHandle is a handle type used to store and access CUFFT plans. +typedef int cufftHandle; + + +cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan, + int nx, + cufftType type, + int batch); + +cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan, + int nx, int ny, + cufftType type); + +cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan, + int nx, int ny, int nz, + cufftType type); + +cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan, + int rank, + int *n, + int *inembed, int istride, int idist, + int *onembed, int ostride, int odist, + cufftType type, + int batch); + +cufftResult CUFFTAPI cufftMakePlan1d(cufftHandle plan, + int nx, + cufftType type, + int batch, + size_t *workSize); + +cufftResult CUFFTAPI cufftMakePlan2d(cufftHandle plan, + int nx, int ny, + cufftType type, + size_t *workSize); + +cufftResult CUFFTAPI cufftMakePlan3d(cufftHandle plan, + int nx, int ny, int nz, + cufftType type, + size_t *workSize); + +cufftResult CUFFTAPI cufftMakePlanMany(cufftHandle plan, + int rank, + int *n, + int *inembed, int istride, int idist, + int *onembed, int ostride, int odist, + cufftType type, + int batch, + size_t *workSize); + +cufftResult CUFFTAPI cufftMakePlanMany64(cufftHandle plan, + int rank, + long long int *n, + long long int *inembed, + long long int istride, + long long int idist, + long long int *onembed, + long long int ostride, long long int odist, + cufftType type, + long long int batch, + size_t * workSize); + +cufftResult CUFFTAPI cufftGetSizeMany64(cufftHandle plan, + int rank, + long long int *n, + long long int *inembed, + long long int istride, long long int idist, + long long int *onembed, + long long int ostride, long long int odist, + cufftType type, + long long int batch, + size_t *workSize); + + + + +cufftResult CUFFTAPI cufftEstimate1d(int nx, + cufftType type, + int batch, + size_t *workSize); + +cufftResult CUFFTAPI cufftEstimate2d(int nx, int ny, + cufftType type, + size_t *workSize); + +cufftResult CUFFTAPI cufftEstimate3d(int nx, int ny, int nz, + cufftType type, + size_t *workSize); + +cufftResult CUFFTAPI cufftEstimateMany(int rank, + int *n, + int *inembed, int istride, int idist, + int *onembed, int ostride, int odist, + cufftType type, + int batch, + size_t *workSize); + +cufftResult CUFFTAPI cufftCreate(cufftHandle * handle); + +cufftResult CUFFTAPI cufftGetSize1d(cufftHandle handle, + int nx, + cufftType type, + int batch, + size_t *workSize ); + +cufftResult CUFFTAPI cufftGetSize2d(cufftHandle handle, + int nx, int ny, + cufftType type, + size_t *workSize); + +cufftResult CUFFTAPI cufftGetSize3d(cufftHandle handle, + int nx, int ny, int nz, + cufftType type, + size_t *workSize); + +cufftResult CUFFTAPI cufftGetSizeMany(cufftHandle handle, + int rank, int *n, + int *inembed, int istride, int idist, + int *onembed, int ostride, int odist, + cufftType type, int batch, size_t *workArea); + +cufftResult CUFFTAPI cufftGetSize(cufftHandle handle, size_t *workSize); + +cufftResult CUFFTAPI cufftSetWorkArea(cufftHandle plan, void *workArea); + +cufftResult CUFFTAPI cufftSetAutoAllocation(cufftHandle plan, int autoAllocate); + +cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan, + cufftComplex *idata, + cufftComplex *odata, + int direction); + +cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan, + cufftReal *idata, + cufftComplex *odata); + +cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan, + cufftComplex *idata, + cufftReal *odata); + +cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan, + cufftDoubleComplex *idata, + cufftDoubleComplex *odata, + int direction); + +cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan, + cufftDoubleReal *idata, + cufftDoubleComplex *odata); + +cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan, + cufftDoubleComplex *idata, + cufftDoubleReal *odata); + + +// utility functions +cufftResult CUFFTAPI cufftSetStream(cufftHandle plan, + cudaStream_t stream); + +cufftResult CUFFTAPI cufftDestroy(cufftHandle plan); + +cufftResult CUFFTAPI cufftGetVersion(int *version); + +cufftResult CUFFTAPI cufftGetProperty(libraryPropertyType type, + int *value); + +#ifdef __cplusplus +} +#endif + +#endif /* _CUFFT_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ec44e1cd050b033dfa40719340dfe3c80b3e679 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6eef99ad56755b346739013ba70ac95be304afcb Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete.h new file mode 100644 index 0000000000000000000000000000000000000000..7e194487a0e2ec02abcb1dd8634c42141a148d84 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete.h @@ -0,0 +1,87 @@ + /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(CURANDDISCRETE_H_) +#define CURANDDISCRETE_H_ + +struct curandDistributionShift_st { + curandDistribution_t probability; + curandDistribution_t host_probability; + unsigned int shift; + unsigned int length; + unsigned int host_gen; +}; + +struct curandHistogramM2_st { + curandHistogramM2V_t V; + curandHistogramM2V_t host_V; + curandHistogramM2K_t K; + curandHistogramM2K_t host_K; + unsigned int host_gen; +}; + + +struct curandDistributionM2Shift_st { + curandHistogramM2_t histogram; + curandHistogramM2_t host_histogram; + unsigned int shift; + unsigned int length; + unsigned int host_gen; +}; + +struct curandDiscreteDistribution_st { + curandDiscreteDistribution_t self_host_ptr; + curandDistributionM2Shift_t M2; + curandDistributionM2Shift_t host_M2; + double stddev; + double mean; + curandMethod_t method; + unsigned int host_gen; +}; + +#endif // !defined(CURANDDISCRETE_H_) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete2.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete2.h new file mode 100644 index 0000000000000000000000000000000000000000..ac827749840488f66d71f492c14dbec80b57a3d1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete2.h @@ -0,0 +1,253 @@ + + /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + + +#if !defined(CURAND_DISCRETE_H_) +#define CURAND_DISCRETE_H_ + +/** + * \defgroup DEVICE Device API + * + * @{ + */ + +#ifndef __CUDACC_RTC__ +#include +#endif // __CUDACC_RTC__ + +#include "curand_mrg32k3a.h" +#include "curand_mtgp32_kernel.h" +#include "curand_philox4x32_x.h" + + +template +QUALIFIERS unsigned int _curand_discrete(T x, curandDiscreteDistribution_t discrete_distribution){ + if (discrete_distribution->method == CURAND_M2){ + return _curand_M2_double(x, discrete_distribution->M2); + } + return (unsigned int)((discrete_distribution->stddev * _curand_normal_icdf_double(x)) + discrete_distribution->mean + 0.5); +} + + +template +QUALIFIERS unsigned int curand__discrete(STATE state, curandDiscreteDistribution_t discrete_distribution){ + if (discrete_distribution->method == CURAND_M2){ + return curand_M2_double(state, discrete_distribution->M2); + } + return (unsigned int)((discrete_distribution->stddev * curand_normal_double(state)) + discrete_distribution->mean + 0.5); //Round to nearest +} + +template +QUALIFIERS uint4 curand__discrete4(STATE state, curandDiscreteDistribution_t discrete_distribution){ + if (discrete_distribution->method == CURAND_M2){ + return curand_M2_double4(state, discrete_distribution->M2); + } + double4 _res; + uint4 result; + _res = curand_normal4_double(state); + result.x = (unsigned int)((discrete_distribution->stddev * _res.x) + discrete_distribution->mean + 0.5); //Round to nearest + result.y = (unsigned int)((discrete_distribution->stddev * _res.y) + discrete_distribution->mean + 0.5); //Round to nearest + result.z = (unsigned int)((discrete_distribution->stddev * _res.z) + discrete_distribution->mean + 0.5); //Round to nearest + result.w = (unsigned int)((discrete_distribution->stddev * _res.w) + discrete_distribution->mean + 0.5); //Round to nearest + return result; +} + +/* + * \brief Return a discrete distributed unsigned int from a XORWOW generator. + * + * Return a single discrete distributed unsigned int derived from a + * distribution defined by \p discrete_distribution from the XORWOW generator in \p state, + * increment position of generator by one. + * + * \param state - Pointer to state to update + * \param discrete_distribution - ancillary structure for discrete distribution + * + * \return unsigned int distributed by distribution defined by \p discrete_distribution. + */ +QUALIFIERS unsigned int curand_discrete(curandStateXORWOW_t *state, curandDiscreteDistribution_t discrete_distribution) +{ + return curand__discrete(state, discrete_distribution); +} + +/* + * \brief Return a discrete distributed unsigned int from a Philox4_32_10 generator. + * + * Return a single discrete distributed unsigned int derived from a + * distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state, + * increment position of generator by one. + * + * \param state - Pointer to state to update + * \param discrete_distribution - ancillary structure for discrete distribution + * + * \return unsigned int distributed by distribution defined by \p discrete_distribution. + */ +QUALIFIERS unsigned int curand_discrete(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution) +{ + return curand__discrete(state, discrete_distribution); +} + +/* + * \brief Return four discrete distributed unsigned ints from a Philox4_32_10 generator. + * + * Return four single discrete distributed unsigned ints derived from a + * distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state, + * increment position of generator by one. + * + * \param state - Pointer to state to update + * \param discrete_distribution - ancillary structure for discrete distribution + * + * \return unsigned int distributed by distribution defined by \p discrete_distribution. + */ +QUALIFIERS uint4 curand_discrete4(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution) +{ + return curand__discrete4(state, discrete_distribution); +} +/* + * \brief Return a discrete distributed unsigned int from a MRG32k3a generator. + * + * Re turn a single discrete distributed unsigned int derived from a + * distribution defined by \p discrete_distribution from the MRG32k3a generator in \p state, + * increment position of generator by one. + * + * \param state - Pointer to state to update + * \param discrete_distribution - ancillary structure for discrete distribution + * + * \return unsigned int distributed by distribution defined by \p discrete_distribution. + */ +QUALIFIERS unsigned int curand_discrete(curandStateMRG32k3a_t *state, curandDiscreteDistribution_t discrete_distribution) +{ + return curand__discrete(state, discrete_distribution); +} + +/* + * \brief Return a discrete distributed unsigned int from a MTGP32 generator. + * + * Return a single discrete distributed unsigned int derived from a + * distribution defined by \p discrete_distribution from the MTGP32 generator in \p state, + * increment position of generator by one. + * + * \param state - Pointer to state to update + * \param discrete_distribution - ancillary structure for discrete distribution + * + * \return unsigned int distributed by distribution defined by \p discrete_distribution. + */ +QUALIFIERS unsigned int curand_discrete(curandStateMtgp32_t *state, curandDiscreteDistribution_t discrete_distribution) +{ + return curand__discrete(state, discrete_distribution); +} + +/* + * \brief Return a discrete distributed unsigned int from a Sobol32 generator. + * + * Return a single discrete distributed unsigned int derived from a + * distribution defined by \p discrete_distribution from the Sobol32 generator in \p state, + * increment position of generator by one. + * + * \param state - Pointer to state to update + * \param discrete_distribution - ancillary structure for discrete distribution + * + * \return unsigned int distributed by distribution defined by \p discrete_distribution. + */ +QUALIFIERS unsigned int curand_discrete(curandStateSobol32_t *state, curandDiscreteDistribution_t discrete_distribution) +{ + return curand__discrete(state, discrete_distribution); +} + +/* + * \brief Return a discrete distributed unsigned int from a scrambled Sobol32 generator. + * + * Return a single discrete distributed unsigned int derived from a + * distribution defined by \p discrete_distribution from the scrambled Sobol32 generator in \p state, + * increment position of generator by one. + * + * \param state - Pointer to state to update + * \param discrete_distribution - ancillary structure for discrete distribution + * + * \return unsigned int distributed by distribution defined by \p discrete_distribution. + */ +QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol32_t *state, curandDiscreteDistribution_t discrete_distribution) +{ + return curand__discrete(state, discrete_distribution); +} + +/* + * \brief Return a discrete distributed unsigned int from a Sobol64 generator. + * + * Return a single discrete distributed unsigned int derived from a + * distribution defined by \p discrete_distribution from the Sobol64 generator in \p state, + * increment position of generator by one. + * + * \param state - Pointer to state to update + * \param discrete_distribution - ancillary structure for discrete distribution + * + * \return unsigned int distributed by distribution defined by \p discrete_distribution. + */ +QUALIFIERS unsigned int curand_discrete(curandStateSobol64_t *state, curandDiscreteDistribution_t discrete_distribution) +{ + return curand__discrete(state, discrete_distribution); +} + +/* + * \brief Return a discrete distributed unsigned int from a scrambled Sobol64 generator. + * + * Return a single discrete distributed unsigned int derived from a + * distribution defined by \p discrete_distribution from the scrambled Sobol64 generator in \p state, + * increment position of generator by one. + * + * \param state - Pointer to state to update + * \param discrete_distribution - ancillary structure for discrete distribution + * + * \return unsigned int distributed by distribution defined by \p discrete_distribution. + */ +QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol64_t *state, curandDiscreteDistribution_t discrete_distribution) +{ + return curand__discrete(state, discrete_distribution); +} + +#endif // !defined(CURAND_DISCRETE_H_) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32.h new file mode 100644 index 0000000000000000000000000000000000000000..ab8a5a5b78629ba86dbb996a7561b415e90dfb53 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32.h @@ -0,0 +1,210 @@ +/* + * Copyright 2010-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef CURAND_MTGP32_H +#define CURAND_MTGP32_H +/* + * @file curand_mtgp32.h + * + * @brief Mersenne Twister for Graphic Processors (mtgp32), which + * generates 32-bit unsigned integers and single precision floating + * point numbers based on IEEE 754 format. + * + * @author Mutsuo Saito (Hiroshima University) + * @author Makoto Matsumoto (Hiroshima University) + * + */ +/* + * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima + * University. All rights reserved. + * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima + * University and University of Tokyo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * * Neither the name of the Hiroshima University nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#define MTGPDC_MEXP 11213 +#define MTGPDC_N 351 +#define MTGPDC_FLOOR_2P 256 +#define MTGPDC_CEIL_2P 512 +#define MTGPDC_PARAM_TABLE mtgp32dc_params_fast_11213 +#define MTGP32_STATE_SIZE 1024 +#define MTGP32_STATE_MASK 1023 +#define CURAND_NUM_MTGP32_PARAMS 200 +#define MEXP 11213 +#define THREAD_NUM MTGPDC_FLOOR_2P +#define LARGE_SIZE (THREAD_NUM * 3) +#define TBL_SIZE 16 + +/** + * \addtogroup DEVICE Device API + * + * @{ + */ + +/* + * \struct MTGP32_PARAMS_FAST_T + * MTGP32 parameters. + * Some element is redundant to keep structure simple. + * + * \b pos is a pick up position which is selected to have good + * performance on graphic processors. 3 < \b pos < Q, where Q is a + * maximum number such that the size of status array - Q is a power of + * 2. For example, when \b mexp is 44497, size of 32-bit status array + * is 696, and Q is 184, then \b pos is between 4 and 183. This means + * 512 parallel calculations is allowed when \b mexp is 44497. + * + * \b poly_sha1 is SHA1 digest of the characteristic polynomial of + * state transition function. SHA1 is calculated based on printing + * form of the polynomial. This is important when we use parameters + * generated by the dynamic creator which + * + * \b mask This is a mask to make the dimension of state space have + * just Mersenne Prime. This is redundant. + */ + +struct mtgp32_params_fast; + +struct mtgp32_params_fast { + int mexp; /*< Mersenne exponent. This is redundant. */ + int pos; /*< pick up position. */ + int sh1; /*< shift value 1. 0 < sh1 < 32. */ + int sh2; /*< shift value 2. 0 < sh2 < 32. */ + unsigned int tbl[16]; /*< a small matrix. */ + unsigned int tmp_tbl[16]; /*< a small matrix for tempering. */ + unsigned int flt_tmp_tbl[16]; /*< a small matrix for tempering and + converting to float. */ + unsigned int mask; /*< This is a mask for state space */ + unsigned char poly_sha1[21]; /*< SHA1 digest */ +}; + +/** \cond UNHIDE_TYPEDEFS */ +typedef struct mtgp32_params_fast mtgp32_params_fast_t; +/** \endcond */ + +/* + * Generator Parameters. + */ +struct mtgp32_kernel_params; +struct mtgp32_kernel_params { + unsigned int pos_tbl[CURAND_NUM_MTGP32_PARAMS]; + unsigned int param_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE]; + unsigned int temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE]; + unsigned int single_temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE]; + unsigned int sh1_tbl[CURAND_NUM_MTGP32_PARAMS]; + unsigned int sh2_tbl[CURAND_NUM_MTGP32_PARAMS]; + unsigned int mask[1]; +}; + +/** \cond UNHIDE_TYPEDEFS */ +typedef struct mtgp32_kernel_params mtgp32_kernel_params_t; +/** \endcond */ + + + +/* + * kernel I/O + * This structure must be initialized before first use. + */ + +/* MTGP (Mersenne Twister) RNG */ +/* This generator uses the Mersenne Twister algorithm of + * http://arxiv.org/abs/1005.4973v2 + * Has period 2^11213. +*/ + +/** + * CURAND MTGP32 state + */ +struct curandStateMtgp32; + +struct curandStateMtgp32 { + unsigned int s[MTGP32_STATE_SIZE]; + int offset; + int pIdx; + mtgp32_kernel_params_t * k; +}; + +/* + * CURAND MTGP32 state + */ +/** \cond UNHIDE_TYPEDEFS */ +typedef struct curandStateMtgp32 curandStateMtgp32_t; +/** \endcond */ + +/** @} */ + +#endif + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_kernel.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..fe8ad5fe1fd3078a5f7a4bc5bcf872c68792d07c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_kernel.h @@ -0,0 +1,385 @@ +/* + * Copyright 2010-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * curand_mtgp32_kernel.h + * + * + * MTGP32-11213 + * + * Mersenne Twister RNG for the GPU + * + * The period of generated integers is 211213-1. + * + * This code generates 32-bit unsigned integers, and + * single precision floating point numbers uniformly distributed + * in the range [1, 2). (float r; 1.0 <= r < 2.0) + */ + +/* + * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima + * University. All rights reserved. + * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima + * University and University of Tokyo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * * Neither the name of the Hiroshima University nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#if !defined CURAND_MTGP32_KERNEL_H +#define CURAND_MTGP32_KERNEL_H + +#if !defined(QUALIFIERS) +#define QUALIFIERS static __forceinline__ __device__ +#endif + +#ifndef __CUDACC_RTC__ +#include +#include +#include +#include +#endif // ifndef __CUDACC_RTC__ +#include "curand.h" +#include "curand_mtgp32.h" + +/** + * \addtogroup DEVICE Device API + * + * @{ + */ + +#ifndef __CUDA_ARCH__ +// define blockDim and threadIdx for host compatibility call +extern const dim3 blockDim; +extern const uint3 threadIdx; +#endif + + +/* + * The function of the recursion formula calculation. + * + * @param[in] X1 the farthest part of state array. + * @param[in] X2 the second farthest part of state array. + * @param[in] Y a part of state array. + * @param[in] bid block id. + * @return output + */ +QUALIFIERS unsigned int para_rec(mtgp32_kernel_params_t * k,unsigned int X1, unsigned int X2, unsigned int Y, int bid) { + unsigned int X = (X1 & k->mask[0]) ^ X2; + unsigned int MAT; + + X ^= X << k->sh1_tbl[bid]; + Y = X ^ (Y >> k->sh2_tbl[bid]); + MAT = k->param_tbl[bid][Y & 0x0f]; + return Y ^ MAT; +} + +/* + * The tempering function. + * + * @param[in] V the output value should be tempered. + * @param[in] T the tempering helper value. + * @param[in] bid block id. + * @return the tempered value. + */ +QUALIFIERS unsigned int temper(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) { + unsigned int MAT; + + T ^= T >> 16; + T ^= T >> 8; + MAT = k->temper_tbl[bid][T & 0x0f]; + return V ^ MAT; +} + +/* + * The tempering and converting function. + * By using the preset table, converting to IEEE format + * and tempering are done simultaneously. + * + * @param[in] V the output value should be tempered. + * @param[in] T the tempering helper value. + * @param[in] bid block id. + * @return the tempered and converted value. + */ +QUALIFIERS unsigned int temper_single(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) { + unsigned int MAT; + unsigned int r; + + T ^= T >> 16; + T ^= T >> 8; + MAT = k->single_temper_tbl[bid][T & 0x0f]; + r = (V >> 9) ^ MAT; + return r; +} + +/** + * \brief Return 32-bits of pseudorandomness from a mtgp32 generator. + * + * Return 32-bits of pseudorandomness from the mtgp32 generator in \p state, + * increment position of generator by the number of threads in the block. + * Note the number of threads in the block can not exceed 256. + * + * \param state - Pointer to state to update + * + * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use. + */ +QUALIFIERS unsigned int curand(curandStateMtgp32_t *state) +{ + unsigned int t; + unsigned int d; + int pos = state->k->pos_tbl[state->pIdx]; + unsigned int r; + unsigned int o; + + d = blockDim.z * blockDim.y * blockDim.x; + //assert( d <= 256 ); + t = (blockDim.z * blockDim.y * threadIdx.z) + (blockDim.x * threadIdx.y) + threadIdx.x; + r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK], + state->s[(t + state->offset + 1) & MTGP32_STATE_MASK], + state->s[(t + state->offset + pos) & MTGP32_STATE_MASK], + state->pIdx); + + state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r; + o = temper(state->k, r, + state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK], + state->pIdx); +#if __CUDA_ARCH__ != 0 + __syncthreads(); +#endif + if (t == 0) + { + state->offset = (state->offset + d) & MTGP32_STATE_MASK; + } +#if __CUDA_ARCH__ != 0 + __syncthreads(); +#endif + return o; + +} +/** + * \brief Return 32-bits of pseudorandomness from a specific position in a mtgp32 generator. + * + * Return 32-bits of pseudorandomness from position \p index of the mtgp32 generator in \p state, + * increment position of generator by \p n positions, which must be the total number of positions + * upddated in the state by the thread block, for this invocation. + * + * Note : + * Thread indices must range from 0...\ n - 1. + * The number of positions updated may not exceed 256. + * A thread block may update more than one state, but a given state may not be updated by more than one thread block. + * + * \param state - Pointer to state to update + * \param index - Index (0..255) of the position within the state to draw from and update + * \param n - The total number of postions in this state that are being updated by this invocation + * + * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use. + */ +QUALIFIERS unsigned int curand_mtgp32_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n) +{ + unsigned int t; + int pos = state->k->pos_tbl[state->pIdx]; + unsigned int r; + unsigned int o; + + t = index; + r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK], + state->s[(t + state->offset + 1) & MTGP32_STATE_MASK], + state->s[(t + state->offset + pos) & MTGP32_STATE_MASK], + state->pIdx); + + state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r; + o = temper(state->k, r, + state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK], + state->pIdx); +#if __CUDA_ARCH__ != 0 + __syncthreads(); +#endif + if (index == 0) + { + state->offset = (state->offset + n) & MTGP32_STATE_MASK; + } +#if __CUDA_ARCH__ != 0 + __syncthreads(); +#endif + return o; +} +/** + * \brief Return a uniformly distributed float from a mtgp32 generator. + * + * Return a uniformly distributed float between \p 0.0f and \p 1.0f + * from the mtgp32 generator in \p state, increment position of generator. + * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating + * point outputs are never returned. + * + * Note: This alternate derivation of a uniform float is provided for completeness + * with the original source + * + * \param state - Pointer to state to update + * + * \return uniformly distributed float between \p 0.0f and \p 1.0f + */ +QUALIFIERS float curand_mtgp32_single(curandStateMtgp32_t *state) +{ + unsigned int t; + unsigned int d; + int pos = state->k->pos_tbl[state->pIdx]; + unsigned int r; + unsigned int o_u; + float o_f; + + + t = blockDim.z * blockDim.y; + d = t * blockDim.x; + //assert( d <= 256 ); + t += threadIdx.x; + r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK], + state->s[(t + state->offset + 1) & MTGP32_STATE_MASK], + state->s[(t + state->offset + pos) & MTGP32_STATE_MASK], + state->pIdx); + + state->s[t] = r; + o_u = temper_single(state->k, r, + state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK], + state->pIdx); +#if __CUDA_ARCH__ != 0 + __syncthreads(); +#endif + if (threadIdx.x == 0) + { + state->offset = (state->offset + d) & MTGP32_STATE_MASK; + } +#if __CUDA_ARCH__ != 0 + __syncthreads(); +#endif + memcpy(&o_f, &o_u, sizeof(o_u)); + return o_f; +} + +/** + * \brief Return a uniformly distributed float from a specific position in a mtgp32 generator. + * + * Return a uniformly distributed float between \p 0.0f and \p 1.0f + * from position \p index of the mtgp32 generator in \p state, and + * increment position of generator by \p n positions, which must be the total number of positions + * upddated in the state by the thread block, for this invocation. + * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating + * point outputs are never returned. + * + * Note 1: + * Thread indices must range from 0...\p n - 1. + * The number of positions updated may not exceed 256. + * A thread block may update more than one state, but a given state may not be updated by more than one thread block. + * + * Note 2: This alternate derivation of a uniform float is provided for completeness + * with the original source + * + * \param state - Pointer to state to update + * \param index - Index (0..255) of the position within the state to draw from and update + * \param n - The total number of postions in this state that are being updated by this invocation + * + * \return uniformly distributed float between \p 0.0f and \p 1.0f + */ +QUALIFIERS float curand_mtgp32_single_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n) +{ + unsigned int t; + int pos = state->k->pos_tbl[state->pIdx]; + unsigned int r; + unsigned int o_u; + float o_f; + + t = index; + r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK], + state->s[(t + state->offset + 1) & MTGP32_STATE_MASK], + state->s[(t + state->offset + pos) & MTGP32_STATE_MASK], + state->pIdx); + + state->s[t] = r; + o_u = temper_single(state->k, r, + state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK], + state->pIdx); +#if __CUDA_ARCH__ != 0 + __syncthreads(); +#endif + if (threadIdx.x == 0) + { + state->offset = (state->offset + n) & MTGP32_STATE_MASK; + } +#if __CUDA_ARCH__ != 0 + __syncthreads(); +#endif + memcpy(&o_f, &o_u, sizeof(o_u)); + return o_f; +} + +/** @} */ + +#endif diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal.h new file mode 100644 index 0000000000000000000000000000000000000000..193cfc5acb0abc446e287097c048bea02675103a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal.h @@ -0,0 +1,837 @@ + + /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + + +#if !defined(CURAND_NORMAL_H_) +#define CURAND_NORMAL_H_ + +/** + * \defgroup DEVICE Device API + * + * @{ + */ + +#ifndef __CUDACC_RTC__ +#include +#endif // __CUDACC_RTC__ + +#include "curand_mrg32k3a.h" +#include "curand_mtgp32_kernel.h" +#include "curand_philox4x32_x.h" +#include "curand_normal_static.h" + +QUALIFIERS float2 _curand_box_muller(unsigned int x, unsigned int y) +{ + float2 result; + float u = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2); + float v = y * CURAND_2POW32_INV_2PI + (CURAND_2POW32_INV_2PI/2); +#if __CUDA_ARCH__ > 0 + float s = sqrtf(-2.0f * logf(u)); + __sincosf(v, &result.x, &result.y); +#else + float s = sqrtf(-2.0f * logf(u)); + result.x = sinf(v); + result.y = cosf(v); +#endif + result.x *= s; + result.y *= s; + return result; +} + +QUALIFIERS float2 curand_box_muller_mrg(curandStateMRG32k3a_t * state) +{ + float x, y; + x = curand_uniform(state); + y = curand_uniform(state) * CURAND_2PI; + float2 result; +#if __CUDA_ARCH__ > 0 + float s = sqrtf(-2.0f * logf(x)); + __sincosf(y, &result.x, &result.y); +#else + float s = sqrtf(-2.0f * logf(x)); + result.x = sinf(y); + result.y = cosf(y); +#endif + result.x *= s; + result.y *= s; + return result; +} + +QUALIFIERS double2 +_curand_box_muller_double(unsigned int x0, unsigned int x1, + unsigned int y0, unsigned int y1) +{ + double2 result; + unsigned long long zx = (unsigned long long)x0 ^ + ((unsigned long long)x1 << (53 - 32)); + double u = zx * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0); + unsigned long long zy = (unsigned long long)y0 ^ + ((unsigned long long)y1 << (53 - 32)); + double v = zy * (CURAND_2POW53_INV_DOUBLE*2.0) + CURAND_2POW53_INV_DOUBLE; + double s = sqrt(-2.0 * log(u)); + +#if __CUDA_ARCH__ > 0 + sincospi(v, &result.x, &result.y); +#else + result.x = sin(v*CURAND_PI_DOUBLE); + result.y = cos(v*CURAND_PI_DOUBLE); +#endif + result.x *= s; + result.y *= s; + + return result; +} + +QUALIFIERS double2 +curand_box_muller_mrg_double(curandStateMRG32k3a_t * state) +{ + double x, y; + double2 result; + x = curand_uniform_double(state); + y = curand_uniform_double(state) * 2.0; + + double s = sqrt(-2.0 * log(x)); +#if __CUDA_ARCH__ > 0 + sincospi(y, &result.x, &result.y); +#else + result.x = sin(y*CURAND_PI_DOUBLE); + result.y = cos(y*CURAND_PI_DOUBLE); +#endif + result.x *= s; + result.y *= s; + return result; +} + +template +QUALIFIERS float2 curand_box_muller(R *state) +{ + float2 result; + unsigned int x = curand(state); + unsigned int y = curand(state); + result = _curand_box_muller(x, y); + return result; +} + +template +QUALIFIERS float4 curand_box_muller4(R *state) +{ + float4 result; + float2 _result; + uint4 x = curand4(state); + //unsigned int y = curand(state); + _result = _curand_box_muller(x.x, x.y); + result.x = _result.x; + result.y = _result.y; + _result = _curand_box_muller(x.z, x.w); + result.z = _result.x; + result.w = _result.y; + return result; +} + +template +QUALIFIERS double2 curand_box_muller_double(R *state) +{ + double2 result; + unsigned int x0 = curand(state); + unsigned int x1 = curand(state); + unsigned int y0 = curand(state); + unsigned int y1 = curand(state); + result = _curand_box_muller_double(x0, x1, y0, y1); + return result; +} + +template +QUALIFIERS double2 curand_box_muller2_double(R *state) +{ + double2 result; + uint4 _x; + _x = curand4(state); + result = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w); + return result; +} + + +template +QUALIFIERS double4 curand_box_muller4_double(R *state) +{ + double4 result; + double2 _res1; + double2 _res2; + uint4 _x; + uint4 _y; + _x = curand4(state); + _y = curand4(state); + _res1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w); + _res2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w); + result.x = _res1.x; + result.y = _res1.y; + result.z = _res2.x; + result.w = _res2.y; + return result; +} + +//QUALIFIERS float _curand_normal_icdf(unsigned int x) +//{ +//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF) +// float s = CURAND_SQRT2; +// // Mirror to avoid loss of precision +// if(x > 0x80000000UL) { +// x = 0xffffffffUL - x; +// s = -s; +// } +// float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f); +// // p is in (0, 0.5], 2p is in (0, 1] +// return s * erfcinvf(2.0f * p); +//#else +// x++; //suppress warnings +// return 0.0f; +//#endif +//} +// +//QUALIFIERS float _curand_normal_icdf(unsigned long long x) +//{ +//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF) +// unsigned int t = (unsigned int)(x >> 32); +// float s = CURAND_SQRT2; +// // Mirror to avoid loss of precision +// if(t > 0x80000000UL) { +// t = 0xffffffffUL - t; +// s = -s; +// } +// float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f); +// // p is in (0, 0.5], 2p is in (0, 1] +// return s * erfcinvf(2.0f * p); +//#else +// x++; +// return 0.0f; +//#endif +//} +// +//QUALIFIERS double _curand_normal_icdf_double(unsigned int x) +//{ +//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF) +// double s = CURAND_SQRT2_DOUBLE; +// // Mirror to avoid loss of precision +// if(x > 0x80000000UL) { +// x = 0xffffffffUL - x; +// s = -s; +// } +// double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0); +// // p is in (0, 0.5], 2p is in (0, 1] +// return s * erfcinv(2.0 * p); +//#else +// x++; +// return 0.0; +//#endif +//} +// +//QUALIFIERS double _curand_normal_icdf_double(unsigned long long x) +//{ +//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF) +// double s = CURAND_SQRT2_DOUBLE; +// x >>= 11; +// // Mirror to avoid loss of precision +// if(x > 0x10000000000000UL) { +// x = 0x1fffffffffffffUL - x; +// s = -s; +// } +// double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0); +// // p is in (0, 0.5], 2p is in (0, 1] +// return s * erfcinv(2.0 * p); +//#else +// x++; +// return 0.0; +//#endif +//} +// + +/** + * \brief Return a normally distributed float from an XORWOW generator. + * + * Return a single normally distributed float with mean \p 0.0f and + * standard deviation \p 1.0f from the XORWOW generator in \p state, + * increment position of generator by one. + * + * The implementation uses a Box-Muller transform to generate two + * normally distributed results, then returns them one at a time. + * See ::curand_normal2() for a more efficient version that returns + * both results at once. + * + * \param state - Pointer to state to update + * + * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f + */ +QUALIFIERS float curand_normal(curandStateXORWOW_t *state) +{ + if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) { + unsigned int x, y; + x = curand(state); + y = curand(state); + float2 v = _curand_box_muller(x, y); + state->boxmuller_extra = v.y; + state->boxmuller_flag = EXTRA_FLAG_NORMAL; + return v.x; + } + state->boxmuller_flag = 0; + return state->boxmuller_extra; +} + +/** + * \brief Return a normally distributed float from an Philox4_32_10 generator. + * + * Return a single normally distributed float with mean \p 0.0f and + * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state, + * increment position of generator by one. + * + * The implementation uses a Box-Muller transform to generate two + * normally distributed results, then returns them one at a time. + * See ::curand_normal2() for a more efficient version that returns + * both results at once. + * + * \param state - Pointer to state to update + * + * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f + */ + +QUALIFIERS float curand_normal(curandStatePhilox4_32_10_t *state) +{ + if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) { + unsigned int x, y; + x = curand(state); + y = curand(state); + float2 v = _curand_box_muller(x, y); + state->boxmuller_extra = v.y; + state->boxmuller_flag = EXTRA_FLAG_NORMAL; + return v.x; + } + state->boxmuller_flag = 0; + return state->boxmuller_extra; +} + + + +/** + * \brief Return a normally distributed float from an MRG32k3a generator. + * + * Return a single normally distributed float with mean \p 0.0f and + * standard deviation \p 1.0f from the MRG32k3a generator in \p state, + * increment position of generator by one. + * + * The implementation uses a Box-Muller transform to generate two + * normally distributed results, then returns them one at a time. + * See ::curand_normal2() for a more efficient version that returns + * both results at once. + * + * \param state - Pointer to state to update + * + * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f + */ +QUALIFIERS float curand_normal(curandStateMRG32k3a_t *state) +{ + if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) { + float2 v = curand_box_muller_mrg(state); + state->boxmuller_extra = v.y; + state->boxmuller_flag = EXTRA_FLAG_NORMAL; + return v.x; + } + state->boxmuller_flag = 0; + return state->boxmuller_extra; +} + +/** + * \brief Return two normally distributed floats from an XORWOW generator. + * + * Return two normally distributed floats with mean \p 0.0f and + * standard deviation \p 1.0f from the XORWOW generator in \p state, + * increment position of generator by two. + * + * The implementation uses a Box-Muller transform to generate two + * normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed float2 where each element is from a + * distribution with mean \p 0.0f and standard deviation \p 1.0f + */ +QUALIFIERS float2 curand_normal2(curandStateXORWOW_t *state) +{ + return curand_box_muller(state); +} +/** + * \brief Return two normally distributed floats from an Philox4_32_10 generator. + * + * Return two normally distributed floats with mean \p 0.0f and + * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state, + * increment position of generator by two. + * + * The implementation uses a Box-Muller transform to generate two + * normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed float2 where each element is from a + * distribution with mean \p 0.0f and standard deviation \p 1.0f + */ +QUALIFIERS float2 curand_normal2(curandStatePhilox4_32_10_t *state) +{ + return curand_box_muller(state); +} + +/** + * \brief Return four normally distributed floats from an Philox4_32_10 generator. + * + * Return four normally distributed floats with mean \p 0.0f and + * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state, + * increment position of generator by four. + * + * The implementation uses a Box-Muller transform to generate two + * normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed float2 where each element is from a + * distribution with mean \p 0.0f and standard deviation \p 1.0f + */ +QUALIFIERS float4 curand_normal4(curandStatePhilox4_32_10_t *state) +{ + return curand_box_muller4(state); +} + + + +/** + * \brief Return two normally distributed floats from an MRG32k3a generator. + * + * Return two normally distributed floats with mean \p 0.0f and + * standard deviation \p 1.0f from the MRG32k3a generator in \p state, + * increment position of generator by two. + * + * The implementation uses a Box-Muller transform to generate two + * normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed float2 where each element is from a + * distribution with mean \p 0.0f and standard deviation \p 1.0f + */ +QUALIFIERS float2 curand_normal2(curandStateMRG32k3a_t *state) +{ + return curand_box_muller_mrg(state); +} + +/** + * \brief Return a normally distributed float from a MTGP32 generator. + * + * Return a single normally distributed float with mean \p 0.0f and + * standard deviation \p 1.0f from the MTGP32 generator in \p state, + * increment position of generator. + * + * The implementation uses the inverse cumulative distribution function + * to generate normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f + */ +QUALIFIERS float curand_normal(curandStateMtgp32_t *state) +{ + return _curand_normal_icdf(curand(state)); +} +/** + * \brief Return a normally distributed float from a Sobol32 generator. + * + * Return a single normally distributed float with mean \p 0.0f and + * standard deviation \p 1.0f from the Sobol32 generator in \p state, + * increment position of generator by one. + * + * The implementation uses the inverse cumulative distribution function + * to generate normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f + */ +QUALIFIERS float curand_normal(curandStateSobol32_t *state) +{ + return _curand_normal_icdf(curand(state)); +} + +/** + * \brief Return a normally distributed float from a scrambled Sobol32 generator. + * + * Return a single normally distributed float with mean \p 0.0f and + * standard deviation \p 1.0f from the scrambled Sobol32 generator in \p state, + * increment position of generator by one. + * + * The implementation uses the inverse cumulative distribution function + * to generate normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f + */ +QUALIFIERS float curand_normal(curandStateScrambledSobol32_t *state) +{ + return _curand_normal_icdf(curand(state)); +} + +/** + * \brief Return a normally distributed float from a Sobol64 generator. + * + * Return a single normally distributed float with mean \p 0.0f and + * standard deviation \p 1.0f from the Sobol64 generator in \p state, + * increment position of generator by one. + * + * The implementation uses the inverse cumulative distribution function + * to generate normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f + */ +QUALIFIERS float curand_normal(curandStateSobol64_t *state) +{ + return _curand_normal_icdf(curand(state)); +} + +/** + * \brief Return a normally distributed float from a scrambled Sobol64 generator. + * + * Return a single normally distributed float with mean \p 0.0f and + * standard deviation \p 1.0f from the scrambled Sobol64 generator in \p state, + * increment position of generator by one. + * + * The implementation uses the inverse cumulative distribution function + * to generate normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f + */ +QUALIFIERS float curand_normal(curandStateScrambledSobol64_t *state) +{ + return _curand_normal_icdf(curand(state)); +} + +/** + * \brief Return a normally distributed double from an XORWOW generator. + * + * Return a single normally distributed double with mean \p 0.0 and + * standard deviation \p 1.0 from the XORWOW generator in \p state, + * increment position of generator. + * + * The implementation uses a Box-Muller transform to generate two + * normally distributed results, then returns them one at a time. + * See ::curand_normal2_double() for a more efficient version that returns + * both results at once. + * + * \param state - Pointer to state to update + * + * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0 + */ +QUALIFIERS double curand_normal_double(curandStateXORWOW_t *state) +{ + if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) { + unsigned int x0, x1, y0, y1; + x0 = curand(state); + x1 = curand(state); + y0 = curand(state); + y1 = curand(state); + double2 v = _curand_box_muller_double(x0, x1, y0, y1); + state->boxmuller_extra_double = v.y; + state->boxmuller_flag_double = EXTRA_FLAG_NORMAL; + return v.x; + } + state->boxmuller_flag_double = 0; + return state->boxmuller_extra_double; +} + +/** + * \brief Return a normally distributed double from an Philox4_32_10 generator. + * + * Return a single normally distributed double with mean \p 0.0 and + * standard deviation \p 1.0 from the Philox4_32_10 generator in \p state, + * increment position of generator. + * + * The implementation uses a Box-Muller transform to generate two + * normally distributed results, then returns them one at a time. + * See ::curand_normal2_double() for a more efficient version that returns + * both results at once. + * + * \param state - Pointer to state to update + * + * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0 + */ + +QUALIFIERS double curand_normal_double(curandStatePhilox4_32_10_t *state) +{ + if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) { + uint4 _x; + _x = curand4(state); + double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w); + state->boxmuller_extra_double = v.y; + state->boxmuller_flag_double = EXTRA_FLAG_NORMAL; + return v.x; + } + state->boxmuller_flag_double = 0; + return state->boxmuller_extra_double; +} + + +/** + * \brief Return a normally distributed double from an MRG32k3a generator. + * + * Return a single normally distributed double with mean \p 0.0 and + * standard deviation \p 1.0 from the XORWOW generator in \p state, + * increment position of generator. + * + * The implementation uses a Box-Muller transform to generate two + * normally distributed results, then returns them one at a time. + * See ::curand_normal2_double() for a more efficient version that returns + * both results at once. + * + * \param state - Pointer to state to update + * + * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0 + */ +QUALIFIERS double curand_normal_double(curandStateMRG32k3a_t *state) +{ + if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) { + double2 v = curand_box_muller_mrg_double(state); + state->boxmuller_extra_double = v.y; + state->boxmuller_flag_double = EXTRA_FLAG_NORMAL; + return v.x; + } + state->boxmuller_flag_double = 0; + return state->boxmuller_extra_double; +} + +/** + * \brief Return two normally distributed doubles from an XORWOW generator. + * + * Return two normally distributed doubles with mean \p 0.0 and + * standard deviation \p 1.0 from the XORWOW generator in \p state, + * increment position of generator by 2. + * + * The implementation uses a Box-Muller transform to generate two + * normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed double2 where each element is from a + * distribution with mean \p 0.0 and standard deviation \p 1.0 + */ +QUALIFIERS double2 curand_normal2_double(curandStateXORWOW_t *state) +{ + return curand_box_muller_double(state); +} + +/** + * \brief Return two normally distributed doubles from an Philox4_32_10 generator. + * + * Return two normally distributed doubles with mean \p 0.0 and + * standard deviation \p 1.0 from the Philox4_32_10 generator in \p state, + * increment position of generator by 2. + * + * The implementation uses a Box-Muller transform to generate two + * normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed double2 where each element is from a + * distribution with mean \p 0.0 and standard deviation \p 1.0 + */ +QUALIFIERS double2 curand_normal2_double(curandStatePhilox4_32_10_t *state) +{ + uint4 _x; + double2 result; + + _x = curand4(state); + double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w); + result.x = v1.x; + result.y = v1.y; + + return result; +} + + // not a part of API +QUALIFIERS double4 curand_normal4_double(curandStatePhilox4_32_10_t *state) +{ + uint4 _x; + uint4 _y; + double4 result; + + _x = curand4(state); + _y = curand4(state); + double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w); + double2 v2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w); + result.x = v1.x; + result.y = v1.y; + result.z = v2.x; + result.w = v2.y; + + return result; +} + + +/** + * \brief Return two normally distributed doubles from an MRG32k3a generator. + * + * Return two normally distributed doubles with mean \p 0.0 and + * standard deviation \p 1.0 from the MRG32k3a generator in \p state, + * increment position of generator. + * + * The implementation uses a Box-Muller transform to generate two + * normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed double2 where each element is from a + * distribution with mean \p 0.0 and standard deviation \p 1.0 + */ +QUALIFIERS double2 curand_normal2_double(curandStateMRG32k3a_t *state) +{ + return curand_box_muller_mrg_double(state); +} + +/** + * \brief Return a normally distributed double from an MTGP32 generator. + * + * Return a single normally distributed double with mean \p 0.0 and + * standard deviation \p 1.0 from the MTGP32 generator in \p state, + * increment position of generator. + * + * The implementation uses the inverse cumulative distribution function + * to generate normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0 + */ +QUALIFIERS double curand_normal_double(curandStateMtgp32_t *state) +{ + return _curand_normal_icdf_double(curand(state)); +} + +/** + * \brief Return a normally distributed double from an Sobol32 generator. + * + * Return a single normally distributed double with mean \p 0.0 and + * standard deviation \p 1.0 from the Sobol32 generator in \p state, + * increment position of generator by one. + * + * The implementation uses the inverse cumulative distribution function + * to generate normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0 + */ +QUALIFIERS double curand_normal_double(curandStateSobol32_t *state) +{ + return _curand_normal_icdf_double(curand(state)); +} + +/** + * \brief Return a normally distributed double from a scrambled Sobol32 generator. + * + * Return a single normally distributed double with mean \p 0.0 and + * standard deviation \p 1.0 from the scrambled Sobol32 generator in \p state, + * increment position of generator by one. + * + * The implementation uses the inverse cumulative distribution function + * to generate normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0 + */ +QUALIFIERS double curand_normal_double(curandStateScrambledSobol32_t *state) +{ + return _curand_normal_icdf_double(curand(state)); +} + +/** + * \brief Return a normally distributed double from a Sobol64 generator. + * + * Return a single normally distributed double with mean \p 0.0 and + * standard deviation \p 1.0 from the Sobol64 generator in \p state, + * increment position of generator by one. + * + * The implementation uses the inverse cumulative distribution function + * to generate normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0 + */ +QUALIFIERS double curand_normal_double(curandStateSobol64_t *state) +{ + return _curand_normal_icdf_double(curand(state)); +} + +/** + * \brief Return a normally distributed double from a scrambled Sobol64 generator. + * + * Return a single normally distributed double with mean \p 0.0 and + * standard deviation \p 1.0 from the scrambled Sobol64 generator in \p state, + * increment position of generator by one. + * + * The implementation uses the inverse cumulative distribution function + * to generate normally distributed results. + * + * \param state - Pointer to state to update + * + * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0 + */ +QUALIFIERS double curand_normal_double(curandStateScrambledSobol64_t *state) +{ + return _curand_normal_icdf_double(curand(state)); +} +#endif // !defined(CURAND_NORMAL_H_) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c892505e17734560543bbfbd47aa535fade55899 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8376b63625d3d9b2f14e371a45bdb6fe8e2b84a Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvtxDetail/nvtxImpl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvtxDetail/nvtxImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..bdc6bd4c72d74ef0f1fe5c2c8b4c49b196927fa7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvtxDetail/nvtxImpl.h @@ -0,0 +1,469 @@ +/* This file was procedurally generated! Do not modify this file by hand. */ + +/* +* Copyright 2009-2016 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO USER: +* +* This source code is subject to NVIDIA ownership rights under U.S. and +* international Copyright laws. +* +* This software and the information contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions +* of a form of NVIDIA software license agreement. +* +* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +* OR PERFORMANCE OF THIS SOURCE CODE. +* +* U.S. Government End Users. This source code is a "commercial item" as +* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +* "commercial computer software" and "commercial computer software +* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +* and is provided to the U.S. Government only as a commercial end item. +* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +* source code with only those rights set forth herein. +* +* Any use of this source code in individual and commercial software must +* include, in the user documentation and internal comments to the code, +* the above Disclaimer and U.S. Government End Users Notice. +*/ + +#ifndef NVTX_IMPL_GUARD +#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined). +#endif + +/* ---- Include required platform headers ---- */ + +#if defined(_WIN32) + +#include + +#else +#include + +#if defined(__ANDROID__) +#include +#endif + +#if defined(__linux__) || defined(__CYGWIN__) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#endif + +/* ---- Define macros used in this file ---- */ + +#define NVTX_INIT_STATE_FRESH 0 +#define NVTX_INIT_STATE_STARTED 1 +#define NVTX_INIT_STATE_COMPLETE 2 + +#ifdef NVTX_DEBUG_PRINT +#ifdef __ANDROID__ +#include +#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__); +#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__); +#else +#include +#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__) +#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__) +#endif +#else /* !defined(NVTX_DEBUG_PRINT) */ +#define NVTX_ERR(...) +#define NVTX_INFO(...) +#endif + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#ifdef __GNUC__ +#pragma GCC visibility push(hidden) +#endif + +/* ---- Forward declare all functions referenced in globals ---- */ + +NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void); +NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)( + NvtxCallbackModule module, + NvtxFunctionTable* out_table, + unsigned int* out_size); +NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)( + uint32_t version); +NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)( + uint32_t exportTableId); + +#include "nvtxInitDecls.h" + +/* ---- Define all globals ---- */ + +typedef struct nvtxGlobals_t +{ + volatile unsigned int initState; + NvtxExportTableCallbacks etblCallbacks; + NvtxExportTableVersionInfo etblVersionInfo; + + /* Implementation function pointers */ + nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr; + nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr; + nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr; + nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr; + nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr; + nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr; + nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr; + nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr; + nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr; + nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr; + nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr; + nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr; + nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr; + nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr; + nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr; + + nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr; + nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr; + nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr; + nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr; + nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr; + nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr; + nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr; + nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr; + + nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr; + nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr; + nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr; + nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr; + nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr; + nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr; + nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr; + nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr; + nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr; + nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr; + nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr; + nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr; + nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr; + nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr; + + nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr; + nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr; + nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr; + nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr; + nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr; + nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr; + + nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr; + nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr; + nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr; + nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr; + nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr; + nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr; + nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr; + nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr; + nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr; + nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr; + nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr; + nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr; + nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr; + nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr; + nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr; + + nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr; + nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr; + nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr; + nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr; + nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr; + nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr; + + /* Tables of function pointers -- Extra null added to the end to ensure + * a crash instead of silent corruption if a tool reads off the end. */ + NvtxFunctionPointer* functionTable_CORE [NVTX_CBID_CORE_SIZE + 1]; + NvtxFunctionPointer* functionTable_CUDA [NVTX_CBID_CUDA_SIZE + 1]; + NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1]; + NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1]; + NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE + 1]; + NvtxFunctionPointer* functionTable_SYNC [NVTX_CBID_SYNC_SIZE + 1]; +} nvtxGlobals_t; + +NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) = +{ + NVTX_INIT_STATE_FRESH, + + { + sizeof(NvtxExportTableCallbacks), + NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable) + }, + { + sizeof(NvtxExportTableVersionInfo), + NVTX_VERSION, + 0, + NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion) + }, + + /* Implementation function pointers */ + NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init), + + NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init), + + NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init), + + NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init), + + NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init), + + NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init), + NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init), + + /* Tables of function pointers */ + { + 0, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr, + 0 + }, + { + 0, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr, + 0 + }, + { + 0, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr, + 0 + }, + { + 0, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr, + 0 + }, + { + 0, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr, + 0 + }, + { + 0, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr, + (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr, + 0 + } +}; + +/* ---- Define static inline implementations of core API functions ---- */ + +#include "nvtxImplCore.h" + +/* ---- Define implementations of export table functions ---- */ + +NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)( + NvtxCallbackModule module, + NvtxFunctionTable* out_table, + unsigned int* out_size) +{ + unsigned int bytes = 0; + NvtxFunctionTable table = (NvtxFunctionTable)0; + + switch (module) + { + case NVTX_CB_MODULE_CORE: + table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE; + bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE); + break; + case NVTX_CB_MODULE_CUDA: + table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA; + bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA); + break; + case NVTX_CB_MODULE_OPENCL: + table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL; + bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL); + break; + case NVTX_CB_MODULE_CUDART: + table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART; + bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART); + break; + case NVTX_CB_MODULE_CORE2: + table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2; + bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2); + break; + case NVTX_CB_MODULE_SYNC: + table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC; + bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC); + break; + default: return 0; + } + + if (out_size) + *out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1; + + if (out_table) + *out_table = table; + + return 1; +} + +NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId) +{ + switch (exportTableId) + { + case NVTX_ETID_CALLBACKS: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks; + case NVTX_ETID_VERSIONINFO: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo; + default: return 0; + } +} + +NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version) +{ + /* Reserved for custom implementations to resolve problems with tools */ + (void)version; +} + +/* ---- Define implementations of init versions of all API functions ---- */ + +#include "nvtxInitDefs.h" + +/* ---- Define implementations of initialization functions ---- */ + +#include "nvtxInit.h" + +#ifdef __GNUC__ +#pragma GCC visibility pop +#endif + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ce3282d395d3c5d71d173eae68b324d59e90d2f Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/libnvToolsExt.so.1 b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/libnvToolsExt.so.1 new file mode 100644 index 0000000000000000000000000000000000000000..4338353fb6b12c216bd3c4f3919230931aab4aeb Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/libnvToolsExt.so.1 differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cufft_cu11-10.9.0.58.dist-info/INSTALLER b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cufft_cu11-10.9.0.58.dist-info/INSTALLER new file mode 100644 index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cufft_cu11-10.9.0.58.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/INSTALLER b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/INSTALLER new file mode 100644 index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/RECORD b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..680500f28b6b1cb9f9bf1b1c350711563f34e50f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/RECORD @@ -0,0 +1,22 @@ +nvidia/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +nvidia/__pycache__/__init__.cpython-311.pyc,, +nvidia/cusolver/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +nvidia/cusolver/__pycache__/__init__.cpython-311.pyc,, +nvidia/cusolver/include/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +nvidia/cusolver/include/__pycache__/__init__.cpython-311.pyc,, +nvidia/cusolver/include/cusolverDn.h,sha256=8KUcqUxWPr8jpz3ZVpTB6I3IXMme1ok7E7vi9XXKRzk,147406 +nvidia/cusolver/include/cusolverMg.h,sha256=N8989nnS2BleeMyuftbQgBDJ4sMAkLPSnmy_S_7fxng,11549 +nvidia/cusolver/include/cusolverRf.h,sha256=7BZfWeuMJ8w1Pz4iZeGmwvDZbDNNq0ivG5MHtiATtls,14292 +nvidia/cusolver/include/cusolverSp.h,sha256=8fev0XawDBd0xrOxUlQ3WhclKlUuVAT64zKxwnP8iT0,32561 +nvidia/cusolver/include/cusolverSp_LOWLEVEL_PREVIEW.h,sha256=rTuS0rxwGV3bAz50ua59WVPQ9SvlijORj732oPejoCk,37495 +nvidia/cusolver/include/cusolver_common.h,sha256=oTUxSnGYIrH8ESAV-s0BgSSMyq5u9hsPtFcCbODMRM4,8825 +nvidia/cusolver/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +nvidia/cusolver/lib/__pycache__/__init__.cpython-311.pyc,, +nvidia/cusolver/lib/libcusolver.so.11,sha256=EugAsYUnRSYliOioPoeYlG3NtytSNUvnLJTjkkwNYIg,302736312 +nvidia/cusolver/lib/libcusolverMg.so.11,sha256=XQ8dThn3AEDlKEn-5-Q_ncv3R47DlpQpReMymoa4dXU,184721832 +nvidia_cusolver_cu11-11.4.1.48.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +nvidia_cusolver_cu11-11.4.1.48.dist-info/License.txt,sha256=rW9YU_ugyg0VnQ9Y1JrkmDDC-Mk_epJki5zpCttMbM0,59262 +nvidia_cusolver_cu11-11.4.1.48.dist-info/METADATA,sha256=9ErGf3O-O5QWTmfl8S-pAEt1JQvSEDHuBS8Aq81nfs4,1552 +nvidia_cusolver_cu11-11.4.1.48.dist-info/RECORD,, +nvidia_cusolver_cu11-11.4.1.48.dist-info/WHEEL,sha256=-kQi_VMfvRQozZJT7HUPMfY-5vLo0LVTmAylNJ3Ft98,106 +nvidia_cusolver_cu11-11.4.1.48.dist-info/top_level.txt,sha256=fTkAtiFuL16nUrB9ytDDtpytz2t0B4NvYTnRzwAhO14,7 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/WHEEL b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/WHEEL new file mode 100644 index 0000000000000000000000000000000000000000..06e355fe0e3ed7077903f119ae6928a17da8eb6f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia_cusolver_cu11-11.4.1.48.dist-info/WHEEL @@ -0,0 +1,5 @@ +Wheel-Version: 1.0 +Generator: bdist_wheel (0.37.1) +Root-Is-Purelib: true +Tag: py3-none-manylinux1_x86_64 + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging-24.2.dist-info/METADATA b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging-24.2.dist-info/METADATA new file mode 100644 index 0000000000000000000000000000000000000000..1479c8694bfbd583a896dbe9bd33cdb6d7e7371e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/packaging-24.2.dist-info/METADATA @@ -0,0 +1,102 @@ +Metadata-Version: 2.3 +Name: packaging +Version: 24.2 +Summary: Core utilities for Python packages +Author-email: Donald Stufft +Requires-Python: >=3.8 +Description-Content-Type: text/x-rst +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: Apache Software License +Classifier: License :: OSI Approved :: BSD License +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3 :: Only +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 +Classifier: Programming Language :: Python :: 3.13 +Classifier: Programming Language :: Python :: Implementation :: CPython +Classifier: Programming Language :: Python :: Implementation :: PyPy +Classifier: Typing :: Typed +Project-URL: Documentation, https://packaging.pypa.io/ +Project-URL: Source, https://github.com/pypa/packaging + +packaging +========= + +.. start-intro + +Reusable core utilities for various Python Packaging +`interoperability specifications `_. + +This library provides utilities that implement the interoperability +specifications which have clearly one correct behaviour (eg: :pep:`440`) +or benefit greatly from having a single shared implementation (eg: :pep:`425`). + +.. end-intro + +The ``packaging`` project includes the following: version handling, specifiers, +markers, requirements, tags, utilities. + +Documentation +------------- + +The `documentation`_ provides information and the API for the following: + +- Version Handling +- Specifiers +- Markers +- Requirements +- Tags +- Utilities + +Installation +------------ + +Use ``pip`` to install these utilities:: + + pip install packaging + +The ``packaging`` library uses calendar-based versioning (``YY.N``). + +Discussion +---------- + +If you run into bugs, you can file them in our `issue tracker`_. + +You can also join ``#pypa`` on Freenode to ask questions or get involved. + + +.. _`documentation`: https://packaging.pypa.io/ +.. _`issue tracker`: https://github.com/pypa/packaging/issues + + +Code of Conduct +--------------- + +Everyone interacting in the packaging project's codebases, issue trackers, chat +rooms, and mailing lists is expected to follow the `PSF Code of Conduct`_. + +.. _PSF Code of Conduct: https://github.com/pypa/.github/blob/main/CODE_OF_CONDUCT.md + +Contributing +------------ + +The ``CONTRIBUTING.rst`` file outlines how to contribute to this project as +well as how to report a potential security issue. The documentation for this +project also covers information about `project development`_ and `security`_. + +.. _`project development`: https://packaging.pypa.io/en/latest/development/ +.. _`security`: https://packaging.pypa.io/en/latest/security/ + +Project History +--------------- + +Please review the ``CHANGELOG.rst`` file or the `Changelog documentation`_ for +recent changes and project history. + +.. _`Changelog documentation`: https://packaging.pypa.io/en/latest/changelog/ + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/share/cmake/pybind11/pybind11Tools.cmake b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/share/cmake/pybind11/pybind11Tools.cmake new file mode 100644 index 0000000000000000000000000000000000000000..bed5e08039abfffb59f1861573a6b835fc4d5a99 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/share/cmake/pybind11/pybind11Tools.cmake @@ -0,0 +1,239 @@ +# tools/pybind11Tools.cmake -- Build system for the pybind11 modules +# +# Copyright (c) 2020 Wenzel Jakob +# +# All rights reserved. Use of this source code is governed by a +# BSD-style license that can be found in the LICENSE file. + +# include_guard(global) (pre-CMake 3.10) +if(TARGET pybind11::python_headers) + return() +endif() + +# Built-in in CMake 3.5+ +include(CMakeParseArguments) + +if(pybind11_FIND_QUIETLY) + set(_pybind11_quiet QUIET) +else() + set(_pybind11_quiet "") +endif() + +# If this is the first run, PYTHON_VERSION can stand in for PYBIND11_PYTHON_VERSION +if(NOT DEFINED PYBIND11_PYTHON_VERSION AND DEFINED PYTHON_VERSION) + message(WARNING "Set PYBIND11_PYTHON_VERSION to search for a specific version, not " + "PYTHON_VERSION (which is an output). Assuming that is what you " + "meant to do and continuing anyway.") + set(PYBIND11_PYTHON_VERSION + "${PYTHON_VERSION}" + CACHE STRING "Python version to use for compiling modules") + unset(PYTHON_VERSION) + unset(PYTHON_VERSION CACHE) +elseif(DEFINED PYBIND11_PYTHON_VERSION) + # If this is set as a normal variable, promote it + set(PYBIND11_PYTHON_VERSION + "${PYBIND11_PYTHON_VERSION}" + CACHE STRING "Python version to use for compiling modules") +else() + # Make an empty cache variable. + set(PYBIND11_PYTHON_VERSION + "" + CACHE STRING "Python version to use for compiling modules") +endif() + +# A user can set versions manually too +set(Python_ADDITIONAL_VERSIONS + "3.12;3.11;3.10;3.9;3.8;3.7" + CACHE INTERNAL "") + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") +find_package(PythonLibsNew ${PYBIND11_PYTHON_VERSION} MODULE REQUIRED ${_pybind11_quiet}) +list(REMOVE_AT CMAKE_MODULE_PATH -1) + +# Makes a normal variable a cached variable +macro(_PYBIND11_PROMOTE_TO_CACHE NAME) + set(_tmp_ptc "${${NAME}}") + # CMake 3.21 complains if a cached variable is shadowed by a normal one + unset(${NAME}) + set(${NAME} + "${_tmp_ptc}" + CACHE INTERNAL "") +endmacro() + +# Cache variables so pybind11_add_module can be used in parent projects +_pybind11_promote_to_cache(PYTHON_INCLUDE_DIRS) +_pybind11_promote_to_cache(PYTHON_LIBRARIES) +_pybind11_promote_to_cache(PYTHON_MODULE_PREFIX) +_pybind11_promote_to_cache(PYTHON_MODULE_EXTENSION) +_pybind11_promote_to_cache(PYTHON_MODULE_DEBUG_POSTFIX) +_pybind11_promote_to_cache(PYTHON_VERSION_MAJOR) +_pybind11_promote_to_cache(PYTHON_VERSION_MINOR) +_pybind11_promote_to_cache(PYTHON_VERSION) +_pybind11_promote_to_cache(PYTHON_IS_DEBUG) + +if(PYBIND11_MASTER_PROJECT) + if(PYTHON_MODULE_EXTENSION MATCHES "pypy") + if(NOT DEFINED PYPY_VERSION) + execute_process( + COMMAND ${PYTHON_EXECUTABLE} -c + [=[import sys; sys.stdout.write(".".join(map(str, sys.pypy_version_info[:3])))]=] + OUTPUT_VARIABLE pypy_version) + set(PYPY_VERSION + ${pypy_version} + CACHE INTERNAL "") + endif() + message(STATUS "PYPY ${PYPY_VERSION} (Py ${PYTHON_VERSION})") + else() + message(STATUS "PYTHON ${PYTHON_VERSION}") + endif() +endif() + +# Only add Python for build - must be added during the import for config since +# it has to be re-discovered. +# +# This needs to be an target to it is included after the local pybind11 +# directory, just in case there are multiple versions of pybind11, we want the +# one we expect. +add_library(pybind11::python_headers INTERFACE IMPORTED) +set_property(TARGET pybind11::python_headers PROPERTY INTERFACE_INCLUDE_DIRECTORIES + "$") +set_property( + TARGET pybind11::pybind11 + APPEND + PROPERTY INTERFACE_LINK_LIBRARIES pybind11::python_headers) + +set(pybind11_INCLUDE_DIRS + "${pybind11_INCLUDE_DIR}" "${PYTHON_INCLUDE_DIRS}" + CACHE INTERNAL "Directories where pybind11 and possibly Python headers are located") + +# Python debug libraries expose slightly different objects before 3.8 +# https://docs.python.org/3.6/c-api/intro.html#debugging-builds +# https://stackoverflow.com/questions/39161202/how-to-work-around-missing-pymodule-create2-in-amd64-win-python35-d-lib +if(PYTHON_IS_DEBUG) + set_property( + TARGET pybind11::pybind11 + APPEND + PROPERTY INTERFACE_COMPILE_DEFINITIONS Py_DEBUG) +endif() + +# The <3.11 code here does not support release/debug builds at the same time, like on vcpkg +if(CMAKE_VERSION VERSION_LESS 3.11) + set_property( + TARGET pybind11::module + APPEND + PROPERTY + INTERFACE_LINK_LIBRARIES + pybind11::python_link_helper + "$<$,$>:$>" + ) + + set_property( + TARGET pybind11::embed + APPEND + PROPERTY INTERFACE_LINK_LIBRARIES pybind11::pybind11 $) +else() + # The IMPORTED INTERFACE library here is to ensure that "debug" and "release" get processed outside + # of a generator expression - https://gitlab.kitware.com/cmake/cmake/-/issues/18424, as they are + # target_link_library keywords rather than real libraries. + add_library(pybind11::_ClassicPythonLibraries IMPORTED INTERFACE) + target_link_libraries(pybind11::_ClassicPythonLibraries INTERFACE ${PYTHON_LIBRARIES}) + target_link_libraries( + pybind11::module + INTERFACE + pybind11::python_link_helper + "$<$,$>:pybind11::_ClassicPythonLibraries>") + + target_link_libraries(pybind11::embed INTERFACE pybind11::pybind11 + pybind11::_ClassicPythonLibraries) +endif() + +function(pybind11_extension name) + # The prefix and extension are provided by FindPythonLibsNew.cmake + set_target_properties( + ${name} + PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}" + DEBUG_POSTFIX "${PYTHON_MODULE_DEBUG_POSTFIX}" + SUFFIX "${PYTHON_MODULE_EXTENSION}") +endfunction() + +# Build a Python extension module: +# pybind11_add_module( [MODULE | SHARED] [EXCLUDE_FROM_ALL] +# [NO_EXTRAS] [THIN_LTO] [OPT_SIZE] source1 [source2 ...]) +# +function(pybind11_add_module target_name) + set(options "MODULE;SHARED;EXCLUDE_FROM_ALL;NO_EXTRAS;SYSTEM;THIN_LTO;OPT_SIZE") + cmake_parse_arguments(ARG "${options}" "" "" ${ARGN}) + + if(ARG_MODULE AND ARG_SHARED) + message(FATAL_ERROR "Can't be both MODULE and SHARED") + elseif(ARG_SHARED) + set(lib_type SHARED) + else() + set(lib_type MODULE) + endif() + + if(ARG_EXCLUDE_FROM_ALL) + set(exclude_from_all EXCLUDE_FROM_ALL) + else() + set(exclude_from_all "") + endif() + + add_library(${target_name} ${lib_type} ${exclude_from_all} ${ARG_UNPARSED_ARGUMENTS}) + + target_link_libraries(${target_name} PRIVATE pybind11::module) + + if(ARG_SYSTEM) + message( + STATUS + "Warning: this does not have an effect - use NO_SYSTEM_FROM_IMPORTED if using imported targets" + ) + endif() + + pybind11_extension(${target_name}) + + # -fvisibility=hidden is required to allow multiple modules compiled against + # different pybind versions to work properly, and for some features (e.g. + # py::module_local). We force it on everything inside the `pybind11` + # namespace; also turning it on for a pybind module compilation here avoids + # potential warnings or issues from having mixed hidden/non-hidden types. + if(NOT DEFINED CMAKE_CXX_VISIBILITY_PRESET) + set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET "hidden") + endif() + + if(NOT DEFINED CMAKE_CUDA_VISIBILITY_PRESET) + set_target_properties(${target_name} PROPERTIES CUDA_VISIBILITY_PRESET "hidden") + endif() + + if(ARG_NO_EXTRAS) + return() + endif() + + if(NOT DEFINED CMAKE_INTERPROCEDURAL_OPTIMIZATION) + if(ARG_THIN_LTO) + target_link_libraries(${target_name} PRIVATE pybind11::thin_lto) + else() + target_link_libraries(${target_name} PRIVATE pybind11::lto) + endif() + endif() + + if(DEFINED CMAKE_BUILD_TYPE) # see https://github.com/pybind/pybind11/issues/4454 + # Use case-insensitive comparison to match the result of $ + string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE) + if(NOT MSVC AND NOT "${uppercase_CMAKE_BUILD_TYPE}" MATCHES DEBUG|RELWITHDEBINFO) + pybind11_strip(${target_name}) + endif() + endif() + + if(MSVC) + target_link_libraries(${target_name} PRIVATE pybind11::windows_extras) + endif() + + if(ARG_OPT_SIZE) + target_link_libraries(${target_name} PRIVATE pybind11::opt_size) + endif() +endfunction() + +# Provide general way to call common Python commands in "common" file. +set(_Python + PYTHON + CACHE INTERNAL "" FORCE) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/_pyximport3.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/_pyximport3.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94217e9fd621ac98ff5825293af6fbdc8b176b9b Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/_pyximport3.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/pyximport.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/pyximport.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae4a76d2a9d1d7bd57a6871b07669fc75fe089f0 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/pyximport.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/pyximport.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/pyximport.py new file mode 100644 index 0000000000000000000000000000000000000000..9d575815afb9dc59834f058b1861dcfa38834ada --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/pyximport.py @@ -0,0 +1,11 @@ +from __future__ import absolute_import +import sys + +if sys.version_info < (3, 5): + # _pyximport3 module requires at least Python 3.5 + from pyximport._pyximport2 import install, uninstall, show_docs +else: + from pyximport._pyximport3 import install, uninstall, show_docs + +if __name__ == '__main__': + show_docs() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d381712b4a356e4ad2c066c3b87abead312f4b81 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/__init__.py @@ -0,0 +1,2038 @@ + +r""" +The torch package contains data structures for multi-dimensional +tensors and defines mathematical operations over these tensors. +Additionally, it provides many utilities for efficient serialization of +Tensors and arbitrary types, and other useful utilities. + +It has a CUDA counterpart, that enables you to run your tensor computations +on an NVIDIA GPU with compute capability >= 3.0. +""" + +import math +import os +import sys +import platform +import textwrap +import ctypes +import inspect +import threading + +# multipy/deploy is setting this import before importing torch, this is the most +# reliable way we have to detect if we're running within deploy. +# https://github.com/pytorch/multipy/blob/d60f34ad38c371e441fe7ffdb77a3c3dda5a5d19/multipy/runtime/interpreter/interpreter_impl.cpp#L134-L137 +def _running_with_deploy(): + return sys.modules.get("torch._meta_registrations", None) is object + +from ._utils import _import_dotted_name, classproperty +from ._utils import _functionalize_sync as _sync +from ._utils_internal import get_file_path, prepare_multiprocessing_environment, \ + USE_RTLD_GLOBAL_WITH_LIBTORCH, USE_GLOBAL_DEPS + +# TODO(torch_deploy) figure out how to freeze version.py in fbcode build +if _running_with_deploy(): + __version__ = "torch-deploy-1.8" +else: + from .torch_version import __version__ as __version__ + +from typing import Any, Callable, Dict, Optional, Set, Tuple, Type, TYPE_CHECKING, Union, List +import builtins + +__all__ = [ + 'typename', 'is_tensor', 'is_storage', + 'set_default_tensor_type', 'set_default_device', 'get_default_device', + 'set_rng_state', 'get_rng_state', 'manual_seed', 'initial_seed', 'seed', + 'save', 'load', 'set_printoptions', 'chunk', 'split', 'stack', 'matmul', + 'no_grad', 'enable_grad', 'rand', 'randn', 'inference_mode', + 'DoubleStorage', 'FloatStorage', 'LongStorage', 'IntStorage', + 'ShortStorage', 'CharStorage', 'ByteStorage', 'BoolStorage', + 'TypedStorage', 'UntypedStorage', + 'DoubleTensor', 'FloatTensor', 'LongTensor', 'IntTensor', + 'ShortTensor', 'CharTensor', 'ByteTensor', 'BoolTensor', 'Tensor', + 'lobpcg', 'use_deterministic_algorithms', + 'are_deterministic_algorithms_enabled', + 'is_deterministic_algorithms_warn_only_enabled', + 'set_deterministic_debug_mode', 'get_deterministic_debug_mode', + 'set_float32_matmul_precision', 'get_float32_matmul_precision', + 'set_warn_always', 'is_warn_always_enabled', 'SymInt', 'SymFloat', + 'SymBool', 'sym_not', 'unravel_index', + 'sym_int', 'sym_float', 'sym_max', 'sym_min', 'sym_ite', 'compile', 'vmap', + 'export', 'autocast', 'cond', 'GradScaler', +] + +################################################################################ +# Load the extension module +################################################################################ + +if sys.platform == 'win32': + pfiles_path = os.getenv('ProgramFiles', 'C:\\Program Files') + py_dll_path = os.path.join(sys.exec_prefix, 'Library', 'bin') + th_dll_path = os.path.join(os.path.dirname(__file__), 'lib') + + # When users create a virtualenv that inherits the base environment, + # we will need to add the corresponding library directory into + # DLL search directories. Otherwise, it will rely on `PATH` which + # is dependent on user settings. + if sys.exec_prefix != sys.base_exec_prefix: + base_py_dll_path = os.path.join(sys.base_exec_prefix, 'Library', 'bin') + else: + base_py_dll_path = '' + + dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, base_py_dll_path])) + + if all(not os.path.exists(os.path.join(p, 'nvToolsExt64_1.dll')) for p in dll_paths): + nvtoolsext_dll_path = os.path.join( + os.getenv('NVTOOLSEXT_PATH', os.path.join(pfiles_path, 'NVIDIA Corporation', 'NvToolsExt')), 'bin', 'x64') + else: + nvtoolsext_dll_path = '' + + from .version import cuda as cuda_version + import glob + if cuda_version and all(not glob.glob(os.path.join(p, 'cudart64*.dll')) for p in dll_paths): + cuda_version_1 = cuda_version.replace('.', '_') + cuda_path_var = 'CUDA_PATH_V' + cuda_version_1 + default_path = os.path.join(pfiles_path, 'NVIDIA GPU Computing Toolkit', 'CUDA', 'v' + cuda_version) + cuda_path = os.path.join(os.getenv(cuda_path_var, default_path), 'bin') + else: + cuda_path = '' + + dll_paths.extend(filter(os.path.exists, [nvtoolsext_dll_path, cuda_path])) + + kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True) + with_load_library_flags = hasattr(kernel32, 'AddDllDirectory') + prev_error_mode = kernel32.SetErrorMode(0x0001) + + kernel32.LoadLibraryW.restype = ctypes.c_void_p + if with_load_library_flags: + kernel32.LoadLibraryExW.restype = ctypes.c_void_p + + for dll_path in dll_paths: + os.add_dll_directory(dll_path) + + try: + ctypes.CDLL('vcruntime140.dll') + ctypes.CDLL('msvcp140.dll') + ctypes.CDLL('vcruntime140_1.dll') + except OSError: + print('''Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure. + It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe''') + + dlls = glob.glob(os.path.join(th_dll_path, '*.dll')) + path_patched = False + for dll in dlls: + is_loaded = False + if with_load_library_flags: + res = kernel32.LoadLibraryExW(dll, None, 0x00001100) + last_error = ctypes.get_last_error() + if res is None and last_error != 126: + err = ctypes.WinError(last_error) + err.strerror += f' Error loading "{dll}" or one of its dependencies.' + raise err + elif res is not None: + is_loaded = True + if not is_loaded: + if not path_patched: + os.environ['PATH'] = ';'.join(dll_paths + [os.environ['PATH']]) + path_patched = True + res = kernel32.LoadLibraryW(dll) + if res is None: + err = ctypes.WinError(ctypes.get_last_error()) + err.strerror += f' Error loading "{dll}" or one of its dependencies.' + raise err + + kernel32.SetErrorMode(prev_error_mode) + + +def _preload_cuda_deps(lib_folder, lib_name): + """Preloads cuda deps if they could not be found otherwise.""" + # Should only be called on Linux if default path resolution have failed + assert platform.system() == 'Linux', 'Should only be called on Linux' + import glob + lib_path = None + for path in sys.path: + nvidia_path = os.path.join(path, 'nvidia') + if not os.path.exists(nvidia_path): + continue + candidate_lib_paths = glob.glob(os.path.join(nvidia_path, lib_folder, 'lib', lib_name)) + if candidate_lib_paths and not lib_path: + lib_path = candidate_lib_paths[0] + if lib_path: + break + if not lib_path: + raise ValueError(f"{lib_name} not found in the system path {sys.path}") + ctypes.CDLL(lib_path) + + +# See Note [Global dependencies] +def _load_global_deps() -> None: + if _running_with_deploy() or platform.system() == 'Windows': + return + + lib_name = 'libtorch_global_deps' + ('.dylib' if platform.system() == 'Darwin' else '.so') + here = os.path.abspath(__file__) + lib_path = os.path.join(os.path.dirname(here), 'lib', lib_name) + + try: + ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL) + except OSError as err: + # Can only happen for wheel with cuda libs as PYPI deps + # As PyTorch is not purelib, but nvidia-*-cu12 is + cuda_libs: Dict[str, str] = { + 'cublas': 'libcublas.so.*[0-9]', + 'cudnn': 'libcudnn.so.*[0-9]', + 'cuda_nvrtc': 'libnvrtc.so.*[0-9]', + 'cuda_runtime': 'libcudart.so.*[0-9]', + 'cuda_cupti': 'libcupti.so.*[0-9]', + 'cufft': 'libcufft.so.*[0-9]', + 'curand': 'libcurand.so.*[0-9]', + 'cusolver': 'libcusolver.so.*[0-9]', + 'cusparse': 'libcusparse.so.*[0-9]', + 'nccl': 'libnccl.so.*[0-9]', + 'nvtx': 'libnvToolsExt.so.*[0-9]', + } + is_cuda_lib_err = [lib for lib in cuda_libs.values() if lib.split('.')[0] in err.args[0]] + if not is_cuda_lib_err: + raise err + for lib_folder, lib_name in cuda_libs.items(): + _preload_cuda_deps(lib_folder, lib_name) + ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL) + + +if (USE_RTLD_GLOBAL_WITH_LIBTORCH or os.getenv('TORCH_USE_RTLD_GLOBAL')) and \ + (_running_with_deploy() or platform.system() != 'Windows'): + # Do it the hard way. You might want to load libtorch with RTLD_GLOBAL in a + # few circumstances: + # + # 1. You're in a build environment (e.g., fbcode) where + # libtorch_global_deps is not available, but you still need + # to get mkl to link in with RTLD_GLOBAL or it will just + # not work. + # + # 2. You're trying to run PyTorch under UBSAN and you need + # to ensure that only one copy of libtorch is loaded, so + # vptr checks work properly + # + # If you're using this setting, you must verify that all the libraries + # you load consistently use the same libstdc++, or you may have + # mysterious segfaults. + # + old_flags = sys.getdlopenflags() + sys.setdlopenflags(os.RTLD_GLOBAL | os.RTLD_LAZY) + from torch._C import * # noqa: F403 + sys.setdlopenflags(old_flags) + del old_flags + +else: + # Easy way. You want this most of the time, because it will prevent + # C++ symbols from libtorch clobbering C++ symbols from other + # libraries, leading to mysterious segfaults. + # + # If building in an environment where libtorch_global_deps isn't available + # like parts of fbsource, but where RTLD_GLOBAL causes segfaults, you will + # want USE_RTLD_GLOBAL_WITH_LIBTORCH = False and USE_GLOBAL_DEPS = False + # + # See Note [Global dependencies] + if USE_GLOBAL_DEPS: + _load_global_deps() + from torch._C import * # noqa: F403 + +# Appease the type checker; ordinarily this binding is inserted by the +# torch._C module initialization code in C +if TYPE_CHECKING: + from . import _C as _C + +class SymInt: + """ + Like an int (including magic methods), but redirects all operations on the + wrapped node. This is used in particular to symbolically record operations + in the symbolic shape workflow. + """ + + def __init__(self, node): + # This field MUST be named node; C++ binding code assumes that this + # class has a field named node that stores SymNode + self.node = node + + def __bool__(self): + return builtins.bool(self != 0) + + def __int__(self): + return self.node.int_() + + def __index__(self): + return self.node.int_() + + # Magic methods installed by torch.fx.experimental.sym_node + + def __eq__(self, other: object) -> builtins.bool: + raise AssertionError("type stub not overridden") + + def __lt__(self, other) -> builtins.bool: + raise AssertionError("type stub not overridden") + + def __gt__(self, other) -> builtins.bool: + raise AssertionError("type stub not overridden") + + def __le__(self, other) -> builtins.bool: + raise AssertionError("type stub not overridden") + + def __ge__(self, other) -> builtins.bool: + raise AssertionError("type stub not overridden") + + def __add__(self, other) -> "SymInt": + raise AssertionError("type stub not overridden") + + def __mul__(self, other) -> "SymInt": + raise AssertionError("type stub not overridden") + + def __sym_max__(self, other): + raise AssertionError("type stub not overridden") + + def __sym_min__(self, other): + raise AssertionError("type stub not overridden") + + def __sym_float__(self): + raise AssertionError("type stub not overridden") + + def __neg__(self): + raise AssertionError("type stub not overridden") + + def __repr__(self): + return str(self.node) + + def __hash__(self) -> builtins.int: + if self.node.is_nested_int(): + return hash(self.node.nested_int()) + else: + # We could support constant SymInts as well, but not doing it for now + raise TypeError("unhashable type: non-nested SymInt") + +class SymFloat: + """ + Like an float (including magic methods), but redirects all operations on the + wrapped node. This is used in particular to symbolically record operations + in the symbolic shape workflow. + """ + + def __init__(self, node): + # This field MUST be named node; C++ binding code assumes that this + # class has a field named node that stores SymNode + self.node = node + + def __bool__(self): + return self.node.bool_() + + # Magic methods installed by torch.fx.experimental.sym_node + + def __eq__(self, other: object) -> builtins.bool: + raise AssertionError("type stub not overridden") + + def __lt__(self, other) -> builtins.bool: + raise AssertionError("type stub not overridden") + + def __gt__(self, other) -> builtins.bool: + raise AssertionError("type stub not overridden") + + def __le__(self, other) -> builtins.bool: + raise AssertionError("type stub not overridden") + + def __ge__(self, other) -> builtins.bool: + raise AssertionError("type stub not overridden") + + def __sym_max__(self, other): + raise AssertionError("type stub not overridden") + + def __sym_min__(self, other): + raise AssertionError("type stub not overridden") + + def __sym_int__(self): + raise AssertionError("type stub not overridden") + + def is_integer(self): + """Return True if the float is an integer.""" + raise AssertionError("type stub not overridden") + + def __repr__(self): + return self.node.str() + +class SymBool: + """ + Like an bool (including magic methods), but redirects all operations on the + wrapped node. This is used in particular to symbolically record operations + in the symbolic shape workflow. + + Unlike regular bools, regular boolean operators will force extra guards instead + of symbolically evaluate. Use the bitwise operators instead to handle this. + """ + + def __init__(self, node): + # This field MUST be named node; C++ binding code assumes that this + # class has a field named node that stores SymNode + self.node = node + + def __bool__(self): + return self.node.bool_() + + def __int__(self): + return builtins.int(self.node.bool_()) + + # Magic methods installed by torch.fx.experimental.sym_node + def __and__(self, other) -> "SymBool": + raise AssertionError("type stub not overridden") + + def __or__(self, other) -> "SymBool": + raise AssertionError("type stub not overridden") + + # We very carefully define __sym_not__, and not a number of other + # plausible alternatives: + # + # - We do not override __not__ because this is not a real magic + # method; you cannot override the meaning of the not builtin in + # Python. We use the name 'sym_not' to clarify that in user code you + # cannot use the builtin not or operator.not_ or operator.__not__ and + # hit this magic method; you must use our custom sym_not operator. + # + # - We do not override the __invert__ method because SymBool is + # meant to be usable in situations where bool is expected. However, + # bitwise negation ~a does the wrong thing with booleans (because + # bool is a subclass of int, so ~1 = -2 which is not falseish.) + # This would be a giant footgun, so we get around it by defining + # our own operator. Note that bitwise and/or do the right thing, + # so we reuse the conventional operators there for readability. + # + def __sym_not__(self) -> "SymBool": + raise AssertionError("type stub not overridden") + + def __sym_ite__(self, then_val, else_val): + raise AssertionError("type stub not overridden") + + def __eq__(self, other) -> builtins.bool: + raise AssertionError("type stub not overridden") + + def __repr__(self): + return str(self.node) + + def __hash__(self): + if self.node.is_constant(): + return hash(self.node.bool_()) + else: + raise TypeError("unhashable type: SymBool") + +def sym_not(a): + r""" SymInt-aware utility for logical negation. + + Args: + a (SymBool or bool): Object to negate + """ + import sympy + from .overrides import has_torch_function_unary, handle_torch_function + + if has_torch_function_unary(a): + return handle_torch_function(sym_not, (a,), a) + if hasattr(a, '__sym_not__'): + return a.__sym_not__() + if isinstance(a, sympy.Basic): + return ~a # type: ignore[operator] + return not a + +def sym_float(a): + r""" SymInt-aware utility for float casting. + + Args: + a (SymInt, SymFloat, or object): Object to cast + """ + from .overrides import has_torch_function_unary, handle_torch_function + + if has_torch_function_unary(a): + return handle_torch_function(sym_float, (a,), a) + if isinstance(a, SymFloat): + return a + elif hasattr(a, '__sym_float__'): + return a.__sym_float__() + return py_float(a) # type: ignore[operator] + + +def sym_int(a): + r""" SymInt-aware utility for int casting. + + Args: + a (SymInt, SymFloat, or object): Object to cast + """ + from .overrides import has_torch_function_unary, handle_torch_function + + if has_torch_function_unary(a): + return handle_torch_function(sym_int, (a,), a) + if isinstance(a, SymInt): + return a + elif isinstance(a, SymFloat): + return math.floor(a) if a >= 0 else math.ceil(a) # type: ignore[arg-type, call-overload] + return py_int(a) # type: ignore[operator] + +def sym_max(a, b): + """ SymInt-aware utility for max().""" + from .overrides import has_torch_function, handle_torch_function + + if has_torch_function((a, b)): + return handle_torch_function(sym_max, (a, b), a, b) + if isinstance(a, (SymInt, SymFloat)): + return a.__sym_max__(b) + elif isinstance(b, (SymInt, SymFloat)): + # NB: If you actually care about preserving output type exactly + # if you do something like max(0, 0.0), it is NOT sound to treat + # min/max as commutative + return b.__sym_max__(a) + return builtins.max(a, b) # type: ignore[operator] + +def sym_min(a, b): + """ SymInt-aware utility for max().""" + from .overrides import has_torch_function, handle_torch_function + + if has_torch_function((a, b)): + return handle_torch_function(sym_min, (a, b), a, b) + if isinstance(a, (SymInt, SymFloat)): + return a.__sym_min__(b) + elif isinstance(b, (SymInt, SymFloat)): + return b.__sym_min__(a) + return builtins.min(a, b) # type: ignore[operator] + +# Drop in replacement for math.sqrt, math.sin, math.cos etc +current_module = sys.modules[__name__] + +def _get_sym_math_fn(name): + def fn(a): + from .overrides import has_torch_function_unary, handle_torch_function + + if has_torch_function_unary(a): + return handle_torch_function(fn, (a,), a) + if hasattr(a, f"__sym_{name}__"): + return getattr(a, f"__sym_{name}__")() + return getattr(math, name)(a) + + return fn + +for name in ("sqrt", "cos", "cosh", "sin", "sinh", "tan", "tanh", "asin", "acos", "atan"): + sym_name = f"_sym_{name}" + fn = _get_sym_math_fn(name) + fn.__qualname__ = fn.__name__ = sym_name + setattr(current_module, sym_name, fn) + +# Adding temporary shortcut +sym_sqrt = current_module._sym_sqrt +__all__.append("sym_sqrt") + +del fn, name, sym_name, current_module # type: ignore[possibly-undefined] + + +def sym_ite(b, t, f): + from .overrides import has_torch_function, handle_torch_function + + if has_torch_function((b, t, f)): + return handle_torch_function(sym_ite, (b, t, f), b, t, f) + assert isinstance(b, (SymBool, builtins.bool)) and type(t) == type(f) + if isinstance(b, SymBool): + return b.__sym_ite__(t, f) + return t if b else f + +# Check to see if we can load C extensions, and if not provide some guidance +# on what the problem might be. +try: + # _initExtension is chosen (arbitrarily) as a sentinel. + from torch._C import _initExtension +except ImportError: + import torch._C as _C_for_compiled_check + + # The __file__ check only works for Python 3.7 and above. + if _C_for_compiled_check.__file__ is None: + raise ImportError(textwrap.dedent(''' + Failed to load PyTorch C extensions: + It appears that PyTorch has loaded the `torch/_C` folder + of the PyTorch repository rather than the C extensions which + are expected in the `torch._C` namespace. This can occur when + using the `install` workflow. e.g. + $ python setup.py install && python -c "import torch" + + This error can generally be solved using the `develop` workflow + $ python setup.py develop && python -c "import torch" # This should succeed + or by running Python from a different directory. + ''').strip()) from None + raise # If __file__ is not None the cause is unknown, so just re-raise. + +for name in dir(_C): + if name[0] != '_' and not name.endswith('Base'): + __all__.append(name) + obj = getattr(_C, name) + if (isinstance(obj, Callable) or inspect.isclass(obj)): # type: ignore[arg-type] + if (obj.__module__ != 'torch'): + # TODO: fix their module from C++ side + if name not in ['DisableTorchFunctionSubclass', 'DisableTorchFunction', 'Generator']: + obj.__module__ = 'torch' + elif name == 'TensorBase': + # issue 109438 / pr 109940. Prevent TensorBase from being copied into torch. + delattr(sys.modules[__name__], name) + +if not TYPE_CHECKING: + # issue 38137 and python issue 43367. Submodules of a C extension are + # non-standard, and attributes of those submodules cannot be pickled since + # pickle expect to be able to import them as "from _C.sub import attr" + # which fails with "_C is not a package + for attr in dir(_C): + candidate = getattr(_C, attr) + if type(candidate) is type(_C): + # submodule + if f'torch._C.{attr}' not in sys.modules: + sys.modules[f'torch._C.{attr}'] = candidate + + +################################################################################ +# Define basic utilities +################################################################################ + + +def typename(o): + if isinstance(o, torch.Tensor): + return o.type() + + module = '' + class_name = '' + if hasattr(o, '__module__') and o.__module__ != 'builtins' \ + and o.__module__ != '__builtin__' and o.__module__ is not None: + module = o.__module__ + '.' + + if hasattr(o, '__qualname__'): + class_name = o.__qualname__ + elif hasattr(o, '__name__'): + class_name = o.__name__ + else: + class_name = o.__class__.__name__ + + return module + class_name + + +def is_tensor(obj): + r"""Returns True if `obj` is a PyTorch tensor. + + Note that this function is simply doing ``isinstance(obj, Tensor)``. + Using that ``isinstance`` check is better for typechecking with mypy, + and more explicit - so it's recommended to use that instead of + ``is_tensor``. + + Args: + obj (Object): Object to test + Example:: + + >>> x = torch.tensor([1, 2, 3]) + >>> torch.is_tensor(x) + True + + """ + return isinstance(obj, torch.Tensor) + + +def is_storage(obj): + r"""Returns True if `obj` is a PyTorch storage object. + + Args: + obj (Object): Object to test + """ + return type(obj) in _storage_classes + + +_GLOBAL_DEVICE_CONTEXT = threading.local() + + +def get_default_device() -> "torch.device": + r"""Gets the default ``torch.Tensor`` to be allocated on ``device``""" + global _GLOBAL_DEVICE_CONTEXT + if hasattr(_GLOBAL_DEVICE_CONTEXT, "device_context"): + device = _GLOBAL_DEVICE_CONTEXT.device_context.device + if device.index is not None: + return device + else: + # TODO: Call like get_device_index() method corresponding to + # each device type + return torch.tensor([]).device + else: + return torch.device("cpu") + + +def set_default_device(device): + """Sets the default ``torch.Tensor`` to be allocated on ``device``. This + does not affect factory function calls which are called with an explicit + ``device`` argument. Factory calls will be performed as if they + were passed ``device`` as an argument. + + To only temporarily change the default device instead of setting it + globally, use ``with torch.device(device):`` instead. + + The default device is initially ``cpu``. If you set the default tensor + device to another device (e.g., ``cuda``) without a device index, tensors + will be allocated on whatever the current device for the device type, + even after :func:`torch.cuda.set_device` is called. + + .. warning:: + + This function imposes a slight performance cost on every Python + call to the torch API (not just factory functions). If this + is causing problems for you, please comment on + https://github.com/pytorch/pytorch/issues/92701 + + .. note:: + + This doesn't affect functions that create tensors that share the same memory as the input, like: + :func:`torch.from_numpy` and :func:`torch.frombuffer` + + Args: + device (device or string): the device to set as default + + Example:: + + >>> # xdoctest: +SKIP("requires cuda, changes global state") + >>> torch.get_default_device() + device(type='cpu') + >>> torch.set_default_device('cuda') # current device is 0 + >>> torch.get_default_device() + device(type='cuda', index=0) + >>> torch.set_default_device('cuda') + >>> torch.cuda.set_device('cuda:1') # current device is 1 + >>> torch.get_default_device() + device(type='cuda', index=1) + >>> torch.set_default_device('cuda:1') + >>> torch.get_default_device() + device(type='cuda', index=1) + + """ + global _GLOBAL_DEVICE_CONTEXT + if hasattr(_GLOBAL_DEVICE_CONTEXT, "device_context"): + device_context = _GLOBAL_DEVICE_CONTEXT.device_context + if device_context is not None: + device_context.__exit__(None, None, None) + + if device is None: + device_context = None + else: + from torch.utils._device import DeviceContext + device_context = DeviceContext(device) + device_context.__enter__() + _GLOBAL_DEVICE_CONTEXT.device_context = device_context + + +def set_default_tensor_type(t): + r""" + .. warning:: + + This function is deprecated as of PyTorch 2.1, please use :func:`torch.set_default_dtype()` and + :func:`torch.set_default_device()` as alternatives. + + Sets the default ``torch.Tensor`` type to floating point tensor type + ``t``. This type will also be used as default floating point type for + type inference in :func:`torch.tensor`. + + The default floating point tensor type is initially ``torch.FloatTensor``. + + Args: + t (type or string): the floating point tensor type or its name + + Example:: + + >>> # xdoctest: +SKIP("Other tests may have changed the default type. Can we reset it?") + >>> torch.tensor([1.2, 3]).dtype # initial default for floating point is torch.float32 + torch.float32 + >>> torch.set_default_tensor_type(torch.DoubleTensor) + >>> torch.tensor([1.2, 3]).dtype # a new floating point tensor + torch.float64 + + """ + if isinstance(t, str): + t = _import_dotted_name(t) + _C._set_default_tensor_type(t) + + +def set_default_dtype(d): + r""" + + Sets the default floating point dtype to :attr:`d`. Supports torch.float32 + and torch.float64 as inputs. Other dtypes may be accepted without complaint + but are not supported and are unlikely to work as expected. + + When PyTorch is initialized its default floating point dtype is torch.float32, + and the intent of set_default_dtype(torch.float64) is to facilitate NumPy-like + type inference. The default floating point dtype is used to: + + 1. Implicitly determine the default complex dtype. When the default floating point + type is float32 the default complex dtype is complex64, and when the default + floating point type is float64 the default complex type is complex128. + 2. Infer the dtype for tensors constructed using Python floats or complex Python + numbers. See examples below. + 3. Determine the result of type promotion between bool and integer tensors and + Python floats and complex Python numbers. + + Args: + d (:class:`torch.dtype`): the floating point dtype to make the default. + Either torch.float32 or torch.float64. + + Example: + >>> # xdoctest: +SKIP("Other tests may have changed the default type. Can we reset it?") + >>> # initial default for floating point is torch.float32 + >>> # Python floats are interpreted as float32 + >>> torch.tensor([1.2, 3]).dtype + torch.float32 + >>> # initial default for floating point is torch.complex64 + >>> # Complex Python numbers are interpreted as complex64 + >>> torch.tensor([1.2, 3j]).dtype + torch.complex64 + + >>> torch.set_default_dtype(torch.float64) + + >>> # Python floats are now interpreted as float64 + >>> torch.tensor([1.2, 3]).dtype # a new floating point tensor + torch.float64 + >>> # Complex Python numbers are now interpreted as complex128 + >>> torch.tensor([1.2, 3j]).dtype # a new complex tensor + torch.complex128 + + """ + _C._set_default_dtype(d) + +def use_deterministic_algorithms(mode: builtins.bool, *, warn_only: builtins.bool = False) -> None: + r""" Sets whether PyTorch operations must use "deterministic" + algorithms. That is, algorithms which, given the same input, and when + run on the same software and hardware, always produce the same output. + When enabled, operations will use deterministic algorithms when available, + and if only nondeterministic algorithms are available they will throw a + :class:`RuntimeError` when called. + + .. note:: This setting alone is not always enough to make an application + reproducible. Refer to :ref:`reproducibility` for more information. + + .. note:: :func:`torch.set_deterministic_debug_mode` offers an alternative + interface for this feature. + + The following normally-nondeterministic operations will act + deterministically when ``mode=True``: + + * :class:`torch.nn.Conv1d` when called on CUDA tensor + * :class:`torch.nn.Conv2d` when called on CUDA tensor + * :class:`torch.nn.Conv3d` when called on CUDA tensor + * :class:`torch.nn.ConvTranspose1d` when called on CUDA tensor + * :class:`torch.nn.ConvTranspose2d` when called on CUDA tensor + * :class:`torch.nn.ConvTranspose3d` when called on CUDA tensor + * :class:`torch.nn.ReplicationPad2d` when attempting to differentiate a CUDA tensor + * :func:`torch.bmm` when called on sparse-dense CUDA tensors + * :func:`torch.Tensor.__getitem__` when attempting to differentiate a CPU tensor + and the index is a list of tensors + * :func:`torch.Tensor.index_put` with ``accumulate=False`` + * :func:`torch.Tensor.index_put` with ``accumulate=True`` when called on a CPU + tensor + * :func:`torch.Tensor.put_` with ``accumulate=True`` when called on a CPU + tensor + * :func:`torch.Tensor.scatter_add_` when called on a CUDA tensor + * :func:`torch.gather` when called on a CUDA tensor that requires grad + * :func:`torch.index_add` when called on CUDA tensor + * :func:`torch.index_select` when attempting to differentiate a CUDA tensor + * :func:`torch.repeat_interleave` when attempting to differentiate a CUDA tensor + * :func:`torch.Tensor.index_copy` when called on a CPU or CUDA tensor + * :func:`torch.Tensor.scatter` when `src` type is Tensor and called on CUDA tensor + * :func:`torch.Tensor.scatter_reduce` when ``reduce='sum'`` or ``reduce='mean'`` and called on CUDA tensor + + The following normally-nondeterministic operations will throw a + :class:`RuntimeError` when ``mode=True``: + + * :class:`torch.nn.AvgPool3d` when attempting to differentiate a CUDA tensor + * :class:`torch.nn.AdaptiveAvgPool2d` when attempting to differentiate a CUDA tensor + * :class:`torch.nn.AdaptiveAvgPool3d` when attempting to differentiate a CUDA tensor + * :class:`torch.nn.MaxPool3d` when attempting to differentiate a CUDA tensor + * :class:`torch.nn.AdaptiveMaxPool2d` when attempting to differentiate a CUDA tensor + * :class:`torch.nn.FractionalMaxPool2d` when attempting to differentiate a CUDA tensor + * :class:`torch.nn.FractionalMaxPool3d` when attempting to differentiate a CUDA tensor + * :class:`torch.nn.MaxUnpool1d` + * :class:`torch.nn.MaxUnpool2d` + * :class:`torch.nn.MaxUnpool3d` + * :func:`torch.nn.functional.interpolate` when attempting to differentiate a CUDA tensor + and one of the following modes is used: + + - ``linear`` + - ``bilinear`` + - ``bicubic`` + - ``trilinear`` + + * :class:`torch.nn.ReflectionPad1d` when attempting to differentiate a CUDA tensor + * :class:`torch.nn.ReflectionPad2d` when attempting to differentiate a CUDA tensor + * :class:`torch.nn.ReflectionPad3d` when attempting to differentiate a CUDA tensor + * :class:`torch.nn.ReplicationPad1d` when attempting to differentiate a CUDA tensor + * :class:`torch.nn.ReplicationPad3d` when attempting to differentiate a CUDA tensor + * :class:`torch.nn.NLLLoss` when called on a CUDA tensor + * :class:`torch.nn.CTCLoss` when attempting to differentiate a CUDA tensor + * :class:`torch.nn.EmbeddingBag` when attempting to differentiate a CUDA tensor when + ``mode='max'`` + * :func:`torch.Tensor.put_` when ``accumulate=False`` + * :func:`torch.Tensor.put_` when ``accumulate=True`` and called on a CUDA tensor + * :func:`torch.histc` when called on a CUDA tensor + * :func:`torch.bincount` when called on a CUDA tensor and ``weights`` + tensor is given + * :func:`torch.kthvalue` with called on a CUDA tensor + * :func:`torch.median` with indices output when called on a CUDA tensor + * :func:`torch.nn.functional.grid_sample` when attempting to differentiate a CUDA tensor + * :func:`torch.cumsum` when called on a CUDA tensor when dtype is floating point or complex + * :func:`torch.Tensor.scatter_reduce` when ``reduce='prod'`` and called on CUDA tensor + * :func:`torch.Tensor.resize_` when called with a quantized tensor + + In addition, several operations fill uninitialized memory when this setting + is turned on and when + :attr:`torch.utils.deterministic.fill_uninitialized_memory` is turned on. + See the documentation for that attribute for more information. + + A handful of CUDA operations are nondeterministic if the CUDA version is + 10.2 or greater, unless the environment variable ``CUBLAS_WORKSPACE_CONFIG=:4096:8`` + or ``CUBLAS_WORKSPACE_CONFIG=:16:8`` is set. See the CUDA documentation for more + details: ``_ + If one of these environment variable configurations is not set, a :class:`RuntimeError` + will be raised from these operations when called with CUDA tensors: + + * :func:`torch.mm` + * :func:`torch.mv` + * :func:`torch.bmm` + + Note that deterministic operations tend to have worse performance than + nondeterministic operations. + + .. note:: + + This flag does not detect or prevent nondeterministic behavior caused + by calling an inplace operation on a tensor with an internal memory + overlap or by giving such a tensor as the :attr:`out` argument for an + operation. In these cases, multiple writes of different data may target + a single memory location, and the order of writes is not guaranteed. + + Args: + mode (:class:`bool`): If True, makes potentially nondeterministic + operations switch to a deterministic algorithm or throw a runtime + error. If False, allows nondeterministic operations. + + Keyword args: + warn_only (:class:`bool`, optional): If True, operations that do not + have a deterministic implementation will throw a warning instead of + an error. Default: ``False`` + + Example:: + + >>> # xdoctest: +SKIP + >>> torch.use_deterministic_algorithms(True) + + # Forward mode nondeterministic error + >>> torch.randn(10, device='cuda').kthvalue(1) + ... + RuntimeError: kthvalue CUDA does not have a deterministic implementation... + + # Backward mode nondeterministic error + >>> torch.nn.AvgPool3d(1)(torch.randn(3, 4, 5, 6, requires_grad=True).cuda()).sum().backward() + ... + RuntimeError: avg_pool3d_backward_cuda does not have a deterministic implementation... + """ + _C._set_deterministic_algorithms(mode, warn_only=warn_only) + +def are_deterministic_algorithms_enabled() -> builtins.bool: + r"""Returns True if the global deterministic flag is turned on. Refer to + :func:`torch.use_deterministic_algorithms` documentation for more details. + """ + return _C._get_deterministic_algorithms() + +def is_deterministic_algorithms_warn_only_enabled() -> builtins.bool: + r"""Returns True if the global deterministic flag is set to warn only. + Refer to :func:`torch.use_deterministic_algorithms` documentation for more + details. + """ + return _C._get_deterministic_algorithms_warn_only() + +def set_deterministic_debug_mode(debug_mode: Union[builtins.int, str]) -> None: + r"""Sets the debug mode for deterministic operations. + + .. note:: This is an alternative interface for + :func:`torch.use_deterministic_algorithms`. Refer to that function's + documentation for details about affected operations. + + Args: + debug_mode(str or int): If "default" or 0, don't error or warn on + nondeterministic operations. If "warn" or 1, warn on + nondeterministic operations. If "error" or 2, error on + nondeterministic operations. + """ + + # NOTE: builtins.int is used here because int in this scope resolves + # to torch.int + if not isinstance(debug_mode, (builtins.int, str)): + raise TypeError(f'debug_mode must be str or int, but got {type(debug_mode)}') + + if isinstance(debug_mode, str): + if debug_mode == 'default': + debug_mode = 0 + elif debug_mode == 'warn': + debug_mode = 1 + elif debug_mode == 'error': + debug_mode = 2 + else: + raise RuntimeError( + 'invalid value of debug_mode, expected one of `default`, ' + f'`warn`, `error`, but got {debug_mode}') + + if debug_mode == 0: + _C._set_deterministic_algorithms(False) + elif debug_mode == 1: + _C._set_deterministic_algorithms(True, warn_only=True) + elif debug_mode == 2: + _C._set_deterministic_algorithms(True) + else: + raise RuntimeError( + 'invalid value of debug_mode, expected 0, 1, or 2, ' + f'but got {debug_mode}') + +def get_deterministic_debug_mode() -> builtins.int: + r"""Returns the current value of the debug mode for deterministic + operations. Refer to :func:`torch.set_deterministic_debug_mode` + documentation for more details. + """ + + if _C._get_deterministic_algorithms(): + if _C._get_deterministic_algorithms_warn_only(): + return 1 + else: + return 2 + else: + return 0 + +def get_float32_matmul_precision() -> builtins.str: + r"""Returns the current value of float32 matrix multiplication precision. Refer to + :func:`torch.set_float32_matmul_precision` documentation for more details. + """ + return _C._get_float32_matmul_precision() + +def set_float32_matmul_precision(precision: str) -> None: + r"""Sets the internal precision of float32 matrix multiplications. + + Running float32 matrix multiplications in lower precision may significantly increase + performance, and in some programs the loss of precision has a negligible impact. + + Supports three settings: + + * "highest", float32 matrix multiplications use the float32 datatype (24 mantissa + bits with 23 bits explicitly stored) for internal computations. + * "high", float32 matrix multiplications either use the TensorFloat32 datatype (10 + mantissa bits explicitly stored) or treat each float32 number as the sum of two bfloat16 numbers + (approximately 16 mantissa bits with 14 bits explicitly stored), if the appropriate fast matrix multiplication + algorithms are available. Otherwise float32 matrix multiplications are computed + as if the precision is "highest". See below for more information on the bfloat16 + approach. + * "medium", float32 matrix multiplications use the bfloat16 datatype (8 mantissa + bits with 7 bits explicitly stored) for internal computations, if a fast matrix multiplication algorithm + using that datatype internally is available. Otherwise float32 + matrix multiplications are computed as if the precision is "high". + + When using "high" precision, float32 multiplications may use a bfloat16-based algorithm + that is more complicated than simply truncating to some smaller number mantissa bits + (e.g. 10 for TensorFloat32, 7 for bfloat16 explicitly stored). Refer to [Henry2019]_ for a complete + description of this algorithm. To briefly explain here, the first step is to realize + that we can perfectly encode a single float32 number as the sum of three bfloat16 + numbers (because float32 has 23 mantissa bits while bfloat16 has 7 explicitly stored, and both have the + same number of exponent bits). This means that the product of two float32 numbers can + be exactly given by the sum of nine products of bfloat16 numbers. We can then trade + accuracy for speed by dropping some of these products. The "high" precision algorithm + specifically keeps only the three most significant products, which conveniently excludes + all of the products involving the last 8 mantissa bits of either input. This means that + we can represent our inputs as the sum of two bfloat16 numbers rather than three. + Because bfloat16 fused-multiply-add (FMA) instructions are typically >10x faster than + float32 ones, it's faster to do three multiplications and 2 additions with bfloat16 + precision than it is to do a single multiplication with float32 precision. + + .. [Henry2019] http://arxiv.org/abs/1904.06376 + + .. note:: + + This does not change the output dtype of float32 matrix multiplications, + it controls how the internal computation of the matrix multiplication is performed. + + .. note:: + + This does not change the precision of convolution operations. Other flags, + like `torch.backends.cudnn.allow_tf32`, may control the precision of convolution + operations. + + .. note:: + + This flag currently only affects one native device type: CUDA. + If "high" or "medium" are set then the TensorFloat32 datatype will be used + when computing float32 matrix multiplications, equivalent to setting + `torch.backends.cuda.matmul.allow_tf32 = True`. When "highest" (the default) + is set then the float32 datatype is used for internal computations, equivalent + to setting `torch.backends.cuda.matmul.allow_tf32 = False`. + + Args: + precision(str): can be set to "highest" (default), "high", or "medium" (see above). + + """ + _C._set_float32_matmul_precision(precision) + +def set_warn_always(b: builtins.bool) -> None: + r"""When this flag is False (default) then some PyTorch warnings may only + appear once per process. This helps avoid excessive warning information. + Setting it to True causes these warnings to always appear, which may be + helpful when debugging. + + Args: + b (:class:`bool`): If True, force warnings to always be emitted + If False, set to the default behaviour + """ + _C._set_warnAlways(b) + +def is_warn_always_enabled() -> builtins.bool: + r"""Returns True if the global warn_always flag is turned on. Refer to + :func:`torch.set_warn_always` documentation for more details. + """ + return _C._get_warnAlways() + +################################################################################ +# Define error checking functions +################################################################################ + +# These error checking functions must be kept consistent with their C++ +# equivalents. Their C++ equivalents are mentioned where applicable. + +def _check_with(error_type, cond: Union[builtins.bool, SymBool], message: Callable[[], str]): # noqa: F811 + if not isinstance(cond, (builtins.bool, torch.SymBool)): + raise TypeError(f'cond must be a bool, but got {type(cond)}') + + from torch.fx.experimental.symbolic_shapes import expect_true + if expect_true(cond): + return + + # error_type must be a subclass of Exception and not subclass of Warning + assert issubclass(error_type, Exception) and not issubclass(error_type, Warning) + + if message is None: + message_evaluated = ( + 'Expected cond to be True, but got False. (Could this error ' + 'message be improved? If so, please report an enhancement request ' + 'to PyTorch.)') + + else: + if not callable(message): + raise TypeError('message must be a callable') + + message_evaluated = str(message()) + + raise error_type(message_evaluated) + +def _check(cond, message=None): # noqa: F811 + r"""Throws error containing an optional message if the specified condition + is False. + + Error type: ``RuntimeError`` + + C++ equivalent: ``TORCH_CHECK`` + + Args: + cond (:class:`bool`): If False, throw error + + message (Callable, optional): Callable that returns either a string or + an object that has a ``__str__()`` method to be used as the error + message. Default: ``None`` + """ + _check_with(RuntimeError, cond, message) + +def _check_is_size(i, message=None): + """Checks that a given integer is a valid size (i.e., is non-negative). + You should use this over _check(i >= 0) because we can use the semantic + information (that i is a size) to make some further inferences in case + i is an unbacked SymInt. + + NB: Do NOT use this in contexts where a -1 size would be valid (indicating + to infer the size from context, or if you should wrap-around or truncate). + Only use this if the only valid value is an honest to goodness size. + """ + # This is responsible for the expect_true + _check(i >= 0, message) + from torch.fx.experimental.symbolic_shapes import _advise_is_size + _advise_is_size(i) + +def _check_index(cond, message=None): # noqa: F811 + r"""Throws error containing an optional message if the specified condition + is False. + + Error type: ``IndexError`` + + C++ equivalent: ``TORCH_CHECK_INDEX`` + + Args: + cond (:class:`bool`): If False, throw error + + message (Callable, optional): Callable that returns either a string or + an object that has a ``__str__()`` method to be used as the error + message. Default: ``None`` + """ + _check_with(IndexError, cond, message) + +def _check_value(cond, message=None): # noqa: F811 + r"""Throws error containing an optional message if the specified condition + is False. + + Error type: ``ValueError`` + + C++ equivalent: ``TORCH_CHECK_VALUE`` + + Args: + cond (:class:`bool`): If False, throw error + + message (Callable, optional): Callable that returns either a string or + an object that has a ``__str__()`` method to be used as the error + message. Default: ``None`` + """ + _check_with(ValueError, cond, message) + +def _check_type(cond, message=None): # noqa: F811 + r"""Throws error containing an optional message if the specified condition + is False. + + Error type: ``TypeError`` + + C++ equivalent: ``TORCH_CHECK_TYPE`` + + Args: + cond (:class:`bool`): If False, throw error + + message (Callable, optional): Callable that returns either a string or + an object that has a ``__str__()`` method to be used as the error + message. Default: ``None`` + """ + _check_with(TypeError, cond, message) + +def _check_not_implemented(cond, message=None): # noqa: F811 + r"""Throws error containing an optional message if the specified condition + is False. + + Error type: ``NotImplementedError`` + + C++ equivalent: ``TORCH_CHECK_NOT_IMPLEMENTED`` + + Args: + cond (:class:`bool`): If False, throw error + + message (Callable, optional): Callable that returns either a string or + an object that has a ``__str__()`` method to be used as the error + message. Default: ``None`` + """ + _check_with(NotImplementedError, cond, message) + +def _check_tensor_all_with(error_type, cond, message=None): # noqa: F811 + if not torch.is_tensor(cond): + raise TypeError(f'cond must be a tensor, but got {type(cond)}') + + if not cond.dtype == torch.bool: + raise TypeError( + f'cond tensor must have dtype torch.bool, but got {cond.dtype}') + + _check_with(error_type, cond._is_all_true().item(), message) + +# C++ equivalent: `TORCH_CHECK_TENSOR_ALL` +def _check_tensor_all(cond, message=None): # noqa: F811 + r"""Throws error containing an optional message if the specified condition + is False. + + Error type: ``RuntimeError`` + + C++ equivalent: ``TORCH_CHECK_TENSOR_ALL`` + + Args: + cond (:class:`torch.Tensor`): Tensor of dtype ``torch.bool``. If any + element is ``False``, throw error + + message (Callable, optional): Callable that returns either a string or + an object that has a ``__str__()`` method to be used as the error + message. Default: ``None`` + """ + _check_tensor_all_with(RuntimeError, cond, message) + +################################################################################ +# Define numeric constants +################################################################################ + +# For Python Array API (https://data-apis.org/array-api/latest/API_specification/constants.html) and +# NumPy consistency (https://numpy.org/devdocs/reference/constants.html) +from math import e , nan , inf , pi +__all__.extend(['e', 'pi', 'nan', 'inf']) + +################################################################################ +# Define Storage and Tensor classes +################################################################################ + +from ._tensor import Tensor +from .storage import _StorageBase, TypedStorage, _LegacyStorage, UntypedStorage, _warn_typed_storage_removal + +# NOTE: New Storage classes should never be added. When adding a new +# dtype, use torch.storage.TypedStorage directly. + +class ByteStorage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.uint8 + +class DoubleStorage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.double + +class FloatStorage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.float + +class HalfStorage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.half + +class LongStorage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.long + +class IntStorage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.int + +class ShortStorage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.short + +class CharStorage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.int8 + +class BoolStorage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.bool + +class BFloat16Storage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.bfloat16 + +class ComplexDoubleStorage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.cdouble + +class ComplexFloatStorage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.cfloat + +class QUInt8Storage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.quint8 + +class QInt8Storage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.qint8 + +class QInt32Storage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.qint32 + +class QUInt4x2Storage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.quint4x2 + +class QUInt2x4Storage(_LegacyStorage): + @classproperty + def dtype(self): + _warn_typed_storage_removal(stacklevel=3) + return self._dtype + + @classproperty + def _dtype(self): + return torch.quint2x4 + +_storage_classes = { + UntypedStorage, DoubleStorage, FloatStorage, LongStorage, IntStorage, + ShortStorage, CharStorage, ByteStorage, HalfStorage, BoolStorage, + QUInt8Storage, QInt8Storage, QInt32Storage, BFloat16Storage, + ComplexFloatStorage, ComplexDoubleStorage, QUInt4x2Storage, QUInt2x4Storage, + TypedStorage +} + +# The _tensor_classes set is initialized by the call to initialize_python_bindings. +_tensor_classes: Set[Type] = set() + +# If you edit these imports, please update torch/__init__.py.in as well +from .random import set_rng_state, get_rng_state, manual_seed, initial_seed, seed +from .serialization import save, load +from ._tensor_str import set_printoptions + +################################################################################ +# Initialize extension +################################################################################ + +def manager_path(): + if _running_with_deploy() or platform.system() == 'Windows': + return b"" + path = get_file_path('torch', 'bin', 'torch_shm_manager') + prepare_multiprocessing_environment(get_file_path('torch')) + if not os.path.exists(path): + raise RuntimeError("Unable to find torch_shm_manager at " + path) + return path.encode('utf-8') + +from torch.amp import autocast, GradScaler + +# Initializing the extension shadows the built-in python float / int classes; +# store them for later use by SymInt / SymFloat. +py_float = float +py_int = int + +# Shared memory manager needs to know the exact location of manager executable +_C._initExtension(manager_path()) +del manager_path + +# Appease the type checker: it can't deal with direct setting of globals(). +# Note that we will see "too many" functions when reexporting this way; there +# is not a good way to fix this problem. Perhaps, try to redesign VariableFunctions +# so that this import is good enough +if TYPE_CHECKING: + # Some type signatures pulled in from _VariableFunctions here clash with + # signatures already imported. For now these clashes are ignored; see + # PR #43339 for details. + from torch._C._VariableFunctions import * # type: ignore[assignment, misc] # noqa: F403 + # Fixup segment_reduce visibility + _segment_reduce = segment_reduce + del segment_reduce # noqa: F821 + +# Ops not to be exposed in `torch` namespace, +# mostly helper ops. +PRIVATE_OPS = ( + 'unique_dim', +) + +for name in dir(_C._VariableFunctions): + if name.startswith('__') or name in PRIVATE_OPS: + continue + obj = getattr(_C._VariableFunctions, name) + obj.__module__ = 'torch' + # Hide some APIs that should not be public + if name == "segment_reduce": + # TODO: Once the undocumented FC window is passed, remove the line bellow + globals()[name] = obj + name = "_" + name + globals()[name] = obj + if not name.startswith("_"): + __all__.append(name) + + +################################################################################ +# Add torch.dtype instances to the public API +################################################################################ + +import torch + +for attribute in dir(torch): + if isinstance(getattr(torch, attribute), torch.dtype): + __all__.append(attribute) + +################################################################################ +# Import TorchDynamo's lazy APIs to avoid circular dependenices +################################################################################ + +# needs to be before from .functional import * to avoid circular dependencies +from ._compile import _disable_dynamo + +################################################################################ +# Import interface functions defined in Python +################################################################################ + +# needs to be after the above ATen bindings so we can overwrite from Python side +from .functional import * # noqa: F403 + + +################################################################################ +# Remove unnecessary members +################################################################################ + +del _StorageBase +del _LegacyStorage + +################################################################################ +# Define _assert +################################################################################ + +# needs to be before the submodule imports to avoid circular dependencies +def _assert(condition, message): + r"""A wrapper around Python's assert which is symbolically traceable. + """ + from .overrides import has_torch_function, handle_torch_function + + if type(condition) is not torch.Tensor and has_torch_function((condition,)): + return handle_torch_function(_assert, (condition,), condition, message) + assert condition, message + +################################################################################ +# Import most common subpackages +################################################################################ + +# Use the redundant form so that type checkers know that these are a part of +# the public API. The "regular" import lines are there solely for the runtime +# side effect of adding to the imported module's members for other users. +from torch import cuda as cuda +from torch import cpu as cpu +from torch import mps as mps +from torch import xpu as xpu +from torch import autograd as autograd +from torch.autograd import ( + no_grad as no_grad, + enable_grad as enable_grad, + set_grad_enabled as set_grad_enabled, + inference_mode as inference_mode, +) +from torch import fft as fft +from torch import futures as futures +from torch import _awaits as _awaits +from torch import nested as nested +from torch import nn as nn +from torch.signal import windows as windows +from torch import optim as optim +import torch.optim._multi_tensor +from torch import multiprocessing as multiprocessing +from torch import sparse as sparse +from torch import special as special +import torch.utils.backcompat +from torch import jit as jit +from torch import linalg as linalg +from torch import hub as hub +from torch import random as random +from torch import distributions as distributions +from torch import testing as testing +from torch import backends as backends +import torch.utils.data +from torch import __config__ as __config__ +from torch import __future__ as __future__ +from torch import profiler as profiler + +# Quantized, sparse, AO, etc. should be last to get imported, as nothing +# is expected to depend on them. +from torch import ao as ao +# nn.quant* depends on ao -- so should be after those. +import torch.nn.quantizable +import torch.nn.quantized +import torch.nn.qat +import torch.nn.intrinsic + +_C._init_names(list(torch._storage_classes)) + +# attach docstrings to torch and tensor functions +from . import _torch_docs, _tensor_docs, _storage_docs +del _torch_docs, _tensor_docs, _storage_docs + + +def compiled_with_cxx11_abi() -> builtins.bool: + r"""Returns whether PyTorch was built with _GLIBCXX_USE_CXX11_ABI=1""" + return _C._GLIBCXX_USE_CXX11_ABI + + +# Import the ops "namespace" +from torch._ops import ops +from torch._classes import classes +import torch._library + +# quantization depends on torch.fx +# Import quantization +from torch import quantization as quantization + +# Import the quasi random sampler +from torch import quasirandom as quasirandom + +# If you are seeing this, it means that this call site was not checked if +# the memory format could be preserved, and it was switched to old default +# behaviour of contiguous +legacy_contiguous_format = contiguous_format + +# Register fork handler to initialize OpenMP in child processes (see gh-28389) +from torch.multiprocessing._atfork import register_after_fork +register_after_fork(torch.get_num_threads) +del register_after_fork + +# Import tools that require fully imported torch (for applying +# torch.jit.script as a decorator, for instance): +from ._lobpcg import lobpcg as lobpcg + +# These were previously defined in native_functions.yaml and appeared on the +# `torch` namespace, but we moved them to c10 dispatch to facilitate custom +# class usage. We add these lines here to preserve backward compatibility. +quantized_lstm = torch.ops.aten.quantized_lstm +quantized_gru = torch.ops.aten.quantized_gru + +from torch.utils.dlpack import from_dlpack, to_dlpack + +# Import experimental masked operations support. See +# [RFC-0016](https://github.com/pytorch/rfcs/pull/27) for more +# information. +from . import masked + +# Import removed ops with error message about removal +from ._linalg_utils import ( # type: ignore[misc] + matrix_rank, + eig, + solve, + lstsq, +) +from ._linalg_utils import _symeig as symeig # type: ignore[misc] + +class _TorchCompileInductorWrapper: + compiler_name = "inductor" + + def __init__(self, mode, options, dynamic): + self.config: Dict[str, Any] = dict() + self.dynamic = dynamic + self.apply_mode(mode) + self.apply_options(options) + + if self.config.get("triton.cudagraphs", False): + os.environ["DISABLE_CUPTI_LAZY_REINIT"] = "1" + # FIXME: CUDA Graph does not work well with CUPTI teardown. + # 1) crashes on 1st lazy CUPTI re-init after teardown (CUDA 11) + # 2) crashes on 2nd non-lazy CUPTI re-init after teardown (CUDA 12) + # Workaround: turn off CUPTI teardown when using CUDA Graphs. + os.environ["TEARDOWN_CUPTI"] = "0" + + def __eq__(self, other): + return (isinstance(other, _TorchCompileInductorWrapper) and + self.config == other.config and + self.dynamic == other.dynamic) + + def apply_mode(self, mode: Optional[str]): + if mode is None or mode == "default": + pass + elif mode in ("reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"): + from torch._inductor import list_mode_options + self.apply_options(list_mode_options(mode, self.dynamic)) + else: + raise RuntimeError( + f"Unrecognized mode={mode}, should be one of: default, reduce-overhead, max-autotune, max-autotune-no-cudagraphs" + ) + + def apply_options(self, options: Optional[Dict[str, Any]]): + if not options: + return + + from torch._inductor import config + current_config: Dict[str, Any] = config.shallow_copy_dict() + + for key, val in options.items(): + attr_name = key.replace("-", "_") + if attr_name not in current_config: + raise RuntimeError( + f"Unexpected optimization option {key}, known options are {list(current_config.keys())}" + ) + if type(val) is not type(current_config[attr_name]): + val_type_str = type(val).__name__ + expected_type_str = type(current_config[attr_name]).__name__ + raise RuntimeError( + f"Unexpected type of attr {key}, got {val_type_str} should be {expected_type_str}" + ) + self.config[attr_name] = val + + def __call__(self, model_, inputs_): + from torch._inductor.compile_fx import compile_fx + + return compile_fx(model_, inputs_, config_patches=self.config) + + def get_compiler_config(self): + from torch._inductor.compile_fx import get_patched_config_dict + return get_patched_config_dict(config_patches=self.config) + + def reset(self): + from torch._inductor import config + if "triton.cudagraphs" in self.config or config.triton.cudagraphs: + if self.config.get("triton.cudagraphs", True): + from torch._inductor.cudagraph_trees import reset_cudagraph_trees + reset_cudagraph_trees() + +class _TorchCompileWrapper: + def __init__(self, backend, mode, options, dynamic): + from torch._dynamo.backends.registry import lookup_backend + + if isinstance(backend, str): + self.compiler_name = backend + elif hasattr(backend, "__name__"): + self.compiler_name = backend.__name__ + else: + self.compiler_name = str(backend) + self.dynamic = dynamic + self.compiler_fn = lookup_backend(backend) + self.kwargs = {} + # only pass the args if they non-empty + if mode and mode != "default": + self.kwargs["mode"] = mode + if options: + self.kwargs["options"] = options + + def __eq__(self, other): + return (isinstance(other, _TorchCompileWrapper) and + self.compiler_fn == other.compiler_fn and + self.kwargs == other.kwargs and + self.dynamic == other.dynamic) + + def __call__(self, model_, inputs_): + return self.compiler_fn(model_, inputs_, **self.kwargs) + + def reset(self): + if hasattr(self.compiler_fn, "reset"): + self.compiler_fn.reset() + + +def compile(model: Optional[Callable] = None, *, + fullgraph: builtins.bool = False, + dynamic: Optional[builtins.bool] = None, + backend: Union[str, Callable] = "inductor", + mode: Union[str, None] = None, + options: Optional[Dict[str, Union[str, builtins.int, builtins.bool]]] = None, + disable: builtins.bool = False) -> Callable: + """ + Optimizes given model/function using TorchDynamo and specified backend. + + Concretely, for every frame executed within the compiled region, we will attempt + to compile it and cache the compiled result on the code object for future + use. A single frame may be compiled multiple times if previous compiled + results are not applicable for subsequent calls (this is called a "guard + failure), you can use TORCH_LOGS=guards to debug these situations. + Multiple compiled results can be associated with a frame up to + ``torch._dynamo.config.cache_size_limit``, which defaults to 64; at which + point we will fall back to eager. Note that compile caches are per + *code object*, not frame; if you dynamically create multiple copies of a + function, they will all share the same code cache. + + Args: + model (Callable): Module/function to optimize + fullgraph (bool): If False (default), torch.compile attempts to discover compileable regions + in the function that it will optimize. If True, then we require that the entire function be + capturable into a single graph. If this is not possible (that is, if there are graph breaks), + then this will raise an error. + dynamic (bool or None): Use dynamic shape tracing. When this is True, we will up-front attempt + to generate a kernel that is as dynamic as possible to avoid recompilations when + sizes change. This may not always work as some operations/optimizations will + force specialization; use TORCH_LOGS=dynamic to debug overspecialization. + When this is False, we will NEVER generate dynamic kernels, we will always specialize. + By default (None), we automatically detect if dynamism has occurred and compile a more + dynamic kernel upon recompile. + backend (str or Callable): backend to be used + + - "inductor" is the default backend, which is a good balance between performance and overhead + + - Non experimental in-tree backends can be seen with `torch._dynamo.list_backends()` + + - Experimental or debug in-tree backends can be seen with `torch._dynamo.list_backends(None)` + + - To register an out-of-tree custom backend: https://pytorch.org/docs/main/compile/custom-backends.html + mode (str): Can be either "default", "reduce-overhead", "max-autotune" or "max-autotune-no-cudagraphs" + + - "default" is the default mode, which is a good balance between performance and overhead + + - "reduce-overhead" is a mode that reduces the overhead of python with CUDA graphs, + useful for small batches. Reduction of overhead can come at the cost of more memory + usage, as we will cache the workspace memory required for the invocation so that we + do not have to reallocate it on subsequent runs. Reduction of overhead is not guaranteed + to work; today, we only reduce overhead for CUDA only graphs which do not mutate inputs. + There are other circumstances where CUDA graphs are not applicable; use TORCH_LOG=perf_hints + to debug. + + - "max-autotune" is a mode that leverages Triton based matrix multiplications and convolutions + It enables CUDA graphs by default. + + - "max-autotune-no-cudagraphs" is a mode similar to "max-autotune" but without CUDA graphs + + - To see the exact configs that each mode sets you can call `torch._inductor.list_mode_options()` + + options (dict): A dictionary of options to pass to the backend. Some notable ones to try out are + + - `epilogue_fusion` which fuses pointwise ops into templates. Requires `max_autotune` to also be set + + - `max_autotune` which will profile to pick the best matmul configuration + + - `fallback_random` which is useful when debugging accuracy issues + + - `shape_padding` which pads matrix shapes to better align loads on GPUs especially for tensor cores + + - `triton.cudagraphs` which will reduce the overhead of python with CUDA graphs + + - `trace.enabled` which is the most useful debugging flag to turn on + + - `trace.graph_diagram` which will show you a picture of your graph after fusion + + - For inductor you can see the full list of configs that it supports by calling `torch._inductor.list_options()` + disable (bool): Turn torch.compile() into a no-op for testing + + Example:: + + @torch.compile(options={"triton.cudagraphs": True}, fullgraph=True) + def foo(x): + return torch.sin(x) + torch.cos(x) + + """ + _C._log_api_usage_once("torch.compile") + # Temporary until we get proper support for python 3.12 + if sys.version_info >= (3, 12): + raise RuntimeError("Dynamo is not supported on Python 3.12+") + + # Decorator mode + if model is None: + def fn(model: Callable): + if model is None: + raise RuntimeError("Model can't be None") + return compile(model, + fullgraph=fullgraph, + dynamic=dynamic, + backend=backend, + mode=mode, + options=options, + disable=disable) + return fn + + if mode is not None and options is not None: + raise RuntimeError("Either mode or options can be specified, but both can't be specified at the same time.") + if mode is None and options is None: + mode = "default" + if backend == "inductor": + backend = _TorchCompileInductorWrapper(mode, options, dynamic) + else: + backend = _TorchCompileWrapper(backend, mode, options, dynamic) + + return torch._dynamo.optimize(backend=backend, nopython=fullgraph, dynamic=dynamic, disable=disable)(model) + + +from torch import export as export + +from torch._higher_order_ops import cond + +def _register_device_module(device_type, module): + r"""Register an external runtime module of the specific :attr:`device_type` + supported by torch. + + After the :attr:`module` is registered correctly, the user can refer + the external runtime module as part of torch with attribute torch.xxx. + """ + # Make sure the device_type represent a supported device type for torch. + device_type = torch.device(device_type).type + m = sys.modules[__name__] + if hasattr(m, device_type): + raise RuntimeError(f"The runtime module of '{device_type}' has already " + f"been registered with '{getattr(m, device_type)}'") + setattr(m, device_type, module) + torch_module_name = '.'.join([__name__, device_type]) + sys.modules[torch_module_name] = module + +# expose return_types +from . import return_types +from . import library +if not TYPE_CHECKING: + from . import _meta_registrations + +# Enable CUDA Sanitizer +if 'TORCH_CUDA_SANITIZER' in os.environ: + import torch.cuda._sanitizer as csan + + csan.enable_cuda_sanitizer() + +# Populate magic methods on SymInt and SymFloat +import torch.fx.experimental.sym_node + +from torch import func as func +from torch.func import vmap + + +# The function _sparse_coo_tensor_unsafe is removed from PyTorch +# Python API (v. 1.13), here we temporarily provide its replacement +# with a deprecation warning. +# TODO: remove the function for PyTorch v 1.15. +def _sparse_coo_tensor_unsafe(*args, **kwargs): + import warnings + warnings.warn('torch._sparse_coo_tensor_unsafe is deprecated, ' + 'use torch.sparse_coo_tensor(..., check_invariants=False) instead.') + kwargs['check_invariants'] = False + return torch.sparse_coo_tensor(*args, **kwargs) + +# Register MPS specific decomps +torch.backends.mps._init() + +if not _running_with_deploy(): + from torch import compiler as compiler + + class _TritonLibrary: + lib = torch.library.Library("triton", "DEF") + ops_table: Dict[Tuple[str, str], Callable] = {} + + @classmethod + def registerOp(cls, op_key, full_schema, op_impl, dispatch_key): + if (op_key, dispatch_key) not in cls.ops_table: + cls.lib.define(full_schema) + cls.lib.impl("triton::" + op_key, op_impl, dispatch_key) + cls.ops_table[(op_key, dispatch_key)] = op_impl + + return cls.ops_table[(op_key, dispatch_key)] + + +# Deprecated attributes +_deprecated_attrs = { + "has_mps": torch.backends.mps.is_built, + "has_cuda": torch.backends.cuda.is_built, + "has_cudnn": torch.backends.cudnn.is_available, + "has_mkldnn": torch.backends.mkldnn.is_available, +} + +if TYPE_CHECKING: + # Import the following modules during type checking to enable code intelligence features, + # such as auto-completion in tools like pylance, even when these modules are not explicitly + # imported in user code. + from torch import _dynamo as _dynamo + from torch import _inductor as _inductor + from torch import onnx as onnx + +else: + _lazy_modules = { + "_dynamo", + "_inductor", + "_export", + # ONNX must be imported after _dynamo, _ops, _subclasses, fx, func and jit + "onnx", + } + + def __getattr__(name): + # Deprecated attrs + replacement = _deprecated_attrs.get(name) + if replacement is not None: + import warnings + warnings.warn(f"'{name}' is deprecated, please use '{replacement.__module__}.{replacement.__name__}()'", stacklevel=2) + return replacement() + + # Lazy modules + if name in _lazy_modules: + import importlib + return importlib.import_module(f".{name}", __name__) + + raise AttributeError(f"module '{__name__}' has no attribute '{name}'") + + +def _constrain_as_value(symbol, min: Optional[builtins.int] = None, max: Optional[builtins.int] = None): + """ + Add min/max constraint on the intermediate symbol at tracing time. If called in eager mode, + it will still check if the input value is within the specified range. + """ + torch.sym_constrain_range(symbol, min=min, max=max) + + +def _constrain_as_size(symbol, min: Optional[builtins.int] = None, max: Optional[builtins.int] = None): + """ + This indicates that a given int is size-like, and can be used in any context where a size is expected. + You will typically use this when reading out integers from Tensors, e.g., max.item() or lengths.tolist() + which then need to be used as tensor constructors. Providing these assertions to PyTorch can help resolve + GuardOnDataDependentSymNode errors upon export, since we cannot guard on unbacked SymInts. + + This function has unusual semantics which distinguish it from + constrain_as_value. Specifically, in some circumstances in framework + code, we will treat this int as >= 2 (when we do a size-oblivious guard). + This makes it easier to This makes it easier to use the unbacked int in + size contexts, as we will often attempt to guard on a size being zero/one + (e.g., when computing the contiguity of a tensor, or testing if + broadcasting can occur), which will not work on unbacked SymInts. + However, if we conservatively assume that the size is not zero/one, we will + end up with a graph that will still work even if the size is zero/one. + + For more details, see https://docs.google.com/document/d/1HSuTTVvYH1pTew89Rtpeu84Ht3nQEFTYhAX3Ypa_xJs/edit + ``` + """ + torch.sym_constrain_range_for_size(symbol, min=min, max=max) + + +from . import _logging +_logging._init_logs() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fe6f3a4603169f622e953a793f606f7fa74a9cbb --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/__init__.py @@ -0,0 +1,16 @@ +# torch.ao is a package with a lot of interdependencies. +# We will use lazy import to avoid cyclic dependencies here. + + +__all__ = [ + "nn", + "ns", + "quantization", + "pruning", +] + +def __getattr__(name): + if name in __all__: + import importlib + return importlib.import_module("." + name, __name__) + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6da38a6c27328910a9300a47fdf70b59362390c Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/__pycache__/embedding_ops.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/__pycache__/embedding_ops.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f711ad5d4a946c86dd6b59045237695c9d2c456 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/__pycache__/embedding_ops.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/conv.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..2b588d84a74e009c567d4a2d0ede6ebca1a3d11e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/conv.py @@ -0,0 +1,270 @@ +import torch +import torch.nn as nn +from torch.nn.modules.utils import _single, _pair, _triple +from torch.ao.nn.intrinsic import _FusedModule +from typing import Tuple, TypeVar, Union +from torch.nn.common_types import _size_1_t, _size_2_t, _size_3_t + +__all__ = [ + "Conv1d", + "Conv2d", + "Conv3d" +] + +MOD = TypeVar('MOD', bound=nn.modules.conv._ConvNd) + +class _ConvNd(nn.modules.conv._ConvNd): + + _FLOAT_MODULE = MOD + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Tuple[int, ...], + stride: Tuple[int, ...], + padding: Tuple[int, ...], + dilation: Tuple[int, ...], + transposed: bool, + output_padding: Tuple[int, ...], + groups: int, + bias: bool, + padding_mode: str, + qconfig=None, + device=None, + dtype=None) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + nn.modules.conv._ConvNd.__init__(self, in_channels, out_channels, kernel_size, + stride, padding, dilation, transposed, + output_padding, groups, bias, padding_mode, **factory_kwargs) + assert qconfig, 'qconfig must be provided for QAT module' + self.qconfig = qconfig + self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs) + + def forward(self, input): + return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias) + + @staticmethod + def from_float(cls, mod): + r"""Create a qat module from a float module + + Args: + `mod`: a float module, either produced by torch.ao.quantization utilities + or directly from user + """ + assert type(mod) == cls._FLOAT_MODULE, ( + "qat." + + cls.__name__ + + ".from_float only works for " + + cls._FLOAT_MODULE.__name__ # type: ignore[attr-defined] + ) + assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' + assert mod.qconfig, 'Input float module must have a valid qconfig' + if issubclass(type(mod), _FusedModule): + mod = mod[0] # type: ignore[index] + qconfig = mod.qconfig + qat_conv = cls(mod.in_channels, mod.out_channels, mod.kernel_size, + stride=mod.stride, padding=mod.padding, dilation=mod.dilation, + groups=mod.groups, bias=mod.bias is not None, + padding_mode=mod.padding_mode, qconfig=qconfig) + qat_conv.weight = mod.weight + qat_conv.bias = mod.bias + return qat_conv + + def to_float(self): + """ This works for both single qat conv, and the qat conv - relu modules + to convert the qat module to a floating point module + """ + cls = type(self) + conv = cls._FLOAT_CONV_MODULE( # type: ignore[attr-defined, operator] + self.in_channels, + self.out_channels, + self.kernel_size, # type: ignore[arg-type] + self.stride, # type: ignore[arg-type] + self.padding, # type: ignore[arg-type] + self.dilation, # type: ignore[arg-type] + self.groups, + self.bias is not None, + self.padding_mode) + conv.weight = torch.nn.Parameter(self.weight.detach()) + if self.bias is not None: + conv.bias = torch.nn.Parameter(self.bias.detach()) + # conv relu + if issubclass(cls, _FusedModule): + modules = [conv] + assert hasattr(cls, "_FLOAT_RELU_MODULE") + relu = cls._FLOAT_RELU_MODULE() # type: ignore[attr-defined] + modules.append(relu) + fused = cls._FLOAT_MODULE(*modules) # type: ignore[arg-type, attr-defined, operator] + fused.train(self.training) + return fused + else: + return conv + +class Conv1d(_ConvNd, nn.Conv1d): + r""" + A Conv1d module attached with FakeQuantize modules for weight, + used for quantization aware training. + + We adopt the same interface as :class:`~torch.nn.Conv1d` + + Similar to :class:`~torch.nn.Conv2d`, with FakeQuantize modules initialized to + default. + + Attributes: + weight_fake_quant: fake quant module for weight + """ + _FLOAT_MODULE = nn.Conv1d + _FLOAT_CONV_MODULE = nn.Conv1d + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: _size_1_t, + stride: _size_1_t = 1, + padding: Union[str, _size_1_t] = 0, + dilation: _size_1_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: str = 'zeros', + qconfig=None, + device=None, + dtype=None) -> None: + kernel_size_ = _single(kernel_size) + stride_ = _single(stride) + padding_ = padding if isinstance(padding, str) else _single(padding) + dilation_ = _single(dilation) + super().__init__( + in_channels, + out_channels, + kernel_size_, + stride=stride_, + padding=padding_, + dilation=dilation_, + transposed=False, + output_padding=_single(0), + groups=groups, + bias=bias, + padding_mode=padding_mode, + qconfig=qconfig, + device=device, + dtype=dtype) + + @classmethod + def from_float(cls, mod): + return super().from_float(cls, mod) + +class Conv2d(_ConvNd, nn.Conv2d): + r""" + A Conv2d module attached with FakeQuantize modules for weight, + used for quantization aware training. + + We adopt the same interface as `torch.nn.Conv2d`, please see + https://pytorch.org/docs/stable/nn.html?highlight=conv2d#torch.nn.Conv2d + for documentation. + + Similar to `torch.nn.Conv2d`, with FakeQuantize modules initialized to + default. + + Attributes: + weight_fake_quant: fake quant module for weight + """ + _FLOAT_MODULE = nn.Conv2d + _FLOAT_CONV_MODULE = nn.Conv2d + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: _size_2_t, + stride: _size_2_t = 1, + padding: Union[str, _size_2_t] = 0, + dilation: _size_2_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: str = 'zeros', + qconfig=None, + device=None, + dtype=None) -> None: + kernel_size_ = _pair(kernel_size) + stride_ = _pair(stride) + padding_ = padding if isinstance(padding, str) else _pair(padding) + dilation_ = _pair(dilation) + super().__init__( + in_channels, + out_channels, + kernel_size_, + stride=stride_, + padding=padding_, + dilation=dilation_, + transposed=False, + output_padding=_pair(0), + groups=groups, + bias=bias, + padding_mode=padding_mode, + qconfig=qconfig, + device=device, + dtype=dtype) + + def forward(self, input): + return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias) + + @classmethod + def from_float(cls, mod): + return super().from_float(cls, mod) + +class Conv3d(_ConvNd, nn.Conv3d): + r""" + A Conv3d module attached with FakeQuantize modules for weight, + used for quantization aware training. + + We adopt the same interface as `torch.nn.Conv3d`, please see + https://pytorch.org/docs/stable/nn.html?highlight=conv3d#torch.nn.Conv3d + for documentation. + + Similar to `torch.nn.Conv3d`, with FakeQuantize modules initialized to + default. + + Attributes: + weight_fake_quant: fake quant module for weight + """ + _FLOAT_MODULE = nn.Conv3d + _FLOAT_CONV_MODULE = nn.Conv3d + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: _size_3_t, + stride: _size_3_t = 1, + padding: Union[str, _size_3_t] = 0, + dilation: _size_3_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: str = 'zeros', + qconfig=None, + device=None, + dtype=None) -> None: + kernel_size_ = _triple(kernel_size) + stride_ = _triple(stride) + padding_ = padding if isinstance(padding, str) else _triple(padding) + dilation_ = _triple(dilation) + super().__init__( + in_channels, + out_channels, + kernel_size_, + stride=stride_, + padding=padding_, + dilation=dilation_, + transposed=False, + output_padding=_triple(0), + groups=groups, + bias=bias, + padding_mode=padding_mode, + qconfig=qconfig, + device=device, + dtype=dtype) + + def forward(self, input): + return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias) + + @classmethod + def from_float(cls, mod): + return super().from_float(cls, mod) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/linear.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..99d43ed3f6c22dbf08c50d6cd30195e2f482172e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/qat/modules/linear.py @@ -0,0 +1,81 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.ao.nn.intrinsic import LinearReLU +from torch.nn.utils.parametrize import ( + is_parametrized, + type_before_parametrizations, + transfer_parametrizations_and_params, +) + +__all__ = [ + "Linear" +] + +class Linear(nn.Linear): + r""" + A linear module attached with FakeQuantize modules for weight, + used for quantization aware training. + + We adopt the same interface as `torch.nn.Linear`, please see + https://pytorch.org/docs/stable/nn.html#torch.nn.Linear + for documentation. + + Similar to `torch.nn.Linear`, with FakeQuantize modules initialized to + default. + + Attributes: + weight: fake quant module for weight + """ + _FLOAT_MODULE = nn.Linear + + def __init__(self, in_features, out_features, bias=True, + qconfig=None, device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__(in_features, out_features, bias, **factory_kwargs) + assert qconfig, 'qconfig must be provided for QAT module' + self.qconfig = qconfig + self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs) + + def forward(self, input): + return F.linear(input, self.weight_fake_quant(self.weight), self.bias) + + @classmethod + def from_float(cls, mod): + r"""Create a qat module from a float module or qparams_dict + Args: `mod` a float module, either produced by torch.ao.quantization utilities + or directly from user + """ + assert type_before_parametrizations(mod) == cls._FLOAT_MODULE, ( + " qat." + + cls.__name__ + + ".from_float only works for " + + cls._FLOAT_MODULE.__name__ + ) + assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined" + assert mod.qconfig, "Input float module must have a valid qconfig" + if type_before_parametrizations(mod) == LinearReLU: + mod = mod[0] + + qconfig = mod.qconfig + qat_linear = cls(mod.in_features, mod.out_features, bias=mod.bias is not None, qconfig=qconfig) + + if is_parametrized(mod, "weight"): + transfer_parametrizations_and_params(mod, qat_linear, "weight") + else: + qat_linear.weight = mod.weight + + if is_parametrized(mod, "bias"): + transfer_parametrizations_and_params(mod, qat_linear, "bias") + else: + qat_linear.bias = mod.bias + + return qat_linear + + def to_float(self): + linear = torch.nn.Linear(self.in_features, self.out_features, self.bias is not None) + linear.weight = torch.nn.Parameter(self.weight.detach()) + if self.bias is not None: + linear.bias = torch.nn.Parameter(self.bias.detach()) + linear.train(self.training) + return linear diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/quantized/modules/linear.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/quantized/modules/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..a6c720e9b6094f0f44ef74d3f50f5c282f1738c7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/quantized/modules/linear.py @@ -0,0 +1,303 @@ +from collections.abc import Iterable +import torch + +import torch.nn as nn +import torch.ao.nn.intrinsic as nni +import torch.ao.nn.intrinsic.qat as nniqat +from torch.nn.utils.fusion import fuse_linear_bn_weights +from torch.nn.utils.parametrize import type_before_parametrizations + +from typing import Optional + +from .utils import _quantize_weight, _hide_packed_params_repr, WeightedQuantizedModule + +__all__ = ['LinearPackedParams', 'Linear'] + + +class LinearPackedParams(torch.nn.Module): + _version = 3 + + def __init__(self, dtype=torch.qint8): + super().__init__() + self.dtype = dtype + if self.dtype == torch.qint8: + wq = torch._empty_affine_quantized([1, 1], scale=1.0, zero_point=0, dtype=torch.qint8) + elif self.dtype == torch.float16: + wq = torch.zeros([1, 1], dtype=torch.float) + self.set_weight_bias(wq, None) # type: ignore[possibly-undefined] + + @torch.jit.export + def set_weight_bias(self, weight: torch.Tensor, bias: Optional[torch.Tensor]) -> None: + if self.dtype == torch.qint8: + self._packed_params = torch.ops.quantized.linear_prepack(weight, bias) + elif self.dtype == torch.float16: + self._packed_params = torch.ops.quantized.linear_prepack_fp16(weight, bias) + else: + raise RuntimeError('Unsupported dtype on dynamic quantized linear!') + + + @torch.jit.export + def _weight_bias(self): + if self.dtype == torch.qint8: + return torch.ops.quantized.linear_unpack(self._packed_params) + elif self.dtype == torch.float16: + return torch.ops.quantized.linear_unpack_fp16(self._packed_params) + else: + raise RuntimeError('Unsupported dtype on dynamic quantized linear!') + + def forward(self, x): + return x + + # Version 1 + # self + # |--- weight : Tensor + # |--- bias : Tensor + # + # Version 2 + # self + # |--- weight : Tensor + # |--- bias : Tensor + # |--- dtype : torch.dtype + # + # Version 3 + # self + # |--- _packed_params : (Tensor, Tensor) representing (weight, bias) + # of LinearPackedParams + # |--- dtype : torch.dtype + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + 'dtype'] = self.dtype + destination[prefix + '_packed_params'] = self._weight_bias() + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + version = local_metadata.get('version', None) + if version is None or version < 2: + self.dtype = torch.qint8 + else: + self.dtype = state_dict[prefix + 'dtype'] + state_dict.pop(prefix + 'dtype') + + if version is None or version < 3: + self.set_weight_bias(state_dict[prefix + 'weight'], state_dict[prefix + 'bias']) + state_dict.pop(prefix + 'weight') + state_dict.pop(prefix + 'bias') + + if version == 3: + weight, bias = state_dict[prefix + '_packed_params'] + state_dict.pop(prefix + '_packed_params') + self.set_weight_bias(weight, bias) + + super()._load_from_state_dict(state_dict, prefix, local_metadata, False, + missing_keys, unexpected_keys, error_msgs) + + + def __repr__(self): + return self._weight_bias().__repr__() + + +class Linear(WeightedQuantizedModule): + r""" + A quantized linear module with quantized tensor as inputs and outputs. + We adopt the same interface as `torch.nn.Linear`, please see + https://pytorch.org/docs/stable/nn.html#torch.nn.Linear for documentation. + + Similar to :class:`~torch.nn.Linear`, attributes will be randomly + initialized at module creation time and will be overwritten later + + Attributes: + weight (Tensor): the non-learnable quantized weights of the module of + shape :math:`(\text{out\_features}, \text{in\_features})`. + bias (Tensor): the non-learnable bias of the module of shape :math:`(\text{out\_features})`. + If :attr:`bias` is ``True``, the values are initialized to zero. + scale: `scale` parameter of output Quantized Tensor, type: double + zero_point: `zero_point` parameter for output Quantized Tensor, type: long + + Examples:: + + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE) + >>> m = nn.quantized.Linear(20, 30) + >>> input = torch.randn(128, 20) + >>> # xdoctest: +SKIP + >>> input = torch.quantize_per_tensor(input, 1.0, 0, torch.quint8) + >>> output = m(input) + >>> print(output.size()) + torch.Size([128, 30]) + """ + _version = 3 + _FLOAT_MODULE = (nn.Linear, nn.modules.linear.NonDynamicallyQuantizableLinear) + + def __init__(self, in_features, out_features, bias_=True, + dtype=torch.qint8): + super().__init__() + # We don't muck around with buffers or attributes or anything here + # to keep the module simple. *everything* is simply a Python attribute. + # Serialization logic is explicitly handled in the below serialization and + # deserialization modules + self.in_features = in_features + self.out_features = out_features + bias = None + if bias_: + bias = torch.zeros(out_features, dtype=torch.float) + + if dtype == torch.qint8: + qweight = torch._empty_affine_quantized( + [out_features, in_features], scale=1, zero_point=0, dtype=torch.qint8) + elif dtype == torch.float16: + qweight = torch.zeros([out_features, in_features], dtype=torch.float) + else: + raise RuntimeError('Unsupported dtype specified for quantized Linear!') + + self._packed_params = LinearPackedParams(dtype) + self._packed_params.set_weight_bias(qweight, bias) + self.scale = 1.0 + self.zero_point = 0 + + def _get_name(self): + return 'QuantizedLinear' + + def extra_repr(self): + return 'in_features={}, out_features={}, scale={}, zero_point={}, qscheme={}'.format( + self.in_features, self.out_features, self.scale, self.zero_point, self.weight().qscheme() + ) + + def __repr__(self): + return _hide_packed_params_repr(self, LinearPackedParams) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.ops.quantized.linear( + x, self._packed_params._packed_params, self.scale, self.zero_point) + + # ===== Serialization methods ===== + # The special consideration here is that we have to unpack the weights into their + # regular QTensor form for serialization. Packed weights should not live + # outside the process in which they were created, rather they should be derived + # from the QTensor weight. + # + # Version 1 + # self + # |--- scale : float + # |--- zero_point : int + # |--- weight : Tensor + # |--- bias : Tensor + # + # Version 2 + # self + # |--- scale : float + # |--- zero_point : int + # |--- _packed_params : Module + # |--- weight : Tensor + # |--- bias : Tensor + # + # Version 3 + # self + # |--- scale : float + # |--- zero_point : int + # |--- _packed_params : Module + # |--- _packed_params : (Tensor, Tensor) representing weight, bias + # of LinearPackedParams C++ struct + # + def _save_to_state_dict(self, destination, prefix, keep_vars): + super()._save_to_state_dict(destination, prefix, keep_vars) + destination[prefix + 'scale'] = torch.tensor(self.scale) + destination[prefix + 'zero_point'] = torch.tensor(self.zero_point) + + # ===== Deserialization methods ===== + # Counterpart to the serialization methods, we must pack the serialized QTensor + # weight into its packed format for use by the FBGEMM ops. + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + self.scale = float(state_dict[prefix + 'scale']) + state_dict.pop(prefix + 'scale') + + self.zero_point = int(state_dict[prefix + 'zero_point']) + state_dict.pop(prefix + 'zero_point') + + version = local_metadata.get('version', None) + + if version is None or version == 1: + # We moved the parameters into a LinearPackedParameters submodule + weight = state_dict.pop(prefix + 'weight') + bias = state_dict.pop(prefix + 'bias') + state_dict.update({prefix + '_packed_params.weight': weight, + prefix + '_packed_params.bias': bias}) + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, False, + missing_keys, unexpected_keys, error_msgs) + + # Function rather than property to make sure that JIT serialization doesn't + # register this as an attribute + def _weight_bias(self): + return self._packed_params._weight_bias() + + def weight(self): + return self._weight_bias()[0] + + def bias(self): + return self._weight_bias()[1] + + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: + self._packed_params.set_weight_bias(w, b) + + @classmethod + def from_float(cls, mod): + r"""Create a quantized module from an observed float module + + Args: + mod (Module): a float module, either produced by torch.ao.quantization + utilities or provided by the user + """ + if hasattr(mod, 'weight_fake_quant'): + if type_before_parametrizations(mod) == nniqat.LinearBn1d: + mod.weight, mod.bias = fuse_linear_bn_weights( + mod.weight, mod.bias, mod.bn.running_mean, mod.bn.running_var, + mod.bn.eps, mod.bn.weight, mod.bn.bias) + weight_post_process = mod.weight_fake_quant + activation_post_process = mod.activation_post_process + else: + # This function does not participate in JIT, so it is OK to ignore + # the type mismatch in assignment. Also, mypy has an issue with + # iterables not being implemented, so we are ignoring those too. + if not isinstance(cls._FLOAT_MODULE, Iterable): + cls._FLOAT_MODULE = [cls._FLOAT_MODULE] # type: ignore[assignment] + supported_modules = ', '.join([float_mod.__name__ for float_mod in cls._FLOAT_MODULE]) # type: ignore[attr-defined] + error_msg = f'nnq.{cls.__name__}.from_float only works for {supported_modules}, but got: {type(mod)}' + assert type_before_parametrizations(mod) in cls._FLOAT_MODULE, error_msg.format() # type: ignore[attr-defined] + assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' + activation_post_process = mod.activation_post_process + if type_before_parametrizations(mod) == nni.LinearReLU: + mod = mod[0] + weight_post_process = mod.qconfig.weight() + weight_post_process(mod.weight) + dtype = weight_post_process.dtype + act_scale, act_zp = activation_post_process.calculate_qparams() + assert dtype == torch.qint8, 'Weight observer must have dtype torch.qint8' + qweight = _quantize_weight(mod.weight.float(), weight_post_process) + qlinear = cls(mod.in_features, + mod.out_features, + dtype=dtype) + qlinear.set_weight_bias(qweight, mod.bias) + qlinear.scale = float(act_scale) + qlinear.zero_point = int(act_zp) + return qlinear + + @classmethod + def from_reference(cls, ref_qlinear, output_scale, output_zero_point): + r"""Create a (fbgemm/qnnpack) quantized module from a reference quantized module + + Args: + ref_qlinear (Module): a reference quantized linear module, either produced by torch.ao.quantization + utilities or provided by the user + output_scale (float): scale for output Tensor + output_zero_point (int): zero point for output Tensor + """ + qlinear = cls( + ref_qlinear.in_features, + ref_qlinear.out_features) + qweight = ref_qlinear.get_quantized_weight() + qlinear.set_weight_bias(qweight, ref_qlinear.bias) + + qlinear.scale = float(output_scale) + qlinear.zero_point = int(output_zero_point) + return qlinear diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/quantized/modules/rnn.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/quantized/modules/rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..deb14856a9ef92a56c5988f1534df4330952133f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/nn/quantized/modules/rnn.py @@ -0,0 +1,51 @@ +import torch + +__all__ = [ + "LSTM", +] + +class LSTM(torch.ao.nn.quantizable.LSTM): + r"""A quantized long short-term memory (LSTM). + + For the description and the argument types, please, refer to :class:`~torch.nn.LSTM` + + Attributes: + layers : instances of the `_LSTMLayer` + + .. note:: + To access the weights and biases, you need to access them per layer. + See examples in :class:`~torch.ao.nn.quantizable.LSTM` + + Examples:: + >>> # xdoctest: +SKIP + >>> custom_module_config = { + ... 'float_to_observed_custom_module_class': { + ... nn.LSTM: nn.quantizable.LSTM, + ... }, + ... 'observed_to_quantized_custom_module_class': { + ... nn.quantizable.LSTM: nn.quantized.LSTM, + ... } + ... } + >>> tq.prepare(model, prepare_custom_module_class=custom_module_config) + >>> tq.convert(model, convert_custom_module_class=custom_module_config) + """ + _FLOAT_MODULE = torch.ao.nn.quantizable.LSTM # type: ignore[assignment] + + def _get_name(self): + return 'QuantizedLSTM' + + @classmethod + def from_float(cls, *args, **kwargs): + # The whole flow is float -> observed -> quantized + # This class does observed -> quantized only + raise NotImplementedError("It looks like you are trying to convert a " + "non-observed LSTM module. Please, see " + "the examples on quantizable LSTMs.") + + @classmethod + def from_observed(cls, other): + assert type(other) == cls._FLOAT_MODULE # type: ignore[has-type] + converted = torch.ao.quantization.convert(other, inplace=False, + remove_qconfig=True) + converted.__class__ = cls + return converted diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc844bc56bf6863094a458443f363dc2142659e8 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e37ee9d99226a35ede2db90ccbfea07be2e2bb6e Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1cca0620b821037fb4f2f24a2317cf88754ecff3 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f09d86a7745c6f2c6642b036b2d31a27583944dc Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35e588fcebd095a2bc7a2d2f1e4d586047533885 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/typing_extensions-4.9.0.dist-info/INSTALLER b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/typing_extensions-4.9.0.dist-info/INSTALLER new file mode 100644 index 0000000000000000000000000000000000000000..a1b589e38a32041e49332e5e81c2d363dc418d68 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/typing_extensions-4.9.0.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/typing_extensions-4.9.0.dist-info/RECORD b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/typing_extensions-4.9.0.dist-info/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..46495911f5842491de81ee47a0b00a96fa463edc --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/typing_extensions-4.9.0.dist-info/RECORD @@ -0,0 +1,7 @@ +__pycache__/typing_extensions.cpython-311.pyc,, +typing_extensions-4.9.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +typing_extensions-4.9.0.dist-info/LICENSE,sha256=Oy-B_iHRgcSZxZolbI4ZaEVdZonSaaqFNzv7avQdo78,13936 +typing_extensions-4.9.0.dist-info/METADATA,sha256=ebx5L9BIL5U8F_82bApE9QHP5izlymctyyI4Ey1bTck,2966 +typing_extensions-4.9.0.dist-info/RECORD,, +typing_extensions-4.9.0.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81 +typing_extensions.py,sha256=R1TPIKi5cxfmdVZfNaDB7WjKgEY4deP5D2CS3XR3hcQ,110125